Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ****************************************************************************** |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2003-2013, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ****************************************************************************** |
michael@0 | 8 | * file name: ucnv_ext.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2003jun13 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * Conversion extensions |
michael@0 | 17 | */ |
michael@0 | 18 | |
michael@0 | 19 | #include "unicode/utypes.h" |
michael@0 | 20 | |
michael@0 | 21 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
michael@0 | 22 | |
michael@0 | 23 | #include "unicode/uset.h" |
michael@0 | 24 | #include "ucnv_bld.h" |
michael@0 | 25 | #include "ucnv_cnv.h" |
michael@0 | 26 | #include "ucnv_ext.h" |
michael@0 | 27 | #include "cmemory.h" |
michael@0 | 28 | #include "uassert.h" |
michael@0 | 29 | |
michael@0 | 30 | /* to Unicode --------------------------------------------------------------- */ |
michael@0 | 31 | |
michael@0 | 32 | /* |
michael@0 | 33 | * @return lookup value for the byte, if found; else 0 |
michael@0 | 34 | */ |
michael@0 | 35 | static inline uint32_t |
michael@0 | 36 | ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) { |
michael@0 | 37 | uint32_t word0, word; |
michael@0 | 38 | int32_t i, start, limit; |
michael@0 | 39 | |
michael@0 | 40 | /* check the input byte against the lowest and highest section bytes */ |
michael@0 | 41 | start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]); |
michael@0 | 42 | limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]); |
michael@0 | 43 | if(byte<start || limit<byte) { |
michael@0 | 44 | return 0; /* the byte is out of range */ |
michael@0 | 45 | } |
michael@0 | 46 | |
michael@0 | 47 | if(length==((limit-start)+1)) { |
michael@0 | 48 | /* direct access on a linear array */ |
michael@0 | 49 | return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */ |
michael@0 | 50 | } |
michael@0 | 51 | |
michael@0 | 52 | /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ |
michael@0 | 53 | word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0); |
michael@0 | 54 | |
michael@0 | 55 | /* |
michael@0 | 56 | * Shift byte once instead of each section word and add 0xffffff. |
michael@0 | 57 | * We will compare the shifted/added byte (bbffffff) against |
michael@0 | 58 | * section words which have byte values in the same bit position. |
michael@0 | 59 | * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv |
michael@0 | 60 | * for all v=0..f |
michael@0 | 61 | * so we need not mask off the lower 24 bits of each section word. |
michael@0 | 62 | */ |
michael@0 | 63 | word=word0|UCNV_EXT_TO_U_VALUE_MASK; |
michael@0 | 64 | |
michael@0 | 65 | /* binary search */ |
michael@0 | 66 | start=0; |
michael@0 | 67 | limit=length; |
michael@0 | 68 | for(;;) { |
michael@0 | 69 | i=limit-start; |
michael@0 | 70 | if(i<=1) { |
michael@0 | 71 | break; /* done */ |
michael@0 | 72 | } |
michael@0 | 73 | /* start<limit-1 */ |
michael@0 | 74 | |
michael@0 | 75 | if(i<=4) { |
michael@0 | 76 | /* linear search for the last part */ |
michael@0 | 77 | if(word0<=toUSection[start]) { |
michael@0 | 78 | break; |
michael@0 | 79 | } |
michael@0 | 80 | if(++start<limit && word0<=toUSection[start]) { |
michael@0 | 81 | break; |
michael@0 | 82 | } |
michael@0 | 83 | if(++start<limit && word0<=toUSection[start]) { |
michael@0 | 84 | break; |
michael@0 | 85 | } |
michael@0 | 86 | /* always break at start==limit-1 */ |
michael@0 | 87 | ++start; |
michael@0 | 88 | break; |
michael@0 | 89 | } |
michael@0 | 90 | |
michael@0 | 91 | i=(start+limit)/2; |
michael@0 | 92 | if(word<toUSection[i]) { |
michael@0 | 93 | limit=i; |
michael@0 | 94 | } else { |
michael@0 | 95 | start=i; |
michael@0 | 96 | } |
michael@0 | 97 | } |
michael@0 | 98 | |
michael@0 | 99 | /* did we really find it? */ |
michael@0 | 100 | if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) { |
michael@0 | 101 | return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */ |
michael@0 | 102 | } else { |
michael@0 | 103 | return 0; /* not found */ |
michael@0 | 104 | } |
michael@0 | 105 | } |
michael@0 | 106 | |
michael@0 | 107 | /* |
michael@0 | 108 | * TRUE if not an SI/SO stateful converter, |
michael@0 | 109 | * or if the match length fits with the current converter state |
michael@0 | 110 | */ |
michael@0 | 111 | #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \ |
michael@0 | 112 | ((sisoState)<0 || ((sisoState)==0) == (match==1)) |
michael@0 | 113 | |
michael@0 | 114 | /* |
michael@0 | 115 | * this works like ucnv_extMatchFromU() except |
michael@0 | 116 | * - the first character is in pre |
michael@0 | 117 | * - no trie is used |
michael@0 | 118 | * - the returned matchLength is not offset by 2 |
michael@0 | 119 | */ |
michael@0 | 120 | static int32_t |
michael@0 | 121 | ucnv_extMatchToU(const int32_t *cx, int8_t sisoState, |
michael@0 | 122 | const char *pre, int32_t preLength, |
michael@0 | 123 | const char *src, int32_t srcLength, |
michael@0 | 124 | uint32_t *pMatchValue, |
michael@0 | 125 | UBool /*useFallback*/, UBool flush) { |
michael@0 | 126 | const uint32_t *toUTable, *toUSection; |
michael@0 | 127 | |
michael@0 | 128 | uint32_t value, matchValue; |
michael@0 | 129 | int32_t i, j, idx, length, matchLength; |
michael@0 | 130 | uint8_t b; |
michael@0 | 131 | |
michael@0 | 132 | if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) { |
michael@0 | 133 | return 0; /* no extension data, no match */ |
michael@0 | 134 | } |
michael@0 | 135 | |
michael@0 | 136 | /* initialize */ |
michael@0 | 137 | toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t); |
michael@0 | 138 | idx=0; |
michael@0 | 139 | |
michael@0 | 140 | matchValue=0; |
michael@0 | 141 | i=j=matchLength=0; |
michael@0 | 142 | |
michael@0 | 143 | if(sisoState==0) { |
michael@0 | 144 | /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ |
michael@0 | 145 | if(preLength>1) { |
michael@0 | 146 | return 0; /* no match of a DBCS sequence in SBCS mode */ |
michael@0 | 147 | } else if(preLength==1) { |
michael@0 | 148 | srcLength=0; |
michael@0 | 149 | } else /* preLength==0 */ { |
michael@0 | 150 | if(srcLength>1) { |
michael@0 | 151 | srcLength=1; |
michael@0 | 152 | } |
michael@0 | 153 | } |
michael@0 | 154 | flush=TRUE; |
michael@0 | 155 | } |
michael@0 | 156 | |
michael@0 | 157 | /* we must not remember fallback matches when not using fallbacks */ |
michael@0 | 158 | |
michael@0 | 159 | /* match input units until there is a full match or the input is consumed */ |
michael@0 | 160 | for(;;) { |
michael@0 | 161 | /* go to the next section */ |
michael@0 | 162 | toUSection=toUTable+idx; |
michael@0 | 163 | |
michael@0 | 164 | /* read first pair of the section */ |
michael@0 | 165 | value=*toUSection++; |
michael@0 | 166 | length=UCNV_EXT_TO_U_GET_BYTE(value); |
michael@0 | 167 | value=UCNV_EXT_TO_U_GET_VALUE(value); |
michael@0 | 168 | if( value!=0 && |
michael@0 | 169 | (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || |
michael@0 | 170 | TO_U_USE_FALLBACK(useFallback)) && |
michael@0 | 171 | UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) |
michael@0 | 172 | ) { |
michael@0 | 173 | /* remember longest match so far */ |
michael@0 | 174 | matchValue=value; |
michael@0 | 175 | matchLength=i+j; |
michael@0 | 176 | } |
michael@0 | 177 | |
michael@0 | 178 | /* match pre[] then src[] */ |
michael@0 | 179 | if(i<preLength) { |
michael@0 | 180 | b=(uint8_t)pre[i++]; |
michael@0 | 181 | } else if(j<srcLength) { |
michael@0 | 182 | b=(uint8_t)src[j++]; |
michael@0 | 183 | } else { |
michael@0 | 184 | /* all input consumed, partial match */ |
michael@0 | 185 | if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) { |
michael@0 | 186 | /* |
michael@0 | 187 | * end of the entire input stream, stop with the longest match so far |
michael@0 | 188 | * or: partial match must not be longer than UCNV_EXT_MAX_BYTES |
michael@0 | 189 | * because it must fit into state buffers |
michael@0 | 190 | */ |
michael@0 | 191 | break; |
michael@0 | 192 | } else { |
michael@0 | 193 | /* continue with more input next time */ |
michael@0 | 194 | return -length; |
michael@0 | 195 | } |
michael@0 | 196 | } |
michael@0 | 197 | |
michael@0 | 198 | /* search for the current UChar */ |
michael@0 | 199 | value=ucnv_extFindToU(toUSection, length, b); |
michael@0 | 200 | if(value==0) { |
michael@0 | 201 | /* no match here, stop with the longest match so far */ |
michael@0 | 202 | break; |
michael@0 | 203 | } else { |
michael@0 | 204 | if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { |
michael@0 | 205 | /* partial match, continue */ |
michael@0 | 206 | idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value); |
michael@0 | 207 | } else { |
michael@0 | 208 | if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || |
michael@0 | 209 | TO_U_USE_FALLBACK(useFallback)) && |
michael@0 | 210 | UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) |
michael@0 | 211 | ) { |
michael@0 | 212 | /* full match, stop with result */ |
michael@0 | 213 | matchValue=value; |
michael@0 | 214 | matchLength=i+j; |
michael@0 | 215 | } else { |
michael@0 | 216 | /* full match on fallback not taken, stop with the longest match so far */ |
michael@0 | 217 | } |
michael@0 | 218 | break; |
michael@0 | 219 | } |
michael@0 | 220 | } |
michael@0 | 221 | } |
michael@0 | 222 | |
michael@0 | 223 | if(matchLength==0) { |
michael@0 | 224 | /* no match at all */ |
michael@0 | 225 | return 0; |
michael@0 | 226 | } |
michael@0 | 227 | |
michael@0 | 228 | /* return result */ |
michael@0 | 229 | *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue); |
michael@0 | 230 | return matchLength; |
michael@0 | 231 | } |
michael@0 | 232 | |
michael@0 | 233 | static inline void |
michael@0 | 234 | ucnv_extWriteToU(UConverter *cnv, const int32_t *cx, |
michael@0 | 235 | uint32_t value, |
michael@0 | 236 | UChar **target, const UChar *targetLimit, |
michael@0 | 237 | int32_t **offsets, int32_t srcIndex, |
michael@0 | 238 | UErrorCode *pErrorCode) { |
michael@0 | 239 | /* output the result */ |
michael@0 | 240 | if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { |
michael@0 | 241 | /* output a single code point */ |
michael@0 | 242 | ucnv_toUWriteCodePoint( |
michael@0 | 243 | cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value), |
michael@0 | 244 | target, targetLimit, |
michael@0 | 245 | offsets, srcIndex, |
michael@0 | 246 | pErrorCode); |
michael@0 | 247 | } else { |
michael@0 | 248 | /* output a string - with correct data we have resultLength>0 */ |
michael@0 | 249 | ucnv_toUWriteUChars( |
michael@0 | 250 | cnv, |
michael@0 | 251 | UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+ |
michael@0 | 252 | UCNV_EXT_TO_U_GET_INDEX(value), |
michael@0 | 253 | UCNV_EXT_TO_U_GET_LENGTH(value), |
michael@0 | 254 | target, targetLimit, |
michael@0 | 255 | offsets, srcIndex, |
michael@0 | 256 | pErrorCode); |
michael@0 | 257 | } |
michael@0 | 258 | } |
michael@0 | 259 | |
michael@0 | 260 | /* |
michael@0 | 261 | * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), |
michael@0 | 262 | * or 1 for DBCS-only, |
michael@0 | 263 | * or -1 if the converter is not SI/SO stateful |
michael@0 | 264 | * |
michael@0 | 265 | * Note: For SI/SO stateful converters getting here, |
michael@0 | 266 | * cnv->mode==0 is equivalent to firstLength==1. |
michael@0 | 267 | */ |
michael@0 | 268 | #define UCNV_SISO_STATE(cnv) \ |
michael@0 | 269 | ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \ |
michael@0 | 270 | (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1) |
michael@0 | 271 | |
michael@0 | 272 | /* |
michael@0 | 273 | * target<targetLimit; set error code for overflow |
michael@0 | 274 | */ |
michael@0 | 275 | U_CFUNC UBool |
michael@0 | 276 | ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, |
michael@0 | 277 | int32_t firstLength, |
michael@0 | 278 | const char **src, const char *srcLimit, |
michael@0 | 279 | UChar **target, const UChar *targetLimit, |
michael@0 | 280 | int32_t **offsets, int32_t srcIndex, |
michael@0 | 281 | UBool flush, |
michael@0 | 282 | UErrorCode *pErrorCode) { |
michael@0 | 283 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
michael@0 | 284 | int32_t match; |
michael@0 | 285 | |
michael@0 | 286 | /* try to match */ |
michael@0 | 287 | match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv), |
michael@0 | 288 | (const char *)cnv->toUBytes, firstLength, |
michael@0 | 289 | *src, (int32_t)(srcLimit-*src), |
michael@0 | 290 | &value, |
michael@0 | 291 | cnv->useFallback, flush); |
michael@0 | 292 | if(match>0) { |
michael@0 | 293 | /* advance src pointer for the consumed input */ |
michael@0 | 294 | *src+=match-firstLength; |
michael@0 | 295 | |
michael@0 | 296 | /* write result to target */ |
michael@0 | 297 | ucnv_extWriteToU(cnv, cx, |
michael@0 | 298 | value, |
michael@0 | 299 | target, targetLimit, |
michael@0 | 300 | offsets, srcIndex, |
michael@0 | 301 | pErrorCode); |
michael@0 | 302 | return TRUE; |
michael@0 | 303 | } else if(match<0) { |
michael@0 | 304 | /* save state for partial match */ |
michael@0 | 305 | const char *s; |
michael@0 | 306 | int32_t j; |
michael@0 | 307 | |
michael@0 | 308 | /* copy the first code point */ |
michael@0 | 309 | s=(const char *)cnv->toUBytes; |
michael@0 | 310 | cnv->preToUFirstLength=(int8_t)firstLength; |
michael@0 | 311 | for(j=0; j<firstLength; ++j) { |
michael@0 | 312 | cnv->preToU[j]=*s++; |
michael@0 | 313 | } |
michael@0 | 314 | |
michael@0 | 315 | /* now copy the newly consumed input */ |
michael@0 | 316 | s=*src; |
michael@0 | 317 | match=-match; |
michael@0 | 318 | for(; j<match; ++j) { |
michael@0 | 319 | cnv->preToU[j]=*s++; |
michael@0 | 320 | } |
michael@0 | 321 | *src=s; /* same as *src=srcLimit; because we reached the end of input */ |
michael@0 | 322 | cnv->preToULength=(int8_t)match; |
michael@0 | 323 | return TRUE; |
michael@0 | 324 | } else /* match==0 no match */ { |
michael@0 | 325 | return FALSE; |
michael@0 | 326 | } |
michael@0 | 327 | } |
michael@0 | 328 | |
michael@0 | 329 | U_CFUNC UChar32 |
michael@0 | 330 | ucnv_extSimpleMatchToU(const int32_t *cx, |
michael@0 | 331 | const char *source, int32_t length, |
michael@0 | 332 | UBool useFallback) { |
michael@0 | 333 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
michael@0 | 334 | int32_t match; |
michael@0 | 335 | |
michael@0 | 336 | if(length<=0) { |
michael@0 | 337 | return 0xffff; |
michael@0 | 338 | } |
michael@0 | 339 | |
michael@0 | 340 | /* try to match */ |
michael@0 | 341 | match=ucnv_extMatchToU(cx, -1, |
michael@0 | 342 | source, length, |
michael@0 | 343 | NULL, 0, |
michael@0 | 344 | &value, |
michael@0 | 345 | useFallback, TRUE); |
michael@0 | 346 | if(match==length) { |
michael@0 | 347 | /* write result for simple, single-character conversion */ |
michael@0 | 348 | if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { |
michael@0 | 349 | return UCNV_EXT_TO_U_GET_CODE_POINT(value); |
michael@0 | 350 | } |
michael@0 | 351 | } |
michael@0 | 352 | |
michael@0 | 353 | /* |
michael@0 | 354 | * return no match because |
michael@0 | 355 | * - match>0 && value points to string: simple conversion cannot handle multiple code points |
michael@0 | 356 | * - match>0 && match!=length: not all input consumed, forbidden for this function |
michael@0 | 357 | * - match==0: no match found in the first place |
michael@0 | 358 | * - match<0: partial match, not supported for simple conversion (and flush==TRUE) |
michael@0 | 359 | */ |
michael@0 | 360 | return 0xfffe; |
michael@0 | 361 | } |
michael@0 | 362 | |
michael@0 | 363 | /* |
michael@0 | 364 | * continue partial match with new input |
michael@0 | 365 | * never called for simple, single-character conversion |
michael@0 | 366 | */ |
michael@0 | 367 | U_CFUNC void |
michael@0 | 368 | ucnv_extContinueMatchToU(UConverter *cnv, |
michael@0 | 369 | UConverterToUnicodeArgs *pArgs, int32_t srcIndex, |
michael@0 | 370 | UErrorCode *pErrorCode) { |
michael@0 | 371 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
michael@0 | 372 | int32_t match, length; |
michael@0 | 373 | |
michael@0 | 374 | match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv), |
michael@0 | 375 | cnv->preToU, cnv->preToULength, |
michael@0 | 376 | pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), |
michael@0 | 377 | &value, |
michael@0 | 378 | cnv->useFallback, pArgs->flush); |
michael@0 | 379 | if(match>0) { |
michael@0 | 380 | if(match>=cnv->preToULength) { |
michael@0 | 381 | /* advance src pointer for the consumed input */ |
michael@0 | 382 | pArgs->source+=match-cnv->preToULength; |
michael@0 | 383 | cnv->preToULength=0; |
michael@0 | 384 | } else { |
michael@0 | 385 | /* the match did not use all of preToU[] - keep the rest for replay */ |
michael@0 | 386 | length=cnv->preToULength-match; |
michael@0 | 387 | uprv_memmove(cnv->preToU, cnv->preToU+match, length); |
michael@0 | 388 | cnv->preToULength=(int8_t)-length; |
michael@0 | 389 | } |
michael@0 | 390 | |
michael@0 | 391 | /* write result */ |
michael@0 | 392 | ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes, |
michael@0 | 393 | value, |
michael@0 | 394 | &pArgs->target, pArgs->targetLimit, |
michael@0 | 395 | &pArgs->offsets, srcIndex, |
michael@0 | 396 | pErrorCode); |
michael@0 | 397 | } else if(match<0) { |
michael@0 | 398 | /* save state for partial match */ |
michael@0 | 399 | const char *s; |
michael@0 | 400 | int32_t j; |
michael@0 | 401 | |
michael@0 | 402 | /* just _append_ the newly consumed input to preToU[] */ |
michael@0 | 403 | s=pArgs->source; |
michael@0 | 404 | match=-match; |
michael@0 | 405 | for(j=cnv->preToULength; j<match; ++j) { |
michael@0 | 406 | cnv->preToU[j]=*s++; |
michael@0 | 407 | } |
michael@0 | 408 | pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ |
michael@0 | 409 | cnv->preToULength=(int8_t)match; |
michael@0 | 410 | } else /* match==0 */ { |
michael@0 | 411 | /* |
michael@0 | 412 | * no match |
michael@0 | 413 | * |
michael@0 | 414 | * We need to split the previous input into two parts: |
michael@0 | 415 | * |
michael@0 | 416 | * 1. The first codepage character is unmappable - that's how we got into |
michael@0 | 417 | * trying the extension data in the first place. |
michael@0 | 418 | * We need to move it from the preToU buffer |
michael@0 | 419 | * to the error buffer, set an error code, |
michael@0 | 420 | * and prepare the rest of the previous input for 2. |
michael@0 | 421 | * |
michael@0 | 422 | * 2. The rest of the previous input must be converted once we |
michael@0 | 423 | * come back from the callback for the first character. |
michael@0 | 424 | * At that time, we have to try again from scratch to convert |
michael@0 | 425 | * these input characters. |
michael@0 | 426 | * The replay will be handled by the ucnv.c conversion code. |
michael@0 | 427 | */ |
michael@0 | 428 | |
michael@0 | 429 | /* move the first codepage character to the error field */ |
michael@0 | 430 | uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength); |
michael@0 | 431 | cnv->toULength=cnv->preToUFirstLength; |
michael@0 | 432 | |
michael@0 | 433 | /* move the rest up inside the buffer */ |
michael@0 | 434 | length=cnv->preToULength-cnv->preToUFirstLength; |
michael@0 | 435 | if(length>0) { |
michael@0 | 436 | uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length); |
michael@0 | 437 | } |
michael@0 | 438 | |
michael@0 | 439 | /* mark preToU for replay */ |
michael@0 | 440 | cnv->preToULength=(int8_t)-length; |
michael@0 | 441 | |
michael@0 | 442 | /* set the error code for unassigned */ |
michael@0 | 443 | *pErrorCode=U_INVALID_CHAR_FOUND; |
michael@0 | 444 | } |
michael@0 | 445 | } |
michael@0 | 446 | |
michael@0 | 447 | /* from Unicode ------------------------------------------------------------- */ |
michael@0 | 448 | |
michael@0 | 449 | // Use roundtrips, "good one-way" mappings, and some normal fallbacks. |
michael@0 | 450 | static inline UBool |
michael@0 | 451 | extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) { |
michael@0 | 452 | return |
michael@0 | 453 | ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 || |
michael@0 | 454 | FROM_U_USE_FALLBACK(useFallback, firstCP)) && |
michael@0 | 455 | (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0; |
michael@0 | 456 | } |
michael@0 | 457 | |
michael@0 | 458 | /* |
michael@0 | 459 | * @return index of the UChar, if found; else <0 |
michael@0 | 460 | */ |
michael@0 | 461 | static inline int32_t |
michael@0 | 462 | ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) { |
michael@0 | 463 | int32_t i, start, limit; |
michael@0 | 464 | |
michael@0 | 465 | /* binary search */ |
michael@0 | 466 | start=0; |
michael@0 | 467 | limit=length; |
michael@0 | 468 | for(;;) { |
michael@0 | 469 | i=limit-start; |
michael@0 | 470 | if(i<=1) { |
michael@0 | 471 | break; /* done */ |
michael@0 | 472 | } |
michael@0 | 473 | /* start<limit-1 */ |
michael@0 | 474 | |
michael@0 | 475 | if(i<=4) { |
michael@0 | 476 | /* linear search for the last part */ |
michael@0 | 477 | if(u<=fromUSection[start]) { |
michael@0 | 478 | break; |
michael@0 | 479 | } |
michael@0 | 480 | if(++start<limit && u<=fromUSection[start]) { |
michael@0 | 481 | break; |
michael@0 | 482 | } |
michael@0 | 483 | if(++start<limit && u<=fromUSection[start]) { |
michael@0 | 484 | break; |
michael@0 | 485 | } |
michael@0 | 486 | /* always break at start==limit-1 */ |
michael@0 | 487 | ++start; |
michael@0 | 488 | break; |
michael@0 | 489 | } |
michael@0 | 490 | |
michael@0 | 491 | i=(start+limit)/2; |
michael@0 | 492 | if(u<fromUSection[i]) { |
michael@0 | 493 | limit=i; |
michael@0 | 494 | } else { |
michael@0 | 495 | start=i; |
michael@0 | 496 | } |
michael@0 | 497 | } |
michael@0 | 498 | |
michael@0 | 499 | /* did we really find it? */ |
michael@0 | 500 | if(start<limit && u==fromUSection[start]) { |
michael@0 | 501 | return start; |
michael@0 | 502 | } else { |
michael@0 | 503 | return -1; /* not found */ |
michael@0 | 504 | } |
michael@0 | 505 | } |
michael@0 | 506 | |
michael@0 | 507 | /* |
michael@0 | 508 | * @param cx pointer to extension data; if NULL, returns 0 |
michael@0 | 509 | * @param firstCP the first code point before all the other UChars |
michael@0 | 510 | * @param pre UChars that must match; !initialMatch: partial match with them |
michael@0 | 511 | * @param preLength length of pre, >=0 |
michael@0 | 512 | * @param src UChars that can be used to complete a match |
michael@0 | 513 | * @param srcLength length of src, >=0 |
michael@0 | 514 | * @param pMatchValue [out] output result value for the match from the data structure |
michael@0 | 515 | * @param useFallback "use fallback" flag, usually from cnv->useFallback |
michael@0 | 516 | * @param flush TRUE if the end of the input stream is reached |
michael@0 | 517 | * @return >1: matched, return value=total match length (number of input units matched) |
michael@0 | 518 | * 1: matched, no mapping but request for <subchar1> |
michael@0 | 519 | * (only for the first code point) |
michael@0 | 520 | * 0: no match |
michael@0 | 521 | * <0: partial match, return value=negative total match length |
michael@0 | 522 | * (partial matches are never returned for flush==TRUE) |
michael@0 | 523 | * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) |
michael@0 | 524 | * the matchLength is 2 if only firstCP matched, and >2 if firstCP and |
michael@0 | 525 | * further code units matched |
michael@0 | 526 | */ |
michael@0 | 527 | static int32_t |
michael@0 | 528 | ucnv_extMatchFromU(const int32_t *cx, |
michael@0 | 529 | UChar32 firstCP, |
michael@0 | 530 | const UChar *pre, int32_t preLength, |
michael@0 | 531 | const UChar *src, int32_t srcLength, |
michael@0 | 532 | uint32_t *pMatchValue, |
michael@0 | 533 | UBool useFallback, UBool flush) { |
michael@0 | 534 | const uint16_t *stage12, *stage3; |
michael@0 | 535 | const uint32_t *stage3b; |
michael@0 | 536 | |
michael@0 | 537 | const UChar *fromUTableUChars, *fromUSectionUChars; |
michael@0 | 538 | const uint32_t *fromUTableValues, *fromUSectionValues; |
michael@0 | 539 | |
michael@0 | 540 | uint32_t value, matchValue; |
michael@0 | 541 | int32_t i, j, idx, length, matchLength; |
michael@0 | 542 | UChar c; |
michael@0 | 543 | |
michael@0 | 544 | if(cx==NULL) { |
michael@0 | 545 | return 0; /* no extension data, no match */ |
michael@0 | 546 | } |
michael@0 | 547 | |
michael@0 | 548 | /* trie lookup of firstCP */ |
michael@0 | 549 | idx=firstCP>>10; /* stage 1 index */ |
michael@0 | 550 | if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) { |
michael@0 | 551 | return 0; /* the first code point is outside the trie */ |
michael@0 | 552 | } |
michael@0 | 553 | |
michael@0 | 554 | stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); |
michael@0 | 555 | stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); |
michael@0 | 556 | idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP); |
michael@0 | 557 | |
michael@0 | 558 | stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); |
michael@0 | 559 | value=stage3b[idx]; |
michael@0 | 560 | if(value==0) { |
michael@0 | 561 | return 0; |
michael@0 | 562 | } |
michael@0 | 563 | |
michael@0 | 564 | /* |
michael@0 | 565 | * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: |
michael@0 | 566 | * Do not interpret values with reserved bits used, for forward compatibility, |
michael@0 | 567 | * and do not even remember intermediate results with reserved bits used. |
michael@0 | 568 | */ |
michael@0 | 569 | |
michael@0 | 570 | if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { |
michael@0 | 571 | /* partial match, enter the loop below */ |
michael@0 | 572 | idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); |
michael@0 | 573 | |
michael@0 | 574 | /* initialize */ |
michael@0 | 575 | fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar); |
michael@0 | 576 | fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t); |
michael@0 | 577 | |
michael@0 | 578 | matchValue=0; |
michael@0 | 579 | i=j=matchLength=0; |
michael@0 | 580 | |
michael@0 | 581 | /* we must not remember fallback matches when not using fallbacks */ |
michael@0 | 582 | |
michael@0 | 583 | /* match input units until there is a full match or the input is consumed */ |
michael@0 | 584 | for(;;) { |
michael@0 | 585 | /* go to the next section */ |
michael@0 | 586 | fromUSectionUChars=fromUTableUChars+idx; |
michael@0 | 587 | fromUSectionValues=fromUTableValues+idx; |
michael@0 | 588 | |
michael@0 | 589 | /* read first pair of the section */ |
michael@0 | 590 | length=*fromUSectionUChars++; |
michael@0 | 591 | value=*fromUSectionValues++; |
michael@0 | 592 | if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) { |
michael@0 | 593 | /* remember longest match so far */ |
michael@0 | 594 | matchValue=value; |
michael@0 | 595 | matchLength=2+i+j; |
michael@0 | 596 | } |
michael@0 | 597 | |
michael@0 | 598 | /* match pre[] then src[] */ |
michael@0 | 599 | if(i<preLength) { |
michael@0 | 600 | c=pre[i++]; |
michael@0 | 601 | } else if(j<srcLength) { |
michael@0 | 602 | c=src[j++]; |
michael@0 | 603 | } else { |
michael@0 | 604 | /* all input consumed, partial match */ |
michael@0 | 605 | if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) { |
michael@0 | 606 | /* |
michael@0 | 607 | * end of the entire input stream, stop with the longest match so far |
michael@0 | 608 | * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS |
michael@0 | 609 | * because it must fit into state buffers |
michael@0 | 610 | */ |
michael@0 | 611 | break; |
michael@0 | 612 | } else { |
michael@0 | 613 | /* continue with more input next time */ |
michael@0 | 614 | return -(2+length); |
michael@0 | 615 | } |
michael@0 | 616 | } |
michael@0 | 617 | |
michael@0 | 618 | /* search for the current UChar */ |
michael@0 | 619 | idx=ucnv_extFindFromU(fromUSectionUChars, length, c); |
michael@0 | 620 | if(idx<0) { |
michael@0 | 621 | /* no match here, stop with the longest match so far */ |
michael@0 | 622 | break; |
michael@0 | 623 | } else { |
michael@0 | 624 | value=fromUSectionValues[idx]; |
michael@0 | 625 | if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { |
michael@0 | 626 | /* partial match, continue */ |
michael@0 | 627 | idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); |
michael@0 | 628 | } else { |
michael@0 | 629 | if(extFromUUseMapping(useFallback, value, firstCP)) { |
michael@0 | 630 | /* full match, stop with result */ |
michael@0 | 631 | matchValue=value; |
michael@0 | 632 | matchLength=2+i+j; |
michael@0 | 633 | } else { |
michael@0 | 634 | /* full match on fallback not taken, stop with the longest match so far */ |
michael@0 | 635 | } |
michael@0 | 636 | break; |
michael@0 | 637 | } |
michael@0 | 638 | } |
michael@0 | 639 | } |
michael@0 | 640 | |
michael@0 | 641 | if(matchLength==0) { |
michael@0 | 642 | /* no match at all */ |
michael@0 | 643 | return 0; |
michael@0 | 644 | } |
michael@0 | 645 | } else /* result from firstCP trie lookup */ { |
michael@0 | 646 | if(extFromUUseMapping(useFallback, value, firstCP)) { |
michael@0 | 647 | /* full match, stop with result */ |
michael@0 | 648 | matchValue=value; |
michael@0 | 649 | matchLength=2; |
michael@0 | 650 | } else { |
michael@0 | 651 | /* fallback not taken */ |
michael@0 | 652 | return 0; |
michael@0 | 653 | } |
michael@0 | 654 | } |
michael@0 | 655 | |
michael@0 | 656 | /* return result */ |
michael@0 | 657 | if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { |
michael@0 | 658 | return 1; /* assert matchLength==2 */ |
michael@0 | 659 | } |
michael@0 | 660 | |
michael@0 | 661 | *pMatchValue=matchValue; |
michael@0 | 662 | return matchLength; |
michael@0 | 663 | } |
michael@0 | 664 | |
michael@0 | 665 | /* |
michael@0 | 666 | * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits |
michael@0 | 667 | */ |
michael@0 | 668 | static inline void |
michael@0 | 669 | ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, |
michael@0 | 670 | uint32_t value, |
michael@0 | 671 | char **target, const char *targetLimit, |
michael@0 | 672 | int32_t **offsets, int32_t srcIndex, |
michael@0 | 673 | UErrorCode *pErrorCode) { |
michael@0 | 674 | uint8_t buffer[1+UCNV_EXT_MAX_BYTES]; |
michael@0 | 675 | const uint8_t *result; |
michael@0 | 676 | int32_t length, prevLength; |
michael@0 | 677 | |
michael@0 | 678 | length=UCNV_EXT_FROM_U_GET_LENGTH(value); |
michael@0 | 679 | value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); |
michael@0 | 680 | |
michael@0 | 681 | /* output the result */ |
michael@0 | 682 | if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { |
michael@0 | 683 | /* |
michael@0 | 684 | * Generate a byte array and then write it below. |
michael@0 | 685 | * This is not the fastest possible way, but it should be ok for |
michael@0 | 686 | * extension mappings, and it is much simpler. |
michael@0 | 687 | * Offset and overflow handling are only done once this way. |
michael@0 | 688 | */ |
michael@0 | 689 | uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */ |
michael@0 | 690 | switch(length) { |
michael@0 | 691 | case 3: |
michael@0 | 692 | *p++=(uint8_t)(value>>16); |
michael@0 | 693 | case 2: /*fall through*/ |
michael@0 | 694 | *p++=(uint8_t)(value>>8); |
michael@0 | 695 | case 1: /*fall through*/ |
michael@0 | 696 | *p++=(uint8_t)value; |
michael@0 | 697 | default: |
michael@0 | 698 | break; /* will never occur */ |
michael@0 | 699 | } |
michael@0 | 700 | result=buffer+1; |
michael@0 | 701 | } else { |
michael@0 | 702 | result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; |
michael@0 | 703 | } |
michael@0 | 704 | |
michael@0 | 705 | /* with correct data we have length>0 */ |
michael@0 | 706 | |
michael@0 | 707 | if((prevLength=cnv->fromUnicodeStatus)!=0) { |
michael@0 | 708 | /* handle SI/SO stateful output */ |
michael@0 | 709 | uint8_t shiftByte; |
michael@0 | 710 | |
michael@0 | 711 | if(prevLength>1 && length==1) { |
michael@0 | 712 | /* change from double-byte mode to single-byte */ |
michael@0 | 713 | shiftByte=(uint8_t)UCNV_SI; |
michael@0 | 714 | cnv->fromUnicodeStatus=1; |
michael@0 | 715 | } else if(prevLength==1 && length>1) { |
michael@0 | 716 | /* change from single-byte mode to double-byte */ |
michael@0 | 717 | shiftByte=(uint8_t)UCNV_SO; |
michael@0 | 718 | cnv->fromUnicodeStatus=2; |
michael@0 | 719 | } else { |
michael@0 | 720 | shiftByte=0; |
michael@0 | 721 | } |
michael@0 | 722 | |
michael@0 | 723 | if(shiftByte!=0) { |
michael@0 | 724 | /* prepend the shift byte to the result bytes */ |
michael@0 | 725 | buffer[0]=shiftByte; |
michael@0 | 726 | if(result!=buffer+1) { |
michael@0 | 727 | uprv_memcpy(buffer+1, result, length); |
michael@0 | 728 | } |
michael@0 | 729 | result=buffer; |
michael@0 | 730 | ++length; |
michael@0 | 731 | } |
michael@0 | 732 | } |
michael@0 | 733 | |
michael@0 | 734 | ucnv_fromUWriteBytes(cnv, (const char *)result, length, |
michael@0 | 735 | target, targetLimit, |
michael@0 | 736 | offsets, srcIndex, |
michael@0 | 737 | pErrorCode); |
michael@0 | 738 | } |
michael@0 | 739 | |
michael@0 | 740 | /* |
michael@0 | 741 | * target<targetLimit; set error code for overflow |
michael@0 | 742 | */ |
michael@0 | 743 | U_CFUNC UBool |
michael@0 | 744 | ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, |
michael@0 | 745 | UChar32 cp, |
michael@0 | 746 | const UChar **src, const UChar *srcLimit, |
michael@0 | 747 | char **target, const char *targetLimit, |
michael@0 | 748 | int32_t **offsets, int32_t srcIndex, |
michael@0 | 749 | UBool flush, |
michael@0 | 750 | UErrorCode *pErrorCode) { |
michael@0 | 751 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
michael@0 | 752 | int32_t match; |
michael@0 | 753 | |
michael@0 | 754 | /* try to match */ |
michael@0 | 755 | match=ucnv_extMatchFromU(cx, cp, |
michael@0 | 756 | NULL, 0, |
michael@0 | 757 | *src, (int32_t)(srcLimit-*src), |
michael@0 | 758 | &value, |
michael@0 | 759 | cnv->useFallback, flush); |
michael@0 | 760 | |
michael@0 | 761 | /* reject a match if the result is a single byte for DBCS-only */ |
michael@0 | 762 | if( match>=2 && |
michael@0 | 763 | !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 && |
michael@0 | 764 | cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) |
michael@0 | 765 | ) { |
michael@0 | 766 | /* advance src pointer for the consumed input */ |
michael@0 | 767 | *src+=match-2; /* remove 2 for the initial code point */ |
michael@0 | 768 | |
michael@0 | 769 | /* write result to target */ |
michael@0 | 770 | ucnv_extWriteFromU(cnv, cx, |
michael@0 | 771 | value, |
michael@0 | 772 | target, targetLimit, |
michael@0 | 773 | offsets, srcIndex, |
michael@0 | 774 | pErrorCode); |
michael@0 | 775 | return TRUE; |
michael@0 | 776 | } else if(match<0) { |
michael@0 | 777 | /* save state for partial match */ |
michael@0 | 778 | const UChar *s; |
michael@0 | 779 | int32_t j; |
michael@0 | 780 | |
michael@0 | 781 | /* copy the first code point */ |
michael@0 | 782 | cnv->preFromUFirstCP=cp; |
michael@0 | 783 | |
michael@0 | 784 | /* now copy the newly consumed input */ |
michael@0 | 785 | s=*src; |
michael@0 | 786 | match=-match-2; /* remove 2 for the initial code point */ |
michael@0 | 787 | for(j=0; j<match; ++j) { |
michael@0 | 788 | cnv->preFromU[j]=*s++; |
michael@0 | 789 | } |
michael@0 | 790 | *src=s; /* same as *src=srcLimit; because we reached the end of input */ |
michael@0 | 791 | cnv->preFromULength=(int8_t)match; |
michael@0 | 792 | return TRUE; |
michael@0 | 793 | } else if(match==1) { |
michael@0 | 794 | /* matched, no mapping but request for <subchar1> */ |
michael@0 | 795 | cnv->useSubChar1=TRUE; |
michael@0 | 796 | return FALSE; |
michael@0 | 797 | } else /* match==0 no match */ { |
michael@0 | 798 | return FALSE; |
michael@0 | 799 | } |
michael@0 | 800 | } |
michael@0 | 801 | |
michael@0 | 802 | /* |
michael@0 | 803 | * Used by ISO 2022 implementation. |
michael@0 | 804 | * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping |
michael@0 | 805 | */ |
michael@0 | 806 | U_CFUNC int32_t |
michael@0 | 807 | ucnv_extSimpleMatchFromU(const int32_t *cx, |
michael@0 | 808 | UChar32 cp, uint32_t *pValue, |
michael@0 | 809 | UBool useFallback) { |
michael@0 | 810 | uint32_t value; |
michael@0 | 811 | int32_t match; |
michael@0 | 812 | |
michael@0 | 813 | /* try to match */ |
michael@0 | 814 | match=ucnv_extMatchFromU(cx, |
michael@0 | 815 | cp, |
michael@0 | 816 | NULL, 0, |
michael@0 | 817 | NULL, 0, |
michael@0 | 818 | &value, |
michael@0 | 819 | useFallback, TRUE); |
michael@0 | 820 | if(match>=2) { |
michael@0 | 821 | /* write result for simple, single-character conversion */ |
michael@0 | 822 | int32_t length; |
michael@0 | 823 | int isRoundtrip; |
michael@0 | 824 | |
michael@0 | 825 | isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); |
michael@0 | 826 | length=UCNV_EXT_FROM_U_GET_LENGTH(value); |
michael@0 | 827 | value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); |
michael@0 | 828 | |
michael@0 | 829 | if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { |
michael@0 | 830 | *pValue=value; |
michael@0 | 831 | return isRoundtrip ? length : -length; |
michael@0 | 832 | #if 0 /* not currently used */ |
michael@0 | 833 | } else if(length==4) { |
michael@0 | 834 | /* de-serialize a 4-byte result */ |
michael@0 | 835 | const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; |
michael@0 | 836 | *pValue= |
michael@0 | 837 | ((uint32_t)result[0]<<24)| |
michael@0 | 838 | ((uint32_t)result[1]<<16)| |
michael@0 | 839 | ((uint32_t)result[2]<<8)| |
michael@0 | 840 | result[3]; |
michael@0 | 841 | return isRoundtrip ? 4 : -4; |
michael@0 | 842 | #endif |
michael@0 | 843 | } |
michael@0 | 844 | } |
michael@0 | 845 | |
michael@0 | 846 | /* |
michael@0 | 847 | * return no match because |
michael@0 | 848 | * - match>1 && resultLength>4: result too long for simple conversion |
michael@0 | 849 | * - match==1: no match found, <subchar1> preferred |
michael@0 | 850 | * - match==0: no match found in the first place |
michael@0 | 851 | * - match<0: partial match, not supported for simple conversion (and flush==TRUE) |
michael@0 | 852 | */ |
michael@0 | 853 | return 0; |
michael@0 | 854 | } |
michael@0 | 855 | |
michael@0 | 856 | /* |
michael@0 | 857 | * continue partial match with new input, requires cnv->preFromUFirstCP>=0 |
michael@0 | 858 | * never called for simple, single-character conversion |
michael@0 | 859 | */ |
michael@0 | 860 | U_CFUNC void |
michael@0 | 861 | ucnv_extContinueMatchFromU(UConverter *cnv, |
michael@0 | 862 | UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, |
michael@0 | 863 | UErrorCode *pErrorCode) { |
michael@0 | 864 | uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ |
michael@0 | 865 | int32_t match; |
michael@0 | 866 | |
michael@0 | 867 | match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes, |
michael@0 | 868 | cnv->preFromUFirstCP, |
michael@0 | 869 | cnv->preFromU, cnv->preFromULength, |
michael@0 | 870 | pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), |
michael@0 | 871 | &value, |
michael@0 | 872 | cnv->useFallback, pArgs->flush); |
michael@0 | 873 | if(match>=2) { |
michael@0 | 874 | match-=2; /* remove 2 for the initial code point */ |
michael@0 | 875 | |
michael@0 | 876 | if(match>=cnv->preFromULength) { |
michael@0 | 877 | /* advance src pointer for the consumed input */ |
michael@0 | 878 | pArgs->source+=match-cnv->preFromULength; |
michael@0 | 879 | cnv->preFromULength=0; |
michael@0 | 880 | } else { |
michael@0 | 881 | /* the match did not use all of preFromU[] - keep the rest for replay */ |
michael@0 | 882 | int32_t length=cnv->preFromULength-match; |
michael@0 | 883 | uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR); |
michael@0 | 884 | cnv->preFromULength=(int8_t)-length; |
michael@0 | 885 | } |
michael@0 | 886 | |
michael@0 | 887 | /* finish the partial match */ |
michael@0 | 888 | cnv->preFromUFirstCP=U_SENTINEL; |
michael@0 | 889 | |
michael@0 | 890 | /* write result */ |
michael@0 | 891 | ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes, |
michael@0 | 892 | value, |
michael@0 | 893 | &pArgs->target, pArgs->targetLimit, |
michael@0 | 894 | &pArgs->offsets, srcIndex, |
michael@0 | 895 | pErrorCode); |
michael@0 | 896 | } else if(match<0) { |
michael@0 | 897 | /* save state for partial match */ |
michael@0 | 898 | const UChar *s; |
michael@0 | 899 | int32_t j; |
michael@0 | 900 | |
michael@0 | 901 | /* just _append_ the newly consumed input to preFromU[] */ |
michael@0 | 902 | s=pArgs->source; |
michael@0 | 903 | match=-match-2; /* remove 2 for the initial code point */ |
michael@0 | 904 | for(j=cnv->preFromULength; j<match; ++j) { |
michael@0 | 905 | U_ASSERT(j>=0); |
michael@0 | 906 | cnv->preFromU[j]=*s++; |
michael@0 | 907 | } |
michael@0 | 908 | pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ |
michael@0 | 909 | cnv->preFromULength=(int8_t)match; |
michael@0 | 910 | } else /* match==0 or 1 */ { |
michael@0 | 911 | /* |
michael@0 | 912 | * no match |
michael@0 | 913 | * |
michael@0 | 914 | * We need to split the previous input into two parts: |
michael@0 | 915 | * |
michael@0 | 916 | * 1. The first code point is unmappable - that's how we got into |
michael@0 | 917 | * trying the extension data in the first place. |
michael@0 | 918 | * We need to move it from the preFromU buffer |
michael@0 | 919 | * to the error buffer, set an error code, |
michael@0 | 920 | * and prepare the rest of the previous input for 2. |
michael@0 | 921 | * |
michael@0 | 922 | * 2. The rest of the previous input must be converted once we |
michael@0 | 923 | * come back from the callback for the first code point. |
michael@0 | 924 | * At that time, we have to try again from scratch to convert |
michael@0 | 925 | * these input characters. |
michael@0 | 926 | * The replay will be handled by the ucnv.c conversion code. |
michael@0 | 927 | */ |
michael@0 | 928 | |
michael@0 | 929 | if(match==1) { |
michael@0 | 930 | /* matched, no mapping but request for <subchar1> */ |
michael@0 | 931 | cnv->useSubChar1=TRUE; |
michael@0 | 932 | } |
michael@0 | 933 | |
michael@0 | 934 | /* move the first code point to the error field */ |
michael@0 | 935 | cnv->fromUChar32=cnv->preFromUFirstCP; |
michael@0 | 936 | cnv->preFromUFirstCP=U_SENTINEL; |
michael@0 | 937 | |
michael@0 | 938 | /* mark preFromU for replay */ |
michael@0 | 939 | cnv->preFromULength=-cnv->preFromULength; |
michael@0 | 940 | |
michael@0 | 941 | /* set the error code for unassigned */ |
michael@0 | 942 | *pErrorCode=U_INVALID_CHAR_FOUND; |
michael@0 | 943 | } |
michael@0 | 944 | } |
michael@0 | 945 | |
michael@0 | 946 | static UBool |
michael@0 | 947 | extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) { |
michael@0 | 948 | if(which==UCNV_ROUNDTRIP_SET) { |
michael@0 | 949 | // Add only code points for which the roundtrip flag is set. |
michael@0 | 950 | // Do not add any fallbacks, even if ucnv_fromUnicode() would use them |
michael@0 | 951 | // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet(). |
michael@0 | 952 | // |
michael@0 | 953 | // By analogy, also do not add "good one-way" mappings. |
michael@0 | 954 | // |
michael@0 | 955 | // Do not add entries with reserved bits set. |
michael@0 | 956 | if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!= |
michael@0 | 957 | UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) { |
michael@0 | 958 | return FALSE; |
michael@0 | 959 | } |
michael@0 | 960 | } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { |
michael@0 | 961 | // Do not add entries with reserved bits set. |
michael@0 | 962 | if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) { |
michael@0 | 963 | return FALSE; |
michael@0 | 964 | } |
michael@0 | 965 | } |
michael@0 | 966 | // Do not add <subchar1> entries or other (future?) pseudo-entries |
michael@0 | 967 | // with an output length of 0. |
michael@0 | 968 | return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength; |
michael@0 | 969 | } |
michael@0 | 970 | |
michael@0 | 971 | static void |
michael@0 | 972 | ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, |
michael@0 | 973 | const int32_t *cx, |
michael@0 | 974 | const USetAdder *sa, |
michael@0 | 975 | UConverterUnicodeSet which, |
michael@0 | 976 | int32_t minLength, |
michael@0 | 977 | UChar32 firstCP, |
michael@0 | 978 | UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, |
michael@0 | 979 | int32_t sectionIndex, |
michael@0 | 980 | UErrorCode *pErrorCode) { |
michael@0 | 981 | const UChar *fromUSectionUChars; |
michael@0 | 982 | const uint32_t *fromUSectionValues; |
michael@0 | 983 | |
michael@0 | 984 | uint32_t value; |
michael@0 | 985 | int32_t i, count; |
michael@0 | 986 | |
michael@0 | 987 | fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex; |
michael@0 | 988 | fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex; |
michael@0 | 989 | |
michael@0 | 990 | /* read first pair of the section */ |
michael@0 | 991 | count=*fromUSectionUChars++; |
michael@0 | 992 | value=*fromUSectionValues++; |
michael@0 | 993 | |
michael@0 | 994 | if(extSetUseMapping(which, minLength, value)) { |
michael@0 | 995 | if(length==U16_LENGTH(firstCP)) { |
michael@0 | 996 | /* add the initial code point */ |
michael@0 | 997 | sa->add(sa->set, firstCP); |
michael@0 | 998 | } else { |
michael@0 | 999 | /* add the string so far */ |
michael@0 | 1000 | sa->addString(sa->set, s, length); |
michael@0 | 1001 | } |
michael@0 | 1002 | } |
michael@0 | 1003 | |
michael@0 | 1004 | for(i=0; i<count; ++i) { |
michael@0 | 1005 | /* append this code unit and recurse or add the string */ |
michael@0 | 1006 | s[length]=fromUSectionUChars[i]; |
michael@0 | 1007 | value=fromUSectionValues[i]; |
michael@0 | 1008 | |
michael@0 | 1009 | if(value==0) { |
michael@0 | 1010 | /* no mapping, do nothing */ |
michael@0 | 1011 | } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { |
michael@0 | 1012 | ucnv_extGetUnicodeSetString( |
michael@0 | 1013 | sharedData, cx, sa, which, minLength, |
michael@0 | 1014 | firstCP, s, length+1, |
michael@0 | 1015 | (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), |
michael@0 | 1016 | pErrorCode); |
michael@0 | 1017 | } else if(extSetUseMapping(which, minLength, value)) { |
michael@0 | 1018 | sa->addString(sa->set, s, length+1); |
michael@0 | 1019 | } |
michael@0 | 1020 | } |
michael@0 | 1021 | } |
michael@0 | 1022 | |
michael@0 | 1023 | U_CFUNC void |
michael@0 | 1024 | ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, |
michael@0 | 1025 | const USetAdder *sa, |
michael@0 | 1026 | UConverterUnicodeSet which, |
michael@0 | 1027 | UConverterSetFilter filter, |
michael@0 | 1028 | UErrorCode *pErrorCode) { |
michael@0 | 1029 | const int32_t *cx; |
michael@0 | 1030 | const uint16_t *stage12, *stage3, *ps2, *ps3; |
michael@0 | 1031 | const uint32_t *stage3b; |
michael@0 | 1032 | |
michael@0 | 1033 | uint32_t value; |
michael@0 | 1034 | int32_t st1, stage1Length, st2, st3, minLength; |
michael@0 | 1035 | |
michael@0 | 1036 | UChar s[UCNV_EXT_MAX_UCHARS]; |
michael@0 | 1037 | UChar32 c; |
michael@0 | 1038 | int32_t length; |
michael@0 | 1039 | |
michael@0 | 1040 | cx=sharedData->mbcs.extIndexes; |
michael@0 | 1041 | if(cx==NULL) { |
michael@0 | 1042 | return; |
michael@0 | 1043 | } |
michael@0 | 1044 | |
michael@0 | 1045 | stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); |
michael@0 | 1046 | stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); |
michael@0 | 1047 | stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); |
michael@0 | 1048 | |
michael@0 | 1049 | stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; |
michael@0 | 1050 | |
michael@0 | 1051 | /* enumerate the from-Unicode trie table */ |
michael@0 | 1052 | c=0; /* keep track of the current code point while enumerating */ |
michael@0 | 1053 | |
michael@0 | 1054 | if(filter==UCNV_SET_FILTER_2022_CN) { |
michael@0 | 1055 | minLength=3; |
michael@0 | 1056 | } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || |
michael@0 | 1057 | filter!=UCNV_SET_FILTER_NONE |
michael@0 | 1058 | ) { |
michael@0 | 1059 | /* DBCS-only, ignore single-byte results */ |
michael@0 | 1060 | minLength=2; |
michael@0 | 1061 | } else { |
michael@0 | 1062 | minLength=1; |
michael@0 | 1063 | } |
michael@0 | 1064 | |
michael@0 | 1065 | /* |
michael@0 | 1066 | * the trie enumeration is almost the same as |
michael@0 | 1067 | * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1 |
michael@0 | 1068 | */ |
michael@0 | 1069 | for(st1=0; st1<stage1Length; ++st1) { |
michael@0 | 1070 | st2=stage12[st1]; |
michael@0 | 1071 | if(st2>stage1Length) { |
michael@0 | 1072 | ps2=stage12+st2; |
michael@0 | 1073 | for(st2=0; st2<64; ++st2) { |
michael@0 | 1074 | if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) { |
michael@0 | 1075 | /* read the stage 3 block */ |
michael@0 | 1076 | ps3=stage3+st3; |
michael@0 | 1077 | |
michael@0 | 1078 | do { |
michael@0 | 1079 | value=stage3b[*ps3++]; |
michael@0 | 1080 | if(value==0) { |
michael@0 | 1081 | /* no mapping, do nothing */ |
michael@0 | 1082 | } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { |
michael@0 | 1083 | // Recurse for partial results. |
michael@0 | 1084 | length=0; |
michael@0 | 1085 | U16_APPEND_UNSAFE(s, length, c); |
michael@0 | 1086 | ucnv_extGetUnicodeSetString( |
michael@0 | 1087 | sharedData, cx, sa, which, minLength, |
michael@0 | 1088 | c, s, length, |
michael@0 | 1089 | (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), |
michael@0 | 1090 | pErrorCode); |
michael@0 | 1091 | } else if(extSetUseMapping(which, minLength, value)) { |
michael@0 | 1092 | switch(filter) { |
michael@0 | 1093 | case UCNV_SET_FILTER_2022_CN: |
michael@0 | 1094 | if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { |
michael@0 | 1095 | continue; |
michael@0 | 1096 | } |
michael@0 | 1097 | break; |
michael@0 | 1098 | case UCNV_SET_FILTER_SJIS: |
michael@0 | 1099 | if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { |
michael@0 | 1100 | continue; |
michael@0 | 1101 | } |
michael@0 | 1102 | break; |
michael@0 | 1103 | case UCNV_SET_FILTER_GR94DBCS: |
michael@0 | 1104 | if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && |
michael@0 | 1105 | (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) && |
michael@0 | 1106 | (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { |
michael@0 | 1107 | continue; |
michael@0 | 1108 | } |
michael@0 | 1109 | break; |
michael@0 | 1110 | case UCNV_SET_FILTER_HZ: |
michael@0 | 1111 | if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && |
michael@0 | 1112 | (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && |
michael@0 | 1113 | (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { |
michael@0 | 1114 | continue; |
michael@0 | 1115 | } |
michael@0 | 1116 | break; |
michael@0 | 1117 | default: |
michael@0 | 1118 | /* |
michael@0 | 1119 | * UCNV_SET_FILTER_NONE, |
michael@0 | 1120 | * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength |
michael@0 | 1121 | */ |
michael@0 | 1122 | break; |
michael@0 | 1123 | } |
michael@0 | 1124 | sa->add(sa->set, c); |
michael@0 | 1125 | } |
michael@0 | 1126 | } while((++c&0xf)!=0); |
michael@0 | 1127 | } else { |
michael@0 | 1128 | c+=16; /* empty stage 3 block */ |
michael@0 | 1129 | } |
michael@0 | 1130 | } |
michael@0 | 1131 | } else { |
michael@0 | 1132 | c+=1024; /* empty stage 2 block */ |
michael@0 | 1133 | } |
michael@0 | 1134 | } |
michael@0 | 1135 | } |
michael@0 | 1136 | |
michael@0 | 1137 | #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |