intl/icu/source/common/ucnv_ext.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2003-2013, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  ucnv_ext.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2003jun13
    14 *   created by: Markus W. Scherer
    15 *
    16 *   Conversion extensions
    17 */
    19 #include "unicode/utypes.h"
    21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
    23 #include "unicode/uset.h"
    24 #include "ucnv_bld.h"
    25 #include "ucnv_cnv.h"
    26 #include "ucnv_ext.h"
    27 #include "cmemory.h"
    28 #include "uassert.h"
    30 /* to Unicode --------------------------------------------------------------- */
    32 /*
    33  * @return lookup value for the byte, if found; else 0
    34  */
    35 static inline uint32_t
    36 ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
    37     uint32_t word0, word;
    38     int32_t i, start, limit;
    40     /* check the input byte against the lowest and highest section bytes */
    41     start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
    42     limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
    43     if(byte<start || limit<byte) {
    44         return 0; /* the byte is out of range */
    45     }
    47     if(length==((limit-start)+1)) {
    48         /* direct access on a linear array */
    49         return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
    50     }
    52     /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
    53     word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0);
    55     /*
    56      * Shift byte once instead of each section word and add 0xffffff.
    57      * We will compare the shifted/added byte (bbffffff) against
    58      * section words which have byte values in the same bit position.
    59      * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
    60      * for all v=0..f
    61      * so we need not mask off the lower 24 bits of each section word.
    62      */
    63     word=word0|UCNV_EXT_TO_U_VALUE_MASK;
    65     /* binary search */
    66     start=0;
    67     limit=length;
    68     for(;;) {
    69         i=limit-start;
    70         if(i<=1) {
    71             break; /* done */
    72         }
    73         /* start<limit-1 */
    75         if(i<=4) {
    76             /* linear search for the last part */
    77             if(word0<=toUSection[start]) {
    78                 break;
    79             }
    80             if(++start<limit && word0<=toUSection[start]) {
    81                 break;
    82             }
    83             if(++start<limit && word0<=toUSection[start]) {
    84                 break;
    85             }
    86             /* always break at start==limit-1 */
    87             ++start;
    88             break;
    89         }
    91         i=(start+limit)/2;
    92         if(word<toUSection[i]) {
    93             limit=i;
    94         } else {
    95             start=i;
    96         }
    97     }
    99     /* did we really find it? */
   100     if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
   101         return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
   102     } else {
   103         return 0; /* not found */
   104     }
   105 }
   107 /*
   108  * TRUE if not an SI/SO stateful converter,
   109  * or if the match length fits with the current converter state
   110  */
   111 #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \
   112     ((sisoState)<0 || ((sisoState)==0) == (match==1))
   114 /*
   115  * this works like ucnv_extMatchFromU() except
   116  * - the first character is in pre
   117  * - no trie is used
   118  * - the returned matchLength is not offset by 2
   119  */
   120 static int32_t
   121 ucnv_extMatchToU(const int32_t *cx, int8_t sisoState,
   122                  const char *pre, int32_t preLength,
   123                  const char *src, int32_t srcLength,
   124                  uint32_t *pMatchValue,
   125                  UBool /*useFallback*/, UBool flush) {
   126     const uint32_t *toUTable, *toUSection;
   128     uint32_t value, matchValue;
   129     int32_t i, j, idx, length, matchLength;
   130     uint8_t b;
   132     if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) {
   133         return 0; /* no extension data, no match */
   134     }
   136     /* initialize */
   137     toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
   138     idx=0;
   140     matchValue=0;
   141     i=j=matchLength=0;
   143     if(sisoState==0) {
   144         /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
   145         if(preLength>1) {
   146             return 0; /* no match of a DBCS sequence in SBCS mode */
   147         } else if(preLength==1) {
   148             srcLength=0;
   149         } else /* preLength==0 */ {
   150             if(srcLength>1) {
   151                 srcLength=1;
   152             }
   153         }
   154         flush=TRUE;
   155     }
   157     /* we must not remember fallback matches when not using fallbacks */
   159     /* match input units until there is a full match or the input is consumed */
   160     for(;;) {
   161         /* go to the next section */
   162         toUSection=toUTable+idx;
   164         /* read first pair of the section */
   165         value=*toUSection++;
   166         length=UCNV_EXT_TO_U_GET_BYTE(value);
   167         value=UCNV_EXT_TO_U_GET_VALUE(value);
   168         if( value!=0 &&
   169             (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
   170              TO_U_USE_FALLBACK(useFallback)) &&
   171             UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
   172         ) {
   173             /* remember longest match so far */
   174             matchValue=value;
   175             matchLength=i+j;
   176         }
   178         /* match pre[] then src[] */
   179         if(i<preLength) {
   180             b=(uint8_t)pre[i++];
   181         } else if(j<srcLength) {
   182             b=(uint8_t)src[j++];
   183         } else {
   184             /* all input consumed, partial match */
   185             if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
   186                 /*
   187                  * end of the entire input stream, stop with the longest match so far
   188                  * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
   189                  * because it must fit into state buffers
   190                  */
   191                 break;
   192             } else {
   193                 /* continue with more input next time */
   194                 return -length;
   195             }
   196         }
   198         /* search for the current UChar */
   199         value=ucnv_extFindToU(toUSection, length, b);
   200         if(value==0) {
   201             /* no match here, stop with the longest match so far */
   202             break;
   203         } else {
   204             if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
   205                 /* partial match, continue */
   206                 idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
   207             } else {
   208                 if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
   209                      TO_U_USE_FALLBACK(useFallback)) &&
   210                     UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
   211                 ) {
   212                     /* full match, stop with result */
   213                     matchValue=value;
   214                     matchLength=i+j;
   215                 } else {
   216                     /* full match on fallback not taken, stop with the longest match so far */
   217                 }
   218                 break;
   219             }
   220         }
   221     }
   223     if(matchLength==0) {
   224         /* no match at all */
   225         return 0;
   226     }
   228     /* return result */
   229     *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
   230     return matchLength;
   231 }
   233 static inline void
   234 ucnv_extWriteToU(UConverter *cnv, const int32_t *cx,
   235                  uint32_t value,
   236                  UChar **target, const UChar *targetLimit,
   237                  int32_t **offsets, int32_t srcIndex,
   238                  UErrorCode *pErrorCode) {
   239     /* output the result */
   240     if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
   241         /* output a single code point */
   242         ucnv_toUWriteCodePoint(
   243             cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value),
   244             target, targetLimit,
   245             offsets, srcIndex,
   246             pErrorCode);
   247     } else {
   248         /* output a string - with correct data we have resultLength>0 */
   249         ucnv_toUWriteUChars(
   250             cnv,
   251             UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+
   252                 UCNV_EXT_TO_U_GET_INDEX(value),
   253             UCNV_EXT_TO_U_GET_LENGTH(value),
   254             target, targetLimit,
   255             offsets, srcIndex,
   256             pErrorCode);
   257     }
   258 }
   260 /*
   261  * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
   262  * or 1 for DBCS-only,
   263  * or -1 if the converter is not SI/SO stateful
   264  *
   265  * Note: For SI/SO stateful converters getting here,
   266  * cnv->mode==0 is equivalent to firstLength==1.
   267  */
   268 #define UCNV_SISO_STATE(cnv) \
   269     ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \
   270      (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1)
   272 /*
   273  * target<targetLimit; set error code for overflow
   274  */
   275 U_CFUNC UBool
   276 ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
   277                         int32_t firstLength,
   278                         const char **src, const char *srcLimit,
   279                         UChar **target, const UChar *targetLimit,
   280                         int32_t **offsets, int32_t srcIndex,
   281                         UBool flush,
   282                         UErrorCode *pErrorCode) {
   283     uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   284     int32_t match;
   286     /* try to match */
   287     match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv),
   288                            (const char *)cnv->toUBytes, firstLength,
   289                            *src, (int32_t)(srcLimit-*src),
   290                            &value,
   291                            cnv->useFallback, flush);
   292     if(match>0) {
   293         /* advance src pointer for the consumed input */
   294         *src+=match-firstLength;
   296         /* write result to target */
   297         ucnv_extWriteToU(cnv, cx,
   298                          value,
   299                          target, targetLimit,
   300                          offsets, srcIndex,
   301                          pErrorCode);
   302         return TRUE;
   303     } else if(match<0) {
   304         /* save state for partial match */
   305         const char *s;
   306         int32_t j;
   308         /* copy the first code point */
   309         s=(const char *)cnv->toUBytes;
   310         cnv->preToUFirstLength=(int8_t)firstLength;
   311         for(j=0; j<firstLength; ++j) {
   312             cnv->preToU[j]=*s++;
   313         }
   315         /* now copy the newly consumed input */
   316         s=*src;
   317         match=-match;
   318         for(; j<match; ++j) {
   319             cnv->preToU[j]=*s++;
   320         }
   321         *src=s; /* same as *src=srcLimit; because we reached the end of input */
   322         cnv->preToULength=(int8_t)match;
   323         return TRUE;
   324     } else /* match==0 no match */ {
   325         return FALSE;
   326     }
   327 }
   329 U_CFUNC UChar32
   330 ucnv_extSimpleMatchToU(const int32_t *cx,
   331                        const char *source, int32_t length,
   332                        UBool useFallback) {
   333     uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   334     int32_t match;
   336     if(length<=0) {
   337         return 0xffff;
   338     }
   340     /* try to match */
   341     match=ucnv_extMatchToU(cx, -1,
   342                            source, length,
   343                            NULL, 0,
   344                            &value,
   345                            useFallback, TRUE);
   346     if(match==length) {
   347         /* write result for simple, single-character conversion */
   348         if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
   349             return UCNV_EXT_TO_U_GET_CODE_POINT(value);
   350         }
   351     }
   353     /*
   354      * return no match because
   355      * - match>0 && value points to string: simple conversion cannot handle multiple code points
   356      * - match>0 && match!=length: not all input consumed, forbidden for this function
   357      * - match==0: no match found in the first place
   358      * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
   359      */
   360     return 0xfffe;
   361 }
   363 /*
   364  * continue partial match with new input
   365  * never called for simple, single-character conversion
   366  */
   367 U_CFUNC void
   368 ucnv_extContinueMatchToU(UConverter *cnv,
   369                          UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
   370                          UErrorCode *pErrorCode) {
   371     uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   372     int32_t match, length;
   374     match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv),
   375                            cnv->preToU, cnv->preToULength,
   376                            pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
   377                            &value,
   378                            cnv->useFallback, pArgs->flush);
   379     if(match>0) {
   380         if(match>=cnv->preToULength) {
   381             /* advance src pointer for the consumed input */
   382             pArgs->source+=match-cnv->preToULength;
   383             cnv->preToULength=0;
   384         } else {
   385             /* the match did not use all of preToU[] - keep the rest for replay */
   386             length=cnv->preToULength-match;
   387             uprv_memmove(cnv->preToU, cnv->preToU+match, length);
   388             cnv->preToULength=(int8_t)-length;
   389         }
   391         /* write result */
   392         ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes,
   393                          value,
   394                          &pArgs->target, pArgs->targetLimit,
   395                          &pArgs->offsets, srcIndex,
   396                          pErrorCode);
   397     } else if(match<0) {
   398         /* save state for partial match */
   399         const char *s;
   400         int32_t j;
   402         /* just _append_ the newly consumed input to preToU[] */
   403         s=pArgs->source;
   404         match=-match;
   405         for(j=cnv->preToULength; j<match; ++j) {
   406             cnv->preToU[j]=*s++;
   407         }
   408         pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
   409         cnv->preToULength=(int8_t)match;
   410     } else /* match==0 */ {
   411         /*
   412          * no match
   413          *
   414          * We need to split the previous input into two parts:
   415          *
   416          * 1. The first codepage character is unmappable - that's how we got into
   417          *    trying the extension data in the first place.
   418          *    We need to move it from the preToU buffer
   419          *    to the error buffer, set an error code,
   420          *    and prepare the rest of the previous input for 2.
   421          *
   422          * 2. The rest of the previous input must be converted once we
   423          *    come back from the callback for the first character.
   424          *    At that time, we have to try again from scratch to convert
   425          *    these input characters.
   426          *    The replay will be handled by the ucnv.c conversion code.
   427          */
   429         /* move the first codepage character to the error field */
   430         uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
   431         cnv->toULength=cnv->preToUFirstLength;
   433         /* move the rest up inside the buffer */
   434         length=cnv->preToULength-cnv->preToUFirstLength;
   435         if(length>0) {
   436             uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
   437         }
   439         /* mark preToU for replay */
   440         cnv->preToULength=(int8_t)-length;
   442         /* set the error code for unassigned */
   443         *pErrorCode=U_INVALID_CHAR_FOUND;
   444     }
   445 }
   447 /* from Unicode ------------------------------------------------------------- */
   449 // Use roundtrips, "good one-way" mappings, and some normal fallbacks.
   450 static inline UBool
   451 extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) {
   452     return
   453         ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 ||
   454             FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
   455         (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0;
   456 }
   458 /*
   459  * @return index of the UChar, if found; else <0
   460  */
   461 static inline int32_t
   462 ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
   463     int32_t i, start, limit;
   465     /* binary search */
   466     start=0;
   467     limit=length;
   468     for(;;) {
   469         i=limit-start;
   470         if(i<=1) {
   471             break; /* done */
   472         }
   473         /* start<limit-1 */
   475         if(i<=4) {
   476             /* linear search for the last part */
   477             if(u<=fromUSection[start]) {
   478                 break;
   479             }
   480             if(++start<limit && u<=fromUSection[start]) {
   481                 break;
   482             }
   483             if(++start<limit && u<=fromUSection[start]) {
   484                 break;
   485             }
   486             /* always break at start==limit-1 */
   487             ++start;
   488             break;
   489         }
   491         i=(start+limit)/2;
   492         if(u<fromUSection[i]) {
   493             limit=i;
   494         } else {
   495             start=i;
   496         }
   497     }
   499     /* did we really find it? */
   500     if(start<limit && u==fromUSection[start]) {
   501         return start;
   502     } else {
   503         return -1; /* not found */
   504     }
   505 }
   507 /*
   508  * @param cx pointer to extension data; if NULL, returns 0
   509  * @param firstCP the first code point before all the other UChars
   510  * @param pre UChars that must match; !initialMatch: partial match with them
   511  * @param preLength length of pre, >=0
   512  * @param src UChars that can be used to complete a match
   513  * @param srcLength length of src, >=0
   514  * @param pMatchValue [out] output result value for the match from the data structure
   515  * @param useFallback "use fallback" flag, usually from cnv->useFallback
   516  * @param flush TRUE if the end of the input stream is reached
   517  * @return >1: matched, return value=total match length (number of input units matched)
   518  *          1: matched, no mapping but request for <subchar1>
   519  *             (only for the first code point)
   520  *          0: no match
   521  *         <0: partial match, return value=negative total match length
   522  *             (partial matches are never returned for flush==TRUE)
   523  *             (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
   524  *         the matchLength is 2 if only firstCP matched, and >2 if firstCP and
   525  *         further code units matched
   526  */
   527 static int32_t
   528 ucnv_extMatchFromU(const int32_t *cx,
   529                    UChar32 firstCP,
   530                    const UChar *pre, int32_t preLength,
   531                    const UChar *src, int32_t srcLength,
   532                    uint32_t *pMatchValue,
   533                    UBool useFallback, UBool flush) {
   534     const uint16_t *stage12, *stage3;
   535     const uint32_t *stage3b;
   537     const UChar *fromUTableUChars, *fromUSectionUChars;
   538     const uint32_t *fromUTableValues, *fromUSectionValues;
   540     uint32_t value, matchValue;
   541     int32_t i, j, idx, length, matchLength;
   542     UChar c;
   544     if(cx==NULL) {
   545         return 0; /* no extension data, no match */
   546     }
   548     /* trie lookup of firstCP */
   549     idx=firstCP>>10; /* stage 1 index */
   550     if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
   551         return 0; /* the first code point is outside the trie */
   552     }
   554     stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
   555     stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
   556     idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP);
   558     stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
   559     value=stage3b[idx];
   560     if(value==0) {
   561         return 0;
   562     }
   564     /*
   565      * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0:
   566      * Do not interpret values with reserved bits used, for forward compatibility,
   567      * and do not even remember intermediate results with reserved bits used.
   568      */
   570     if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
   571         /* partial match, enter the loop below */
   572         idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
   574         /* initialize */
   575         fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
   576         fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
   578         matchValue=0;
   579         i=j=matchLength=0;
   581         /* we must not remember fallback matches when not using fallbacks */
   583         /* match input units until there is a full match or the input is consumed */
   584         for(;;) {
   585             /* go to the next section */
   586             fromUSectionUChars=fromUTableUChars+idx;
   587             fromUSectionValues=fromUTableValues+idx;
   589             /* read first pair of the section */
   590             length=*fromUSectionUChars++;
   591             value=*fromUSectionValues++;
   592             if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) {
   593                 /* remember longest match so far */
   594                 matchValue=value;
   595                 matchLength=2+i+j;
   596             }
   598             /* match pre[] then src[] */
   599             if(i<preLength) {
   600                 c=pre[i++];
   601             } else if(j<srcLength) {
   602                 c=src[j++];
   603             } else {
   604                 /* all input consumed, partial match */
   605                 if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
   606                     /*
   607                      * end of the entire input stream, stop with the longest match so far
   608                      * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
   609                      * because it must fit into state buffers
   610                      */
   611                     break;
   612                 } else {
   613                     /* continue with more input next time */
   614                     return -(2+length);
   615                 }
   616             }
   618             /* search for the current UChar */
   619             idx=ucnv_extFindFromU(fromUSectionUChars, length, c);
   620             if(idx<0) {
   621                 /* no match here, stop with the longest match so far */
   622                 break;
   623             } else {
   624                 value=fromUSectionValues[idx];
   625                 if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
   626                     /* partial match, continue */
   627                     idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
   628                 } else {
   629                     if(extFromUUseMapping(useFallback, value, firstCP)) {
   630                         /* full match, stop with result */
   631                         matchValue=value;
   632                         matchLength=2+i+j;
   633                     } else {
   634                         /* full match on fallback not taken, stop with the longest match so far */
   635                     }
   636                     break;
   637                 }
   638             }
   639         }
   641         if(matchLength==0) {
   642             /* no match at all */
   643             return 0;
   644         }
   645     } else /* result from firstCP trie lookup */ {
   646         if(extFromUUseMapping(useFallback, value, firstCP)) {
   647             /* full match, stop with result */
   648             matchValue=value;
   649             matchLength=2;
   650         } else {
   651             /* fallback not taken */
   652             return 0;
   653         }
   654     }
   656     /* return result */
   657     if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
   658         return 1; /* assert matchLength==2 */
   659     }
   661     *pMatchValue=matchValue;
   662     return matchLength;
   663 }
   665 /*
   666  * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits
   667  */
   668 static inline void
   669 ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
   670                    uint32_t value,
   671                    char **target, const char *targetLimit,
   672                    int32_t **offsets, int32_t srcIndex,
   673                    UErrorCode *pErrorCode) {
   674     uint8_t buffer[1+UCNV_EXT_MAX_BYTES];
   675     const uint8_t *result;
   676     int32_t length, prevLength;
   678     length=UCNV_EXT_FROM_U_GET_LENGTH(value);
   679     value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
   681     /* output the result */
   682     if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
   683         /*
   684          * Generate a byte array and then write it below.
   685          * This is not the fastest possible way, but it should be ok for
   686          * extension mappings, and it is much simpler.
   687          * Offset and overflow handling are only done once this way.
   688          */
   689         uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */
   690         switch(length) {
   691         case 3:
   692             *p++=(uint8_t)(value>>16);
   693         case 2: /*fall through*/
   694             *p++=(uint8_t)(value>>8);
   695         case 1: /*fall through*/
   696             *p++=(uint8_t)value;
   697         default:
   698             break; /* will never occur */
   699         }
   700         result=buffer+1;
   701     } else {
   702         result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
   703     }
   705     /* with correct data we have length>0 */
   707     if((prevLength=cnv->fromUnicodeStatus)!=0) {
   708         /* handle SI/SO stateful output */
   709         uint8_t shiftByte;
   711         if(prevLength>1 && length==1) {
   712             /* change from double-byte mode to single-byte */
   713             shiftByte=(uint8_t)UCNV_SI;
   714             cnv->fromUnicodeStatus=1;
   715         } else if(prevLength==1 && length>1) {
   716             /* change from single-byte mode to double-byte */
   717             shiftByte=(uint8_t)UCNV_SO;
   718             cnv->fromUnicodeStatus=2;
   719         } else {
   720             shiftByte=0;
   721         }
   723         if(shiftByte!=0) {
   724             /* prepend the shift byte to the result bytes */
   725             buffer[0]=shiftByte;
   726             if(result!=buffer+1) {
   727                 uprv_memcpy(buffer+1, result, length);
   728             }
   729             result=buffer;
   730             ++length;
   731         }
   732     }
   734     ucnv_fromUWriteBytes(cnv, (const char *)result, length,
   735                          target, targetLimit,
   736                          offsets, srcIndex,
   737                          pErrorCode);
   738 }
   740 /*
   741  * target<targetLimit; set error code for overflow
   742  */
   743 U_CFUNC UBool
   744 ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
   745                           UChar32 cp,
   746                           const UChar **src, const UChar *srcLimit,
   747                           char **target, const char *targetLimit,
   748                           int32_t **offsets, int32_t srcIndex,
   749                           UBool flush,
   750                           UErrorCode *pErrorCode) {
   751     uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   752     int32_t match;
   754     /* try to match */
   755     match=ucnv_extMatchFromU(cx, cp,
   756                              NULL, 0,
   757                              *src, (int32_t)(srcLimit-*src),
   758                              &value,
   759                              cnv->useFallback, flush);
   761     /* reject a match if the result is a single byte for DBCS-only */
   762     if( match>=2 &&
   763         !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 &&
   764           cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY)
   765     ) {
   766         /* advance src pointer for the consumed input */
   767         *src+=match-2; /* remove 2 for the initial code point */
   769         /* write result to target */
   770         ucnv_extWriteFromU(cnv, cx,
   771                            value,
   772                            target, targetLimit,
   773                            offsets, srcIndex,
   774                            pErrorCode);
   775         return TRUE;
   776     } else if(match<0) {
   777         /* save state for partial match */
   778         const UChar *s;
   779         int32_t j;
   781         /* copy the first code point */
   782         cnv->preFromUFirstCP=cp;
   784         /* now copy the newly consumed input */
   785         s=*src;
   786         match=-match-2; /* remove 2 for the initial code point */
   787         for(j=0; j<match; ++j) {
   788             cnv->preFromU[j]=*s++;
   789         }
   790         *src=s; /* same as *src=srcLimit; because we reached the end of input */
   791         cnv->preFromULength=(int8_t)match;
   792         return TRUE;
   793     } else if(match==1) {
   794         /* matched, no mapping but request for <subchar1> */
   795         cnv->useSubChar1=TRUE;
   796         return FALSE;
   797     } else /* match==0 no match */ {
   798         return FALSE;
   799     }
   800 }
   802 /*
   803  * Used by ISO 2022 implementation.
   804  * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping
   805  */
   806 U_CFUNC int32_t
   807 ucnv_extSimpleMatchFromU(const int32_t *cx,
   808                          UChar32 cp, uint32_t *pValue,
   809                          UBool useFallback) {
   810     uint32_t value;
   811     int32_t match;
   813     /* try to match */
   814     match=ucnv_extMatchFromU(cx,
   815                              cp,
   816                              NULL, 0,
   817                              NULL, 0,
   818                              &value,
   819                              useFallback, TRUE);
   820     if(match>=2) {
   821         /* write result for simple, single-character conversion */
   822         int32_t length;
   823         int isRoundtrip;
   825         isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value);
   826         length=UCNV_EXT_FROM_U_GET_LENGTH(value);
   827         value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
   829         if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
   830             *pValue=value;
   831             return isRoundtrip ? length : -length;
   832 #if 0 /* not currently used */
   833         } else if(length==4) {
   834             /* de-serialize a 4-byte result */
   835             const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
   836             *pValue=
   837                 ((uint32_t)result[0]<<24)|
   838                 ((uint32_t)result[1]<<16)|
   839                 ((uint32_t)result[2]<<8)|
   840                 result[3];
   841             return isRoundtrip ? 4 : -4;
   842 #endif
   843         }
   844     }
   846     /*
   847      * return no match because
   848      * - match>1 && resultLength>4: result too long for simple conversion
   849      * - match==1: no match found, <subchar1> preferred
   850      * - match==0: no match found in the first place
   851      * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
   852      */
   853     return 0;
   854 }
   856 /*
   857  * continue partial match with new input, requires cnv->preFromUFirstCP>=0
   858  * never called for simple, single-character conversion
   859  */
   860 U_CFUNC void
   861 ucnv_extContinueMatchFromU(UConverter *cnv,
   862                            UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
   863                            UErrorCode *pErrorCode) {
   864     uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   865     int32_t match;
   867     match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes,
   868                              cnv->preFromUFirstCP,
   869                              cnv->preFromU, cnv->preFromULength,
   870                              pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
   871                              &value,
   872                              cnv->useFallback, pArgs->flush);
   873     if(match>=2) {
   874         match-=2; /* remove 2 for the initial code point */
   876         if(match>=cnv->preFromULength) {
   877             /* advance src pointer for the consumed input */
   878             pArgs->source+=match-cnv->preFromULength;
   879             cnv->preFromULength=0;
   880         } else {
   881             /* the match did not use all of preFromU[] - keep the rest for replay */
   882             int32_t length=cnv->preFromULength-match;
   883             uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR);
   884             cnv->preFromULength=(int8_t)-length;
   885         }
   887         /* finish the partial match */
   888         cnv->preFromUFirstCP=U_SENTINEL;
   890         /* write result */
   891         ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes,
   892                            value,
   893                            &pArgs->target, pArgs->targetLimit,
   894                            &pArgs->offsets, srcIndex,
   895                            pErrorCode);
   896     } else if(match<0) {
   897         /* save state for partial match */
   898         const UChar *s;
   899         int32_t j;
   901         /* just _append_ the newly consumed input to preFromU[] */
   902         s=pArgs->source;
   903         match=-match-2; /* remove 2 for the initial code point */
   904         for(j=cnv->preFromULength; j<match; ++j) {
   905             U_ASSERT(j>=0);
   906             cnv->preFromU[j]=*s++;
   907         }
   908         pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
   909         cnv->preFromULength=(int8_t)match;
   910     } else /* match==0 or 1 */ {
   911         /*
   912          * no match
   913          *
   914          * We need to split the previous input into two parts:
   915          *
   916          * 1. The first code point is unmappable - that's how we got into
   917          *    trying the extension data in the first place.
   918          *    We need to move it from the preFromU buffer
   919          *    to the error buffer, set an error code,
   920          *    and prepare the rest of the previous input for 2.
   921          *
   922          * 2. The rest of the previous input must be converted once we
   923          *    come back from the callback for the first code point.
   924          *    At that time, we have to try again from scratch to convert
   925          *    these input characters.
   926          *    The replay will be handled by the ucnv.c conversion code.
   927          */
   929         if(match==1) {
   930             /* matched, no mapping but request for <subchar1> */
   931             cnv->useSubChar1=TRUE;
   932         }
   934         /* move the first code point to the error field */
   935         cnv->fromUChar32=cnv->preFromUFirstCP;
   936         cnv->preFromUFirstCP=U_SENTINEL;
   938         /* mark preFromU for replay */
   939         cnv->preFromULength=-cnv->preFromULength;
   941         /* set the error code for unassigned */
   942         *pErrorCode=U_INVALID_CHAR_FOUND;
   943     }
   944 }
   946 static UBool
   947 extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) {
   948     if(which==UCNV_ROUNDTRIP_SET) {
   949         // Add only code points for which the roundtrip flag is set.
   950         // Do not add any fallbacks, even if ucnv_fromUnicode() would use them
   951         // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet().
   952         //
   953         // By analogy, also do not add "good one-way" mappings.
   954         //
   955         // Do not add entries with reserved bits set.
   956         if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!=
   957                 UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) {
   958             return FALSE;
   959         }
   960     } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
   961         // Do not add entries with reserved bits set.
   962         if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) {
   963             return FALSE;
   964         }
   965     }
   966     // Do not add <subchar1> entries or other (future?) pseudo-entries
   967     // with an output length of 0.
   968     return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength;
   969 }
   971 static void
   972 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
   973                             const int32_t *cx,
   974                             const USetAdder *sa,
   975                             UConverterUnicodeSet which,
   976                             int32_t minLength,
   977                             UChar32 firstCP,
   978                             UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
   979                             int32_t sectionIndex,
   980                             UErrorCode *pErrorCode) {
   981     const UChar *fromUSectionUChars;
   982     const uint32_t *fromUSectionValues;
   984     uint32_t value;
   985     int32_t i, count;
   987     fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
   988     fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
   990     /* read first pair of the section */
   991     count=*fromUSectionUChars++;
   992     value=*fromUSectionValues++;
   994     if(extSetUseMapping(which, minLength, value)) {
   995         if(length==U16_LENGTH(firstCP)) {
   996             /* add the initial code point */
   997             sa->add(sa->set, firstCP);
   998         } else {
   999             /* add the string so far */
  1000             sa->addString(sa->set, s, length);
  1004     for(i=0; i<count; ++i) {
  1005         /* append this code unit and recurse or add the string */
  1006         s[length]=fromUSectionUChars[i];
  1007         value=fromUSectionValues[i];
  1009         if(value==0) {
  1010             /* no mapping, do nothing */
  1011         } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
  1012             ucnv_extGetUnicodeSetString(
  1013                 sharedData, cx, sa, which, minLength,
  1014                 firstCP, s, length+1,
  1015                 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
  1016                 pErrorCode);
  1017         } else if(extSetUseMapping(which, minLength, value)) {
  1018             sa->addString(sa->set, s, length+1);
  1023 U_CFUNC void
  1024 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
  1025                       const USetAdder *sa,
  1026                       UConverterUnicodeSet which,
  1027                       UConverterSetFilter filter,
  1028                       UErrorCode *pErrorCode) {
  1029     const int32_t *cx;
  1030     const uint16_t *stage12, *stage3, *ps2, *ps3;
  1031     const uint32_t *stage3b;
  1033     uint32_t value;
  1034     int32_t st1, stage1Length, st2, st3, minLength;
  1036     UChar s[UCNV_EXT_MAX_UCHARS];
  1037     UChar32 c;
  1038     int32_t length;
  1040     cx=sharedData->mbcs.extIndexes;
  1041     if(cx==NULL) {
  1042         return;
  1045     stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
  1046     stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
  1047     stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
  1049     stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
  1051     /* enumerate the from-Unicode trie table */
  1052     c=0; /* keep track of the current code point while enumerating */
  1054     if(filter==UCNV_SET_FILTER_2022_CN) {
  1055         minLength=3;
  1056     } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
  1057                filter!=UCNV_SET_FILTER_NONE
  1058     ) {
  1059         /* DBCS-only, ignore single-byte results */
  1060         minLength=2;
  1061     } else {
  1062         minLength=1;
  1065     /*
  1066      * the trie enumeration is almost the same as
  1067      * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1
  1068      */
  1069     for(st1=0; st1<stage1Length; ++st1) {
  1070         st2=stage12[st1];
  1071         if(st2>stage1Length) {
  1072             ps2=stage12+st2;
  1073             for(st2=0; st2<64; ++st2) {
  1074                 if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
  1075                     /* read the stage 3 block */
  1076                     ps3=stage3+st3;
  1078                     do {
  1079                         value=stage3b[*ps3++];
  1080                         if(value==0) {
  1081                             /* no mapping, do nothing */
  1082                         } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
  1083                             // Recurse for partial results.
  1084                             length=0;
  1085                             U16_APPEND_UNSAFE(s, length, c);
  1086                             ucnv_extGetUnicodeSetString(
  1087                                 sharedData, cx, sa, which, minLength,
  1088                                 c, s, length,
  1089                                 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
  1090                                 pErrorCode);
  1091                         } else if(extSetUseMapping(which, minLength, value)) {
  1092                             switch(filter) {
  1093                             case UCNV_SET_FILTER_2022_CN:
  1094                                 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
  1095                                     continue;
  1097                                 break;
  1098                             case UCNV_SET_FILTER_SJIS:
  1099                                 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
  1100                                     continue;
  1102                                 break;
  1103                             case UCNV_SET_FILTER_GR94DBCS:
  1104                                 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
  1105                                      (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
  1106                                      (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
  1107                                     continue;
  1109                                 break;
  1110                             case UCNV_SET_FILTER_HZ:
  1111                                 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
  1112                                      (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
  1113                                      (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
  1114                                     continue;
  1116                                 break;
  1117                             default:
  1118                                 /*
  1119                                  * UCNV_SET_FILTER_NONE,
  1120                                  * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
  1121                                  */
  1122                                 break;
  1124                             sa->add(sa->set, c);
  1126                     } while((++c&0xf)!=0);
  1127                 } else {
  1128                     c+=16; /* empty stage 3 block */
  1131         } else {
  1132             c+=1024; /* empty stage 2 block */
  1137 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

mercurial