intl/icu/source/common/ucnvscsu.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2000-2011, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  ucnvscsu.c
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2000nov18
    14 *   created by: Markus W. Scherer
    15 *
    16 *   This is an implementation of the Standard Compression Scheme for Unicode
    17 *   as defined in http://www.unicode.org/unicode/reports/tr6/ .
    18 *   Reserved commands and window settings are treated as illegal sequences and
    19 *   will result in callback calls.
    20 */
    22 #include "unicode/utypes.h"
    24 #if !UCONFIG_NO_CONVERSION
    26 #include "unicode/ucnv.h"
    27 #include "unicode/ucnv_cb.h"
    28 #include "unicode/utf16.h"
    29 #include "ucnv_bld.h"
    30 #include "ucnv_cnv.h"
    31 #include "cmemory.h"
    33 /* SCSU definitions --------------------------------------------------------- */
    35 /* SCSU command byte values */
    36 enum {
    37     SQ0=0x01, /* Quote from window pair 0 */
    38     SQ7=0x08, /* Quote from window pair 7 */
    39     SDX=0x0B, /* Define a window as extended */
    40     Srs=0x0C, /* reserved */
    41     SQU=0x0E, /* Quote a single Unicode character */
    42     SCU=0x0F, /* Change to Unicode mode */
    43     SC0=0x10, /* Select window 0 */
    44     SC7=0x17, /* Select window 7 */
    45     SD0=0x18, /* Define and select window 0 */
    46     SD7=0x1F, /* Define and select window 7 */
    48     UC0=0xE0, /* Select window 0 */
    49     UC7=0xE7, /* Select window 7 */
    50     UD0=0xE8, /* Define and select window 0 */
    51     UD7=0xEF, /* Define and select window 7 */
    52     UQU=0xF0, /* Quote a single Unicode character */
    53     UDX=0xF1, /* Define a Window as extended */
    54     Urs=0xF2  /* reserved */
    55 };
    57 enum {
    58     /*
    59      * Unicode code points from 3400 to E000 are not adressible by
    60      * dynamic window, since in these areas no short run alphabets are
    61      * found. Therefore add gapOffset to all values from gapThreshold.
    62      */
    63     gapThreshold=0x68,
    64     gapOffset=0xAC00,
    66     /* values between reservedStart and fixedThreshold are reserved */
    67     reservedStart=0xA8,
    69     /* use table of predefined fixed offsets for values from fixedThreshold */
    70     fixedThreshold=0xF9
    71 };
    73 /* constant offsets for the 8 static windows */
    74 static const uint32_t staticOffsets[8]={
    75     0x0000, /* ASCII for quoted tags */
    76     0x0080, /* Latin - 1 Supplement (for access to punctuation) */
    77     0x0100, /* Latin Extended-A */
    78     0x0300, /* Combining Diacritical Marks */
    79     0x2000, /* General Punctuation */
    80     0x2080, /* Currency Symbols */
    81     0x2100, /* Letterlike Symbols and Number Forms */
    82     0x3000  /* CJK Symbols and punctuation */
    83 };
    85 /* initial offsets for the 8 dynamic (sliding) windows */
    86 static const uint32_t initialDynamicOffsets[8]={
    87     0x0080, /* Latin-1 */
    88     0x00C0, /* Latin Extended A */
    89     0x0400, /* Cyrillic */
    90     0x0600, /* Arabic */
    91     0x0900, /* Devanagari */
    92     0x3040, /* Hiragana */
    93     0x30A0, /* Katakana */
    94     0xFF00  /* Fullwidth ASCII */
    95 };
    97 /* Table of fixed predefined Offsets */
    98 static const uint32_t fixedOffsets[]={
    99     /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
   100     /* 0xFA */ 0x0250, /* IPA extensions */
   101     /* 0xFB */ 0x0370, /* Greek */
   102     /* 0xFC */ 0x0530, /* Armenian */
   103     /* 0xFD */ 0x3040, /* Hiragana */
   104     /* 0xFE */ 0x30A0, /* Katakana */
   105     /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
   106 };
   108 /* state values */
   109 enum {
   110     readCommand,
   111     quotePairOne,
   112     quotePairTwo,
   113     quoteOne,
   114     definePairOne,
   115     definePairTwo,
   116     defineOne
   117 };
   119 typedef struct SCSUData {
   120     /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
   121     uint32_t toUDynamicOffsets[8];
   122     uint32_t fromUDynamicOffsets[8];
   124     /* state machine state - toUnicode */
   125     UBool toUIsSingleByteMode;
   126     uint8_t toUState;
   127     int8_t toUQuoteWindow, toUDynamicWindow;
   128     uint8_t toUByteOne;
   129     uint8_t toUPadding[3];
   131     /* state machine state - fromUnicode */
   132     UBool fromUIsSingleByteMode;
   133     int8_t fromUDynamicWindow;
   135     /*
   136      * windowUse[] keeps track of the use of the dynamic windows:
   137      * At nextWindowUseIndex there is the least recently used window,
   138      * and the following windows (in a wrapping manner) are more and more
   139      * recently used.
   140      * At nextWindowUseIndex-1 there is the most recently used window.
   141      */
   142     uint8_t locale;
   143     int8_t nextWindowUseIndex;
   144     int8_t windowUse[8];
   145 } SCSUData;
   147 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
   148 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
   150 enum {
   151     lGeneric, l_ja
   152 };
   154 /* SCSU setup functions ----------------------------------------------------- */
   156 static void
   157 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
   158     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
   160     if(choice<=UCNV_RESET_TO_UNICODE) {
   161         /* reset toUnicode */
   162         uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
   164         scsu->toUIsSingleByteMode=TRUE;
   165         scsu->toUState=readCommand;
   166         scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
   167         scsu->toUByteOne=0;
   169         cnv->toULength=0;
   170     }
   171     if(choice!=UCNV_RESET_TO_UNICODE) {
   172         /* reset fromUnicode */
   173         uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
   175         scsu->fromUIsSingleByteMode=TRUE;
   176         scsu->fromUDynamicWindow=0;
   178         scsu->nextWindowUseIndex=0;
   179         switch(scsu->locale) {
   180         case l_ja:
   181             uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
   182             break;
   183         default:
   184             uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
   185             break;
   186         }
   188         cnv->fromUChar32=0;
   189     }
   190 }
   192 static void
   193 _SCSUOpen(UConverter *cnv,
   194           UConverterLoadArgs *pArgs,
   195           UErrorCode *pErrorCode) {
   196     const char *locale=pArgs->locale;
   197     if(pArgs->onlyTestIsLoadable) {
   198         return;
   199     }
   200     cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
   201     if(cnv->extraInfo!=NULL) {
   202         if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
   203             ((SCSUData *)cnv->extraInfo)->locale=l_ja;
   204         } else {
   205             ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
   206         }
   207         _SCSUReset(cnv, UCNV_RESET_BOTH);
   208     } else {
   209         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   210     }
   212     /* Set the substitution character U+fffd as a Unicode string. */
   213     cnv->subUChars[0]=0xfffd;
   214     cnv->subCharLen=-1;
   215 }
   217 static void
   218 _SCSUClose(UConverter *cnv) {
   219     if(cnv->extraInfo!=NULL) {
   220         if(!cnv->isExtraLocal) {
   221             uprv_free(cnv->extraInfo);
   222         }
   223         cnv->extraInfo=NULL;
   224     }
   225 }
   227 /* SCSU-to-Unicode conversion functions ------------------------------------- */
   229 static void
   230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   231                           UErrorCode *pErrorCode) {
   232     UConverter *cnv;
   233     SCSUData *scsu;
   234     const uint8_t *source, *sourceLimit;
   235     UChar *target;
   236     const UChar *targetLimit;
   237     int32_t *offsets;
   238     UBool isSingleByteMode;
   239     uint8_t state, byteOne;
   240     int8_t quoteWindow, dynamicWindow;
   242     int32_t sourceIndex, nextSourceIndex;
   244     uint8_t b;
   246     /* set up the local pointers */
   247     cnv=pArgs->converter;
   248     scsu=(SCSUData *)cnv->extraInfo;
   250     source=(const uint8_t *)pArgs->source;
   251     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   252     target=pArgs->target;
   253     targetLimit=pArgs->targetLimit;
   254     offsets=pArgs->offsets;
   256     /* get the state machine state */
   257     isSingleByteMode=scsu->toUIsSingleByteMode;
   258     state=scsu->toUState;
   259     quoteWindow=scsu->toUQuoteWindow;
   260     dynamicWindow=scsu->toUDynamicWindow;
   261     byteOne=scsu->toUByteOne;
   263     /* sourceIndex=-1 if the current character began in the previous buffer */
   264     sourceIndex=state==readCommand ? 0 : -1;
   265     nextSourceIndex=0;
   267     /*
   268      * conversion "loop"
   269      *
   270      * For performance, this is not a normal C loop.
   271      * Instead, there are two code blocks for the two SCSU modes.
   272      * The function branches to either one, and a change of the mode is done with a goto to
   273      * the other branch.
   274      *
   275      * Each branch has two conventional loops:
   276      * - a fast-path loop for the most common codes in the mode
   277      * - a loop for all other codes in the mode
   278      * When the fast-path runs into a code that it cannot handle, its loop ends and it
   279      * runs into the following loop to handle the other codes.
   280      * The end of the input or output buffer is also handled by the slower loop.
   281      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
   282      *
   283      * The callback handling is done by returning with an error code.
   284      * The conversion framework actually calls the callback function.
   285      */
   286     if(isSingleByteMode) {
   287         /* fast path for single-byte mode */
   288         if(state==readCommand) {
   289 fastSingle:
   290             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
   291                 ++source;
   292                 ++nextSourceIndex;
   293                 if(b<=0x7f) {
   294                     /* write US-ASCII graphic character or DEL */
   295                     *target++=(UChar)b;
   296                     if(offsets!=NULL) {
   297                         *offsets++=sourceIndex;
   298                     }
   299                 } else {
   300                     /* write from dynamic window */
   301                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
   302                     if(c<=0xffff) {
   303                         *target++=(UChar)c;
   304                         if(offsets!=NULL) {
   305                             *offsets++=sourceIndex;
   306                         }
   307                     } else {
   308                         /* output surrogate pair */
   309                         *target++=(UChar)(0xd7c0+(c>>10));
   310                         if(target<targetLimit) {
   311                             *target++=(UChar)(0xdc00|(c&0x3ff));
   312                             if(offsets!=NULL) {
   313                                 *offsets++=sourceIndex;
   314                                 *offsets++=sourceIndex;
   315                             }
   316                         } else {
   317                             /* target overflow */
   318                             if(offsets!=NULL) {
   319                                 *offsets++=sourceIndex;
   320                             }
   321                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
   322                             cnv->UCharErrorBufferLength=1;
   323                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   324                             goto endloop;
   325                         }
   326                     }
   327                 }
   328                 sourceIndex=nextSourceIndex;
   329             }
   330         }
   332         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
   333 singleByteMode:
   334         while(source<sourceLimit) {
   335             if(target>=targetLimit) {
   336                 /* target is full */
   337                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   338                 break;
   339             }
   340             b=*source++;
   341             ++nextSourceIndex;
   342             switch(state) {
   343             case readCommand:
   344                 /* redundant conditions are commented out */
   345                 /* here: b<0x20 because otherwise we would be in fastSingle */
   346                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
   347                     /* CR/LF/TAB/NUL */
   348                     *target++=(UChar)b;
   349                     if(offsets!=NULL) {
   350                         *offsets++=sourceIndex;
   351                     }
   352                     sourceIndex=nextSourceIndex;
   353                     goto fastSingle;
   354                 } else if(SC0<=b) {
   355                     if(b<=SC7) {
   356                         dynamicWindow=(int8_t)(b-SC0);
   357                         sourceIndex=nextSourceIndex;
   358                         goto fastSingle;
   359                     } else /* if(SD0<=b && b<=SD7) */ {
   360                         dynamicWindow=(int8_t)(b-SD0);
   361                         state=defineOne;
   362                     }
   363                 } else if(/* SQ0<=b && */ b<=SQ7) {
   364                     quoteWindow=(int8_t)(b-SQ0);
   365                     state=quoteOne;
   366                 } else if(b==SDX) {
   367                     state=definePairOne;
   368                 } else if(b==SQU) {
   369                     state=quotePairOne;
   370                 } else if(b==SCU) {
   371                     sourceIndex=nextSourceIndex;
   372                     isSingleByteMode=FALSE;
   373                     goto fastUnicode;
   374                 } else /* Srs */ {
   375                     /* callback(illegal) */
   376                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   377                     cnv->toUBytes[0]=b;
   378                     cnv->toULength=1;
   379                     goto endloop;
   380                 }
   382                 /* store the first byte of a multibyte sequence in toUBytes[] */
   383                 cnv->toUBytes[0]=b;
   384                 cnv->toULength=1;
   385                 break;
   386             case quotePairOne:
   387                 byteOne=b;
   388                 cnv->toUBytes[1]=b;
   389                 cnv->toULength=2;
   390                 state=quotePairTwo;
   391                 break;
   392             case quotePairTwo:
   393                 *target++=(UChar)((byteOne<<8)|b);
   394                 if(offsets!=NULL) {
   395                     *offsets++=sourceIndex;
   396                 }
   397                 sourceIndex=nextSourceIndex;
   398                 state=readCommand;
   399                 goto fastSingle;
   400             case quoteOne:
   401                 if(b<0x80) {
   402                     /* all static offsets are in the BMP */
   403                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
   404                     if(offsets!=NULL) {
   405                         *offsets++=sourceIndex;
   406                     }
   407                 } else {
   408                     /* write from dynamic window */
   409                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
   410                     if(c<=0xffff) {
   411                         *target++=(UChar)c;
   412                         if(offsets!=NULL) {
   413                             *offsets++=sourceIndex;
   414                         }
   415                     } else {
   416                         /* output surrogate pair */
   417                         *target++=(UChar)(0xd7c0+(c>>10));
   418                         if(target<targetLimit) {
   419                             *target++=(UChar)(0xdc00|(c&0x3ff));
   420                             if(offsets!=NULL) {
   421                                 *offsets++=sourceIndex;
   422                                 *offsets++=sourceIndex;
   423                             }
   424                         } else {
   425                             /* target overflow */
   426                             if(offsets!=NULL) {
   427                                 *offsets++=sourceIndex;
   428                             }
   429                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
   430                             cnv->UCharErrorBufferLength=1;
   431                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   432                             goto endloop;
   433                         }
   434                     }
   435                 }
   436                 sourceIndex=nextSourceIndex;
   437                 state=readCommand;
   438                 goto fastSingle;
   439             case definePairOne:
   440                 dynamicWindow=(int8_t)((b>>5)&7);
   441                 byteOne=(uint8_t)(b&0x1f);
   442                 cnv->toUBytes[1]=b;
   443                 cnv->toULength=2;
   444                 state=definePairTwo;
   445                 break;
   446             case definePairTwo:
   447                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
   448                 sourceIndex=nextSourceIndex;
   449                 state=readCommand;
   450                 goto fastSingle;
   451             case defineOne:
   452                 if(b==0) {
   453                     /* callback(illegal): Reserved window offset value 0 */
   454                     cnv->toUBytes[1]=b;
   455                     cnv->toULength=2;
   456                     goto endloop;
   457                 } else if(b<gapThreshold) {
   458                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
   459                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
   460                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
   461                 } else if(b>=fixedThreshold) {
   462                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
   463                 } else {
   464                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
   465                     cnv->toUBytes[1]=b;
   466                     cnv->toULength=2;
   467                     goto endloop;
   468                 }
   469                 sourceIndex=nextSourceIndex;
   470                 state=readCommand;
   471                 goto fastSingle;
   472             }
   473         }
   474     } else {
   475         /* fast path for Unicode mode */
   476         if(state==readCommand) {
   477 fastUnicode:
   478             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
   479                 *target++=(UChar)((b<<8)|source[1]);
   480                 if(offsets!=NULL) {
   481                     *offsets++=sourceIndex;
   482                 }
   483                 sourceIndex=nextSourceIndex;
   484                 nextSourceIndex+=2;
   485                 source+=2;
   486             }
   487         }
   489         /* normal state machine for Unicode mode */
   490 /* unicodeByteMode: */
   491         while(source<sourceLimit) {
   492             if(target>=targetLimit) {
   493                 /* target is full */
   494                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   495                 break;
   496             }
   497             b=*source++;
   498             ++nextSourceIndex;
   499             switch(state) {
   500             case readCommand:
   501                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
   502                     byteOne=b;
   503                     cnv->toUBytes[0]=b;
   504                     cnv->toULength=1;
   505                     state=quotePairTwo;
   506                 } else if(/* UC0<=b && */ b<=UC7) {
   507                     dynamicWindow=(int8_t)(b-UC0);
   508                     sourceIndex=nextSourceIndex;
   509                     isSingleByteMode=TRUE;
   510                     goto fastSingle;
   511                 } else if(/* UD0<=b && */ b<=UD7) {
   512                     dynamicWindow=(int8_t)(b-UD0);
   513                     isSingleByteMode=TRUE;
   514                     cnv->toUBytes[0]=b;
   515                     cnv->toULength=1;
   516                     state=defineOne;
   517                     goto singleByteMode;
   518                 } else if(b==UDX) {
   519                     isSingleByteMode=TRUE;
   520                     cnv->toUBytes[0]=b;
   521                     cnv->toULength=1;
   522                     state=definePairOne;
   523                     goto singleByteMode;
   524                 } else if(b==UQU) {
   525                     cnv->toUBytes[0]=b;
   526                     cnv->toULength=1;
   527                     state=quotePairOne;
   528                 } else /* Urs */ {
   529                     /* callback(illegal) */
   530                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   531                     cnv->toUBytes[0]=b;
   532                     cnv->toULength=1;
   533                     goto endloop;
   534                 }
   535                 break;
   536             case quotePairOne:
   537                 byteOne=b;
   538                 cnv->toUBytes[1]=b;
   539                 cnv->toULength=2;
   540                 state=quotePairTwo;
   541                 break;
   542             case quotePairTwo:
   543                 *target++=(UChar)((byteOne<<8)|b);
   544                 if(offsets!=NULL) {
   545                     *offsets++=sourceIndex;
   546                 }
   547                 sourceIndex=nextSourceIndex;
   548                 state=readCommand;
   549                 goto fastUnicode;
   550             }
   551         }
   552     }
   553 endloop:
   555     /* set the converter state back into UConverter */
   556     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
   557         /* reset to deal with the next character */
   558         state=readCommand;
   559     } else if(state==readCommand) {
   560         /* not in a multi-byte sequence, reset toULength */
   561         cnv->toULength=0;
   562     }
   563     scsu->toUIsSingleByteMode=isSingleByteMode;
   564     scsu->toUState=state;
   565     scsu->toUQuoteWindow=quoteWindow;
   566     scsu->toUDynamicWindow=dynamicWindow;
   567     scsu->toUByteOne=byteOne;
   569     /* write back the updated pointers */
   570     pArgs->source=(const char *)source;
   571     pArgs->target=target;
   572     pArgs->offsets=offsets;
   573     return;
   574 }
   576 /*
   577  * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
   578  * If a change is made in the original function, then either
   579  * change this function the same way or
   580  * re-copy the original function and remove the variables
   581  * offsets, sourceIndex, and nextSourceIndex.
   582  */
   583 static void
   584 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
   585                UErrorCode *pErrorCode) {
   586     UConverter *cnv;
   587     SCSUData *scsu;
   588     const uint8_t *source, *sourceLimit;
   589     UChar *target;
   590     const UChar *targetLimit;
   591     UBool isSingleByteMode;
   592     uint8_t state, byteOne;
   593     int8_t quoteWindow, dynamicWindow;
   595     uint8_t b;
   597     /* set up the local pointers */
   598     cnv=pArgs->converter;
   599     scsu=(SCSUData *)cnv->extraInfo;
   601     source=(const uint8_t *)pArgs->source;
   602     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   603     target=pArgs->target;
   604     targetLimit=pArgs->targetLimit;
   606     /* get the state machine state */
   607     isSingleByteMode=scsu->toUIsSingleByteMode;
   608     state=scsu->toUState;
   609     quoteWindow=scsu->toUQuoteWindow;
   610     dynamicWindow=scsu->toUDynamicWindow;
   611     byteOne=scsu->toUByteOne;
   613     /*
   614      * conversion "loop"
   615      *
   616      * For performance, this is not a normal C loop.
   617      * Instead, there are two code blocks for the two SCSU modes.
   618      * The function branches to either one, and a change of the mode is done with a goto to
   619      * the other branch.
   620      *
   621      * Each branch has two conventional loops:
   622      * - a fast-path loop for the most common codes in the mode
   623      * - a loop for all other codes in the mode
   624      * When the fast-path runs into a code that it cannot handle, its loop ends and it
   625      * runs into the following loop to handle the other codes.
   626      * The end of the input or output buffer is also handled by the slower loop.
   627      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
   628      *
   629      * The callback handling is done by returning with an error code.
   630      * The conversion framework actually calls the callback function.
   631      */
   632     if(isSingleByteMode) {
   633         /* fast path for single-byte mode */
   634         if(state==readCommand) {
   635 fastSingle:
   636             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
   637                 ++source;
   638                 if(b<=0x7f) {
   639                     /* write US-ASCII graphic character or DEL */
   640                     *target++=(UChar)b;
   641                 } else {
   642                     /* write from dynamic window */
   643                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
   644                     if(c<=0xffff) {
   645                         *target++=(UChar)c;
   646                     } else {
   647                         /* output surrogate pair */
   648                         *target++=(UChar)(0xd7c0+(c>>10));
   649                         if(target<targetLimit) {
   650                             *target++=(UChar)(0xdc00|(c&0x3ff));
   651                         } else {
   652                             /* target overflow */
   653                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
   654                             cnv->UCharErrorBufferLength=1;
   655                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   656                             goto endloop;
   657                         }
   658                     }
   659                 }
   660             }
   661         }
   663         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
   664 singleByteMode:
   665         while(source<sourceLimit) {
   666             if(target>=targetLimit) {
   667                 /* target is full */
   668                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   669                 break;
   670             }
   671             b=*source++;
   672             switch(state) {
   673             case readCommand:
   674                 /* redundant conditions are commented out */
   675                 /* here: b<0x20 because otherwise we would be in fastSingle */
   676                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
   677                     /* CR/LF/TAB/NUL */
   678                     *target++=(UChar)b;
   679                     goto fastSingle;
   680                 } else if(SC0<=b) {
   681                     if(b<=SC7) {
   682                         dynamicWindow=(int8_t)(b-SC0);
   683                         goto fastSingle;
   684                     } else /* if(SD0<=b && b<=SD7) */ {
   685                         dynamicWindow=(int8_t)(b-SD0);
   686                         state=defineOne;
   687                     }
   688                 } else if(/* SQ0<=b && */ b<=SQ7) {
   689                     quoteWindow=(int8_t)(b-SQ0);
   690                     state=quoteOne;
   691                 } else if(b==SDX) {
   692                     state=definePairOne;
   693                 } else if(b==SQU) {
   694                     state=quotePairOne;
   695                 } else if(b==SCU) {
   696                     isSingleByteMode=FALSE;
   697                     goto fastUnicode;
   698                 } else /* Srs */ {
   699                     /* callback(illegal) */
   700                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   701                     cnv->toUBytes[0]=b;
   702                     cnv->toULength=1;
   703                     goto endloop;
   704                 }
   706                 /* store the first byte of a multibyte sequence in toUBytes[] */
   707                 cnv->toUBytes[0]=b;
   708                 cnv->toULength=1;
   709                 break;
   710             case quotePairOne:
   711                 byteOne=b;
   712                 cnv->toUBytes[1]=b;
   713                 cnv->toULength=2;
   714                 state=quotePairTwo;
   715                 break;
   716             case quotePairTwo:
   717                 *target++=(UChar)((byteOne<<8)|b);
   718                 state=readCommand;
   719                 goto fastSingle;
   720             case quoteOne:
   721                 if(b<0x80) {
   722                     /* all static offsets are in the BMP */
   723                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
   724                 } else {
   725                     /* write from dynamic window */
   726                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
   727                     if(c<=0xffff) {
   728                         *target++=(UChar)c;
   729                     } else {
   730                         /* output surrogate pair */
   731                         *target++=(UChar)(0xd7c0+(c>>10));
   732                         if(target<targetLimit) {
   733                             *target++=(UChar)(0xdc00|(c&0x3ff));
   734                         } else {
   735                             /* target overflow */
   736                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
   737                             cnv->UCharErrorBufferLength=1;
   738                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   739                             goto endloop;
   740                         }
   741                     }
   742                 }
   743                 state=readCommand;
   744                 goto fastSingle;
   745             case definePairOne:
   746                 dynamicWindow=(int8_t)((b>>5)&7);
   747                 byteOne=(uint8_t)(b&0x1f);
   748                 cnv->toUBytes[1]=b;
   749                 cnv->toULength=2;
   750                 state=definePairTwo;
   751                 break;
   752             case definePairTwo:
   753                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
   754                 state=readCommand;
   755                 goto fastSingle;
   756             case defineOne:
   757                 if(b==0) {
   758                     /* callback(illegal): Reserved window offset value 0 */
   759                     cnv->toUBytes[1]=b;
   760                     cnv->toULength=2;
   761                     goto endloop;
   762                 } else if(b<gapThreshold) {
   763                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
   764                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
   765                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
   766                 } else if(b>=fixedThreshold) {
   767                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
   768                 } else {
   769                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
   770                     cnv->toUBytes[1]=b;
   771                     cnv->toULength=2;
   772                     goto endloop;
   773                 }
   774                 state=readCommand;
   775                 goto fastSingle;
   776             }
   777         }
   778     } else {
   779         /* fast path for Unicode mode */
   780         if(state==readCommand) {
   781 fastUnicode:
   782             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
   783                 *target++=(UChar)((b<<8)|source[1]);
   784                 source+=2;
   785             }
   786         }
   788         /* normal state machine for Unicode mode */
   789 /* unicodeByteMode: */
   790         while(source<sourceLimit) {
   791             if(target>=targetLimit) {
   792                 /* target is full */
   793                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   794                 break;
   795             }
   796             b=*source++;
   797             switch(state) {
   798             case readCommand:
   799                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
   800                     byteOne=b;
   801                     cnv->toUBytes[0]=b;
   802                     cnv->toULength=1;
   803                     state=quotePairTwo;
   804                 } else if(/* UC0<=b && */ b<=UC7) {
   805                     dynamicWindow=(int8_t)(b-UC0);
   806                     isSingleByteMode=TRUE;
   807                     goto fastSingle;
   808                 } else if(/* UD0<=b && */ b<=UD7) {
   809                     dynamicWindow=(int8_t)(b-UD0);
   810                     isSingleByteMode=TRUE;
   811                     cnv->toUBytes[0]=b;
   812                     cnv->toULength=1;
   813                     state=defineOne;
   814                     goto singleByteMode;
   815                 } else if(b==UDX) {
   816                     isSingleByteMode=TRUE;
   817                     cnv->toUBytes[0]=b;
   818                     cnv->toULength=1;
   819                     state=definePairOne;
   820                     goto singleByteMode;
   821                 } else if(b==UQU) {
   822                     cnv->toUBytes[0]=b;
   823                     cnv->toULength=1;
   824                     state=quotePairOne;
   825                 } else /* Urs */ {
   826                     /* callback(illegal) */
   827                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   828                     cnv->toUBytes[0]=b;
   829                     cnv->toULength=1;
   830                     goto endloop;
   831                 }
   832                 break;
   833             case quotePairOne:
   834                 byteOne=b;
   835                 cnv->toUBytes[1]=b;
   836                 cnv->toULength=2;
   837                 state=quotePairTwo;
   838                 break;
   839             case quotePairTwo:
   840                 *target++=(UChar)((byteOne<<8)|b);
   841                 state=readCommand;
   842                 goto fastUnicode;
   843             }
   844         }
   845     }
   846 endloop:
   848     /* set the converter state back into UConverter */
   849     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
   850         /* reset to deal with the next character */
   851         state=readCommand;
   852     } else if(state==readCommand) {
   853         /* not in a multi-byte sequence, reset toULength */
   854         cnv->toULength=0;
   855     }
   856     scsu->toUIsSingleByteMode=isSingleByteMode;
   857     scsu->toUState=state;
   858     scsu->toUQuoteWindow=quoteWindow;
   859     scsu->toUDynamicWindow=dynamicWindow;
   860     scsu->toUByteOne=byteOne;
   862     /* write back the updated pointers */
   863     pArgs->source=(const char *)source;
   864     pArgs->target=target;
   865     return;
   866 }
   868 /* SCSU-from-Unicode conversion functions ----------------------------------- */
   870 /*
   871  * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
   872  * reasonable results. The lookahead is minimal.
   873  * Many cases are simple:
   874  * A character fits directly into the current mode, a dynamic or static window,
   875  * or is not compressible. These cases are tested first.
   876  * Real compression heuristics are applied to the rest, in code branches for
   877  * single/Unicode mode and BMP/supplementary code points.
   878  * The heuristics used here are extremely simple.
   879  */
   881 /* get the number of the window that this character is in, or -1 */
   882 static int8_t
   883 getWindow(const uint32_t offsets[8], uint32_t c) {
   884     int i;
   885     for(i=0; i<8; ++i) {
   886         if((uint32_t)(c-offsets[i])<=0x7f) {
   887             return (int8_t)(i);
   888         }
   889     }
   890     return -1;
   891 }
   893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
   894 static UBool
   895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
   896     return (UBool)(c<=offset+0x7f &&
   897           (c>=offset || (c<=0x7f &&
   898                         (c>=0x20 || (1UL<<c)&0x2601))));
   899                                 /* binary 0010 0110 0000 0001,
   900                                    check for b==0xd || b==0xa || b==9 || b==0 */
   901 }
   903 /*
   904  * getNextDynamicWindow returns the next dynamic window to be redefined
   905  */
   906 static int8_t
   907 getNextDynamicWindow(SCSUData *scsu) {
   908     int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
   909     if(++scsu->nextWindowUseIndex==8) {
   910         scsu->nextWindowUseIndex=0;
   911     }
   912     return window;
   913 }
   915 /*
   916  * useDynamicWindow() adjusts
   917  * windowUse[] and nextWindowUseIndex for the algorithm to choose
   918  * the next dynamic window to be defined;
   919  * a subclass may override it and provide its own algorithm.
   920  */
   921 static void
   922 useDynamicWindow(SCSUData *scsu, int8_t window) {
   923     /*
   924      * move the existing window, which just became the most recently used one,
   925      * up in windowUse[] to nextWindowUseIndex-1
   926      */
   928     /* first, find the index of the window - backwards to favor the more recently used windows */
   929     int i, j;
   931     i=scsu->nextWindowUseIndex;
   932     do {
   933         if(--i<0) {
   934             i=7;
   935         }
   936     } while(scsu->windowUse[i]!=window);
   938     /* now copy each windowUse[i+1] to [i] */
   939     j=i+1;
   940     if(j==8) {
   941         j=0;
   942     }
   943     while(j!=scsu->nextWindowUseIndex) {
   944         scsu->windowUse[i]=scsu->windowUse[j];
   945         i=j;
   946         if(++j==8) { j=0; }
   947     }
   949     /* finally, set the window into the most recently used index */
   950     scsu->windowUse[i]=window;
   951 }
   953 /*
   954  * calculate the offset and the code for a dynamic window that contains the character
   955  * takes fixed offsets into account
   956  * the offset of the window is stored in the offset variable,
   957  * the code is returned
   958  *
   959  * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
   960  */
   961 static int
   962 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
   963     int i;
   965     for(i=0; i<7; ++i) {
   966         if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
   967             *pOffset=fixedOffsets[i];
   968             return 0xf9+i;
   969         }
   970     }
   972     if(c<0x80) {
   973         /* No dynamic window for US-ASCII. */
   974         return -1;
   975     } else if(c<0x3400 ||
   976               (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
   977               (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
   978     ) {
   979         /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
   980         *pOffset=c&0x7fffff80;
   981         return (int)(c>>7);
   982     } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
   983         /* For these characters we need to take the gapOffset into account. */
   984         *pOffset=c&0x7fffff80;
   985         return (int)((c-gapOffset)>>7);
   986     } else {
   987         return -1;
   988     }
   989 }
   991 /*
   992  * Idea for compression:
   993  *  - save SCSUData and other state before really starting work
   994  *  - at endloop, see if compression could be better with just unicode mode
   995  *  - don't do this if a callback has been called
   996  *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
   997  *  - different buffer handling!
   998  *
   999  * Drawback or need for corrective handling:
  1000  * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
  1001  * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
  1002  * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
  1004  * How to achieve both?
  1005  *  - Only replace the result after an SDX or SCU?
  1006  */
  1008 static void
  1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  1010                             UErrorCode *pErrorCode) {
  1011     UConverter *cnv;
  1012     SCSUData *scsu;
  1013     const UChar *source, *sourceLimit;
  1014     uint8_t *target;
  1015     int32_t targetCapacity;
  1016     int32_t *offsets;
  1018     UBool isSingleByteMode;
  1019     uint8_t dynamicWindow;
  1020     uint32_t currentOffset;
  1022     uint32_t c, delta;
  1024     int32_t sourceIndex, nextSourceIndex;
  1026     int32_t length;
  1028     /* variables for compression heuristics */
  1029     uint32_t offset;
  1030     UChar lead, trail;
  1031     int code;
  1032     int8_t window;
  1034     /* set up the local pointers */
  1035     cnv=pArgs->converter;
  1036     scsu=(SCSUData *)cnv->extraInfo;
  1038     /* set up the local pointers */
  1039     source=pArgs->source;
  1040     sourceLimit=pArgs->sourceLimit;
  1041     target=(uint8_t *)pArgs->target;
  1042     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  1043     offsets=pArgs->offsets;
  1045     /* get the state machine state */
  1046     isSingleByteMode=scsu->fromUIsSingleByteMode;
  1047     dynamicWindow=scsu->fromUDynamicWindow;
  1048     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1050     c=cnv->fromUChar32;
  1052     /* sourceIndex=-1 if the current character began in the previous buffer */
  1053     sourceIndex= c==0 ? 0 : -1;
  1054     nextSourceIndex=0;
  1056     /* similar conversion "loop" as in toUnicode */
  1057 loop:
  1058     if(isSingleByteMode) {
  1059         if(c!=0 && targetCapacity>0) {
  1060             goto getTrailSingle;
  1063         /* state machine for single-byte mode */
  1064 /* singleByteMode: */
  1065         while(source<sourceLimit) {
  1066             if(targetCapacity<=0) {
  1067                 /* target is full */
  1068                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1069                 break;
  1071             c=*source++;
  1072             ++nextSourceIndex;
  1074             if((c-0x20)<=0x5f) {
  1075                 /* pass US-ASCII graphic character through */
  1076                 *target++=(uint8_t)c;
  1077                 if(offsets!=NULL) {
  1078                     *offsets++=sourceIndex;
  1080                 --targetCapacity;
  1081             } else if(c<0x20) {
  1082                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  1083                     /* CR/LF/TAB/NUL */
  1084                     *target++=(uint8_t)c;
  1085                     if(offsets!=NULL) {
  1086                         *offsets++=sourceIndex;
  1088                     --targetCapacity;
  1089                 } else {
  1090                     /* quote C0 control character */
  1091                     c|=SQ0<<8;
  1092                     length=2;
  1093                     goto outputBytes;
  1095             } else if((delta=c-currentOffset)<=0x7f) {
  1096                 /* use the current dynamic window */
  1097                 *target++=(uint8_t)(delta|0x80);
  1098                 if(offsets!=NULL) {
  1099                     *offsets++=sourceIndex;
  1101                 --targetCapacity;
  1102             } else if(U16_IS_SURROGATE(c)) {
  1103                 if(U16_IS_SURROGATE_LEAD(c)) {
  1104 getTrailSingle:
  1105                     lead=(UChar)c;
  1106                     if(source<sourceLimit) {
  1107                         /* test the following code unit */
  1108                         trail=*source;
  1109                         if(U16_IS_TRAIL(trail)) {
  1110                             ++source;
  1111                             ++nextSourceIndex;
  1112                             c=U16_GET_SUPPLEMENTARY(c, trail);
  1113                             /* convert this surrogate code point */
  1114                             /* exit this condition tree */
  1115                         } else {
  1116                             /* this is an unmatched lead code unit (1st surrogate) */
  1117                             /* callback(illegal) */
  1118                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1119                             goto endloop;
  1121                     } else {
  1122                         /* no more input */
  1123                         break;
  1125                 } else {
  1126                     /* this is an unmatched trail code unit (2nd surrogate) */
  1127                     /* callback(illegal) */
  1128                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1129                     goto endloop;
  1132                 /* compress supplementary character U+10000..U+10ffff */
  1133                 if((delta=c-currentOffset)<=0x7f) {
  1134                     /* use the current dynamic window */
  1135                     *target++=(uint8_t)(delta|0x80);
  1136                     if(offsets!=NULL) {
  1137                         *offsets++=sourceIndex;
  1139                     --targetCapacity;
  1140                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1141                     /* there is a dynamic window that contains this character, change to it */
  1142                     dynamicWindow=window;
  1143                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1144                     useDynamicWindow(scsu, dynamicWindow);
  1145                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1146                     length=2;
  1147                     goto outputBytes;
  1148                 } else if((code=getDynamicOffset(c, &offset))>=0) {
  1149                     /* might check if there are more characters in this window to come */
  1150                     /* define an extended window with this character */
  1151                     code-=0x200;
  1152                     dynamicWindow=getNextDynamicWindow(scsu);
  1153                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1154                     useDynamicWindow(scsu, dynamicWindow);
  1155                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1156                     length=4;
  1157                     goto outputBytes;
  1158                 } else {
  1159                     /* change to Unicode mode and output this (lead, trail) pair */
  1160                     isSingleByteMode=FALSE;
  1161                     *target++=(uint8_t)SCU;
  1162                     if(offsets!=NULL) {
  1163                         *offsets++=sourceIndex;
  1165                     --targetCapacity;
  1166                     c=((uint32_t)lead<<16)|trail;
  1167                     length=4;
  1168                     goto outputBytes;
  1170             } else if(c<0xa0) {
  1171                 /* quote C1 control character */
  1172                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
  1173                 length=2;
  1174                 goto outputBytes;
  1175             } else if(c==0xfeff || c>=0xfff0) {
  1176                 /* quote signature character=byte order mark and specials */
  1177                 c|=SQU<<16;
  1178                 length=3;
  1179                 goto outputBytes;
  1180             } else {
  1181                 /* compress all other BMP characters */
  1182                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1183                     /* there is a window defined that contains this character - switch to it or quote from it? */
  1184                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
  1185                         /* change to dynamic window */
  1186                         dynamicWindow=window;
  1187                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1188                         useDynamicWindow(scsu, dynamicWindow);
  1189                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1190                         length=2;
  1191                         goto outputBytes;
  1192                     } else {
  1193                         /* quote from dynamic window */
  1194                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
  1195                         length=2;
  1196                         goto outputBytes;
  1198                 } else if((window=getWindow(staticOffsets, c))>=0) {
  1199                     /* quote from static window */
  1200                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
  1201                     length=2;
  1202                     goto outputBytes;
  1203                 } else if((code=getDynamicOffset(c, &offset))>=0) {
  1204                     /* define a dynamic window with this character */
  1205                     dynamicWindow=getNextDynamicWindow(scsu);
  1206                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1207                     useDynamicWindow(scsu, dynamicWindow);
  1208                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1209                     length=3;
  1210                     goto outputBytes;
  1211                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
  1212                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1213                 ) {
  1214                     /*
  1215                      * this character is not compressible (a BMP ideograph or similar);
  1216                      * switch to Unicode mode if this is the last character in the block
  1217                      * or there is at least one more ideograph following immediately
  1218                      */
  1219                     isSingleByteMode=FALSE;
  1220                     c|=SCU<<16;
  1221                     length=3;
  1222                     goto outputBytes;
  1223                 } else {
  1224                     /* quote Unicode */
  1225                     c|=SQU<<16;
  1226                     length=3;
  1227                     goto outputBytes;
  1231             /* normal end of conversion: prepare for a new character */
  1232             c=0;
  1233             sourceIndex=nextSourceIndex;
  1235     } else {
  1236         if(c!=0 && targetCapacity>0) {
  1237             goto getTrailUnicode;
  1240         /* state machine for Unicode mode */
  1241 /* unicodeByteMode: */
  1242         while(source<sourceLimit) {
  1243             if(targetCapacity<=0) {
  1244                 /* target is full */
  1245                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1246                 break;
  1248             c=*source++;
  1249             ++nextSourceIndex;
  1251             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
  1252                 /* not compressible, write character directly */
  1253                 if(targetCapacity>=2) {
  1254                     *target++=(uint8_t)(c>>8);
  1255                     *target++=(uint8_t)c;
  1256                     if(offsets!=NULL) {
  1257                         *offsets++=sourceIndex;
  1258                         *offsets++=sourceIndex;
  1260                     targetCapacity-=2;
  1261                 } else {
  1262                     length=2;
  1263                     goto outputBytes;
  1265             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
  1266                 /* compress BMP character if the following one is not an uncompressible ideograph */
  1267                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
  1268                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
  1269                         /* ASCII digit or letter */
  1270                         isSingleByteMode=TRUE;
  1271                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
  1272                         length=2;
  1273                         goto outputBytes;
  1274                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1275                         /* there is a dynamic window that contains this character, change to it */
  1276                         isSingleByteMode=TRUE;
  1277                         dynamicWindow=window;
  1278                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1279                         useDynamicWindow(scsu, dynamicWindow);
  1280                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1281                         length=2;
  1282                         goto outputBytes;
  1283                     } else if((code=getDynamicOffset(c, &offset))>=0) {
  1284                         /* define a dynamic window with this character */
  1285                         isSingleByteMode=TRUE;
  1286                         dynamicWindow=getNextDynamicWindow(scsu);
  1287                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1288                         useDynamicWindow(scsu, dynamicWindow);
  1289                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1290                         length=3;
  1291                         goto outputBytes;
  1295                 /* don't know how to compress this character, just write it directly */
  1296                 length=2;
  1297                 goto outputBytes;
  1298             } else if(c<0xe000) {
  1299                 /* c is a surrogate */
  1300                 if(U16_IS_SURROGATE_LEAD(c)) {
  1301 getTrailUnicode:
  1302                     lead=(UChar)c;
  1303                     if(source<sourceLimit) {
  1304                         /* test the following code unit */
  1305                         trail=*source;
  1306                         if(U16_IS_TRAIL(trail)) {
  1307                             ++source;
  1308                             ++nextSourceIndex;
  1309                             c=U16_GET_SUPPLEMENTARY(c, trail);
  1310                             /* convert this surrogate code point */
  1311                             /* exit this condition tree */
  1312                         } else {
  1313                             /* this is an unmatched lead code unit (1st surrogate) */
  1314                             /* callback(illegal) */
  1315                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1316                             goto endloop;
  1318                     } else {
  1319                         /* no more input */
  1320                         break;
  1322                 } else {
  1323                     /* this is an unmatched trail code unit (2nd surrogate) */
  1324                     /* callback(illegal) */
  1325                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1326                     goto endloop;
  1329                 /* compress supplementary character */
  1330                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
  1331                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1332                 ) {
  1333                     /*
  1334                      * there is a dynamic window that contains this character and
  1335                      * the following character is not uncompressible,
  1336                      * change to the window
  1337                      */
  1338                     isSingleByteMode=TRUE;
  1339                     dynamicWindow=window;
  1340                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1341                     useDynamicWindow(scsu, dynamicWindow);
  1342                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1343                     length=2;
  1344                     goto outputBytes;
  1345                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
  1346                           (code=getDynamicOffset(c, &offset))>=0
  1347                 ) {
  1348                     /* two supplementary characters in (probably) the same window - define an extended one */
  1349                     isSingleByteMode=TRUE;
  1350                     code-=0x200;
  1351                     dynamicWindow=getNextDynamicWindow(scsu);
  1352                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1353                     useDynamicWindow(scsu, dynamicWindow);
  1354                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1355                     length=4;
  1356                     goto outputBytes;
  1357                 } else {
  1358                     /* don't know how to compress this character, just write it directly */
  1359                     c=((uint32_t)lead<<16)|trail;
  1360                     length=4;
  1361                     goto outputBytes;
  1363             } else /* 0xe000<=c<0xf300 */ {
  1364                 /* quote to avoid SCSU tags */
  1365                 c|=UQU<<16;
  1366                 length=3;
  1367                 goto outputBytes;
  1370             /* normal end of conversion: prepare for a new character */
  1371             c=0;
  1372             sourceIndex=nextSourceIndex;
  1375 endloop:
  1377     /* set the converter state back into UConverter */
  1378     scsu->fromUIsSingleByteMode=isSingleByteMode;
  1379     scsu->fromUDynamicWindow=dynamicWindow;
  1381     cnv->fromUChar32=c;
  1383     /* write back the updated pointers */
  1384     pArgs->source=source;
  1385     pArgs->target=(char *)target;
  1386     pArgs->offsets=offsets;
  1387     return;
  1389 outputBytes:
  1390     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
  1391     /* from the first if in the loop we know that targetCapacity>0 */
  1392     if(length<=targetCapacity) {
  1393         if(offsets==NULL) {
  1394             switch(length) {
  1395                 /* each branch falls through to the next one */
  1396             case 4:
  1397                 *target++=(uint8_t)(c>>24);
  1398             case 3: /*fall through*/
  1399                 *target++=(uint8_t)(c>>16);
  1400             case 2: /*fall through*/
  1401                 *target++=(uint8_t)(c>>8);
  1402             case 1: /*fall through*/
  1403                 *target++=(uint8_t)c;
  1404             default:
  1405                 /* will never occur */
  1406                 break;
  1408         } else {
  1409             switch(length) {
  1410                 /* each branch falls through to the next one */
  1411             case 4:
  1412                 *target++=(uint8_t)(c>>24);
  1413                 *offsets++=sourceIndex;
  1414             case 3: /*fall through*/
  1415                 *target++=(uint8_t)(c>>16);
  1416                 *offsets++=sourceIndex;
  1417             case 2: /*fall through*/
  1418                 *target++=(uint8_t)(c>>8);
  1419                 *offsets++=sourceIndex;
  1420             case 1: /*fall through*/
  1421                 *target++=(uint8_t)c;
  1422                 *offsets++=sourceIndex;
  1423             default:
  1424                 /* will never occur */
  1425                 break;
  1428         targetCapacity-=length;
  1430         /* normal end of conversion: prepare for a new character */
  1431         c=0;
  1432         sourceIndex=nextSourceIndex;
  1433         goto loop;
  1434     } else {
  1435         uint8_t *p;
  1437         /*
  1438          * We actually do this backwards here:
  1439          * In order to save an intermediate variable, we output
  1440          * first to the overflow buffer what does not fit into the
  1441          * regular target.
  1442          */
  1443         /* we know that 0<=targetCapacity<length<=4 */
  1444         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
  1445         length-=targetCapacity;
  1446         p=(uint8_t *)cnv->charErrorBuffer;
  1447         switch(length) {
  1448             /* each branch falls through to the next one */
  1449         case 4:
  1450             *p++=(uint8_t)(c>>24);
  1451         case 3: /*fall through*/
  1452             *p++=(uint8_t)(c>>16);
  1453         case 2: /*fall through*/
  1454             *p++=(uint8_t)(c>>8);
  1455         case 1: /*fall through*/
  1456             *p=(uint8_t)c;
  1457         default:
  1458             /* will never occur */
  1459             break;
  1461         cnv->charErrorBufferLength=(int8_t)length;
  1463         /* now output what fits into the regular target */
  1464         c>>=8*length; /* length was reduced by targetCapacity */
  1465         switch(targetCapacity) {
  1466             /* each branch falls through to the next one */
  1467         case 3:
  1468             *target++=(uint8_t)(c>>16);
  1469             if(offsets!=NULL) {
  1470                 *offsets++=sourceIndex;
  1472         case 2: /*fall through*/
  1473             *target++=(uint8_t)(c>>8);
  1474             if(offsets!=NULL) {
  1475                 *offsets++=sourceIndex;
  1477         case 1: /*fall through*/
  1478             *target++=(uint8_t)c;
  1479             if(offsets!=NULL) {
  1480                 *offsets++=sourceIndex;
  1482         default:
  1483             break;
  1486         /* target overflow */
  1487         targetCapacity=0;
  1488         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1489         c=0;
  1490         goto endloop;
  1494 /*
  1495  * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
  1496  * If a change is made in the original function, then either
  1497  * change this function the same way or
  1498  * re-copy the original function and remove the variables
  1499  * offsets, sourceIndex, and nextSourceIndex.
  1500  */
  1501 static void
  1502 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
  1503                  UErrorCode *pErrorCode) {
  1504     UConverter *cnv;
  1505     SCSUData *scsu;
  1506     const UChar *source, *sourceLimit;
  1507     uint8_t *target;
  1508     int32_t targetCapacity;
  1510     UBool isSingleByteMode;
  1511     uint8_t dynamicWindow;
  1512     uint32_t currentOffset;
  1514     uint32_t c, delta;
  1516     int32_t length;
  1518     /* variables for compression heuristics */
  1519     uint32_t offset;
  1520     UChar lead, trail;
  1521     int code;
  1522     int8_t window;
  1524     /* set up the local pointers */
  1525     cnv=pArgs->converter;
  1526     scsu=(SCSUData *)cnv->extraInfo;
  1528     /* set up the local pointers */
  1529     source=pArgs->source;
  1530     sourceLimit=pArgs->sourceLimit;
  1531     target=(uint8_t *)pArgs->target;
  1532     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  1534     /* get the state machine state */
  1535     isSingleByteMode=scsu->fromUIsSingleByteMode;
  1536     dynamicWindow=scsu->fromUDynamicWindow;
  1537     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1539     c=cnv->fromUChar32;
  1541     /* similar conversion "loop" as in toUnicode */
  1542 loop:
  1543     if(isSingleByteMode) {
  1544         if(c!=0 && targetCapacity>0) {
  1545             goto getTrailSingle;
  1548         /* state machine for single-byte mode */
  1549 /* singleByteMode: */
  1550         while(source<sourceLimit) {
  1551             if(targetCapacity<=0) {
  1552                 /* target is full */
  1553                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1554                 break;
  1556             c=*source++;
  1558             if((c-0x20)<=0x5f) {
  1559                 /* pass US-ASCII graphic character through */
  1560                 *target++=(uint8_t)c;
  1561                 --targetCapacity;
  1562             } else if(c<0x20) {
  1563                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  1564                     /* CR/LF/TAB/NUL */
  1565                     *target++=(uint8_t)c;
  1566                     --targetCapacity;
  1567                 } else {
  1568                     /* quote C0 control character */
  1569                     c|=SQ0<<8;
  1570                     length=2;
  1571                     goto outputBytes;
  1573             } else if((delta=c-currentOffset)<=0x7f) {
  1574                 /* use the current dynamic window */
  1575                 *target++=(uint8_t)(delta|0x80);
  1576                 --targetCapacity;
  1577             } else if(U16_IS_SURROGATE(c)) {
  1578                 if(U16_IS_SURROGATE_LEAD(c)) {
  1579 getTrailSingle:
  1580                     lead=(UChar)c;
  1581                     if(source<sourceLimit) {
  1582                         /* test the following code unit */
  1583                         trail=*source;
  1584                         if(U16_IS_TRAIL(trail)) {
  1585                             ++source;
  1586                             c=U16_GET_SUPPLEMENTARY(c, trail);
  1587                             /* convert this surrogate code point */
  1588                             /* exit this condition tree */
  1589                         } else {
  1590                             /* this is an unmatched lead code unit (1st surrogate) */
  1591                             /* callback(illegal) */
  1592                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1593                             goto endloop;
  1595                     } else {
  1596                         /* no more input */
  1597                         break;
  1599                 } else {
  1600                     /* this is an unmatched trail code unit (2nd surrogate) */
  1601                     /* callback(illegal) */
  1602                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1603                     goto endloop;
  1606                 /* compress supplementary character U+10000..U+10ffff */
  1607                 if((delta=c-currentOffset)<=0x7f) {
  1608                     /* use the current dynamic window */
  1609                     *target++=(uint8_t)(delta|0x80);
  1610                     --targetCapacity;
  1611                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1612                     /* there is a dynamic window that contains this character, change to it */
  1613                     dynamicWindow=window;
  1614                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1615                     useDynamicWindow(scsu, dynamicWindow);
  1616                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1617                     length=2;
  1618                     goto outputBytes;
  1619                 } else if((code=getDynamicOffset(c, &offset))>=0) {
  1620                     /* might check if there are more characters in this window to come */
  1621                     /* define an extended window with this character */
  1622                     code-=0x200;
  1623                     dynamicWindow=getNextDynamicWindow(scsu);
  1624                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1625                     useDynamicWindow(scsu, dynamicWindow);
  1626                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1627                     length=4;
  1628                     goto outputBytes;
  1629                 } else {
  1630                     /* change to Unicode mode and output this (lead, trail) pair */
  1631                     isSingleByteMode=FALSE;
  1632                     *target++=(uint8_t)SCU;
  1633                     --targetCapacity;
  1634                     c=((uint32_t)lead<<16)|trail;
  1635                     length=4;
  1636                     goto outputBytes;
  1638             } else if(c<0xa0) {
  1639                 /* quote C1 control character */
  1640                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
  1641                 length=2;
  1642                 goto outputBytes;
  1643             } else if(c==0xfeff || c>=0xfff0) {
  1644                 /* quote signature character=byte order mark and specials */
  1645                 c|=SQU<<16;
  1646                 length=3;
  1647                 goto outputBytes;
  1648             } else {
  1649                 /* compress all other BMP characters */
  1650                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1651                     /* there is a window defined that contains this character - switch to it or quote from it? */
  1652                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
  1653                         /* change to dynamic window */
  1654                         dynamicWindow=window;
  1655                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1656                         useDynamicWindow(scsu, dynamicWindow);
  1657                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1658                         length=2;
  1659                         goto outputBytes;
  1660                     } else {
  1661                         /* quote from dynamic window */
  1662                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
  1663                         length=2;
  1664                         goto outputBytes;
  1666                 } else if((window=getWindow(staticOffsets, c))>=0) {
  1667                     /* quote from static window */
  1668                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
  1669                     length=2;
  1670                     goto outputBytes;
  1671                 } else if((code=getDynamicOffset(c, &offset))>=0) {
  1672                     /* define a dynamic window with this character */
  1673                     dynamicWindow=getNextDynamicWindow(scsu);
  1674                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1675                     useDynamicWindow(scsu, dynamicWindow);
  1676                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1677                     length=3;
  1678                     goto outputBytes;
  1679                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
  1680                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1681                 ) {
  1682                     /*
  1683                      * this character is not compressible (a BMP ideograph or similar);
  1684                      * switch to Unicode mode if this is the last character in the block
  1685                      * or there is at least one more ideograph following immediately
  1686                      */
  1687                     isSingleByteMode=FALSE;
  1688                     c|=SCU<<16;
  1689                     length=3;
  1690                     goto outputBytes;
  1691                 } else {
  1692                     /* quote Unicode */
  1693                     c|=SQU<<16;
  1694                     length=3;
  1695                     goto outputBytes;
  1699             /* normal end of conversion: prepare for a new character */
  1700             c=0;
  1702     } else {
  1703         if(c!=0 && targetCapacity>0) {
  1704             goto getTrailUnicode;
  1707         /* state machine for Unicode mode */
  1708 /* unicodeByteMode: */
  1709         while(source<sourceLimit) {
  1710             if(targetCapacity<=0) {
  1711                 /* target is full */
  1712                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1713                 break;
  1715             c=*source++;
  1717             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
  1718                 /* not compressible, write character directly */
  1719                 if(targetCapacity>=2) {
  1720                     *target++=(uint8_t)(c>>8);
  1721                     *target++=(uint8_t)c;
  1722                     targetCapacity-=2;
  1723                 } else {
  1724                     length=2;
  1725                     goto outputBytes;
  1727             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
  1728                 /* compress BMP character if the following one is not an uncompressible ideograph */
  1729                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
  1730                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
  1731                         /* ASCII digit or letter */
  1732                         isSingleByteMode=TRUE;
  1733                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
  1734                         length=2;
  1735                         goto outputBytes;
  1736                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1737                         /* there is a dynamic window that contains this character, change to it */
  1738                         isSingleByteMode=TRUE;
  1739                         dynamicWindow=window;
  1740                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1741                         useDynamicWindow(scsu, dynamicWindow);
  1742                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1743                         length=2;
  1744                         goto outputBytes;
  1745                     } else if((code=getDynamicOffset(c, &offset))>=0) {
  1746                         /* define a dynamic window with this character */
  1747                         isSingleByteMode=TRUE;
  1748                         dynamicWindow=getNextDynamicWindow(scsu);
  1749                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1750                         useDynamicWindow(scsu, dynamicWindow);
  1751                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1752                         length=3;
  1753                         goto outputBytes;
  1757                 /* don't know how to compress this character, just write it directly */
  1758                 length=2;
  1759                 goto outputBytes;
  1760             } else if(c<0xe000) {
  1761                 /* c is a surrogate */
  1762                 if(U16_IS_SURROGATE_LEAD(c)) {
  1763 getTrailUnicode:
  1764                     lead=(UChar)c;
  1765                     if(source<sourceLimit) {
  1766                         /* test the following code unit */
  1767                         trail=*source;
  1768                         if(U16_IS_TRAIL(trail)) {
  1769                             ++source;
  1770                             c=U16_GET_SUPPLEMENTARY(c, trail);
  1771                             /* convert this surrogate code point */
  1772                             /* exit this condition tree */
  1773                         } else {
  1774                             /* this is an unmatched lead code unit (1st surrogate) */
  1775                             /* callback(illegal) */
  1776                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1777                             goto endloop;
  1779                     } else {
  1780                         /* no more input */
  1781                         break;
  1783                 } else {
  1784                     /* this is an unmatched trail code unit (2nd surrogate) */
  1785                     /* callback(illegal) */
  1786                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1787                     goto endloop;
  1790                 /* compress supplementary character */
  1791                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
  1792                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1793                 ) {
  1794                     /*
  1795                      * there is a dynamic window that contains this character and
  1796                      * the following character is not uncompressible,
  1797                      * change to the window
  1798                      */
  1799                     isSingleByteMode=TRUE;
  1800                     dynamicWindow=window;
  1801                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1802                     useDynamicWindow(scsu, dynamicWindow);
  1803                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1804                     length=2;
  1805                     goto outputBytes;
  1806                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
  1807                           (code=getDynamicOffset(c, &offset))>=0
  1808                 ) {
  1809                     /* two supplementary characters in (probably) the same window - define an extended one */
  1810                     isSingleByteMode=TRUE;
  1811                     code-=0x200;
  1812                     dynamicWindow=getNextDynamicWindow(scsu);
  1813                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1814                     useDynamicWindow(scsu, dynamicWindow);
  1815                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1816                     length=4;
  1817                     goto outputBytes;
  1818                 } else {
  1819                     /* don't know how to compress this character, just write it directly */
  1820                     c=((uint32_t)lead<<16)|trail;
  1821                     length=4;
  1822                     goto outputBytes;
  1824             } else /* 0xe000<=c<0xf300 */ {
  1825                 /* quote to avoid SCSU tags */
  1826                 c|=UQU<<16;
  1827                 length=3;
  1828                 goto outputBytes;
  1831             /* normal end of conversion: prepare for a new character */
  1832             c=0;
  1835 endloop:
  1837     /* set the converter state back into UConverter */
  1838     scsu->fromUIsSingleByteMode=isSingleByteMode;
  1839     scsu->fromUDynamicWindow=dynamicWindow;
  1841     cnv->fromUChar32=c;
  1843     /* write back the updated pointers */
  1844     pArgs->source=source;
  1845     pArgs->target=(char *)target;
  1846     return;
  1848 outputBytes:
  1849     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
  1850     /* from the first if in the loop we know that targetCapacity>0 */
  1851     if(length<=targetCapacity) {
  1852         switch(length) {
  1853             /* each branch falls through to the next one */
  1854         case 4:
  1855             *target++=(uint8_t)(c>>24);
  1856         case 3: /*fall through*/
  1857             *target++=(uint8_t)(c>>16);
  1858         case 2: /*fall through*/
  1859             *target++=(uint8_t)(c>>8);
  1860         case 1: /*fall through*/
  1861             *target++=(uint8_t)c;
  1862         default:
  1863             /* will never occur */
  1864             break;
  1866         targetCapacity-=length;
  1868         /* normal end of conversion: prepare for a new character */
  1869         c=0;
  1870         goto loop;
  1871     } else {
  1872         uint8_t *p;
  1874         /*
  1875          * We actually do this backwards here:
  1876          * In order to save an intermediate variable, we output
  1877          * first to the overflow buffer what does not fit into the
  1878          * regular target.
  1879          */
  1880         /* we know that 0<=targetCapacity<length<=4 */
  1881         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
  1882         length-=targetCapacity;
  1883         p=(uint8_t *)cnv->charErrorBuffer;
  1884         switch(length) {
  1885             /* each branch falls through to the next one */
  1886         case 4:
  1887             *p++=(uint8_t)(c>>24);
  1888         case 3: /*fall through*/
  1889             *p++=(uint8_t)(c>>16);
  1890         case 2: /*fall through*/
  1891             *p++=(uint8_t)(c>>8);
  1892         case 1: /*fall through*/
  1893             *p=(uint8_t)c;
  1894         default:
  1895             /* will never occur */
  1896             break;
  1898         cnv->charErrorBufferLength=(int8_t)length;
  1900         /* now output what fits into the regular target */
  1901         c>>=8*length; /* length was reduced by targetCapacity */
  1902         switch(targetCapacity) {
  1903             /* each branch falls through to the next one */
  1904         case 3:
  1905             *target++=(uint8_t)(c>>16);
  1906         case 2: /*fall through*/
  1907             *target++=(uint8_t)(c>>8);
  1908         case 1: /*fall through*/
  1909             *target++=(uint8_t)c;
  1910         default:
  1911             break;
  1914         /* target overflow */
  1915         targetCapacity=0;
  1916         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1917         c=0;
  1918         goto endloop;
  1922 /* miscellaneous ------------------------------------------------------------ */
  1924 static const char *
  1925 _SCSUGetName(const UConverter *cnv) {
  1926     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
  1928     switch(scsu->locale) {
  1929     case l_ja:
  1930         return "SCSU,locale=ja";
  1931     default:
  1932         return "SCSU";
  1936 /* structure for SafeClone calculations */
  1937 struct cloneSCSUStruct
  1939     UConverter cnv;
  1940     SCSUData mydata;
  1941 };
  1943 static UConverter * 
  1944 _SCSUSafeClone(const UConverter *cnv, 
  1945                void *stackBuffer, 
  1946                int32_t *pBufferSize, 
  1947                UErrorCode *status)
  1949     struct cloneSCSUStruct * localClone;
  1950     int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
  1952     if (U_FAILURE(*status)){
  1953         return 0;
  1956     if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
  1957         *pBufferSize = bufferSizeNeeded;
  1958         return 0;
  1961     localClone = (struct cloneSCSUStruct *)stackBuffer;
  1962     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
  1964     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
  1965     localClone->cnv.extraInfo = &localClone->mydata;
  1966     localClone->cnv.isExtraLocal = TRUE;
  1968     return &localClone->cnv;
  1972 static const UConverterImpl _SCSUImpl={
  1973     UCNV_SCSU,
  1975     NULL,
  1976     NULL,
  1978     _SCSUOpen,
  1979     _SCSUClose,
  1980     _SCSUReset,
  1982     _SCSUToUnicode,
  1983     _SCSUToUnicodeWithOffsets,
  1984     _SCSUFromUnicode,
  1985     _SCSUFromUnicodeWithOffsets,
  1986     NULL,
  1988     NULL,
  1989     _SCSUGetName,
  1990     NULL,
  1991     _SCSUSafeClone,
  1992     ucnv_getCompleteUnicodeSet
  1993 };
  1995 static const UConverterStaticData _SCSUStaticData={
  1996     sizeof(UConverterStaticData),
  1997     "SCSU",
  1998     1212, /* CCSID for SCSU */
  1999     UCNV_IBM, UCNV_SCSU,
  2000     1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
  2001     /*
  2002      * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
  2003      * substitution string.
  2004      */
  2005     { 0x0e, 0xff, 0xfd, 0 }, 3,
  2006     FALSE, FALSE,
  2007     0,
  2008     0,
  2009     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  2010 };
  2012 const UConverterSharedData _SCSUData={
  2013     sizeof(UConverterSharedData), ~((uint32_t)0),
  2014     NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
  2016 };
  2018 #endif

mercurial