intl/icu/source/common/ucnv2022.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnv2022.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,3951 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2000-2012, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   file name:  ucnv2022.cpp
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2000feb03
    1.15 +*   created by: Markus W. Scherer
    1.16 +*
    1.17 +*   Change history:
    1.18 +*
    1.19 +*   06/29/2000  helena  Major rewrite of the callback APIs.
    1.20 +*   08/08/2000  Ram     Included support for ISO-2022-JP-2
    1.21 +*                       Changed implementation of toUnicode
    1.22 +*                       function
    1.23 +*   08/21/2000  Ram     Added support for ISO-2022-KR
    1.24 +*   08/29/2000  Ram     Seperated implementation of EBCDIC to
    1.25 +*                       ucnvebdc.c
    1.26 +*   09/20/2000  Ram     Added support for ISO-2022-CN
    1.27 +*                       Added implementations for getNextUChar()
    1.28 +*                       for specific 2022 country variants.
    1.29 +*   10/31/2000  Ram     Implemented offsets logic functions
    1.30 +*/
    1.31 +
    1.32 +#include "unicode/utypes.h"
    1.33 +
    1.34 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
    1.35 +
    1.36 +#include "unicode/ucnv.h"
    1.37 +#include "unicode/uset.h"
    1.38 +#include "unicode/ucnv_err.h"
    1.39 +#include "unicode/ucnv_cb.h"
    1.40 +#include "unicode/utf16.h"
    1.41 +#include "ucnv_imp.h"
    1.42 +#include "ucnv_bld.h"
    1.43 +#include "ucnv_cnv.h"
    1.44 +#include "ucnvmbcs.h"
    1.45 +#include "cstring.h"
    1.46 +#include "cmemory.h"
    1.47 +#include "uassert.h"
    1.48 +
    1.49 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.50 +
    1.51 +#ifdef U_ENABLE_GENERIC_ISO_2022
    1.52 +/*
    1.53 + * I am disabling the generic ISO-2022 converter after proposing to do so on
    1.54 + * the icu mailing list two days ago.
    1.55 + *
    1.56 + * Reasons:
    1.57 + * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
    1.58 + *    its designation sequences, single shifts with return to the previous state,
    1.59 + *    switch-with-no-return to UTF-16BE or similar, etc.
    1.60 + *    This is unlike the language-specific variants like ISO-2022-JP which
    1.61 + *    require a much smaller repertoire of ISO-2022 features.
    1.62 + *    These variants continue to be supported.
    1.63 + * 2. I believe that no one is really using the generic ISO-2022 converter
    1.64 + *    but rather always one of the language-specific variants.
    1.65 + *    Note that ICU's generic ISO-2022 converter has always output one escape
    1.66 + *    sequence followed by UTF-8 for the whole stream.
    1.67 + * 3. Switching between subcharsets is extremely slow, because each time
    1.68 + *    the previous converter is closed and a new one opened,
    1.69 + *    without any kind of caching, least-recently-used list, etc.
    1.70 + * 4. The code is currently buggy, and given the above it does not seem
    1.71 + *    reasonable to spend the time on maintenance.
    1.72 + * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
    1.73 + *    This means, for example, that when ISO-8859-7 is designated, the following
    1.74 + *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
    1.75 + *    The ICU ISO-2022 converter does not handle this - and has no information
    1.76 + *    about which subconverter would have to be shifted vs. which is designed
    1.77 + *    for 7-bit ISO-2022.
    1.78 + *
    1.79 + * Markus Scherer 2003-dec-03
    1.80 + */
    1.81 +#endif
    1.82 +
    1.83 +static const char SHIFT_IN_STR[]  = "\x0F";
    1.84 +// static const char SHIFT_OUT_STR[] = "\x0E";
    1.85 +
    1.86 +#define CR      0x0D
    1.87 +#define LF      0x0A
    1.88 +#define H_TAB   0x09
    1.89 +#define V_TAB   0x0B
    1.90 +#define SPACE   0x20
    1.91 +
    1.92 +enum {
    1.93 +    HWKANA_START=0xff61,
    1.94 +    HWKANA_END=0xff9f
    1.95 +};
    1.96 +
    1.97 +/*
    1.98 + * 94-character sets with native byte values A1..FE are encoded in ISO 2022
    1.99 + * as bytes 21..7E. (Subtract 0x80.)
   1.100 + * 96-character sets with native byte values A0..FF are encoded in ISO 2022
   1.101 + * as bytes 20..7F. (Subtract 0x80.)
   1.102 + * Do not encode C1 control codes with native bytes 80..9F
   1.103 + * as bytes 00..1F (C0 control codes).
   1.104 + */
   1.105 +enum {
   1.106 +    GR94_START=0xa1,
   1.107 +    GR94_END=0xfe,
   1.108 +    GR96_START=0xa0,
   1.109 +    GR96_END=0xff
   1.110 +};
   1.111 +
   1.112 +/*
   1.113 + * ISO 2022 control codes must not be converted from Unicode
   1.114 + * because they would mess up the byte stream.
   1.115 + * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
   1.116 + * corresponding to SO, SI, and ESC.
   1.117 + */
   1.118 +#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
   1.119 +
   1.120 +/* for ISO-2022-JP and -CN implementations */
   1.121 +typedef enum  {
   1.122 +        /* shared values */
   1.123 +        INVALID_STATE=-1,
   1.124 +        ASCII = 0,
   1.125 +
   1.126 +        SS2_STATE=0x10,
   1.127 +        SS3_STATE,
   1.128 +
   1.129 +        /* JP */
   1.130 +        ISO8859_1 = 1 ,
   1.131 +        ISO8859_7 = 2 ,
   1.132 +        JISX201  = 3,
   1.133 +        JISX208 = 4,
   1.134 +        JISX212 = 5,
   1.135 +        GB2312  =6,
   1.136 +        KSC5601 =7,
   1.137 +        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
   1.138 +
   1.139 +        /* CN */
   1.140 +        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
   1.141 +        GB2312_1=1,
   1.142 +        ISO_IR_165=2,
   1.143 +        CNS_11643=3,
   1.144 +
   1.145 +        /*
   1.146 +         * these are used in StateEnum and ISO2022State variables,
   1.147 +         * but CNS_11643 must be used to index into myConverterArray[]
   1.148 +         */
   1.149 +        CNS_11643_0=0x20,
   1.150 +        CNS_11643_1,
   1.151 +        CNS_11643_2,
   1.152 +        CNS_11643_3,
   1.153 +        CNS_11643_4,
   1.154 +        CNS_11643_5,
   1.155 +        CNS_11643_6,
   1.156 +        CNS_11643_7
   1.157 +} StateEnum;
   1.158 +
   1.159 +/* is the StateEnum charset value for a DBCS charset? */
   1.160 +#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
   1.161 +
   1.162 +#define CSM(cs) ((uint16_t)1<<(cs))
   1.163 +
   1.164 +/*
   1.165 + * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
   1.166 + * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
   1.167 + *
   1.168 + * Note: The converter uses some leniency:
   1.169 + * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
   1.170 + *   all versions, not just JIS7 and JIS8.
   1.171 + * - ICU does not distinguish between different versions of JIS X 0208.
   1.172 + */
   1.173 +enum { MAX_JA_VERSION=4 };
   1.174 +static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
   1.175 +    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
   1.176 +    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
   1.177 +    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
   1.178 +    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
   1.179 +    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
   1.180 +};
   1.181 +
   1.182 +typedef enum {
   1.183 +        ASCII1=0,
   1.184 +        LATIN1,
   1.185 +        SBCS,
   1.186 +        DBCS,
   1.187 +        MBCS,
   1.188 +        HWKANA
   1.189 +}Cnv2022Type;
   1.190 +
   1.191 +typedef struct ISO2022State {
   1.192 +    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
   1.193 +    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
   1.194 +    int8_t prevG;       /* g before single shift (SS2 or SS3) */
   1.195 +} ISO2022State;
   1.196 +
   1.197 +#define UCNV_OPTIONS_VERSION_MASK 0xf
   1.198 +#define UCNV_2022_MAX_CONVERTERS 10
   1.199 +
   1.200 +typedef struct{
   1.201 +    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
   1.202 +    UConverter *currentConverter;
   1.203 +    Cnv2022Type currentType;
   1.204 +    ISO2022State toU2022State, fromU2022State;
   1.205 +    uint32_t key;
   1.206 +    uint32_t version;
   1.207 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.208 +    UBool isFirstBuffer;
   1.209 +#endif
   1.210 +    UBool isEmptySegment;
   1.211 +    char name[30];
   1.212 +    char locale[3];
   1.213 +}UConverterDataISO2022;
   1.214 +
   1.215 +/* Protos */
   1.216 +/* ISO-2022 ----------------------------------------------------------------- */
   1.217 +
   1.218 +/*Forward declaration */
   1.219 +U_CFUNC void
   1.220 +ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
   1.221 +                      UErrorCode * err);
   1.222 +U_CFUNC void
   1.223 +ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
   1.224 +                                    UErrorCode * err);
   1.225 +
   1.226 +#define ESC_2022 0x1B /*ESC*/
   1.227 +
   1.228 +typedef enum
   1.229 +{
   1.230 +        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
   1.231 +        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
   1.232 +        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
   1.233 +        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
   1.234 +} UCNV_TableStates_2022;
   1.235 +
   1.236 +/*
   1.237 +* The way these state transition arrays work is:
   1.238 +* ex : ESC$B is the sequence for JISX208
   1.239 +*      a) First Iteration: char is ESC
   1.240 +*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
   1.241 +*             int x = normalize_esq_chars_2022[27] which is equal to 1
   1.242 +*         ii) Search for this value in escSeqStateTable_Key_2022[]
   1.243 +*             value of x is stored at escSeqStateTable_Key_2022[0]
   1.244 +*        iii) Save this index as offset
   1.245 +*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
   1.246 +*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
   1.247 +*     b) Switch on this state and continue to next char
   1.248 +*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
   1.249 +*             which is normalize_esq_chars_2022[36] == 4
   1.250 +*         ii) x is currently 1(from above)
   1.251 +*               x<<=5 -- x is now 32
   1.252 +*               x+=normalize_esq_chars_2022[36]
   1.253 +*               now x is 36
   1.254 +*        iii) Search for this value in escSeqStateTable_Key_2022[]
   1.255 +*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
   1.256 +*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
   1.257 +*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
   1.258 +*     c) Switch on this state and continue to next char
   1.259 +*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
   1.260 +*        ii) x is currently 36 (from above)
   1.261 +*            x<<=5 -- x is now 1152
   1.262 +*            x+=normalize_esq_chars_2022[66]
   1.263 +*            now x is 1161
   1.264 +*       iii) Search for this value in escSeqStateTable_Key_2022[]
   1.265 +*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
   1.266 +*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
   1.267 +*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
   1.268 +*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
   1.269 +*/
   1.270 +
   1.271 +
   1.272 +/*Below are the 3 arrays depicting a state transition table*/
   1.273 +static const int8_t normalize_esq_chars_2022[256] = {
   1.274 +/*       0      1       2       3       4      5       6        7       8       9           */
   1.275 +
   1.276 +         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.277 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.278 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
   1.279 +        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
   1.280 +        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
   1.281 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.282 +        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
   1.283 +        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
   1.284 +        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.285 +        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.286 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.287 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.288 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.289 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.290 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.291 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.292 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.293 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.294 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.295 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.296 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.297 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.298 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.299 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.300 +        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
   1.301 +        ,0     ,0      ,0      ,0      ,0      ,0
   1.302 +};
   1.303 +
   1.304 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.305 +/*
   1.306 + * When the generic ISO-2022 converter is completely removed, not just disabled
   1.307 + * per #ifdef, then the following state table and the associated tables that are
   1.308 + * dimensioned with MAX_STATES_2022 should be trimmed.
   1.309 + *
   1.310 + * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
   1.311 + * the associated escape sequences starting with ESC ( B should be removed.
   1.312 + * This includes the ones with key values 1097 and all of the ones above 1000000.
   1.313 + *
   1.314 + * For the latter, the tables can simply be truncated.
   1.315 + * For the former, since the tables must be kept parallel, it is probably best
   1.316 + * to simply duplicate an adjacent table cell, parallel in all tables.
   1.317 + *
   1.318 + * It may make sense to restructure the tables, especially by using small search
   1.319 + * tables for the variants instead of indexing them parallel to the table here.
   1.320 + */
   1.321 +#endif
   1.322 +
   1.323 +#define MAX_STATES_2022 74
   1.324 +static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
   1.325 +/*   0           1           2           3           4           5           6           7           8           9           */
   1.326 +
   1.327 +     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
   1.328 +    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
   1.329 +    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
   1.330 +    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
   1.331 +    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
   1.332 +    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
   1.333 +    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
   1.334 +    ,35947631   ,35947635   ,35947636   ,35947638
   1.335 +};
   1.336 +
   1.337 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.338 +
   1.339 +static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
   1.340 + /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
   1.341 +
   1.342 +     NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
   1.343 +    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
   1.344 +    ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
   1.345 +    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
   1.346 +    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
   1.347 +    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
   1.348 +    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
   1.349 +    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
   1.350 +};
   1.351 +
   1.352 +#endif
   1.353 +
   1.354 +static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
   1.355 +/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
   1.356 +     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
   1.357 +    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
   1.358 +    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
   1.359 +    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
   1.360 +    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
   1.361 +    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
   1.362 +    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
   1.363 +    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
   1.364 +};
   1.365 +
   1.366 +
   1.367 +/* Type def for refactoring changeState_2022 code*/
   1.368 +typedef enum{
   1.369 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.370 +    ISO_2022=0,
   1.371 +#endif
   1.372 +    ISO_2022_JP=1,
   1.373 +    ISO_2022_KR=2,
   1.374 +    ISO_2022_CN=3
   1.375 +} Variant2022;
   1.376 +
   1.377 +/*********** ISO 2022 Converter Protos ***********/
   1.378 +static void
   1.379 +_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
   1.380 +
   1.381 +static void
   1.382 + _ISO2022Close(UConverter *converter);
   1.383 +
   1.384 +static void
   1.385 +_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
   1.386 +
   1.387 +static const char*
   1.388 +_ISO2022getName(const UConverter* cnv);
   1.389 +
   1.390 +static void
   1.391 +_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
   1.392 +
   1.393 +static UConverter *
   1.394 +_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
   1.395 +
   1.396 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.397 +static void
   1.398 +T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
   1.399 +#endif
   1.400 +
   1.401 +namespace {
   1.402 +
   1.403 +/*const UConverterSharedData _ISO2022Data;*/
   1.404 +extern const UConverterSharedData _ISO2022JPData;
   1.405 +extern const UConverterSharedData _ISO2022KRData;
   1.406 +extern const UConverterSharedData _ISO2022CNData;
   1.407 +
   1.408 +}  // namespace
   1.409 +
   1.410 +/*************** Converter implementations ******************/
   1.411 +
   1.412 +/* The purpose of this function is to get around gcc compiler warnings. */
   1.413 +static inline void
   1.414 +fromUWriteUInt8(UConverter *cnv,
   1.415 +                 const char *bytes, int32_t length,
   1.416 +                 uint8_t **target, const char *targetLimit,
   1.417 +                 int32_t **offsets,
   1.418 +                 int32_t sourceIndex,
   1.419 +                 UErrorCode *pErrorCode)
   1.420 +{
   1.421 +    char *targetChars = (char *)*target;
   1.422 +    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
   1.423 +                         offsets, sourceIndex, pErrorCode);
   1.424 +    *target = (uint8_t*)targetChars;
   1.425 +
   1.426 +}
   1.427 +
   1.428 +static inline void
   1.429 +setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
   1.430 +    if(myConverterData->version == 1) {
   1.431 +        UConverter *cnv = myConverterData->currentConverter;
   1.432 +
   1.433 +        cnv->toUnicodeStatus=0;     /* offset */
   1.434 +        cnv->mode=0;                /* state */
   1.435 +        cnv->toULength=0;           /* byteIndex */
   1.436 +    }
   1.437 +}
   1.438 +
   1.439 +static inline void
   1.440 +setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
   1.441 +   /* in ISO-2022-KR the designator sequence appears only once
   1.442 +    * in a file so we append it only once
   1.443 +    */
   1.444 +    if( converter->charErrorBufferLength==0){
   1.445 +
   1.446 +        converter->charErrorBufferLength = 4;
   1.447 +        converter->charErrorBuffer[0] = 0x1b;
   1.448 +        converter->charErrorBuffer[1] = 0x24;
   1.449 +        converter->charErrorBuffer[2] = 0x29;
   1.450 +        converter->charErrorBuffer[3] = 0x43;
   1.451 +    }
   1.452 +    if(myConverterData->version == 1) {
   1.453 +        UConverter *cnv = myConverterData->currentConverter;
   1.454 +
   1.455 +        cnv->fromUChar32=0;
   1.456 +        cnv->fromUnicodeStatus=1;   /* prevLength */
   1.457 +    }
   1.458 +}
   1.459 +
   1.460 +static void
   1.461 +_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
   1.462 +
   1.463 +    char myLocale[6]={' ',' ',' ',' ',' ',' '};
   1.464 +
   1.465 +    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
   1.466 +    if(cnv->extraInfo != NULL) {
   1.467 +        UConverterNamePieces stackPieces;
   1.468 +        UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
   1.469 +        UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
   1.470 +        uint32_t version;
   1.471 +
   1.472 +        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
   1.473 +
   1.474 +        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
   1.475 +        myConverterData->currentType = ASCII1;
   1.476 +        cnv->fromUnicodeStatus =FALSE;
   1.477 +        if(pArgs->locale){
   1.478 +            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
   1.479 +        }
   1.480 +        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
   1.481 +        myConverterData->version = version;
   1.482 +        if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
   1.483 +            (myLocale[2]=='_' || myLocale[2]=='\0'))
   1.484 +        {
   1.485 +            size_t len=0;
   1.486 +            /* open the required converters and cache them */
   1.487 +            if(version>MAX_JA_VERSION) {
   1.488 +                /* prevent indexing beyond jpCharsetMasks[] */
   1.489 +                myConverterData->version = version = 0;
   1.490 +            }
   1.491 +            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
   1.492 +                myConverterData->myConverterArray[ISO8859_7] =
   1.493 +                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
   1.494 +            }
   1.495 +            myConverterData->myConverterArray[JISX208] =
   1.496 +                ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
   1.497 +            if(jpCharsetMasks[version]&CSM(JISX212)) {
   1.498 +                myConverterData->myConverterArray[JISX212] =
   1.499 +                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
   1.500 +            }
   1.501 +            if(jpCharsetMasks[version]&CSM(GB2312)) {
   1.502 +                myConverterData->myConverterArray[GB2312] =
   1.503 +                    ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
   1.504 +            }
   1.505 +            if(jpCharsetMasks[version]&CSM(KSC5601)) {
   1.506 +                myConverterData->myConverterArray[KSC5601] =
   1.507 +                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
   1.508 +            }
   1.509 +
   1.510 +            /* set the function pointers to appropriate funtions */
   1.511 +            cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
   1.512 +            uprv_strcpy(myConverterData->locale,"ja");
   1.513 +
   1.514 +            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
   1.515 +            len = uprv_strlen(myConverterData->name);
   1.516 +            myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
   1.517 +            myConverterData->name[len+1]='\0';
   1.518 +        }
   1.519 +        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
   1.520 +            (myLocale[2]=='_' || myLocale[2]=='\0'))
   1.521 +        {
   1.522 +            const char *cnvName;
   1.523 +            if(version==1) {
   1.524 +                cnvName="icu-internal-25546";
   1.525 +            } else {
   1.526 +                cnvName="ibm-949";
   1.527 +                myConverterData->version=version=0;
   1.528 +            }
   1.529 +            if(pArgs->onlyTestIsLoadable) {
   1.530 +                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
   1.531 +                uprv_free(cnv->extraInfo);
   1.532 +                cnv->extraInfo=NULL;
   1.533 +                return;
   1.534 +            } else {
   1.535 +                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
   1.536 +                if (U_FAILURE(*errorCode)) {
   1.537 +                    _ISO2022Close(cnv);
   1.538 +                    return;
   1.539 +                }
   1.540 +
   1.541 +                if(version==1) {
   1.542 +                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
   1.543 +                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
   1.544 +                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
   1.545 +                }else{
   1.546 +                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
   1.547 +                }
   1.548 +
   1.549 +                /* initialize the state variables */
   1.550 +                setInitialStateToUnicodeKR(cnv, myConverterData);
   1.551 +                setInitialStateFromUnicodeKR(cnv, myConverterData);
   1.552 +
   1.553 +                /* set the function pointers to appropriate funtions */
   1.554 +                cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
   1.555 +                uprv_strcpy(myConverterData->locale,"ko");
   1.556 +            }
   1.557 +        }
   1.558 +        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
   1.559 +            (myLocale[2]=='_' || myLocale[2]=='\0'))
   1.560 +        {
   1.561 +
   1.562 +            /* open the required converters and cache them */
   1.563 +            myConverterData->myConverterArray[GB2312_1] =
   1.564 +                ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
   1.565 +            if(version==1) {
   1.566 +                myConverterData->myConverterArray[ISO_IR_165] =
   1.567 +                    ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
   1.568 +            }
   1.569 +            myConverterData->myConverterArray[CNS_11643] =
   1.570 +                ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
   1.571 +
   1.572 +
   1.573 +            /* set the function pointers to appropriate funtions */
   1.574 +            cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
   1.575 +            uprv_strcpy(myConverterData->locale,"cn");
   1.576 +
   1.577 +            if (version==0){
   1.578 +                myConverterData->version = 0;
   1.579 +                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
   1.580 +            }else if (version==1){
   1.581 +                myConverterData->version = 1;
   1.582 +                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
   1.583 +            }else {
   1.584 +                myConverterData->version = 2;
   1.585 +                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
   1.586 +            }
   1.587 +        }
   1.588 +        else{
   1.589 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.590 +            myConverterData->isFirstBuffer = TRUE;
   1.591 +
   1.592 +            /* append the UTF-8 escape sequence */
   1.593 +            cnv->charErrorBufferLength = 3;
   1.594 +            cnv->charErrorBuffer[0] = 0x1b;
   1.595 +            cnv->charErrorBuffer[1] = 0x25;
   1.596 +            cnv->charErrorBuffer[2] = 0x42;
   1.597 +
   1.598 +            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
   1.599 +            /* initialize the state variables */
   1.600 +            uprv_strcpy(myConverterData->name,"ISO_2022");
   1.601 +#else
   1.602 +            *errorCode = U_UNSUPPORTED_ERROR;
   1.603 +            return;
   1.604 +#endif
   1.605 +        }
   1.606 +
   1.607 +        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
   1.608 +
   1.609 +        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
   1.610 +            _ISO2022Close(cnv);
   1.611 +        }
   1.612 +    } else {
   1.613 +        *errorCode = U_MEMORY_ALLOCATION_ERROR;
   1.614 +    }
   1.615 +}
   1.616 +
   1.617 +
   1.618 +static void
   1.619 +_ISO2022Close(UConverter *converter) {
   1.620 +    UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
   1.621 +    UConverterSharedData **array = myData->myConverterArray;
   1.622 +    int32_t i;
   1.623 +
   1.624 +    if (converter->extraInfo != NULL) {
   1.625 +        /*close the array of converter pointers and free the memory*/
   1.626 +        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
   1.627 +            if(array[i]!=NULL) {
   1.628 +                ucnv_unloadSharedDataIfReady(array[i]);
   1.629 +            }
   1.630 +        }
   1.631 +
   1.632 +        ucnv_close(myData->currentConverter);
   1.633 +
   1.634 +        if(!converter->isExtraLocal){
   1.635 +            uprv_free (converter->extraInfo);
   1.636 +            converter->extraInfo = NULL;
   1.637 +        }
   1.638 +    }
   1.639 +}
   1.640 +
   1.641 +static void
   1.642 +_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
   1.643 +    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
   1.644 +    if(choice<=UCNV_RESET_TO_UNICODE) {
   1.645 +        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
   1.646 +        myConverterData->key = 0;
   1.647 +        myConverterData->isEmptySegment = FALSE;
   1.648 +    }
   1.649 +    if(choice!=UCNV_RESET_TO_UNICODE) {
   1.650 +        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
   1.651 +    }
   1.652 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.653 +    if(myConverterData->locale[0] == 0){
   1.654 +        if(choice<=UCNV_RESET_TO_UNICODE) {
   1.655 +            myConverterData->isFirstBuffer = TRUE;
   1.656 +            myConverterData->key = 0;
   1.657 +            if (converter->mode == UCNV_SO){
   1.658 +                ucnv_close (myConverterData->currentConverter);
   1.659 +                myConverterData->currentConverter=NULL;
   1.660 +            }
   1.661 +            converter->mode = UCNV_SI;
   1.662 +        }
   1.663 +        if(choice!=UCNV_RESET_TO_UNICODE) {
   1.664 +            /* re-append UTF-8 escape sequence */
   1.665 +            converter->charErrorBufferLength = 3;
   1.666 +            converter->charErrorBuffer[0] = 0x1b;
   1.667 +            converter->charErrorBuffer[1] = 0x28;
   1.668 +            converter->charErrorBuffer[2] = 0x42;
   1.669 +        }
   1.670 +    }
   1.671 +    else
   1.672 +#endif
   1.673 +    {
   1.674 +        /* reset the state variables */
   1.675 +        if(myConverterData->locale[0] == 'k'){
   1.676 +            if(choice<=UCNV_RESET_TO_UNICODE) {
   1.677 +                setInitialStateToUnicodeKR(converter, myConverterData);
   1.678 +            }
   1.679 +            if(choice!=UCNV_RESET_TO_UNICODE) {
   1.680 +                setInitialStateFromUnicodeKR(converter, myConverterData);
   1.681 +            }
   1.682 +        }
   1.683 +    }
   1.684 +}
   1.685 +
   1.686 +static const char*
   1.687 +_ISO2022getName(const UConverter* cnv){
   1.688 +    if(cnv->extraInfo){
   1.689 +        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
   1.690 +        return myData->name;
   1.691 +    }
   1.692 +    return NULL;
   1.693 +}
   1.694 +
   1.695 +
   1.696 +/*************** to unicode *******************/
   1.697 +/****************************************************************************
   1.698 + * Recognized escape sequences are
   1.699 + * <ESC>(B  ASCII
   1.700 + * <ESC>.A  ISO-8859-1
   1.701 + * <ESC>.F  ISO-8859-7
   1.702 + * <ESC>(J  JISX-201
   1.703 + * <ESC>(I  JISX-201
   1.704 + * <ESC>$B  JISX-208
   1.705 + * <ESC>$@  JISX-208
   1.706 + * <ESC>$(D JISX-212
   1.707 + * <ESC>$A  GB2312
   1.708 + * <ESC>$(C KSC5601
   1.709 + */
   1.710 +static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
   1.711 +/*      0                1               2               3               4               5               6               7               8               9    */
   1.712 +    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.713 +    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
   1.714 +    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.715 +    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
   1.716 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.717 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.718 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.719 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.720 +};
   1.721 +
   1.722 +/*************** to unicode *******************/
   1.723 +static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
   1.724 +/*      0                1               2               3               4               5               6               7               8               9    */
   1.725 +     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.726 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.727 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.728 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.729 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
   1.730 +    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.731 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.732 +    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
   1.733 +};
   1.734 +
   1.735 +
   1.736 +static UCNV_TableStates_2022
   1.737 +getKey_2022(char c,int32_t* key,int32_t* offset){
   1.738 +    int32_t togo;
   1.739 +    int32_t low = 0;
   1.740 +    int32_t hi = MAX_STATES_2022;
   1.741 +    int32_t oldmid=0;
   1.742 +
   1.743 +    togo = normalize_esq_chars_2022[(uint8_t)c];
   1.744 +    if(togo == 0) {
   1.745 +        /* not a valid character anywhere in an escape sequence */
   1.746 +        *key = 0;
   1.747 +        *offset = 0;
   1.748 +        return INVALID_2022;
   1.749 +    }
   1.750 +    togo = (*key << 5) + togo;
   1.751 +
   1.752 +    while (hi != low)  /*binary search*/{
   1.753 +
   1.754 +        register int32_t mid = (hi+low) >> 1; /*Finds median*/
   1.755 +
   1.756 +        if (mid == oldmid)
   1.757 +            break;
   1.758 +
   1.759 +        if (escSeqStateTable_Key_2022[mid] > togo){
   1.760 +            hi = mid;
   1.761 +        }
   1.762 +        else if (escSeqStateTable_Key_2022[mid] < togo){
   1.763 +            low = mid;
   1.764 +        }
   1.765 +        else /*we found it*/{
   1.766 +            *key = togo;
   1.767 +            *offset = mid;
   1.768 +            return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
   1.769 +        }
   1.770 +        oldmid = mid;
   1.771 +
   1.772 +    }
   1.773 +
   1.774 +    *key = 0;
   1.775 +    *offset = 0;
   1.776 +    return INVALID_2022;
   1.777 +}
   1.778 +
   1.779 +/*runs through a state machine to determine the escape sequence - codepage correspondance
   1.780 + */
   1.781 +static void
   1.782 +changeState_2022(UConverter* _this,
   1.783 +                const char** source,
   1.784 +                const char* sourceLimit,
   1.785 +                Variant2022 var,
   1.786 +                UErrorCode* err){
   1.787 +    UCNV_TableStates_2022 value;
   1.788 +    UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
   1.789 +    uint32_t key = myData2022->key;
   1.790 +    int32_t offset = 0;
   1.791 +    int8_t initialToULength = _this->toULength;
   1.792 +    char c;
   1.793 +
   1.794 +    value = VALID_NON_TERMINAL_2022;
   1.795 +    while (*source < sourceLimit) {
   1.796 +        c = *(*source)++;
   1.797 +        _this->toUBytes[_this->toULength++]=(uint8_t)c;
   1.798 +        value = getKey_2022(c,(int32_t *) &key, &offset);
   1.799 +
   1.800 +        switch (value){
   1.801 +
   1.802 +        case VALID_NON_TERMINAL_2022 :
   1.803 +            /* continue with the loop */
   1.804 +            break;
   1.805 +
   1.806 +        case VALID_TERMINAL_2022:
   1.807 +            key = 0;
   1.808 +            goto DONE;
   1.809 +
   1.810 +        case INVALID_2022:
   1.811 +            goto DONE;
   1.812 +
   1.813 +        case VALID_MAYBE_TERMINAL_2022:
   1.814 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.815 +            /* ESC ( B is ambiguous only for ISO_2022 itself */
   1.816 +            if(var == ISO_2022) {
   1.817 +                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
   1.818 +                _this->toULength = 0;
   1.819 +
   1.820 +                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
   1.821 +
   1.822 +                /* continue with the loop */
   1.823 +                value = VALID_NON_TERMINAL_2022;
   1.824 +                break;
   1.825 +            } else
   1.826 +#endif
   1.827 +            {
   1.828 +                /* not ISO_2022 itself, finish here */
   1.829 +                value = VALID_TERMINAL_2022;
   1.830 +                key = 0;
   1.831 +                goto DONE;
   1.832 +            }
   1.833 +        }
   1.834 +    }
   1.835 +
   1.836 +DONE:
   1.837 +    myData2022->key = key;
   1.838 +
   1.839 +    if (value == VALID_NON_TERMINAL_2022) {
   1.840 +        /* indicate that the escape sequence is incomplete: key!=0 */
   1.841 +        return;
   1.842 +    } else if (value == INVALID_2022 ) {
   1.843 +        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1.844 +    } else /* value == VALID_TERMINAL_2022 */ {
   1.845 +        switch(var){
   1.846 +#ifdef U_ENABLE_GENERIC_ISO_2022
   1.847 +        case ISO_2022:
   1.848 +        {
   1.849 +            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
   1.850 +            if(chosenConverterName == NULL) {
   1.851 +                /* SS2 or SS3 */
   1.852 +                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1.853 +                _this->toUCallbackReason = UCNV_UNASSIGNED;
   1.854 +                return;
   1.855 +            }
   1.856 +
   1.857 +            _this->mode = UCNV_SI;
   1.858 +            ucnv_close(myData2022->currentConverter);
   1.859 +            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
   1.860 +            if(U_SUCCESS(*err)) {
   1.861 +                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
   1.862 +                _this->mode = UCNV_SO;
   1.863 +            }
   1.864 +            break;
   1.865 +        }
   1.866 +#endif
   1.867 +        case ISO_2022_JP:
   1.868 +            {
   1.869 +                StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
   1.870 +                switch(tempState) {
   1.871 +                case INVALID_STATE:
   1.872 +                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1.873 +                    break;
   1.874 +                case SS2_STATE:
   1.875 +                    if(myData2022->toU2022State.cs[2]!=0) {
   1.876 +                        if(myData2022->toU2022State.g<2) {
   1.877 +                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
   1.878 +                        }
   1.879 +                        myData2022->toU2022State.g=2;
   1.880 +                    } else {
   1.881 +                        /* illegal to have SS2 before a matching designator */
   1.882 +                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1.883 +                    }
   1.884 +                    break;
   1.885 +                /* case SS3_STATE: not used in ISO-2022-JP-x */
   1.886 +                case ISO8859_1:
   1.887 +                case ISO8859_7:
   1.888 +                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
   1.889 +                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1.890 +                    } else {
   1.891 +                        /* G2 charset for SS2 */
   1.892 +                        myData2022->toU2022State.cs[2]=(int8_t)tempState;
   1.893 +                    }
   1.894 +                    break;
   1.895 +                default:
   1.896 +                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
   1.897 +                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1.898 +                    } else {
   1.899 +                        /* G0 charset */
   1.900 +                        myData2022->toU2022State.cs[0]=(int8_t)tempState;
   1.901 +                    }
   1.902 +                    break;
   1.903 +                }
   1.904 +            }
   1.905 +            break;
   1.906 +        case ISO_2022_CN:
   1.907 +            {
   1.908 +                StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
   1.909 +                switch(tempState) {
   1.910 +                case INVALID_STATE:
   1.911 +                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1.912 +                    break;
   1.913 +                case SS2_STATE:
   1.914 +                    if(myData2022->toU2022State.cs[2]!=0) {
   1.915 +                        if(myData2022->toU2022State.g<2) {
   1.916 +                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
   1.917 +                        }
   1.918 +                        myData2022->toU2022State.g=2;
   1.919 +                    } else {
   1.920 +                        /* illegal to have SS2 before a matching designator */
   1.921 +                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1.922 +                    }
   1.923 +                    break;
   1.924 +                case SS3_STATE:
   1.925 +                    if(myData2022->toU2022State.cs[3]!=0) {
   1.926 +                        if(myData2022->toU2022State.g<2) {
   1.927 +                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
   1.928 +                        }
   1.929 +                        myData2022->toU2022State.g=3;
   1.930 +                    } else {
   1.931 +                        /* illegal to have SS3 before a matching designator */
   1.932 +                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1.933 +                    }
   1.934 +                    break;
   1.935 +                case ISO_IR_165:
   1.936 +                    if(myData2022->version==0) {
   1.937 +                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1.938 +                        break;
   1.939 +                    }
   1.940 +                    /*fall through*/
   1.941 +                case GB2312_1:
   1.942 +                    /*fall through*/
   1.943 +                case CNS_11643_1:
   1.944 +                    myData2022->toU2022State.cs[1]=(int8_t)tempState;
   1.945 +                    break;
   1.946 +                case CNS_11643_2:
   1.947 +                    myData2022->toU2022State.cs[2]=(int8_t)tempState;
   1.948 +                    break;
   1.949 +                default:
   1.950 +                    /* other CNS 11643 planes */
   1.951 +                    if(myData2022->version==0) {
   1.952 +                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1.953 +                    } else {
   1.954 +                       myData2022->toU2022State.cs[3]=(int8_t)tempState;
   1.955 +                    }
   1.956 +                    break;
   1.957 +                }
   1.958 +            }
   1.959 +            break;
   1.960 +        case ISO_2022_KR:
   1.961 +            if(offset==0x30){
   1.962 +                /* nothing to be done, just accept this one escape sequence */
   1.963 +            } else {
   1.964 +                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1.965 +            }
   1.966 +            break;
   1.967 +
   1.968 +        default:
   1.969 +            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1.970 +            break;
   1.971 +        }
   1.972 +    }
   1.973 +    if(U_SUCCESS(*err)) {
   1.974 +        _this->toULength = 0;
   1.975 +    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
   1.976 +        if(_this->toULength>1) {
   1.977 +            /*
   1.978 +             * Ticket 5691: consistent illegal sequences:
   1.979 +             * - We include at least the first byte (ESC) in the illegal sequence.
   1.980 +             * - If any of the non-initial bytes could be the start of a character,
   1.981 +             *   we stop the illegal sequence before the first one of those.
   1.982 +             *   In escape sequences, all following bytes are "printable", that is,
   1.983 +             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
   1.984 +             *   they are valid single/lead bytes.
   1.985 +             *   For simplicity, we always only report the initial ESC byte as the
   1.986 +             *   illegal sequence and back out all other bytes we looked at.
   1.987 +             */
   1.988 +            /* Back out some bytes. */
   1.989 +            int8_t backOutDistance=_this->toULength-1;
   1.990 +            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
   1.991 +            if(backOutDistance<=bytesFromThisBuffer) {
   1.992 +                /* same as initialToULength<=1 */
   1.993 +                *source-=backOutDistance;
   1.994 +            } else {
   1.995 +                /* Back out bytes from the previous buffer: Need to replay them. */
   1.996 +                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
   1.997 +                /* same as -(initialToULength-1) */
   1.998 +                /* preToULength is negative! */
   1.999 +                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
  1.1000 +                *source-=bytesFromThisBuffer;
  1.1001 +            }
  1.1002 +            _this->toULength=1;
  1.1003 +        }
  1.1004 +    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
  1.1005 +        _this->toUCallbackReason = UCNV_UNASSIGNED;
  1.1006 +    }
  1.1007 +}
  1.1008 +
  1.1009 +/*Checks the characters of the buffer against valid 2022 escape sequences
  1.1010 +*if the match we return a pointer to the initial start of the sequence otherwise
  1.1011 +*we return sourceLimit
  1.1012 +*/
  1.1013 +/*for 2022 looks ahead in the stream
  1.1014 + *to determine the longest possible convertible
  1.1015 + *data stream
  1.1016 + */
  1.1017 +static inline const char*
  1.1018 +getEndOfBuffer_2022(const char** source,
  1.1019 +                   const char* sourceLimit,
  1.1020 +                   UBool /*flush*/){
  1.1021 +
  1.1022 +    const char* mySource = *source;
  1.1023 +
  1.1024 +#ifdef U_ENABLE_GENERIC_ISO_2022
  1.1025 +    if (*source >= sourceLimit)
  1.1026 +        return sourceLimit;
  1.1027 +
  1.1028 +    do{
  1.1029 +
  1.1030 +        if (*mySource == ESC_2022){
  1.1031 +            int8_t i;
  1.1032 +            int32_t key = 0;
  1.1033 +            int32_t offset;
  1.1034 +            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
  1.1035 +
  1.1036 +            /* Kludge: I could not
  1.1037 +            * figure out the reason for validating an escape sequence
  1.1038 +            * twice - once here and once in changeState_2022().
  1.1039 +            * is it possible to have an ESC character in a ISO2022
  1.1040 +            * byte stream which is valid in a code page? Is it legal?
  1.1041 +            */
  1.1042 +            for (i=0;
  1.1043 +            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
  1.1044 +            i++) {
  1.1045 +                value =  getKey_2022(*(mySource+i), &key, &offset);
  1.1046 +            }
  1.1047 +            if (value > 0 || *mySource==ESC_2022)
  1.1048 +                return mySource;
  1.1049 +
  1.1050 +            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
  1.1051 +                return sourceLimit;
  1.1052 +        }
  1.1053 +    }while (++mySource < sourceLimit);
  1.1054 +
  1.1055 +    return sourceLimit;
  1.1056 +#else
  1.1057 +    while(mySource < sourceLimit && *mySource != ESC_2022) {
  1.1058 +        ++mySource;
  1.1059 +    }
  1.1060 +    return mySource;
  1.1061 +#endif
  1.1062 +}
  1.1063 +
  1.1064 +
  1.1065 +/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
  1.1066 + * any future change in _MBCSFromUChar32() function should be reflected here.
  1.1067 + * @return number of bytes in *value; negative number if fallback; 0 if no mapping
  1.1068 + */
  1.1069 +static inline int32_t
  1.1070 +MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
  1.1071 +                                         UChar32 c,
  1.1072 +                                         uint32_t* value,
  1.1073 +                                         UBool useFallback,
  1.1074 +                                         int outputType)
  1.1075 +{
  1.1076 +    const int32_t *cx;
  1.1077 +    const uint16_t *table;
  1.1078 +    uint32_t stage2Entry;
  1.1079 +    uint32_t myValue;
  1.1080 +    int32_t length;
  1.1081 +    const uint8_t *p;
  1.1082 +    /*
  1.1083 +     * TODO(markus): Use and require new, faster MBCS conversion table structures.
  1.1084 +     * Use internal version of ucnv_open() that verifies that the new structures are available,
  1.1085 +     * else U_INTERNAL_PROGRAM_ERROR.
  1.1086 +     */
  1.1087 +    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
  1.1088 +    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
  1.1089 +        table=sharedData->mbcs.fromUnicodeTable;
  1.1090 +        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
  1.1091 +        /* get the bytes and the length for the output */
  1.1092 +        if(outputType==MBCS_OUTPUT_2){
  1.1093 +            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
  1.1094 +            if(myValue<=0xff) {
  1.1095 +                length=1;
  1.1096 +            } else {
  1.1097 +                length=2;
  1.1098 +            }
  1.1099 +        } else /* outputType==MBCS_OUTPUT_3 */ {
  1.1100 +            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
  1.1101 +            myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
  1.1102 +            if(myValue<=0xff) {
  1.1103 +                length=1;
  1.1104 +            } else if(myValue<=0xffff) {
  1.1105 +                length=2;
  1.1106 +            } else {
  1.1107 +                length=3;
  1.1108 +            }
  1.1109 +        }
  1.1110 +        /* is this code point assigned, or do we use fallbacks? */
  1.1111 +        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
  1.1112 +            /* assigned */
  1.1113 +            *value=myValue;
  1.1114 +            return length;
  1.1115 +        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
  1.1116 +            /*
  1.1117 +             * We allow a 0 byte output if the "assigned" bit is set for this entry.
  1.1118 +             * There is no way with this data structure for fallback output
  1.1119 +             * to be a zero byte.
  1.1120 +             */
  1.1121 +            *value=myValue;
  1.1122 +            return -length;
  1.1123 +        }
  1.1124 +    }
  1.1125 +
  1.1126 +    cx=sharedData->mbcs.extIndexes;
  1.1127 +    if(cx!=NULL) {
  1.1128 +        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
  1.1129 +    }
  1.1130 +
  1.1131 +    /* unassigned */
  1.1132 +    return 0;
  1.1133 +}
  1.1134 +
  1.1135 +/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
  1.1136 + * any future change in _MBCSSingleFromUChar32() function should be reflected here.
  1.1137 + * @param retval pointer to output byte
  1.1138 + * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
  1.1139 + */
  1.1140 +static inline int32_t
  1.1141 +MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
  1.1142 +                                       UChar32 c,
  1.1143 +                                       uint32_t* retval,
  1.1144 +                                       UBool useFallback)
  1.1145 +{
  1.1146 +    const uint16_t *table;
  1.1147 +    int32_t value;
  1.1148 +    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
  1.1149 +    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
  1.1150 +        return 0;
  1.1151 +    }
  1.1152 +    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
  1.1153 +    table=sharedData->mbcs.fromUnicodeTable;
  1.1154 +    /* get the byte for the output */
  1.1155 +    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
  1.1156 +    /* is this code point assigned, or do we use fallbacks? */
  1.1157 +    *retval=(uint32_t)(value&0xff);
  1.1158 +    if(value>=0xf00) {
  1.1159 +        return 1;  /* roundtrip */
  1.1160 +    } else if(useFallback ? value>=0x800 : value>=0xc00) {
  1.1161 +        return -1;  /* fallback taken */
  1.1162 +    } else {
  1.1163 +        return 0;  /* no mapping */
  1.1164 +    }
  1.1165 +}
  1.1166 +
  1.1167 +/*
  1.1168 + * Check that the result is a 2-byte value with each byte in the range A1..FE
  1.1169 + * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
  1.1170 + * to move it to the ISO 2022 range 21..7E.
  1.1171 + * Return 0 if out of range.
  1.1172 + */
  1.1173 +static inline uint32_t
  1.1174 +_2022FromGR94DBCS(uint32_t value) {
  1.1175 +    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
  1.1176 +        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
  1.1177 +    ) {
  1.1178 +        return value - 0x8080;  /* shift down to 21..7e byte range */
  1.1179 +    } else {
  1.1180 +        return 0;  /* not valid for ISO 2022 */
  1.1181 +    }
  1.1182 +}
  1.1183 +
  1.1184 +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
  1.1185 +/*
  1.1186 + * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
  1.1187 + * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
  1.1188 + * unchanged. 
  1.1189 + */
  1.1190 +static inline uint32_t
  1.1191 +_2022ToGR94DBCS(uint32_t value) {
  1.1192 +    uint32_t returnValue = value + 0x8080;
  1.1193 +    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
  1.1194 +        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
  1.1195 +        return returnValue;
  1.1196 +    } else {
  1.1197 +        return value;
  1.1198 +    }
  1.1199 +}
  1.1200 +#endif
  1.1201 +
  1.1202 +#ifdef U_ENABLE_GENERIC_ISO_2022
  1.1203 +
  1.1204 +/**********************************************************************************
  1.1205 +*  ISO-2022 Converter
  1.1206 +*
  1.1207 +*
  1.1208 +*/
  1.1209 +
  1.1210 +static void
  1.1211 +T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
  1.1212 +                                                           UErrorCode* err){
  1.1213 +    const char* mySourceLimit, *realSourceLimit;
  1.1214 +    const char* sourceStart;
  1.1215 +    const UChar* myTargetStart;
  1.1216 +    UConverter* saveThis;
  1.1217 +    UConverterDataISO2022* myData;
  1.1218 +    int8_t length;
  1.1219 +
  1.1220 +    saveThis = args->converter;
  1.1221 +    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
  1.1222 +
  1.1223 +    realSourceLimit = args->sourceLimit;
  1.1224 +    while (args->source < realSourceLimit) {
  1.1225 +        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
  1.1226 +            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
  1.1227 +            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
  1.1228 +
  1.1229 +            if(args->source < mySourceLimit) {
  1.1230 +                if(myData->currentConverter==NULL) {
  1.1231 +                    myData->currentConverter = ucnv_open("ASCII",err);
  1.1232 +                    if(U_FAILURE(*err)){
  1.1233 +                        return;
  1.1234 +                    }
  1.1235 +
  1.1236 +                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
  1.1237 +                    saveThis->mode = UCNV_SO;
  1.1238 +                }
  1.1239 +
  1.1240 +                /* convert to before the ESC or until the end of the buffer */
  1.1241 +                myData->isFirstBuffer=FALSE;
  1.1242 +                sourceStart = args->source;
  1.1243 +                myTargetStart = args->target;
  1.1244 +                args->converter = myData->currentConverter;
  1.1245 +                ucnv_toUnicode(args->converter,
  1.1246 +                    &args->target,
  1.1247 +                    args->targetLimit,
  1.1248 +                    &args->source,
  1.1249 +                    mySourceLimit,
  1.1250 +                    args->offsets,
  1.1251 +                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
  1.1252 +                    err);
  1.1253 +                args->converter = saveThis;
  1.1254 +
  1.1255 +                if (*err == U_BUFFER_OVERFLOW_ERROR) {
  1.1256 +                    /* move the overflow buffer */
  1.1257 +                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
  1.1258 +                    myData->currentConverter->UCharErrorBufferLength = 0;
  1.1259 +                    if(length > 0) {
  1.1260 +                        uprv_memcpy(saveThis->UCharErrorBuffer,
  1.1261 +                                    myData->currentConverter->UCharErrorBuffer,
  1.1262 +                                    length*U_SIZEOF_UCHAR);
  1.1263 +                    }
  1.1264 +                    return;
  1.1265 +                }
  1.1266 +
  1.1267 +                /*
  1.1268 +                 * At least one of:
  1.1269 +                 * -Error while converting
  1.1270 +                 * -Done with entire buffer
  1.1271 +                 * -Need to write offsets or update the current offset
  1.1272 +                 *  (leave that up to the code in ucnv.c)
  1.1273 +                 *
  1.1274 +                 * or else we just stopped at an ESC byte and continue with changeState_2022()
  1.1275 +                 */
  1.1276 +                if (U_FAILURE(*err) ||
  1.1277 +                    (args->source == realSourceLimit) ||
  1.1278 +                    (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
  1.1279 +                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
  1.1280 +                ) {
  1.1281 +                    /* copy partial or error input for truncated detection and error handling */
  1.1282 +                    if(U_FAILURE(*err)) {
  1.1283 +                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
  1.1284 +                        if(length > 0) {
  1.1285 +                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
  1.1286 +                        }
  1.1287 +                    } else {
  1.1288 +                        length = saveThis->toULength = myData->currentConverter->toULength;
  1.1289 +                        if(length > 0) {
  1.1290 +                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
  1.1291 +                            if(args->source < mySourceLimit) {
  1.1292 +                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
  1.1293 +                            }
  1.1294 +                        }
  1.1295 +                    }
  1.1296 +                    return;
  1.1297 +                }
  1.1298 +            }
  1.1299 +        }
  1.1300 +
  1.1301 +        sourceStart = args->source;
  1.1302 +        changeState_2022(args->converter,
  1.1303 +               &(args->source),
  1.1304 +               realSourceLimit,
  1.1305 +               ISO_2022,
  1.1306 +               err);
  1.1307 +        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
  1.1308 +            /* let the ucnv.c code update its current offset */
  1.1309 +            return;
  1.1310 +        }
  1.1311 +    }
  1.1312 +}
  1.1313 +
  1.1314 +#endif
  1.1315 +
  1.1316 +/*
  1.1317 + * To Unicode Callback helper function
  1.1318 + */
  1.1319 +static void
  1.1320 +toUnicodeCallback(UConverter *cnv,
  1.1321 +                  const uint32_t sourceChar, const uint32_t targetUniChar,
  1.1322 +                  UErrorCode* err){
  1.1323 +    if(sourceChar>0xff){
  1.1324 +        cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
  1.1325 +        cnv->toUBytes[1] = (uint8_t)sourceChar;
  1.1326 +        cnv->toULength = 2;
  1.1327 +    }
  1.1328 +    else{
  1.1329 +        cnv->toUBytes[0] =(char) sourceChar;
  1.1330 +        cnv->toULength = 1;
  1.1331 +    }
  1.1332 +
  1.1333 +    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
  1.1334 +        *err = U_INVALID_CHAR_FOUND;
  1.1335 +    }
  1.1336 +    else{
  1.1337 +        *err = U_ILLEGAL_CHAR_FOUND;
  1.1338 +    }
  1.1339 +}
  1.1340 +
  1.1341 +/**************************************ISO-2022-JP*************************************************/
  1.1342 +
  1.1343 +/************************************** IMPORTANT **************************************************
  1.1344 +* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
  1.1345 +* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
  1.1346 +* The converter iterates over each Unicode codepoint
  1.1347 +* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
  1.1348 +* processed one char at a time it would make sense to reduce the extra processing a canned converter
  1.1349 +* would do as far as possible.
  1.1350 +*
  1.1351 +* If the implementation of these macros or structure of sharedData struct change in the future, make
  1.1352 +* sure that ISO-2022 is also changed.
  1.1353 +***************************************************************************************************
  1.1354 +*/
  1.1355 +
  1.1356 +/***************************************************************************************************
  1.1357 +* Rules for ISO-2022-jp encoding
  1.1358 +* (i)   Escape sequences must be fully contained within a line they should not
  1.1359 +*       span new lines or CRs
  1.1360 +* (ii)  If the last character on a line is represented by two bytes then an ASCII or
  1.1361 +*       JIS-Roman character escape sequence should follow before the line terminates
  1.1362 +* (iii) If the first character on the line is represented by two bytes then a two
  1.1363 +*       byte character escape sequence should precede it
  1.1364 +* (iv)  If no escape sequence is encountered then the characters are ASCII
  1.1365 +* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
  1.1366 +*       and invoked with SS2 (ESC N).
  1.1367 +* (vi)  If there is any G0 designation in text, there must be a switch to
  1.1368 +*       ASCII or to JIS X 0201-Roman before a space character (but not
  1.1369 +*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
  1.1370 +*       characters such as tab or CRLF.
  1.1371 +* (vi)  Supported encodings:
  1.1372 +*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
  1.1373 +*
  1.1374 +*  source : RFC-1554
  1.1375 +*
  1.1376 +*          JISX201, JISX208,JISX212 : new .cnv data files created
  1.1377 +*          KSC5601 : alias to ibm-949 mapping table
  1.1378 +*          GB2312 : alias to ibm-1386 mapping table
  1.1379 +*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
  1.1380 +*          ISO-8859-7 : alisas to ibm-9409 mapping table
  1.1381 +*/
  1.1382 +
  1.1383 +/* preference order of JP charsets */
  1.1384 +static const StateEnum jpCharsetPref[]={
  1.1385 +    ASCII,
  1.1386 +    JISX201,
  1.1387 +    ISO8859_1,
  1.1388 +    ISO8859_7,
  1.1389 +    JISX208,
  1.1390 +    JISX212,
  1.1391 +    GB2312,
  1.1392 +    KSC5601,
  1.1393 +    HWKANA_7BIT
  1.1394 +};
  1.1395 +
  1.1396 +/*
  1.1397 + * The escape sequences must be in order of the enum constants like JISX201  = 3,
  1.1398 + * not in order of jpCharsetPref[]!
  1.1399 + */
  1.1400 +static const char escSeqChars[][6] ={
  1.1401 +    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
  1.1402 +    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
  1.1403 +    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
  1.1404 +    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
  1.1405 +    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
  1.1406 +    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
  1.1407 +    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
  1.1408 +    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
  1.1409 +    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
  1.1410 +
  1.1411 +};
  1.1412 +static  const int8_t escSeqCharsLen[] ={
  1.1413 +    3, /* length of <ESC>(B  ASCII       */
  1.1414 +    3, /* length of <ESC>.A  ISO-8859-1  */
  1.1415 +    3, /* length of <ESC>.F  ISO-8859-7  */
  1.1416 +    3, /* length of <ESC>(J  JISX-201    */
  1.1417 +    3, /* length of <ESC>$B  JISX-208    */
  1.1418 +    4, /* length of <ESC>$(D JISX-212    */
  1.1419 +    3, /* length of <ESC>$A  GB2312      */
  1.1420 +    4, /* length of <ESC>$(C KSC5601     */
  1.1421 +    3  /* length of <ESC>(I  HWKANA_7BIT */
  1.1422 +};
  1.1423 +
  1.1424 +/*
  1.1425 +* The iteration over various code pages works this way:
  1.1426 +* i)   Get the currentState from myConverterData->currentState
  1.1427 +* ii)  Check if the character is mapped to a valid character in the currentState
  1.1428 +*      Yes ->  a) set the initIterState to currentState
  1.1429 +*       b) remain in this state until an invalid character is found
  1.1430 +*      No  ->  a) go to the next code page and find the character
  1.1431 +* iii) Before changing the state increment the current state check if the current state
  1.1432 +*      is equal to the intitIteration state
  1.1433 +*      Yes ->  A character that cannot be represented in any of the supported encodings
  1.1434 +*       break and return a U_INVALID_CHARACTER error
  1.1435 +*      No  ->  Continue and find the character in next code page
  1.1436 +*
  1.1437 +*
  1.1438 +* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
  1.1439 +*/
  1.1440 +
  1.1441 +/* Map 00..7F to Unicode according to JIS X 0201. */
  1.1442 +static inline uint32_t
  1.1443 +jisx201ToU(uint32_t value) {
  1.1444 +    if(value < 0x5c) {
  1.1445 +        return value;
  1.1446 +    } else if(value == 0x5c) {
  1.1447 +        return 0xa5;
  1.1448 +    } else if(value == 0x7e) {
  1.1449 +        return 0x203e;
  1.1450 +    } else /* value <= 0x7f */ {
  1.1451 +        return value;
  1.1452 +    }
  1.1453 +}
  1.1454 +
  1.1455 +/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
  1.1456 +static inline uint32_t
  1.1457 +jisx201FromU(uint32_t value) {
  1.1458 +    if(value<=0x7f) {
  1.1459 +        if(value!=0x5c && value!=0x7e) {
  1.1460 +            return value;
  1.1461 +        }
  1.1462 +    } else if(value==0xa5) {
  1.1463 +        return 0x5c;
  1.1464 +    } else if(value==0x203e) {
  1.1465 +        return 0x7e;
  1.1466 +    }
  1.1467 +    return 0xfffe;
  1.1468 +}
  1.1469 +
  1.1470 +/*
  1.1471 + * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
  1.1472 + * to JIS X 0208, and convert it to a pair of 21..7E bytes.
  1.1473 + * Return 0 if the byte pair is out of range.
  1.1474 + */
  1.1475 +static inline uint32_t
  1.1476 +_2022FromSJIS(uint32_t value) {
  1.1477 +    uint8_t trail;
  1.1478 +
  1.1479 +    if(value > 0xEFFC) {
  1.1480 +        return 0;  /* beyond JIS X 0208 */
  1.1481 +    }
  1.1482 +
  1.1483 +    trail = (uint8_t)value;
  1.1484 +
  1.1485 +    value &= 0xff00;  /* lead byte */
  1.1486 +    if(value <= 0x9f00) {
  1.1487 +        value -= 0x7000;
  1.1488 +    } else /* 0xe000 <= value <= 0xef00 */ {
  1.1489 +        value -= 0xb000;
  1.1490 +    }
  1.1491 +    value <<= 1;
  1.1492 +
  1.1493 +    if(trail <= 0x9e) {
  1.1494 +        value -= 0x100;
  1.1495 +        if(trail <= 0x7e) {
  1.1496 +            value |= trail - 0x1f;
  1.1497 +        } else {
  1.1498 +            value |= trail - 0x20;
  1.1499 +        }
  1.1500 +    } else /* trail <= 0xfc */ {
  1.1501 +        value |= trail - 0x7e;
  1.1502 +    }
  1.1503 +    return value;
  1.1504 +}
  1.1505 +
  1.1506 +/*
  1.1507 + * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
  1.1508 + * If either byte is outside 21..7E make sure that the result is not valid
  1.1509 + * for Shift-JIS so that the converter catches it.
  1.1510 + * Some invalid byte values already turn into equally invalid Shift-JIS
  1.1511 + * byte values and need not be tested explicitly.
  1.1512 + */
  1.1513 +static inline void
  1.1514 +_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
  1.1515 +    if(c1&1) {
  1.1516 +        ++c1;
  1.1517 +        if(c2 <= 0x5f) {
  1.1518 +            c2 += 0x1f;
  1.1519 +        } else if(c2 <= 0x7e) {
  1.1520 +            c2 += 0x20;
  1.1521 +        } else {
  1.1522 +            c2 = 0;  /* invalid */
  1.1523 +        }
  1.1524 +    } else {
  1.1525 +        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
  1.1526 +            c2 += 0x7e;
  1.1527 +        } else {
  1.1528 +            c2 = 0;  /* invalid */
  1.1529 +        }
  1.1530 +    }
  1.1531 +    c1 >>= 1;
  1.1532 +    if(c1 <= 0x2f) {
  1.1533 +        c1 += 0x70;
  1.1534 +    } else if(c1 <= 0x3f) {
  1.1535 +        c1 += 0xb0;
  1.1536 +    } else {
  1.1537 +        c1 = 0;  /* invalid */
  1.1538 +    }
  1.1539 +    bytes[0] = (char)c1;
  1.1540 +    bytes[1] = (char)c2;
  1.1541 +}
  1.1542 +
  1.1543 +/*
  1.1544 + * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
  1.1545 + * Katakana.
  1.1546 + * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
  1.1547 + * because Shift-JIS roundtrips half-width Katakana to single bytes.
  1.1548 + * These were the only fallbacks in ICU's jisx-208.ucm file.
  1.1549 + */
  1.1550 +static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
  1.1551 +    0x2123,  /* U+FF61 */
  1.1552 +    0x2156,
  1.1553 +    0x2157,
  1.1554 +    0x2122,
  1.1555 +    0x2126,
  1.1556 +    0x2572,
  1.1557 +    0x2521,
  1.1558 +    0x2523,
  1.1559 +    0x2525,
  1.1560 +    0x2527,
  1.1561 +    0x2529,
  1.1562 +    0x2563,
  1.1563 +    0x2565,
  1.1564 +    0x2567,
  1.1565 +    0x2543,
  1.1566 +    0x213C,  /* U+FF70 */
  1.1567 +    0x2522,
  1.1568 +    0x2524,
  1.1569 +    0x2526,
  1.1570 +    0x2528,
  1.1571 +    0x252A,
  1.1572 +    0x252B,
  1.1573 +    0x252D,
  1.1574 +    0x252F,
  1.1575 +    0x2531,
  1.1576 +    0x2533,
  1.1577 +    0x2535,
  1.1578 +    0x2537,
  1.1579 +    0x2539,
  1.1580 +    0x253B,
  1.1581 +    0x253D,
  1.1582 +    0x253F,  /* U+FF80 */
  1.1583 +    0x2541,
  1.1584 +    0x2544,
  1.1585 +    0x2546,
  1.1586 +    0x2548,
  1.1587 +    0x254A,
  1.1588 +    0x254B,
  1.1589 +    0x254C,
  1.1590 +    0x254D,
  1.1591 +    0x254E,
  1.1592 +    0x254F,
  1.1593 +    0x2552,
  1.1594 +    0x2555,
  1.1595 +    0x2558,
  1.1596 +    0x255B,
  1.1597 +    0x255E,
  1.1598 +    0x255F,  /* U+FF90 */
  1.1599 +    0x2560,
  1.1600 +    0x2561,
  1.1601 +    0x2562,
  1.1602 +    0x2564,
  1.1603 +    0x2566,
  1.1604 +    0x2568,
  1.1605 +    0x2569,
  1.1606 +    0x256A,
  1.1607 +    0x256B,
  1.1608 +    0x256C,
  1.1609 +    0x256D,
  1.1610 +    0x256F,
  1.1611 +    0x2573,
  1.1612 +    0x212B,
  1.1613 +    0x212C   /* U+FF9F */
  1.1614 +};
  1.1615 +
  1.1616 +static void
  1.1617 +UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
  1.1618 +    UConverter *cnv = args->converter;
  1.1619 +    UConverterDataISO2022 *converterData;
  1.1620 +    ISO2022State *pFromU2022State;
  1.1621 +    uint8_t *target = (uint8_t *) args->target;
  1.1622 +    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
  1.1623 +    const UChar* source = args->source;
  1.1624 +    const UChar* sourceLimit = args->sourceLimit;
  1.1625 +    int32_t* offsets = args->offsets;
  1.1626 +    UChar32 sourceChar;
  1.1627 +    char buffer[8];
  1.1628 +    int32_t len, outLen;
  1.1629 +    int8_t choices[10];
  1.1630 +    int32_t choiceCount;
  1.1631 +    uint32_t targetValue = 0;
  1.1632 +    UBool useFallback;
  1.1633 +
  1.1634 +    int32_t i;
  1.1635 +    int8_t cs, g;
  1.1636 +
  1.1637 +    /* set up the state */
  1.1638 +    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
  1.1639 +    pFromU2022State   = &converterData->fromU2022State;
  1.1640 +
  1.1641 +    choiceCount = 0;
  1.1642 +
  1.1643 +    /* check if the last codepoint of previous buffer was a lead surrogate*/
  1.1644 +    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
  1.1645 +        goto getTrail;
  1.1646 +    }
  1.1647 +
  1.1648 +    while(source < sourceLimit) {
  1.1649 +        if(target < targetLimit) {
  1.1650 +
  1.1651 +            sourceChar  = *(source++);
  1.1652 +            /*check if the char is a First surrogate*/
  1.1653 +            if(U16_IS_SURROGATE(sourceChar)) {
  1.1654 +                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
  1.1655 +getTrail:
  1.1656 +                    /*look ahead to find the trail surrogate*/
  1.1657 +                    if(source < sourceLimit) {
  1.1658 +                        /* test the following code unit */
  1.1659 +                        UChar trail=(UChar) *source;
  1.1660 +                        if(U16_IS_TRAIL(trail)) {
  1.1661 +                            source++;
  1.1662 +                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
  1.1663 +                            cnv->fromUChar32=0x00;
  1.1664 +                            /* convert this supplementary code point */
  1.1665 +                            /* exit this condition tree */
  1.1666 +                        } else {
  1.1667 +                            /* this is an unmatched lead code unit (1st surrogate) */
  1.1668 +                            /* callback(illegal) */
  1.1669 +                            *err=U_ILLEGAL_CHAR_FOUND;
  1.1670 +                            cnv->fromUChar32=sourceChar;
  1.1671 +                            break;
  1.1672 +                        }
  1.1673 +                    } else {
  1.1674 +                        /* no more input */
  1.1675 +                        cnv->fromUChar32=sourceChar;
  1.1676 +                        break;
  1.1677 +                    }
  1.1678 +                } else {
  1.1679 +                    /* this is an unmatched trail code unit (2nd surrogate) */
  1.1680 +                    /* callback(illegal) */
  1.1681 +                    *err=U_ILLEGAL_CHAR_FOUND;
  1.1682 +                    cnv->fromUChar32=sourceChar;
  1.1683 +                    break;
  1.1684 +                }
  1.1685 +            }
  1.1686 +
  1.1687 +            /* do not convert SO/SI/ESC */
  1.1688 +            if(IS_2022_CONTROL(sourceChar)) {
  1.1689 +                /* callback(illegal) */
  1.1690 +                *err=U_ILLEGAL_CHAR_FOUND;
  1.1691 +                cnv->fromUChar32=sourceChar;
  1.1692 +                break;
  1.1693 +            }
  1.1694 +
  1.1695 +            /* do the conversion */
  1.1696 +
  1.1697 +            if(choiceCount == 0) {
  1.1698 +                uint16_t csm;
  1.1699 +
  1.1700 +                /*
  1.1701 +                 * The csm variable keeps track of which charsets are allowed
  1.1702 +                 * and not used yet while building the choices[].
  1.1703 +                 */
  1.1704 +                csm = jpCharsetMasks[converterData->version];
  1.1705 +                choiceCount = 0;
  1.1706 +
  1.1707 +                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
  1.1708 +                if(converterData->version == 3 || converterData->version == 4) {
  1.1709 +                    choices[choiceCount++] = (int8_t)HWKANA_7BIT;
  1.1710 +                }
  1.1711 +                /* Do not try single-byte half-width Katakana for other versions. */
  1.1712 +                csm &= ~CSM(HWKANA_7BIT);
  1.1713 +
  1.1714 +                /* try the current G0 charset */
  1.1715 +                choices[choiceCount++] = cs = pFromU2022State->cs[0];
  1.1716 +                csm &= ~CSM(cs);
  1.1717 +
  1.1718 +                /* try the current G2 charset */
  1.1719 +                if((cs = pFromU2022State->cs[2]) != 0) {
  1.1720 +                    choices[choiceCount++] = cs;
  1.1721 +                    csm &= ~CSM(cs);
  1.1722 +                }
  1.1723 +
  1.1724 +                /* try all the other possible charsets */
  1.1725 +                for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
  1.1726 +                    cs = (int8_t)jpCharsetPref[i];
  1.1727 +                    if(CSM(cs) & csm) {
  1.1728 +                        choices[choiceCount++] = cs;
  1.1729 +                        csm &= ~CSM(cs);
  1.1730 +                    }
  1.1731 +                }
  1.1732 +            }
  1.1733 +
  1.1734 +            cs = g = 0;
  1.1735 +            /*
  1.1736 +             * len==0: no mapping found yet
  1.1737 +             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
  1.1738 +             * len>0: found a roundtrip result, done
  1.1739 +             */
  1.1740 +            len = 0;
  1.1741 +            /*
  1.1742 +             * We will turn off useFallback after finding a fallback,
  1.1743 +             * but we still get fallbacks from PUA code points as usual.
  1.1744 +             * Therefore, we will also need to check that we don't overwrite
  1.1745 +             * an early fallback with a later one.
  1.1746 +             */
  1.1747 +            useFallback = cnv->useFallback;
  1.1748 +
  1.1749 +            for(i = 0; i < choiceCount && len <= 0; ++i) {
  1.1750 +                uint32_t value;
  1.1751 +                int32_t len2;
  1.1752 +                int8_t cs0 = choices[i];
  1.1753 +                switch(cs0) {
  1.1754 +                case ASCII:
  1.1755 +                    if(sourceChar <= 0x7f) {
  1.1756 +                        targetValue = (uint32_t)sourceChar;
  1.1757 +                        len = 1;
  1.1758 +                        cs = cs0;
  1.1759 +                        g = 0;
  1.1760 +                    }
  1.1761 +                    break;
  1.1762 +                case ISO8859_1:
  1.1763 +                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
  1.1764 +                        targetValue = (uint32_t)sourceChar - 0x80;
  1.1765 +                        len = 1;
  1.1766 +                        cs = cs0;
  1.1767 +                        g = 2;
  1.1768 +                    }
  1.1769 +                    break;
  1.1770 +                case HWKANA_7BIT:
  1.1771 +                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
  1.1772 +                        if(converterData->version==3) {
  1.1773 +                            /* JIS7: use G1 (SO) */
  1.1774 +                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
  1.1775 +                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
  1.1776 +                            len = 1;
  1.1777 +                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
  1.1778 +                            g = 1;
  1.1779 +                        } else if(converterData->version==4) {
  1.1780 +                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
  1.1781 +                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
  1.1782 +                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
  1.1783 +                            len = 1;
  1.1784 +
  1.1785 +                            cs = pFromU2022State->cs[0];
  1.1786 +                            if(IS_JP_DBCS(cs)) {
  1.1787 +                                /* switch from a DBCS charset to JISX201 */
  1.1788 +                                cs = (int8_t)JISX201;
  1.1789 +                            }
  1.1790 +                            /* else stay in the current G0 charset */
  1.1791 +                            g = 0;
  1.1792 +                        }
  1.1793 +                        /* else do not use HWKANA_7BIT with other versions */
  1.1794 +                    }
  1.1795 +                    break;
  1.1796 +                case JISX201:
  1.1797 +                    /* G0 SBCS */
  1.1798 +                    value = jisx201FromU(sourceChar);
  1.1799 +                    if(value <= 0x7f) {
  1.1800 +                        targetValue = value;
  1.1801 +                        len = 1;
  1.1802 +                        cs = cs0;
  1.1803 +                        g = 0;
  1.1804 +                        useFallback = FALSE;
  1.1805 +                    }
  1.1806 +                    break;
  1.1807 +                case JISX208:
  1.1808 +                    /* G0 DBCS from Shift-JIS table */
  1.1809 +                    len2 = MBCS_FROM_UCHAR32_ISO2022(
  1.1810 +                                converterData->myConverterArray[cs0],
  1.1811 +                                sourceChar, &value,
  1.1812 +                                useFallback, MBCS_OUTPUT_2);
  1.1813 +                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
  1.1814 +                        value = _2022FromSJIS(value);
  1.1815 +                        if(value != 0) {
  1.1816 +                            targetValue = value;
  1.1817 +                            len = len2;
  1.1818 +                            cs = cs0;
  1.1819 +                            g = 0;
  1.1820 +                            useFallback = FALSE;
  1.1821 +                        }
  1.1822 +                    } else if(len == 0 && useFallback &&
  1.1823 +                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
  1.1824 +                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
  1.1825 +                        len = -2;
  1.1826 +                        cs = cs0;
  1.1827 +                        g = 0;
  1.1828 +                        useFallback = FALSE;
  1.1829 +                    }
  1.1830 +                    break;
  1.1831 +                case ISO8859_7:
  1.1832 +                    /* G0 SBCS forced to 7-bit output */
  1.1833 +                    len2 = MBCS_SINGLE_FROM_UCHAR32(
  1.1834 +                                converterData->myConverterArray[cs0],
  1.1835 +                                sourceChar, &value,
  1.1836 +                                useFallback);
  1.1837 +                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
  1.1838 +                        targetValue = value - 0x80;
  1.1839 +                        len = len2;
  1.1840 +                        cs = cs0;
  1.1841 +                        g = 2;
  1.1842 +                        useFallback = FALSE;
  1.1843 +                    }
  1.1844 +                    break;
  1.1845 +                default:
  1.1846 +                    /* G0 DBCS */
  1.1847 +                    len2 = MBCS_FROM_UCHAR32_ISO2022(
  1.1848 +                                converterData->myConverterArray[cs0],
  1.1849 +                                sourceChar, &value,
  1.1850 +                                useFallback, MBCS_OUTPUT_2);
  1.1851 +                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
  1.1852 +                        if(cs0 == KSC5601) {
  1.1853 +                            /*
  1.1854 +                             * Check for valid bytes for the encoding scheme.
  1.1855 +                             * This is necessary because the sub-converter (windows-949)
  1.1856 +                             * has a broader encoding scheme than is valid for 2022.
  1.1857 +                             */
  1.1858 +                            value = _2022FromGR94DBCS(value);
  1.1859 +                            if(value == 0) {
  1.1860 +                                break;
  1.1861 +                            }
  1.1862 +                        }
  1.1863 +                        targetValue = value;
  1.1864 +                        len = len2;
  1.1865 +                        cs = cs0;
  1.1866 +                        g = 0;
  1.1867 +                        useFallback = FALSE;
  1.1868 +                    }
  1.1869 +                    break;
  1.1870 +                }
  1.1871 +            }
  1.1872 +
  1.1873 +            if(len != 0) {
  1.1874 +                if(len < 0) {
  1.1875 +                    len = -len;  /* fallback */
  1.1876 +                }
  1.1877 +                outLen = 0; /* count output bytes */
  1.1878 +
  1.1879 +                /* write SI if necessary (only for JIS7) */
  1.1880 +                if(pFromU2022State->g == 1 && g == 0) {
  1.1881 +                    buffer[outLen++] = UCNV_SI;
  1.1882 +                    pFromU2022State->g = 0;
  1.1883 +                }
  1.1884 +
  1.1885 +                /* write the designation sequence if necessary */
  1.1886 +                if(cs != pFromU2022State->cs[g]) {
  1.1887 +                    int32_t escLen = escSeqCharsLen[cs];
  1.1888 +                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
  1.1889 +                    outLen += escLen;
  1.1890 +                    pFromU2022State->cs[g] = cs;
  1.1891 +
  1.1892 +                    /* invalidate the choices[] */
  1.1893 +                    choiceCount = 0;
  1.1894 +                }
  1.1895 +
  1.1896 +                /* write the shift sequence if necessary */
  1.1897 +                if(g != pFromU2022State->g) {
  1.1898 +                    switch(g) {
  1.1899 +                    /* case 0 handled before writing escapes */
  1.1900 +                    case 1:
  1.1901 +                        buffer[outLen++] = UCNV_SO;
  1.1902 +                        pFromU2022State->g = 1;
  1.1903 +                        break;
  1.1904 +                    default: /* case 2 */
  1.1905 +                        buffer[outLen++] = 0x1b;
  1.1906 +                        buffer[outLen++] = 0x4e;
  1.1907 +                        break;
  1.1908 +                    /* no case 3: no SS3 in ISO-2022-JP-x */
  1.1909 +                    }
  1.1910 +                }
  1.1911 +
  1.1912 +                /* write the output bytes */
  1.1913 +                if(len == 1) {
  1.1914 +                    buffer[outLen++] = (char)targetValue;
  1.1915 +                } else /* len == 2 */ {
  1.1916 +                    buffer[outLen++] = (char)(targetValue >> 8);
  1.1917 +                    buffer[outLen++] = (char)targetValue;
  1.1918 +                }
  1.1919 +            } else {
  1.1920 +                /*
  1.1921 +                 * if we cannot find the character after checking all codepages
  1.1922 +                 * then this is an error
  1.1923 +                 */
  1.1924 +                *err = U_INVALID_CHAR_FOUND;
  1.1925 +                cnv->fromUChar32=sourceChar;
  1.1926 +                break;
  1.1927 +            }
  1.1928 +
  1.1929 +            if(sourceChar == CR || sourceChar == LF) {
  1.1930 +                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
  1.1931 +                pFromU2022State->cs[2] = 0;
  1.1932 +                choiceCount = 0;
  1.1933 +            }
  1.1934 +
  1.1935 +            /* output outLen>0 bytes in buffer[] */
  1.1936 +            if(outLen == 1) {
  1.1937 +                *target++ = buffer[0];
  1.1938 +                if(offsets) {
  1.1939 +                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
  1.1940 +                }
  1.1941 +            } else if(outLen == 2 && (target + 2) <= targetLimit) {
  1.1942 +                *target++ = buffer[0];
  1.1943 +                *target++ = buffer[1];
  1.1944 +                if(offsets) {
  1.1945 +                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
  1.1946 +                    *offsets++ = sourceIndex;
  1.1947 +                    *offsets++ = sourceIndex;
  1.1948 +                }
  1.1949 +            } else {
  1.1950 +                fromUWriteUInt8(
  1.1951 +                    cnv,
  1.1952 +                    buffer, outLen,
  1.1953 +                    &target, (const char *)targetLimit,
  1.1954 +                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
  1.1955 +                    err);
  1.1956 +                if(U_FAILURE(*err)) {
  1.1957 +                    break;
  1.1958 +                }
  1.1959 +            }
  1.1960 +        } /* end if(myTargetIndex<myTargetLength) */
  1.1961 +        else{
  1.1962 +            *err =U_BUFFER_OVERFLOW_ERROR;
  1.1963 +            break;
  1.1964 +        }
  1.1965 +
  1.1966 +    }/* end while(mySourceIndex<mySourceLength) */
  1.1967 +
  1.1968 +    /*
  1.1969 +     * the end of the input stream and detection of truncated input
  1.1970 +     * are handled by the framework, but for ISO-2022-JP conversion
  1.1971 +     * we need to be in ASCII mode at the very end
  1.1972 +     *
  1.1973 +     * conditions:
  1.1974 +     *   successful
  1.1975 +     *   in SO mode or not in ASCII mode
  1.1976 +     *   end of input and no truncated input
  1.1977 +     */
  1.1978 +    if( U_SUCCESS(*err) &&
  1.1979 +        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
  1.1980 +        args->flush && source>=sourceLimit && cnv->fromUChar32==0
  1.1981 +    ) {
  1.1982 +        int32_t sourceIndex;
  1.1983 +
  1.1984 +        outLen = 0;
  1.1985 +
  1.1986 +        if(pFromU2022State->g != 0) {
  1.1987 +            buffer[outLen++] = UCNV_SI;
  1.1988 +            pFromU2022State->g = 0;
  1.1989 +        }
  1.1990 +
  1.1991 +        if(pFromU2022State->cs[0] != ASCII) {
  1.1992 +            int32_t escLen = escSeqCharsLen[ASCII];
  1.1993 +            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
  1.1994 +            outLen += escLen;
  1.1995 +            pFromU2022State->cs[0] = (int8_t)ASCII;
  1.1996 +        }
  1.1997 +
  1.1998 +        /* get the source index of the last input character */
  1.1999 +        /*
  1.2000 +         * TODO this would be simpler and more reliable if we used a pair
  1.2001 +         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
  1.2002 +         * so that we could simply use the prevSourceIndex here;
  1.2003 +         * this code gives an incorrect result for the rare case of an unmatched
  1.2004 +         * trail surrogate that is alone in the last buffer of the text stream
  1.2005 +         */
  1.2006 +        sourceIndex=(int32_t)(source-args->source);
  1.2007 +        if(sourceIndex>0) {
  1.2008 +            --sourceIndex;
  1.2009 +            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
  1.2010 +                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
  1.2011 +            ) {
  1.2012 +                --sourceIndex;
  1.2013 +            }
  1.2014 +        } else {
  1.2015 +            sourceIndex=-1;
  1.2016 +        }
  1.2017 +
  1.2018 +        fromUWriteUInt8(
  1.2019 +            cnv,
  1.2020 +            buffer, outLen,
  1.2021 +            &target, (const char *)targetLimit,
  1.2022 +            &offsets, sourceIndex,
  1.2023 +            err);
  1.2024 +    }
  1.2025 +
  1.2026 +    /*save the state and return */
  1.2027 +    args->source = source;
  1.2028 +    args->target = (char*)target;
  1.2029 +}
  1.2030 +
  1.2031 +/*************** to unicode *******************/
  1.2032 +
  1.2033 +static void
  1.2034 +UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
  1.2035 +                                               UErrorCode* err){
  1.2036 +    char tempBuf[2];
  1.2037 +    const char *mySource = (char *) args->source;
  1.2038 +    UChar *myTarget = args->target;
  1.2039 +    const char *mySourceLimit = args->sourceLimit;
  1.2040 +    uint32_t targetUniChar = 0x0000;
  1.2041 +    uint32_t mySourceChar = 0x0000;
  1.2042 +    uint32_t tmpSourceChar = 0x0000;
  1.2043 +    UConverterDataISO2022* myData;
  1.2044 +    ISO2022State *pToU2022State;
  1.2045 +    StateEnum cs;
  1.2046 +
  1.2047 +    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
  1.2048 +    pToU2022State = &myData->toU2022State;
  1.2049 +
  1.2050 +    if(myData->key != 0) {
  1.2051 +        /* continue with a partial escape sequence */
  1.2052 +        goto escape;
  1.2053 +    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
  1.2054 +        /* continue with a partial double-byte character */
  1.2055 +        mySourceChar = args->converter->toUBytes[0];
  1.2056 +        args->converter->toULength = 0;
  1.2057 +        cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
  1.2058 +        targetUniChar = missingCharMarker;
  1.2059 +        goto getTrailByte;
  1.2060 +    }
  1.2061 +
  1.2062 +    while(mySource < mySourceLimit){
  1.2063 +
  1.2064 +        targetUniChar =missingCharMarker;
  1.2065 +
  1.2066 +        if(myTarget < args->targetLimit){
  1.2067 +
  1.2068 +            mySourceChar= (unsigned char) *mySource++;
  1.2069 +
  1.2070 +            switch(mySourceChar) {
  1.2071 +            case UCNV_SI:
  1.2072 +                if(myData->version==3) {
  1.2073 +                    pToU2022State->g=0;
  1.2074 +                    continue;
  1.2075 +                } else {
  1.2076 +                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
  1.2077 +                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
  1.2078 +                    break;
  1.2079 +                }
  1.2080 +
  1.2081 +            case UCNV_SO:
  1.2082 +                if(myData->version==3) {
  1.2083 +                    /* JIS7: switch to G1 half-width Katakana */
  1.2084 +                    pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
  1.2085 +                    pToU2022State->g=1;
  1.2086 +                    continue;
  1.2087 +                } else {
  1.2088 +                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
  1.2089 +                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
  1.2090 +                    break;
  1.2091 +                }
  1.2092 +
  1.2093 +            case ESC_2022:
  1.2094 +                mySource--;
  1.2095 +escape:
  1.2096 +                {
  1.2097 +                    const char * mySourceBefore = mySource;
  1.2098 +                    int8_t toULengthBefore = args->converter->toULength;
  1.2099 +
  1.2100 +                    changeState_2022(args->converter,&(mySource),
  1.2101 +                        mySourceLimit, ISO_2022_JP,err);
  1.2102 +
  1.2103 +                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
  1.2104 +                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
  1.2105 +                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  1.2106 +                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
  1.2107 +                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
  1.2108 +                    }
  1.2109 +                }
  1.2110 +
  1.2111 +                /* invalid or illegal escape sequence */
  1.2112 +                if(U_FAILURE(*err)){
  1.2113 +                    args->target = myTarget;
  1.2114 +                    args->source = mySource;
  1.2115 +                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
  1.2116 +                    return;
  1.2117 +                }
  1.2118 +                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
  1.2119 +                if(myData->key==0) {
  1.2120 +                    myData->isEmptySegment = TRUE;
  1.2121 +                }
  1.2122 +                continue;
  1.2123 +
  1.2124 +            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
  1.2125 +
  1.2126 +            case CR:
  1.2127 +                /*falls through*/
  1.2128 +            case LF:
  1.2129 +                /* automatically reset to single-byte mode */
  1.2130 +                if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
  1.2131 +                    pToU2022State->cs[0] = (int8_t)ASCII;
  1.2132 +                }
  1.2133 +                pToU2022State->cs[2] = 0;
  1.2134 +                pToU2022State->g = 0;
  1.2135 +                /* falls through */
  1.2136 +            default:
  1.2137 +                /* convert one or two bytes */
  1.2138 +                myData->isEmptySegment = FALSE;
  1.2139 +                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
  1.2140 +                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
  1.2141 +                    !IS_JP_DBCS(cs)
  1.2142 +                ) {
  1.2143 +                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
  1.2144 +                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
  1.2145 +
  1.2146 +                    /* return from a single-shift state to the previous one */
  1.2147 +                    if(pToU2022State->g >= 2) {
  1.2148 +                        pToU2022State->g=pToU2022State->prevG;
  1.2149 +                    }
  1.2150 +                } else switch(cs) {
  1.2151 +                case ASCII:
  1.2152 +                    if(mySourceChar <= 0x7f) {
  1.2153 +                        targetUniChar = mySourceChar;
  1.2154 +                    }
  1.2155 +                    break;
  1.2156 +                case ISO8859_1:
  1.2157 +                    if(mySourceChar <= 0x7f) {
  1.2158 +                        targetUniChar = mySourceChar + 0x80;
  1.2159 +                    }
  1.2160 +                    /* return from a single-shift state to the previous one */
  1.2161 +                    pToU2022State->g=pToU2022State->prevG;
  1.2162 +                    break;
  1.2163 +                case ISO8859_7:
  1.2164 +                    if(mySourceChar <= 0x7f) {
  1.2165 +                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
  1.2166 +                        targetUniChar =
  1.2167 +                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
  1.2168 +                                myData->myConverterArray[cs],
  1.2169 +                                mySourceChar + 0x80);
  1.2170 +                    }
  1.2171 +                    /* return from a single-shift state to the previous one */
  1.2172 +                    pToU2022State->g=pToU2022State->prevG;
  1.2173 +                    break;
  1.2174 +                case JISX201:
  1.2175 +                    if(mySourceChar <= 0x7f) {
  1.2176 +                        targetUniChar = jisx201ToU(mySourceChar);
  1.2177 +                    }
  1.2178 +                    break;
  1.2179 +                case HWKANA_7BIT:
  1.2180 +                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
  1.2181 +                        /* 7-bit halfwidth Katakana */
  1.2182 +                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
  1.2183 +                    }
  1.2184 +                    break;
  1.2185 +                default:
  1.2186 +                    /* G0 DBCS */
  1.2187 +                    if(mySource < mySourceLimit) {
  1.2188 +                        int leadIsOk, trailIsOk;
  1.2189 +                        uint8_t trailByte;
  1.2190 +getTrailByte:
  1.2191 +                        trailByte = (uint8_t)*mySource;
  1.2192 +                        /*
  1.2193 +                         * Ticket 5691: consistent illegal sequences:
  1.2194 +                         * - We include at least the first byte in the illegal sequence.
  1.2195 +                         * - If any of the non-initial bytes could be the start of a character,
  1.2196 +                         *   we stop the illegal sequence before the first one of those.
  1.2197 +                         *
  1.2198 +                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
  1.2199 +                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
  1.2200 +                         * Otherwise we convert or report the pair of bytes.
  1.2201 +                         */
  1.2202 +                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
  1.2203 +                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
  1.2204 +                        if (leadIsOk && trailIsOk) {
  1.2205 +                            ++mySource;
  1.2206 +                            tmpSourceChar = (mySourceChar << 8) | trailByte;
  1.2207 +                            if(cs == JISX208) {
  1.2208 +                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
  1.2209 +                                mySourceChar = tmpSourceChar;
  1.2210 +                            } else {
  1.2211 +                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
  1.2212 +                                mySourceChar = tmpSourceChar;
  1.2213 +                                if (cs == KSC5601) {
  1.2214 +                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
  1.2215 +                                }
  1.2216 +                                tempBuf[0] = (char)(tmpSourceChar >> 8);
  1.2217 +                                tempBuf[1] = (char)(tmpSourceChar);
  1.2218 +                            }
  1.2219 +                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
  1.2220 +                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
  1.2221 +                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
  1.2222 +                            ++mySource;
  1.2223 +                            /* add another bit so that the code below writes 2 bytes in case of error */
  1.2224 +                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
  1.2225 +                        }
  1.2226 +                    } else {
  1.2227 +                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  1.2228 +                        args->converter->toULength = 1;
  1.2229 +                        goto endloop;
  1.2230 +                    }
  1.2231 +                }  /* End of inner switch */
  1.2232 +                break;
  1.2233 +            }  /* End of outer switch */
  1.2234 +            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
  1.2235 +                if(args->offsets){
  1.2236 +                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  1.2237 +                }
  1.2238 +                *(myTarget++)=(UChar)targetUniChar;
  1.2239 +            }
  1.2240 +            else if(targetUniChar > missingCharMarker){
  1.2241 +                /* disassemble the surrogate pair and write to output*/
  1.2242 +                targetUniChar-=0x0010000;
  1.2243 +                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
  1.2244 +                if(args->offsets){
  1.2245 +                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  1.2246 +                }
  1.2247 +                ++myTarget;
  1.2248 +                if(myTarget< args->targetLimit){
  1.2249 +                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
  1.2250 +                    if(args->offsets){
  1.2251 +                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  1.2252 +                    }
  1.2253 +                    ++myTarget;
  1.2254 +                }else{
  1.2255 +                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
  1.2256 +                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
  1.2257 +                }
  1.2258 +
  1.2259 +            }
  1.2260 +            else{
  1.2261 +                /* Call the callback function*/
  1.2262 +                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
  1.2263 +                break;
  1.2264 +            }
  1.2265 +        }
  1.2266 +        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
  1.2267 +            *err =U_BUFFER_OVERFLOW_ERROR;
  1.2268 +            break;
  1.2269 +        }
  1.2270 +    }
  1.2271 +endloop:
  1.2272 +    args->target = myTarget;
  1.2273 +    args->source = mySource;
  1.2274 +}
  1.2275 +
  1.2276 +
  1.2277 +/***************************************************************
  1.2278 +*   Rules for ISO-2022-KR encoding
  1.2279 +*   i) The KSC5601 designator sequence should appear only once in a file,
  1.2280 +*      at the begining of a line before any KSC5601 characters. This usually
  1.2281 +*      means that it appears by itself on the first line of the file
  1.2282 +*  ii) There are only 2 shifting sequences SO to shift into double byte mode
  1.2283 +*      and SI to shift into single byte mode
  1.2284 +*/
  1.2285 +static void
  1.2286 +UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
  1.2287 +
  1.2288 +    UConverter* saveConv = args->converter;
  1.2289 +    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
  1.2290 +    args->converter=myConverterData->currentConverter;
  1.2291 +
  1.2292 +    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
  1.2293 +    ucnv_MBCSFromUnicodeWithOffsets(args,err);
  1.2294 +    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
  1.2295 +
  1.2296 +    if(*err == U_BUFFER_OVERFLOW_ERROR) {
  1.2297 +        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
  1.2298 +            uprv_memcpy(
  1.2299 +                saveConv->charErrorBuffer,
  1.2300 +                myConverterData->currentConverter->charErrorBuffer,
  1.2301 +                myConverterData->currentConverter->charErrorBufferLength);
  1.2302 +        }
  1.2303 +        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
  1.2304 +        myConverterData->currentConverter->charErrorBufferLength = 0;
  1.2305 +    }
  1.2306 +    args->converter=saveConv;
  1.2307 +}
  1.2308 +
  1.2309 +static void
  1.2310 +UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
  1.2311 +
  1.2312 +    const UChar *source = args->source;
  1.2313 +    const UChar *sourceLimit = args->sourceLimit;
  1.2314 +    unsigned char *target = (unsigned char *) args->target;
  1.2315 +    unsigned char *targetLimit = (unsigned char *) args->targetLimit;
  1.2316 +    int32_t* offsets = args->offsets;
  1.2317 +    uint32_t targetByteUnit = 0x0000;
  1.2318 +    UChar32 sourceChar = 0x0000;
  1.2319 +    UBool isTargetByteDBCS;
  1.2320 +    UBool oldIsTargetByteDBCS;
  1.2321 +    UConverterDataISO2022 *converterData;
  1.2322 +    UConverterSharedData* sharedData;
  1.2323 +    UBool useFallback;
  1.2324 +    int32_t length =0;
  1.2325 +
  1.2326 +    converterData=(UConverterDataISO2022*)args->converter->extraInfo;
  1.2327 +    /* if the version is 1 then the user is requesting
  1.2328 +     * conversion with ibm-25546 pass the arguments to
  1.2329 +     * MBCS converter and return
  1.2330 +     */
  1.2331 +    if(converterData->version==1){
  1.2332 +        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
  1.2333 +        return;
  1.2334 +    }
  1.2335 +
  1.2336 +    /* initialize data */
  1.2337 +    sharedData = converterData->currentConverter->sharedData;
  1.2338 +    useFallback = args->converter->useFallback;
  1.2339 +    isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
  1.2340 +    oldIsTargetByteDBCS = isTargetByteDBCS;
  1.2341 +
  1.2342 +    isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
  1.2343 +    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
  1.2344 +        goto getTrail;
  1.2345 +    }
  1.2346 +    while(source < sourceLimit){
  1.2347 +
  1.2348 +        targetByteUnit = missingCharMarker;
  1.2349 +
  1.2350 +        if(target < (unsigned char*) args->targetLimit){
  1.2351 +            sourceChar = *source++;
  1.2352 +
  1.2353 +            /* do not convert SO/SI/ESC */
  1.2354 +            if(IS_2022_CONTROL(sourceChar)) {
  1.2355 +                /* callback(illegal) */
  1.2356 +                *err=U_ILLEGAL_CHAR_FOUND;
  1.2357 +                args->converter->fromUChar32=sourceChar;
  1.2358 +                break;
  1.2359 +            }
  1.2360 +
  1.2361 +            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
  1.2362 +            if(length < 0) {
  1.2363 +                length = -length;  /* fallback */
  1.2364 +            }
  1.2365 +            /* only DBCS or SBCS characters are expected*/
  1.2366 +            /* DB characters with high bit set to 1 are expected */
  1.2367 +            if( length > 2 || length==0 ||
  1.2368 +                (length == 1 && targetByteUnit > 0x7f) ||
  1.2369 +                (length == 2 &&
  1.2370 +                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
  1.2371 +                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
  1.2372 +            ) {
  1.2373 +                targetByteUnit=missingCharMarker;
  1.2374 +            }
  1.2375 +            if (targetByteUnit != missingCharMarker){
  1.2376 +
  1.2377 +                oldIsTargetByteDBCS = isTargetByteDBCS;
  1.2378 +                isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
  1.2379 +                  /* append the shift sequence */
  1.2380 +                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
  1.2381 +
  1.2382 +                    if (isTargetByteDBCS)
  1.2383 +                        *target++ = UCNV_SO;
  1.2384 +                    else
  1.2385 +                        *target++ = UCNV_SI;
  1.2386 +                    if(offsets)
  1.2387 +                        *(offsets++) = (int32_t)(source - args->source-1);
  1.2388 +                }
  1.2389 +                /* write the targetUniChar  to target */
  1.2390 +                if(targetByteUnit <= 0x00FF){
  1.2391 +                    if( target < targetLimit){
  1.2392 +                        *(target++) = (unsigned char) targetByteUnit;
  1.2393 +                        if(offsets){
  1.2394 +                            *(offsets++) = (int32_t)(source - args->source-1);
  1.2395 +                        }
  1.2396 +
  1.2397 +                    }else{
  1.2398 +                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
  1.2399 +                        *err = U_BUFFER_OVERFLOW_ERROR;
  1.2400 +                    }
  1.2401 +                }else{
  1.2402 +                    if(target < targetLimit){
  1.2403 +                        *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
  1.2404 +                        if(offsets){
  1.2405 +                            *(offsets++) = (int32_t)(source - args->source-1);
  1.2406 +                        }
  1.2407 +                        if(target < targetLimit){
  1.2408 +                            *(target++) =(unsigned char) (targetByteUnit -0x80);
  1.2409 +                            if(offsets){
  1.2410 +                                *(offsets++) = (int32_t)(source - args->source-1);
  1.2411 +                            }
  1.2412 +                        }else{
  1.2413 +                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
  1.2414 +                            *err = U_BUFFER_OVERFLOW_ERROR;
  1.2415 +                        }
  1.2416 +                    }else{
  1.2417 +                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
  1.2418 +                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
  1.2419 +                        *err = U_BUFFER_OVERFLOW_ERROR;
  1.2420 +                    }
  1.2421 +                }
  1.2422 +
  1.2423 +            }
  1.2424 +            else{
  1.2425 +                /* oops.. the code point is unassingned
  1.2426 +                 * set the error and reason
  1.2427 +                 */
  1.2428 +
  1.2429 +                /*check if the char is a First surrogate*/
  1.2430 +                if(U16_IS_SURROGATE(sourceChar)) {
  1.2431 +                    if(U16_IS_SURROGATE_LEAD(sourceChar)) {
  1.2432 +getTrail:
  1.2433 +                        /*look ahead to find the trail surrogate*/
  1.2434 +                        if(source <  sourceLimit) {
  1.2435 +                            /* test the following code unit */
  1.2436 +                            UChar trail=(UChar) *source;
  1.2437 +                            if(U16_IS_TRAIL(trail)) {
  1.2438 +                                source++;
  1.2439 +                                sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
  1.2440 +                                *err = U_INVALID_CHAR_FOUND;
  1.2441 +                                /* convert this surrogate code point */
  1.2442 +                                /* exit this condition tree */
  1.2443 +                            } else {
  1.2444 +                                /* this is an unmatched lead code unit (1st surrogate) */
  1.2445 +                                /* callback(illegal) */
  1.2446 +                                *err=U_ILLEGAL_CHAR_FOUND;
  1.2447 +                            }
  1.2448 +                        } else {
  1.2449 +                            /* no more input */
  1.2450 +                            *err = U_ZERO_ERROR;
  1.2451 +                        }
  1.2452 +                    } else {
  1.2453 +                        /* this is an unmatched trail code unit (2nd surrogate) */
  1.2454 +                        /* callback(illegal) */
  1.2455 +                        *err=U_ILLEGAL_CHAR_FOUND;
  1.2456 +                    }
  1.2457 +                } else {
  1.2458 +                    /* callback(unassigned) for a BMP code point */
  1.2459 +                    *err = U_INVALID_CHAR_FOUND;
  1.2460 +                }
  1.2461 +
  1.2462 +                args->converter->fromUChar32=sourceChar;
  1.2463 +                break;
  1.2464 +            }
  1.2465 +        } /* end if(myTargetIndex<myTargetLength) */
  1.2466 +        else{
  1.2467 +            *err =U_BUFFER_OVERFLOW_ERROR;
  1.2468 +            break;
  1.2469 +        }
  1.2470 +
  1.2471 +    }/* end while(mySourceIndex<mySourceLength) */
  1.2472 +
  1.2473 +    /*
  1.2474 +     * the end of the input stream and detection of truncated input
  1.2475 +     * are handled by the framework, but for ISO-2022-KR conversion
  1.2476 +     * we need to be in ASCII mode at the very end
  1.2477 +     *
  1.2478 +     * conditions:
  1.2479 +     *   successful
  1.2480 +     *   not in ASCII mode
  1.2481 +     *   end of input and no truncated input
  1.2482 +     */
  1.2483 +    if( U_SUCCESS(*err) &&
  1.2484 +        isTargetByteDBCS &&
  1.2485 +        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
  1.2486 +    ) {
  1.2487 +        int32_t sourceIndex;
  1.2488 +
  1.2489 +        /* we are switching to ASCII */
  1.2490 +        isTargetByteDBCS=FALSE;
  1.2491 +
  1.2492 +        /* get the source index of the last input character */
  1.2493 +        /*
  1.2494 +         * TODO this would be simpler and more reliable if we used a pair
  1.2495 +         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
  1.2496 +         * so that we could simply use the prevSourceIndex here;
  1.2497 +         * this code gives an incorrect result for the rare case of an unmatched
  1.2498 +         * trail surrogate that is alone in the last buffer of the text stream
  1.2499 +         */
  1.2500 +        sourceIndex=(int32_t)(source-args->source);
  1.2501 +        if(sourceIndex>0) {
  1.2502 +            --sourceIndex;
  1.2503 +            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
  1.2504 +                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
  1.2505 +            ) {
  1.2506 +                --sourceIndex;
  1.2507 +            }
  1.2508 +        } else {
  1.2509 +            sourceIndex=-1;
  1.2510 +        }
  1.2511 +
  1.2512 +        fromUWriteUInt8(
  1.2513 +            args->converter,
  1.2514 +            SHIFT_IN_STR, 1,
  1.2515 +            &target, (const char *)targetLimit,
  1.2516 +            &offsets, sourceIndex,
  1.2517 +            err);
  1.2518 +    }
  1.2519 +
  1.2520 +    /*save the state and return */
  1.2521 +    args->source = source;
  1.2522 +    args->target = (char*)target;
  1.2523 +    args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
  1.2524 +}
  1.2525 +
  1.2526 +/************************ To Unicode ***************************************/
  1.2527 +
  1.2528 +static void
  1.2529 +UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
  1.2530 +                                                            UErrorCode* err){
  1.2531 +    char const* sourceStart;
  1.2532 +    UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
  1.2533 +
  1.2534 +    UConverterToUnicodeArgs subArgs;
  1.2535 +    int32_t minArgsSize;
  1.2536 +
  1.2537 +    /* set up the subconverter arguments */
  1.2538 +    if(args->size<sizeof(UConverterToUnicodeArgs)) {
  1.2539 +        minArgsSize = args->size;
  1.2540 +    } else {
  1.2541 +        minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
  1.2542 +    }
  1.2543 +
  1.2544 +    uprv_memcpy(&subArgs, args, minArgsSize);
  1.2545 +    subArgs.size = (uint16_t)minArgsSize;
  1.2546 +    subArgs.converter = myData->currentConverter;
  1.2547 +
  1.2548 +    /* remember the original start of the input for offsets */
  1.2549 +    sourceStart = args->source;
  1.2550 +
  1.2551 +    if(myData->key != 0) {
  1.2552 +        /* continue with a partial escape sequence */
  1.2553 +        goto escape;
  1.2554 +    }
  1.2555 +
  1.2556 +    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
  1.2557 +        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
  1.2558 +        subArgs.source = args->source;
  1.2559 +        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
  1.2560 +        if(subArgs.source != subArgs.sourceLimit) {
  1.2561 +            /*
  1.2562 +             * get the current partial byte sequence
  1.2563 +             *
  1.2564 +             * it needs to be moved between the public and the subconverter
  1.2565 +             * so that the conversion framework, which only sees the public
  1.2566 +             * converter, can handle truncated and illegal input etc.
  1.2567 +             */
  1.2568 +            if(args->converter->toULength > 0) {
  1.2569 +                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
  1.2570 +            }
  1.2571 +            subArgs.converter->toULength = args->converter->toULength;
  1.2572 +
  1.2573 +            /*
  1.2574 +             * Convert up to the end of the input, or to before the next escape character.
  1.2575 +             * Does not handle conversion extensions because the preToU[] state etc.
  1.2576 +             * is not copied.
  1.2577 +             */
  1.2578 +            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
  1.2579 +
  1.2580 +            if(args->offsets != NULL && sourceStart != args->source) {
  1.2581 +                /* update offsets to base them on the actual start of the input */
  1.2582 +                int32_t *offsets = args->offsets;
  1.2583 +                UChar *target = args->target;
  1.2584 +                int32_t delta = (int32_t)(args->source - sourceStart);
  1.2585 +                while(target < subArgs.target) {
  1.2586 +                    if(*offsets >= 0) {
  1.2587 +                        *offsets += delta;
  1.2588 +                    }
  1.2589 +                    ++offsets;
  1.2590 +                    ++target;
  1.2591 +                }
  1.2592 +            }
  1.2593 +            args->source = subArgs.source;
  1.2594 +            args->target = subArgs.target;
  1.2595 +            args->offsets = subArgs.offsets;
  1.2596 +
  1.2597 +            /* copy input/error/overflow buffers */
  1.2598 +            if(subArgs.converter->toULength > 0) {
  1.2599 +                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
  1.2600 +            }
  1.2601 +            args->converter->toULength = subArgs.converter->toULength;
  1.2602 +
  1.2603 +            if(*err == U_BUFFER_OVERFLOW_ERROR) {
  1.2604 +                if(subArgs.converter->UCharErrorBufferLength > 0) {
  1.2605 +                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
  1.2606 +                                subArgs.converter->UCharErrorBufferLength);
  1.2607 +                }
  1.2608 +                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
  1.2609 +                subArgs.converter->UCharErrorBufferLength = 0;
  1.2610 +            }
  1.2611 +        }
  1.2612 +
  1.2613 +        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
  1.2614 +            return;
  1.2615 +        }
  1.2616 +
  1.2617 +escape:
  1.2618 +        changeState_2022(args->converter,
  1.2619 +               &(args->source),
  1.2620 +               args->sourceLimit,
  1.2621 +               ISO_2022_KR,
  1.2622 +               err);
  1.2623 +    }
  1.2624 +}
  1.2625 +
  1.2626 +static void
  1.2627 +UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
  1.2628 +                                                            UErrorCode* err){
  1.2629 +    char tempBuf[2];
  1.2630 +    const char *mySource = ( char *) args->source;
  1.2631 +    UChar *myTarget = args->target;
  1.2632 +    const char *mySourceLimit = args->sourceLimit;
  1.2633 +    UChar32 targetUniChar = 0x0000;
  1.2634 +    UChar mySourceChar = 0x0000;
  1.2635 +    UConverterDataISO2022* myData;
  1.2636 +    UConverterSharedData* sharedData ;
  1.2637 +    UBool useFallback;
  1.2638 +
  1.2639 +    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
  1.2640 +    if(myData->version==1){
  1.2641 +        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
  1.2642 +        return;
  1.2643 +    }
  1.2644 +
  1.2645 +    /* initialize state */
  1.2646 +    sharedData = myData->currentConverter->sharedData;
  1.2647 +    useFallback = args->converter->useFallback;
  1.2648 +
  1.2649 +    if(myData->key != 0) {
  1.2650 +        /* continue with a partial escape sequence */
  1.2651 +        goto escape;
  1.2652 +    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
  1.2653 +        /* continue with a partial double-byte character */
  1.2654 +        mySourceChar = args->converter->toUBytes[0];
  1.2655 +        args->converter->toULength = 0;
  1.2656 +        goto getTrailByte;
  1.2657 +    }
  1.2658 +
  1.2659 +    while(mySource< mySourceLimit){
  1.2660 +
  1.2661 +        if(myTarget < args->targetLimit){
  1.2662 +
  1.2663 +            mySourceChar= (unsigned char) *mySource++;
  1.2664 +
  1.2665 +            if(mySourceChar==UCNV_SI){
  1.2666 +                myData->toU2022State.g = 0;
  1.2667 +                if (myData->isEmptySegment) {
  1.2668 +                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
  1.2669 +                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  1.2670 +                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
  1.2671 +                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  1.2672 +                    args->converter->toULength = 1;
  1.2673 +                    args->target = myTarget;
  1.2674 +                    args->source = mySource;
  1.2675 +                    return;
  1.2676 +                }
  1.2677 +                /*consume the source */
  1.2678 +                continue;
  1.2679 +            }else if(mySourceChar==UCNV_SO){
  1.2680 +                myData->toU2022State.g = 1;
  1.2681 +                myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
  1.2682 +                /*consume the source */
  1.2683 +                continue;
  1.2684 +            }else if(mySourceChar==ESC_2022){
  1.2685 +                mySource--;
  1.2686 +escape:
  1.2687 +                myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
  1.2688 +                changeState_2022(args->converter,&(mySource),
  1.2689 +                                mySourceLimit, ISO_2022_KR, err);
  1.2690 +                if(U_FAILURE(*err)){
  1.2691 +                    args->target = myTarget;
  1.2692 +                    args->source = mySource;
  1.2693 +                    return;
  1.2694 +                }
  1.2695 +                continue;
  1.2696 +            }
  1.2697 +
  1.2698 +            myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
  1.2699 +            if(myData->toU2022State.g == 1) {
  1.2700 +                if(mySource < mySourceLimit) {
  1.2701 +                    int leadIsOk, trailIsOk;
  1.2702 +                    uint8_t trailByte;
  1.2703 +getTrailByte:
  1.2704 +                    targetUniChar = missingCharMarker;
  1.2705 +                    trailByte = (uint8_t)*mySource;
  1.2706 +                    /*
  1.2707 +                     * Ticket 5691: consistent illegal sequences:
  1.2708 +                     * - We include at least the first byte in the illegal sequence.
  1.2709 +                     * - If any of the non-initial bytes could be the start of a character,
  1.2710 +                     *   we stop the illegal sequence before the first one of those.
  1.2711 +                     *
  1.2712 +                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
  1.2713 +                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
  1.2714 +                     * Otherwise we convert or report the pair of bytes.
  1.2715 +                     */
  1.2716 +                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
  1.2717 +                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
  1.2718 +                    if (leadIsOk && trailIsOk) {
  1.2719 +                        ++mySource;
  1.2720 +                        tempBuf[0] = (char)(mySourceChar + 0x80);
  1.2721 +                        tempBuf[1] = (char)(trailByte + 0x80);
  1.2722 +                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
  1.2723 +                        mySourceChar = (mySourceChar << 8) | trailByte;
  1.2724 +                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
  1.2725 +                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
  1.2726 +                        ++mySource;
  1.2727 +                        /* add another bit so that the code below writes 2 bytes in case of error */
  1.2728 +                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
  1.2729 +                    }
  1.2730 +                } else {
  1.2731 +                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  1.2732 +                    args->converter->toULength = 1;
  1.2733 +                    break;
  1.2734 +                }
  1.2735 +            }
  1.2736 +            else if(mySourceChar <= 0x7f) {
  1.2737 +                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
  1.2738 +            } else {
  1.2739 +                targetUniChar = 0xffff;
  1.2740 +            }
  1.2741 +            if(targetUniChar < 0xfffe){
  1.2742 +                if(args->offsets) {
  1.2743 +                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  1.2744 +                }
  1.2745 +                *(myTarget++)=(UChar)targetUniChar;
  1.2746 +            }
  1.2747 +            else {
  1.2748 +                /* Call the callback function*/
  1.2749 +                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
  1.2750 +                break;
  1.2751 +            }
  1.2752 +        }
  1.2753 +        else{
  1.2754 +            *err =U_BUFFER_OVERFLOW_ERROR;
  1.2755 +            break;
  1.2756 +        }
  1.2757 +    }
  1.2758 +    args->target = myTarget;
  1.2759 +    args->source = mySource;
  1.2760 +}
  1.2761 +
  1.2762 +/*************************** END ISO2022-KR *********************************/
  1.2763 +
  1.2764 +/*************************** ISO-2022-CN *********************************
  1.2765 +*
  1.2766 +* Rules for ISO-2022-CN Encoding:
  1.2767 +* i)   The designator sequence must appear once on a line before any instance
  1.2768 +*      of character set it designates.
  1.2769 +* ii)  If two lines contain characters from the same character set, both lines
  1.2770 +*      must include the designator sequence.
  1.2771 +* iii) Once the designator sequence is known, a shifting sequence has to be found
  1.2772 +*      to invoke the  shifting
  1.2773 +* iv)  All lines start in ASCII and end in ASCII.
  1.2774 +* v)   Four shifting sequences are employed for this purpose:
  1.2775 +*
  1.2776 +*      Sequcence   ASCII Eq    Charsets
  1.2777 +*      ----------  -------    ---------
  1.2778 +*      SI           <SI>        US-ASCII
  1.2779 +*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
  1.2780 +*      SS2          <ESC>N      CNS-11643-1992 Plane 2
  1.2781 +*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
  1.2782 +*
  1.2783 +* vi)
  1.2784 +*      SOdesignator  : ESC "$" ")" finalchar_for_SO
  1.2785 +*      SS2designator : ESC "$" "*" finalchar_for_SS2
  1.2786 +*      SS3designator : ESC "$" "+" finalchar_for_SS3
  1.2787 +*
  1.2788 +*      ESC $ ) A       Indicates the bytes following SO are Chinese
  1.2789 +*       characters as defined in GB 2312-80, until
  1.2790 +*       another SOdesignation appears
  1.2791 +*
  1.2792 +*
  1.2793 +*      ESC $ ) E       Indicates the bytes following SO are as defined
  1.2794 +*       in ISO-IR-165 (for details, see section 2.1),
  1.2795 +*       until another SOdesignation appears
  1.2796 +*
  1.2797 +*      ESC $ ) G       Indicates the bytes following SO are as defined
  1.2798 +*       in CNS 11643-plane-1, until another
  1.2799 +*       SOdesignation appears
  1.2800 +*
  1.2801 +*      ESC $ * H       Indicates the two bytes immediately following
  1.2802 +*       SS2 is a Chinese character as defined in CNS
  1.2803 +*       11643-plane-2, until another SS2designation
  1.2804 +*       appears
  1.2805 +*       (Meaning <ESC>N must preceed every 2 byte
  1.2806 +*        sequence.)
  1.2807 +*
  1.2808 +*      ESC $ + I       Indicates the immediate two bytes following SS3
  1.2809 +*       is a Chinese character as defined in CNS
  1.2810 +*       11643-plane-3, until another SS3designation
  1.2811 +*       appears
  1.2812 +*       (Meaning <ESC>O must preceed every 2 byte
  1.2813 +*        sequence.)
  1.2814 +*
  1.2815 +*      ESC $ + J       Indicates the immediate two bytes following SS3
  1.2816 +*       is a Chinese character as defined in CNS
  1.2817 +*       11643-plane-4, until another SS3designation
  1.2818 +*       appears
  1.2819 +*       (In English: <ESC>O must preceed every 2 byte
  1.2820 +*        sequence.)
  1.2821 +*
  1.2822 +*      ESC $ + K       Indicates the immediate two bytes following SS3
  1.2823 +*       is a Chinese character as defined in CNS
  1.2824 +*       11643-plane-5, until another SS3designation
  1.2825 +*       appears
  1.2826 +*
  1.2827 +*      ESC $ + L       Indicates the immediate two bytes following SS3
  1.2828 +*       is a Chinese character as defined in CNS
  1.2829 +*       11643-plane-6, until another SS3designation
  1.2830 +*       appears
  1.2831 +*
  1.2832 +*      ESC $ + M       Indicates the immediate two bytes following SS3
  1.2833 +*       is a Chinese character as defined in CNS
  1.2834 +*       11643-plane-7, until another SS3designation
  1.2835 +*       appears
  1.2836 +*
  1.2837 +*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
  1.2838 +*       has its own designation information before any Chinese characters
  1.2839 +*       appear
  1.2840 +*
  1.2841 +*/
  1.2842 +
  1.2843 +/* The following are defined this way to make the strings truly readonly */
  1.2844 +static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
  1.2845 +static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
  1.2846 +static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
  1.2847 +static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
  1.2848 +static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
  1.2849 +static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
  1.2850 +static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
  1.2851 +static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
  1.2852 +static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
  1.2853 +
  1.2854 +/********************** ISO2022-CN Data **************************/
  1.2855 +static const char* const escSeqCharsCN[10] ={
  1.2856 +        SHIFT_IN_STR,                   /* 0 ASCII */
  1.2857 +        GB_2312_80_STR,                 /* 1 GB2312_1 */
  1.2858 +        ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
  1.2859 +        CNS_11643_1992_Plane_1_STR,
  1.2860 +        CNS_11643_1992_Plane_2_STR,
  1.2861 +        CNS_11643_1992_Plane_3_STR,
  1.2862 +        CNS_11643_1992_Plane_4_STR,
  1.2863 +        CNS_11643_1992_Plane_5_STR,
  1.2864 +        CNS_11643_1992_Plane_6_STR,
  1.2865 +        CNS_11643_1992_Plane_7_STR
  1.2866 +};
  1.2867 +
  1.2868 +static void
  1.2869 +UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
  1.2870 +    UConverter *cnv = args->converter;
  1.2871 +    UConverterDataISO2022 *converterData;
  1.2872 +    ISO2022State *pFromU2022State;
  1.2873 +    uint8_t *target = (uint8_t *) args->target;
  1.2874 +    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
  1.2875 +    const UChar* source = args->source;
  1.2876 +    const UChar* sourceLimit = args->sourceLimit;
  1.2877 +    int32_t* offsets = args->offsets;
  1.2878 +    UChar32 sourceChar;
  1.2879 +    char buffer[8];
  1.2880 +    int32_t len;
  1.2881 +    int8_t choices[3];
  1.2882 +    int32_t choiceCount;
  1.2883 +    uint32_t targetValue = 0;
  1.2884 +    UBool useFallback;
  1.2885 +
  1.2886 +    /* set up the state */
  1.2887 +    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
  1.2888 +    pFromU2022State   = &converterData->fromU2022State;
  1.2889 +
  1.2890 +    choiceCount = 0;
  1.2891 +
  1.2892 +    /* check if the last codepoint of previous buffer was a lead surrogate*/
  1.2893 +    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
  1.2894 +        goto getTrail;
  1.2895 +    }
  1.2896 +
  1.2897 +    while( source < sourceLimit){
  1.2898 +        if(target < targetLimit){
  1.2899 +
  1.2900 +            sourceChar  = *(source++);
  1.2901 +            /*check if the char is a First surrogate*/
  1.2902 +             if(U16_IS_SURROGATE(sourceChar)) {
  1.2903 +                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
  1.2904 +getTrail:
  1.2905 +                    /*look ahead to find the trail surrogate*/
  1.2906 +                    if(source < sourceLimit) {
  1.2907 +                        /* test the following code unit */
  1.2908 +                        UChar trail=(UChar) *source;
  1.2909 +                        if(U16_IS_TRAIL(trail)) {
  1.2910 +                            source++;
  1.2911 +                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
  1.2912 +                            cnv->fromUChar32=0x00;
  1.2913 +                            /* convert this supplementary code point */
  1.2914 +                            /* exit this condition tree */
  1.2915 +                        } else {
  1.2916 +                            /* this is an unmatched lead code unit (1st surrogate) */
  1.2917 +                            /* callback(illegal) */
  1.2918 +                            *err=U_ILLEGAL_CHAR_FOUND;
  1.2919 +                            cnv->fromUChar32=sourceChar;
  1.2920 +                            break;
  1.2921 +                        }
  1.2922 +                    } else {
  1.2923 +                        /* no more input */
  1.2924 +                        cnv->fromUChar32=sourceChar;
  1.2925 +                        break;
  1.2926 +                    }
  1.2927 +                } else {
  1.2928 +                    /* this is an unmatched trail code unit (2nd surrogate) */
  1.2929 +                    /* callback(illegal) */
  1.2930 +                    *err=U_ILLEGAL_CHAR_FOUND;
  1.2931 +                    cnv->fromUChar32=sourceChar;
  1.2932 +                    break;
  1.2933 +                }
  1.2934 +            }
  1.2935 +
  1.2936 +            /* do the conversion */
  1.2937 +            if(sourceChar <= 0x007f ){
  1.2938 +                /* do not convert SO/SI/ESC */
  1.2939 +                if(IS_2022_CONTROL(sourceChar)) {
  1.2940 +                    /* callback(illegal) */
  1.2941 +                    *err=U_ILLEGAL_CHAR_FOUND;
  1.2942 +                    cnv->fromUChar32=sourceChar;
  1.2943 +                    break;
  1.2944 +                }
  1.2945 +
  1.2946 +                /* US-ASCII */
  1.2947 +                if(pFromU2022State->g == 0) {
  1.2948 +                    buffer[0] = (char)sourceChar;
  1.2949 +                    len = 1;
  1.2950 +                } else {
  1.2951 +                    buffer[0] = UCNV_SI;
  1.2952 +                    buffer[1] = (char)sourceChar;
  1.2953 +                    len = 2;
  1.2954 +                    pFromU2022State->g = 0;
  1.2955 +                    choiceCount = 0;
  1.2956 +                }
  1.2957 +                if(sourceChar == CR || sourceChar == LF) {
  1.2958 +                    /* reset the state at the end of a line */
  1.2959 +                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
  1.2960 +                    choiceCount = 0;
  1.2961 +                }
  1.2962 +            }
  1.2963 +            else{
  1.2964 +                /* convert U+0080..U+10ffff */
  1.2965 +                int32_t i;
  1.2966 +                int8_t cs, g;
  1.2967 +
  1.2968 +                if(choiceCount == 0) {
  1.2969 +                    /* try the current SO/G1 converter first */
  1.2970 +                    choices[0] = pFromU2022State->cs[1];
  1.2971 +
  1.2972 +                    /* default to GB2312_1 if none is designated yet */
  1.2973 +                    if(choices[0] == 0) {
  1.2974 +                        choices[0] = GB2312_1;
  1.2975 +                    }
  1.2976 +
  1.2977 +                    if(converterData->version == 0) {
  1.2978 +                        /* ISO-2022-CN */
  1.2979 +
  1.2980 +                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
  1.2981 +                        if(choices[0] == GB2312_1) {
  1.2982 +                            choices[1] = (int8_t)CNS_11643_1;
  1.2983 +                        } else {
  1.2984 +                            choices[1] = (int8_t)GB2312_1;
  1.2985 +                        }
  1.2986 +
  1.2987 +                        choiceCount = 2;
  1.2988 +                    } else if (converterData->version == 1) {
  1.2989 +                        /* ISO-2022-CN-EXT */
  1.2990 +
  1.2991 +                        /* try one of the other converters */
  1.2992 +                        switch(choices[0]) {
  1.2993 +                        case GB2312_1:
  1.2994 +                            choices[1] = (int8_t)CNS_11643_1;
  1.2995 +                            choices[2] = (int8_t)ISO_IR_165;
  1.2996 +                            break;
  1.2997 +                        case ISO_IR_165:
  1.2998 +                            choices[1] = (int8_t)GB2312_1;
  1.2999 +                            choices[2] = (int8_t)CNS_11643_1;
  1.3000 +                            break;
  1.3001 +                        default: /* CNS_11643_x */
  1.3002 +                            choices[1] = (int8_t)GB2312_1;
  1.3003 +                            choices[2] = (int8_t)ISO_IR_165;
  1.3004 +                            break;
  1.3005 +                        }
  1.3006 +
  1.3007 +                        choiceCount = 3;
  1.3008 +                    } else {
  1.3009 +                        choices[0] = (int8_t)CNS_11643_1;
  1.3010 +                        choices[1] = (int8_t)GB2312_1;
  1.3011 +                    }
  1.3012 +                }
  1.3013 +
  1.3014 +                cs = g = 0;
  1.3015 +                /*
  1.3016 +                 * len==0: no mapping found yet
  1.3017 +                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
  1.3018 +                 * len>0: found a roundtrip result, done
  1.3019 +                 */
  1.3020 +                len = 0;
  1.3021 +                /*
  1.3022 +                 * We will turn off useFallback after finding a fallback,
  1.3023 +                 * but we still get fallbacks from PUA code points as usual.
  1.3024 +                 * Therefore, we will also need to check that we don't overwrite
  1.3025 +                 * an early fallback with a later one.
  1.3026 +                 */
  1.3027 +                useFallback = cnv->useFallback;
  1.3028 +
  1.3029 +                for(i = 0; i < choiceCount && len <= 0; ++i) {
  1.3030 +                    int8_t cs0 = choices[i];
  1.3031 +                    if(cs0 > 0) {
  1.3032 +                        uint32_t value;
  1.3033 +                        int32_t len2;
  1.3034 +                        if(cs0 >= CNS_11643_0) {
  1.3035 +                            len2 = MBCS_FROM_UCHAR32_ISO2022(
  1.3036 +                                        converterData->myConverterArray[CNS_11643],
  1.3037 +                                        sourceChar,
  1.3038 +                                        &value,
  1.3039 +                                        useFallback,
  1.3040 +                                        MBCS_OUTPUT_3);
  1.3041 +                            if(len2 == 3 || (len2 == -3 && len == 0)) {
  1.3042 +                                targetValue = value;
  1.3043 +                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
  1.3044 +                                if(len2 >= 0) {
  1.3045 +                                    len = 2;
  1.3046 +                                } else {
  1.3047 +                                    len = -2;
  1.3048 +                                    useFallback = FALSE;
  1.3049 +                                }
  1.3050 +                                if(cs == CNS_11643_1) {
  1.3051 +                                    g = 1;
  1.3052 +                                } else if(cs == CNS_11643_2) {
  1.3053 +                                    g = 2;
  1.3054 +                                } else /* plane 3..7 */ if(converterData->version == 1) {
  1.3055 +                                    g = 3;
  1.3056 +                                } else {
  1.3057 +                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
  1.3058 +                                    len = 0;
  1.3059 +                                }
  1.3060 +                            }
  1.3061 +                        } else {
  1.3062 +                            /* GB2312_1 or ISO-IR-165 */
  1.3063 +                            U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
  1.3064 +                            len2 = MBCS_FROM_UCHAR32_ISO2022(
  1.3065 +                                        converterData->myConverterArray[cs0],
  1.3066 +                                        sourceChar,
  1.3067 +                                        &value,
  1.3068 +                                        useFallback,
  1.3069 +                                        MBCS_OUTPUT_2);
  1.3070 +                            if(len2 == 2 || (len2 == -2 && len == 0)) {
  1.3071 +                                targetValue = value;
  1.3072 +                                len = len2;
  1.3073 +                                cs = cs0;
  1.3074 +                                g = 1;
  1.3075 +                                useFallback = FALSE;
  1.3076 +                            }
  1.3077 +                        }
  1.3078 +                    }
  1.3079 +                }
  1.3080 +
  1.3081 +                if(len != 0) {
  1.3082 +                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
  1.3083 +
  1.3084 +                    /* write the designation sequence if necessary */
  1.3085 +                    if(cs != pFromU2022State->cs[g]) {
  1.3086 +                        if(cs < CNS_11643) {
  1.3087 +                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
  1.3088 +                        } else {
  1.3089 +                            U_ASSERT(cs >= CNS_11643_1);
  1.3090 +                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
  1.3091 +                        }
  1.3092 +                        len = 4;
  1.3093 +                        pFromU2022State->cs[g] = cs;
  1.3094 +                        if(g == 1) {
  1.3095 +                            /* changing the SO/G1 charset invalidates the choices[] */
  1.3096 +                            choiceCount = 0;
  1.3097 +                        }
  1.3098 +                    }
  1.3099 +
  1.3100 +                    /* write the shift sequence if necessary */
  1.3101 +                    if(g != pFromU2022State->g) {
  1.3102 +                        switch(g) {
  1.3103 +                        case 1:
  1.3104 +                            buffer[len++] = UCNV_SO;
  1.3105 +
  1.3106 +                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
  1.3107 +                            pFromU2022State->g = 1;
  1.3108 +                            break;
  1.3109 +                        case 2:
  1.3110 +                            buffer[len++] = 0x1b;
  1.3111 +                            buffer[len++] = 0x4e;
  1.3112 +                            break;
  1.3113 +                        default: /* case 3 */
  1.3114 +                            buffer[len++] = 0x1b;
  1.3115 +                            buffer[len++] = 0x4f;
  1.3116 +                            break;
  1.3117 +                        }
  1.3118 +                    }
  1.3119 +
  1.3120 +                    /* write the two output bytes */
  1.3121 +                    buffer[len++] = (char)(targetValue >> 8);
  1.3122 +                    buffer[len++] = (char)targetValue;
  1.3123 +                } else {
  1.3124 +                    /* if we cannot find the character after checking all codepages
  1.3125 +                     * then this is an error
  1.3126 +                     */
  1.3127 +                    *err = U_INVALID_CHAR_FOUND;
  1.3128 +                    cnv->fromUChar32=sourceChar;
  1.3129 +                    break;
  1.3130 +                }
  1.3131 +            }
  1.3132 +
  1.3133 +            /* output len>0 bytes in buffer[] */
  1.3134 +            if(len == 1) {
  1.3135 +                *target++ = buffer[0];
  1.3136 +                if(offsets) {
  1.3137 +                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
  1.3138 +                }
  1.3139 +            } else if(len == 2 && (target + 2) <= targetLimit) {
  1.3140 +                *target++ = buffer[0];
  1.3141 +                *target++ = buffer[1];
  1.3142 +                if(offsets) {
  1.3143 +                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
  1.3144 +                    *offsets++ = sourceIndex;
  1.3145 +                    *offsets++ = sourceIndex;
  1.3146 +                }
  1.3147 +            } else {
  1.3148 +                fromUWriteUInt8(
  1.3149 +                    cnv,
  1.3150 +                    buffer, len,
  1.3151 +                    &target, (const char *)targetLimit,
  1.3152 +                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
  1.3153 +                    err);
  1.3154 +                if(U_FAILURE(*err)) {
  1.3155 +                    break;
  1.3156 +                }
  1.3157 +            }
  1.3158 +        } /* end if(myTargetIndex<myTargetLength) */
  1.3159 +        else{
  1.3160 +            *err =U_BUFFER_OVERFLOW_ERROR;
  1.3161 +            break;
  1.3162 +        }
  1.3163 +
  1.3164 +    }/* end while(mySourceIndex<mySourceLength) */
  1.3165 +
  1.3166 +    /*
  1.3167 +     * the end of the input stream and detection of truncated input
  1.3168 +     * are handled by the framework, but for ISO-2022-CN conversion
  1.3169 +     * we need to be in ASCII mode at the very end
  1.3170 +     *
  1.3171 +     * conditions:
  1.3172 +     *   successful
  1.3173 +     *   not in ASCII mode
  1.3174 +     *   end of input and no truncated input
  1.3175 +     */
  1.3176 +    if( U_SUCCESS(*err) &&
  1.3177 +        pFromU2022State->g!=0 &&
  1.3178 +        args->flush && source>=sourceLimit && cnv->fromUChar32==0
  1.3179 +    ) {
  1.3180 +        int32_t sourceIndex;
  1.3181 +
  1.3182 +        /* we are switching to ASCII */
  1.3183 +        pFromU2022State->g=0;
  1.3184 +
  1.3185 +        /* get the source index of the last input character */
  1.3186 +        /*
  1.3187 +         * TODO this would be simpler and more reliable if we used a pair
  1.3188 +         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
  1.3189 +         * so that we could simply use the prevSourceIndex here;
  1.3190 +         * this code gives an incorrect result for the rare case of an unmatched
  1.3191 +         * trail surrogate that is alone in the last buffer of the text stream
  1.3192 +         */
  1.3193 +        sourceIndex=(int32_t)(source-args->source);
  1.3194 +        if(sourceIndex>0) {
  1.3195 +            --sourceIndex;
  1.3196 +            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
  1.3197 +                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
  1.3198 +            ) {
  1.3199 +                --sourceIndex;
  1.3200 +            }
  1.3201 +        } else {
  1.3202 +            sourceIndex=-1;
  1.3203 +        }
  1.3204 +
  1.3205 +        fromUWriteUInt8(
  1.3206 +            cnv,
  1.3207 +            SHIFT_IN_STR, 1,
  1.3208 +            &target, (const char *)targetLimit,
  1.3209 +            &offsets, sourceIndex,
  1.3210 +            err);
  1.3211 +    }
  1.3212 +
  1.3213 +    /*save the state and return */
  1.3214 +    args->source = source;
  1.3215 +    args->target = (char*)target;
  1.3216 +}
  1.3217 +
  1.3218 +
  1.3219 +static void
  1.3220 +UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
  1.3221 +                                               UErrorCode* err){
  1.3222 +    char tempBuf[3];
  1.3223 +    const char *mySource = (char *) args->source;
  1.3224 +    UChar *myTarget = args->target;
  1.3225 +    const char *mySourceLimit = args->sourceLimit;
  1.3226 +    uint32_t targetUniChar = 0x0000;
  1.3227 +    uint32_t mySourceChar = 0x0000;
  1.3228 +    UConverterDataISO2022* myData;
  1.3229 +    ISO2022State *pToU2022State;
  1.3230 +
  1.3231 +    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
  1.3232 +    pToU2022State = &myData->toU2022State;
  1.3233 +
  1.3234 +    if(myData->key != 0) {
  1.3235 +        /* continue with a partial escape sequence */
  1.3236 +        goto escape;
  1.3237 +    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
  1.3238 +        /* continue with a partial double-byte character */
  1.3239 +        mySourceChar = args->converter->toUBytes[0];
  1.3240 +        args->converter->toULength = 0;
  1.3241 +        targetUniChar = missingCharMarker;
  1.3242 +        goto getTrailByte;
  1.3243 +    }
  1.3244 +
  1.3245 +    while(mySource < mySourceLimit){
  1.3246 +
  1.3247 +        targetUniChar =missingCharMarker;
  1.3248 +
  1.3249 +        if(myTarget < args->targetLimit){
  1.3250 +
  1.3251 +            mySourceChar= (unsigned char) *mySource++;
  1.3252 +
  1.3253 +            switch(mySourceChar){
  1.3254 +            case UCNV_SI:
  1.3255 +                pToU2022State->g=0;
  1.3256 +                if (myData->isEmptySegment) {
  1.3257 +                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
  1.3258 +                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  1.3259 +                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
  1.3260 +                    args->converter->toUBytes[0] = mySourceChar;
  1.3261 +                    args->converter->toULength = 1;
  1.3262 +                    args->target = myTarget;
  1.3263 +                    args->source = mySource;
  1.3264 +                    return;
  1.3265 +                }
  1.3266 +                continue;
  1.3267 +
  1.3268 +            case UCNV_SO:
  1.3269 +                if(pToU2022State->cs[1] != 0) {
  1.3270 +                    pToU2022State->g=1;
  1.3271 +                    myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
  1.3272 +                    continue;
  1.3273 +                } else {
  1.3274 +                    /* illegal to have SO before a matching designator */
  1.3275 +                    myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
  1.3276 +                    break;
  1.3277 +                }
  1.3278 +
  1.3279 +            case ESC_2022:
  1.3280 +                mySource--;
  1.3281 +escape:
  1.3282 +                {
  1.3283 +                    const char * mySourceBefore = mySource;
  1.3284 +                    int8_t toULengthBefore = args->converter->toULength;
  1.3285 +
  1.3286 +                    changeState_2022(args->converter,&(mySource),
  1.3287 +                        mySourceLimit, ISO_2022_CN,err);
  1.3288 +
  1.3289 +                    /* After SO there must be at least one character before a designator (designator error handled separately) */
  1.3290 +                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
  1.3291 +                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  1.3292 +                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
  1.3293 +                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
  1.3294 +                    }
  1.3295 +                }
  1.3296 +
  1.3297 +                /* invalid or illegal escape sequence */
  1.3298 +                if(U_FAILURE(*err)){
  1.3299 +                    args->target = myTarget;
  1.3300 +                    args->source = mySource;
  1.3301 +                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
  1.3302 +                    return;
  1.3303 +                }
  1.3304 +                continue;
  1.3305 +
  1.3306 +            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
  1.3307 +
  1.3308 +            case CR:
  1.3309 +                /*falls through*/
  1.3310 +            case LF:
  1.3311 +                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
  1.3312 +                /* falls through */
  1.3313 +            default:
  1.3314 +                /* convert one or two bytes */
  1.3315 +                myData->isEmptySegment = FALSE;
  1.3316 +                if(pToU2022State->g != 0) {
  1.3317 +                    if(mySource < mySourceLimit) {
  1.3318 +                        UConverterSharedData *cnv;
  1.3319 +                        StateEnum tempState;
  1.3320 +                        int32_t tempBufLen;
  1.3321 +                        int leadIsOk, trailIsOk;
  1.3322 +                        uint8_t trailByte;
  1.3323 +getTrailByte:
  1.3324 +                        trailByte = (uint8_t)*mySource;
  1.3325 +                        /*
  1.3326 +                         * Ticket 5691: consistent illegal sequences:
  1.3327 +                         * - We include at least the first byte in the illegal sequence.
  1.3328 +                         * - If any of the non-initial bytes could be the start of a character,
  1.3329 +                         *   we stop the illegal sequence before the first one of those.
  1.3330 +                         *
  1.3331 +                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
  1.3332 +                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
  1.3333 +                         * Otherwise we convert or report the pair of bytes.
  1.3334 +                         */
  1.3335 +                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
  1.3336 +                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
  1.3337 +                        if (leadIsOk && trailIsOk) {
  1.3338 +                            ++mySource;
  1.3339 +                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
  1.3340 +                            if(tempState >= CNS_11643_0) {
  1.3341 +                                cnv = myData->myConverterArray[CNS_11643];
  1.3342 +                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
  1.3343 +                                tempBuf[1] = (char) (mySourceChar);
  1.3344 +                                tempBuf[2] = (char) trailByte;
  1.3345 +                                tempBufLen = 3;
  1.3346 +
  1.3347 +                            }else{
  1.3348 +                                U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
  1.3349 +                                cnv = myData->myConverterArray[tempState];
  1.3350 +                                tempBuf[0] = (char) (mySourceChar);
  1.3351 +                                tempBuf[1] = (char) trailByte;
  1.3352 +                                tempBufLen = 2;
  1.3353 +                            }
  1.3354 +                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
  1.3355 +                            mySourceChar = (mySourceChar << 8) | trailByte;
  1.3356 +                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
  1.3357 +                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
  1.3358 +                            ++mySource;
  1.3359 +                            /* add another bit so that the code below writes 2 bytes in case of error */
  1.3360 +                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
  1.3361 +                        }
  1.3362 +                        if(pToU2022State->g>=2) {
  1.3363 +                            /* return from a single-shift state to the previous one */
  1.3364 +                            pToU2022State->g=pToU2022State->prevG;
  1.3365 +                        }
  1.3366 +                    } else {
  1.3367 +                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  1.3368 +                        args->converter->toULength = 1;
  1.3369 +                        goto endloop;
  1.3370 +                    }
  1.3371 +                }
  1.3372 +                else{
  1.3373 +                    if(mySourceChar <= 0x7f) {
  1.3374 +                        targetUniChar = (UChar) mySourceChar;
  1.3375 +                    }
  1.3376 +                }
  1.3377 +                break;
  1.3378 +            }
  1.3379 +            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
  1.3380 +                if(args->offsets){
  1.3381 +                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  1.3382 +                }
  1.3383 +                *(myTarget++)=(UChar)targetUniChar;
  1.3384 +            }
  1.3385 +            else if(targetUniChar > missingCharMarker){
  1.3386 +                /* disassemble the surrogate pair and write to output*/
  1.3387 +                targetUniChar-=0x0010000;
  1.3388 +                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
  1.3389 +                if(args->offsets){
  1.3390 +                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  1.3391 +                }
  1.3392 +                ++myTarget;
  1.3393 +                if(myTarget< args->targetLimit){
  1.3394 +                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
  1.3395 +                    if(args->offsets){
  1.3396 +                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  1.3397 +                    }
  1.3398 +                    ++myTarget;
  1.3399 +                }else{
  1.3400 +                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
  1.3401 +                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
  1.3402 +                }
  1.3403 +
  1.3404 +            }
  1.3405 +            else{
  1.3406 +                /* Call the callback function*/
  1.3407 +                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
  1.3408 +                break;
  1.3409 +            }
  1.3410 +        }
  1.3411 +        else{
  1.3412 +            *err =U_BUFFER_OVERFLOW_ERROR;
  1.3413 +            break;
  1.3414 +        }
  1.3415 +    }
  1.3416 +endloop:
  1.3417 +    args->target = myTarget;
  1.3418 +    args->source = mySource;
  1.3419 +}
  1.3420 +
  1.3421 +static void
  1.3422 +_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
  1.3423 +    UConverter *cnv = args->converter;
  1.3424 +    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
  1.3425 +    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
  1.3426 +    char *p, *subchar;
  1.3427 +    char buffer[8];
  1.3428 +    int32_t length;
  1.3429 +
  1.3430 +    subchar=(char *)cnv->subChars;
  1.3431 +    length=cnv->subCharLen; /* assume length==1 for most variants */
  1.3432 +
  1.3433 +    p = buffer;
  1.3434 +    switch(myConverterData->locale[0]){
  1.3435 +    case 'j':
  1.3436 +        {
  1.3437 +            int8_t cs;
  1.3438 +
  1.3439 +            if(pFromU2022State->g == 1) {
  1.3440 +                /* JIS7: switch from G1 to G0 */
  1.3441 +                pFromU2022State->g = 0;
  1.3442 +                *p++ = UCNV_SI;
  1.3443 +            }
  1.3444 +
  1.3445 +            cs = pFromU2022State->cs[0];
  1.3446 +            if(cs != ASCII && cs != JISX201) {
  1.3447 +                /* not in ASCII or JIS X 0201: switch to ASCII */
  1.3448 +                pFromU2022State->cs[0] = (int8_t)ASCII;
  1.3449 +                *p++ = '\x1b';
  1.3450 +                *p++ = '\x28';
  1.3451 +                *p++ = '\x42';
  1.3452 +            }
  1.3453 +
  1.3454 +            *p++ = subchar[0];
  1.3455 +            break;
  1.3456 +        }
  1.3457 +    case 'c':
  1.3458 +        if(pFromU2022State->g != 0) {
  1.3459 +            /* not in ASCII mode: switch to ASCII */
  1.3460 +            pFromU2022State->g = 0;
  1.3461 +            *p++ = UCNV_SI;
  1.3462 +        }
  1.3463 +        *p++ = subchar[0];
  1.3464 +        break;
  1.3465 +    case 'k':
  1.3466 +        if(myConverterData->version == 0) {
  1.3467 +            if(length == 1) {
  1.3468 +                if((UBool)args->converter->fromUnicodeStatus) {
  1.3469 +                    /* in DBCS mode: switch to SBCS */
  1.3470 +                    args->converter->fromUnicodeStatus = 0;
  1.3471 +                    *p++ = UCNV_SI;
  1.3472 +                }
  1.3473 +                *p++ = subchar[0];
  1.3474 +            } else /* length == 2*/ {
  1.3475 +                if(!(UBool)args->converter->fromUnicodeStatus) {
  1.3476 +                    /* in SBCS mode: switch to DBCS */
  1.3477 +                    args->converter->fromUnicodeStatus = 1;
  1.3478 +                    *p++ = UCNV_SO;
  1.3479 +                }
  1.3480 +                *p++ = subchar[0];
  1.3481 +                *p++ = subchar[1];
  1.3482 +            }
  1.3483 +            break;
  1.3484 +        } else {
  1.3485 +            /* save the subconverter's substitution string */
  1.3486 +            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
  1.3487 +            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
  1.3488 +
  1.3489 +            /* set our substitution string into the subconverter */
  1.3490 +            myConverterData->currentConverter->subChars = (uint8_t *)subchar;
  1.3491 +            myConverterData->currentConverter->subCharLen = (int8_t)length;
  1.3492 +
  1.3493 +            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
  1.3494 +            args->converter = myConverterData->currentConverter;
  1.3495 +            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
  1.3496 +            ucnv_cbFromUWriteSub(args, 0, err);
  1.3497 +            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
  1.3498 +            args->converter = cnv;
  1.3499 +
  1.3500 +            /* restore the subconverter's substitution string */
  1.3501 +            myConverterData->currentConverter->subChars = currentSubChars;
  1.3502 +            myConverterData->currentConverter->subCharLen = currentSubCharLen;
  1.3503 +
  1.3504 +            if(*err == U_BUFFER_OVERFLOW_ERROR) {
  1.3505 +                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
  1.3506 +                    uprv_memcpy(
  1.3507 +                        cnv->charErrorBuffer,
  1.3508 +                        myConverterData->currentConverter->charErrorBuffer,
  1.3509 +                        myConverterData->currentConverter->charErrorBufferLength);
  1.3510 +                }
  1.3511 +                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
  1.3512 +                myConverterData->currentConverter->charErrorBufferLength = 0;
  1.3513 +            }
  1.3514 +            return;
  1.3515 +        }
  1.3516 +    default:
  1.3517 +        /* not expected */
  1.3518 +        break;
  1.3519 +    }
  1.3520 +    ucnv_cbFromUWriteBytes(args,
  1.3521 +                           buffer, (int32_t)(p - buffer),
  1.3522 +                           offsetIndex, err);
  1.3523 +}
  1.3524 +
  1.3525 +/*
  1.3526 + * Structure for cloning an ISO 2022 converter into a single memory block.
  1.3527 + * ucnv_safeClone() of the converter will align the entire cloneStruct,
  1.3528 + * and then ucnv_safeClone() of the sub-converter may additionally align
  1.3529 + * currentConverter inside the cloneStruct, for which we need the deadSpace
  1.3530 + * after currentConverter.
  1.3531 + * This is because UAlignedMemory may be larger than the actually
  1.3532 + * necessary alignment size for the platform.
  1.3533 + * The other cloneStruct fields will not be moved around,
  1.3534 + * and are aligned properly with cloneStruct's alignment.
  1.3535 + */
  1.3536 +struct cloneStruct
  1.3537 +{
  1.3538 +    UConverter cnv;
  1.3539 +    UConverter currentConverter;
  1.3540 +    UAlignedMemory deadSpace;
  1.3541 +    UConverterDataISO2022 mydata;
  1.3542 +};
  1.3543 +
  1.3544 +
  1.3545 +static UConverter *
  1.3546 +_ISO_2022_SafeClone(
  1.3547 +            const UConverter *cnv,
  1.3548 +            void *stackBuffer,
  1.3549 +            int32_t *pBufferSize,
  1.3550 +            UErrorCode *status)
  1.3551 +{
  1.3552 +    struct cloneStruct * localClone;
  1.3553 +    UConverterDataISO2022 *cnvData;
  1.3554 +    int32_t i, size;
  1.3555 +
  1.3556 +    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
  1.3557 +        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
  1.3558 +        return NULL;
  1.3559 +    }
  1.3560 +
  1.3561 +    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
  1.3562 +    localClone = (struct cloneStruct *)stackBuffer;
  1.3563 +
  1.3564 +    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
  1.3565 +
  1.3566 +    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
  1.3567 +    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
  1.3568 +    localClone->cnv.isExtraLocal = TRUE;
  1.3569 +
  1.3570 +    /* share the subconverters */
  1.3571 +
  1.3572 +    if(cnvData->currentConverter != NULL) {
  1.3573 +        size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
  1.3574 +        localClone->mydata.currentConverter =
  1.3575 +            ucnv_safeClone(cnvData->currentConverter,
  1.3576 +                            &localClone->currentConverter,
  1.3577 +                            &size, status);
  1.3578 +        if(U_FAILURE(*status)) {
  1.3579 +            return NULL;
  1.3580 +        }
  1.3581 +    }
  1.3582 +
  1.3583 +    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
  1.3584 +        if(cnvData->myConverterArray[i] != NULL) {
  1.3585 +            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
  1.3586 +        }
  1.3587 +    }
  1.3588 +
  1.3589 +    return &localClone->cnv;
  1.3590 +}
  1.3591 +
  1.3592 +static void
  1.3593 +_ISO_2022_GetUnicodeSet(const UConverter *cnv,
  1.3594 +                    const USetAdder *sa,
  1.3595 +                    UConverterUnicodeSet which,
  1.3596 +                    UErrorCode *pErrorCode)
  1.3597 +{
  1.3598 +    int32_t i;
  1.3599 +    UConverterDataISO2022* cnvData;
  1.3600 +
  1.3601 +    if (U_FAILURE(*pErrorCode)) {
  1.3602 +        return;
  1.3603 +    }
  1.3604 +#ifdef U_ENABLE_GENERIC_ISO_2022
  1.3605 +    if (cnv->sharedData == &_ISO2022Data) {
  1.3606 +        /* We use UTF-8 in this case */
  1.3607 +        sa->addRange(sa->set, 0, 0xd7FF);
  1.3608 +        sa->addRange(sa->set, 0xE000, 0x10FFFF);
  1.3609 +        return;
  1.3610 +    }
  1.3611 +#endif
  1.3612 +
  1.3613 +    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
  1.3614 +
  1.3615 +    /* open a set and initialize it with code points that are algorithmically round-tripped */
  1.3616 +    switch(cnvData->locale[0]){
  1.3617 +    case 'j':
  1.3618 +        /* include JIS X 0201 which is hardcoded */
  1.3619 +        sa->add(sa->set, 0xa5);
  1.3620 +        sa->add(sa->set, 0x203e);
  1.3621 +        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
  1.3622 +            /* include Latin-1 for some variants of JP */
  1.3623 +            sa->addRange(sa->set, 0, 0xff);
  1.3624 +        } else {
  1.3625 +            /* include ASCII for JP */
  1.3626 +            sa->addRange(sa->set, 0, 0x7f);
  1.3627 +        }
  1.3628 +        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
  1.3629 +            /*
  1.3630 +             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
  1.3631 +             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
  1.3632 +             * use half-width Katakana.
  1.3633 +             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
  1.3634 +             * half-width Katakana via the ESC ( I sequence.
  1.3635 +             * However, we only emit (fromUnicode) half-width Katakana according to the
  1.3636 +             * definition of each variant.
  1.3637 +             *
  1.3638 +             * When including fallbacks,
  1.3639 +             * we need to include half-width Katakana Unicode code points for all JP variants because
  1.3640 +             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
  1.3641 +             */
  1.3642 +            /* include half-width Katakana for JP */
  1.3643 +            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
  1.3644 +        }
  1.3645 +        break;
  1.3646 +    case 'c':
  1.3647 +    case 'z':
  1.3648 +        /* include ASCII for CN */
  1.3649 +        sa->addRange(sa->set, 0, 0x7f);
  1.3650 +        break;
  1.3651 +    case 'k':
  1.3652 +        /* there is only one converter for KR, and it is not in the myConverterArray[] */
  1.3653 +        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
  1.3654 +                cnvData->currentConverter, sa, which, pErrorCode);
  1.3655 +        /* the loop over myConverterArray[] will simply not find another converter */
  1.3656 +        break;
  1.3657 +    default:
  1.3658 +        break;
  1.3659 +    }
  1.3660 +
  1.3661 +#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
  1.3662 +            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
  1.3663 +                cnvData->version==0 && i==CNS_11643
  1.3664 +            ) {
  1.3665 +                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
  1.3666 +                ucnv_MBCSGetUnicodeSetForBytes(
  1.3667 +                        cnvData->myConverterArray[i],
  1.3668 +                        sa, UCNV_ROUNDTRIP_SET,
  1.3669 +                        0, 0x81, 0x82,
  1.3670 +                        pErrorCode);
  1.3671 +            }
  1.3672 +#endif
  1.3673 +
  1.3674 +    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
  1.3675 +        UConverterSetFilter filter;
  1.3676 +        if(cnvData->myConverterArray[i]!=NULL) {
  1.3677 +            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
  1.3678 +                cnvData->version==0 && i==CNS_11643
  1.3679 +            ) {
  1.3680 +                /*
  1.3681 +                 * Version-specific for CN:
  1.3682 +                 * CN version 0 does not map CNS planes 3..7 although
  1.3683 +                 * they are all available in the CNS conversion table;
  1.3684 +                 * CN version 1 (-EXT) does map them all.
  1.3685 +                 * The two versions create different Unicode sets.
  1.3686 +                 */
  1.3687 +                filter=UCNV_SET_FILTER_2022_CN;
  1.3688 +            } else if(cnvData->locale[0]=='j' && i==JISX208) {
  1.3689 +                /*
  1.3690 +                 * Only add code points that map to Shift-JIS codes
  1.3691 +                 * corresponding to JIS X 0208.
  1.3692 +                 */
  1.3693 +                filter=UCNV_SET_FILTER_SJIS;
  1.3694 +            } else if(i==KSC5601) {
  1.3695 +                /*
  1.3696 +                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
  1.3697 +                 * are broader than GR94.
  1.3698 +                 */
  1.3699 +                filter=UCNV_SET_FILTER_GR94DBCS;
  1.3700 +            } else {
  1.3701 +                filter=UCNV_SET_FILTER_NONE;
  1.3702 +            }
  1.3703 +            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
  1.3704 +        }
  1.3705 +    }
  1.3706 +
  1.3707 +    /*
  1.3708 +     * ISO 2022 converters must not convert SO/SI/ESC despite what
  1.3709 +     * sub-converters do by themselves.
  1.3710 +     * Remove these characters from the set.
  1.3711 +     */
  1.3712 +    sa->remove(sa->set, 0x0e);
  1.3713 +    sa->remove(sa->set, 0x0f);
  1.3714 +    sa->remove(sa->set, 0x1b);
  1.3715 +
  1.3716 +    /* ISO 2022 converters do not convert C1 controls either */
  1.3717 +    sa->removeRange(sa->set, 0x80, 0x9f);
  1.3718 +}
  1.3719 +
  1.3720 +static const UConverterImpl _ISO2022Impl={
  1.3721 +    UCNV_ISO_2022,
  1.3722 +
  1.3723 +    NULL,
  1.3724 +    NULL,
  1.3725 +
  1.3726 +    _ISO2022Open,
  1.3727 +    _ISO2022Close,
  1.3728 +    _ISO2022Reset,
  1.3729 +
  1.3730 +#ifdef U_ENABLE_GENERIC_ISO_2022
  1.3731 +    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
  1.3732 +    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
  1.3733 +    ucnv_fromUnicode_UTF8,
  1.3734 +    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
  1.3735 +#else
  1.3736 +    NULL,
  1.3737 +    NULL,
  1.3738 +    NULL,
  1.3739 +    NULL,
  1.3740 +#endif
  1.3741 +    NULL,
  1.3742 +
  1.3743 +    NULL,
  1.3744 +    _ISO2022getName,
  1.3745 +    _ISO_2022_WriteSub,
  1.3746 +    _ISO_2022_SafeClone,
  1.3747 +    _ISO_2022_GetUnicodeSet,
  1.3748 +
  1.3749 +    NULL,
  1.3750 +    NULL
  1.3751 +};
  1.3752 +static const UConverterStaticData _ISO2022StaticData={
  1.3753 +    sizeof(UConverterStaticData),
  1.3754 +    "ISO_2022",
  1.3755 +    2022,
  1.3756 +    UCNV_IBM,
  1.3757 +    UCNV_ISO_2022,
  1.3758 +    1,
  1.3759 +    3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
  1.3760 +    { 0x1a, 0, 0, 0 },
  1.3761 +    1,
  1.3762 +    FALSE,
  1.3763 +    FALSE,
  1.3764 +    0,
  1.3765 +    0,
  1.3766 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.3767 +};
  1.3768 +const UConverterSharedData _ISO2022Data={
  1.3769 +    sizeof(UConverterSharedData),
  1.3770 +    ~((uint32_t) 0),
  1.3771 +    NULL,
  1.3772 +    NULL,
  1.3773 +    &_ISO2022StaticData,
  1.3774 +    FALSE,
  1.3775 +    &_ISO2022Impl,
  1.3776 +    0, UCNV_MBCS_TABLE_INITIALIZER
  1.3777 +};
  1.3778 +
  1.3779 +/*************JP****************/
  1.3780 +static const UConverterImpl _ISO2022JPImpl={
  1.3781 +    UCNV_ISO_2022,
  1.3782 +
  1.3783 +    NULL,
  1.3784 +    NULL,
  1.3785 +
  1.3786 +    _ISO2022Open,
  1.3787 +    _ISO2022Close,
  1.3788 +    _ISO2022Reset,
  1.3789 +
  1.3790 +    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
  1.3791 +    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
  1.3792 +    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
  1.3793 +    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
  1.3794 +    NULL,
  1.3795 +
  1.3796 +    NULL,
  1.3797 +    _ISO2022getName,
  1.3798 +    _ISO_2022_WriteSub,
  1.3799 +    _ISO_2022_SafeClone,
  1.3800 +    _ISO_2022_GetUnicodeSet,
  1.3801 +
  1.3802 +    NULL,
  1.3803 +    NULL
  1.3804 +};
  1.3805 +static const UConverterStaticData _ISO2022JPStaticData={
  1.3806 +    sizeof(UConverterStaticData),
  1.3807 +    "ISO_2022_JP",
  1.3808 +    0,
  1.3809 +    UCNV_IBM,
  1.3810 +    UCNV_ISO_2022,
  1.3811 +    1,
  1.3812 +    6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
  1.3813 +    { 0x1a, 0, 0, 0 },
  1.3814 +    1,
  1.3815 +    FALSE,
  1.3816 +    FALSE,
  1.3817 +    0,
  1.3818 +    0,
  1.3819 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.3820 +};
  1.3821 +
  1.3822 +namespace {
  1.3823 +
  1.3824 +const UConverterSharedData _ISO2022JPData={
  1.3825 +    sizeof(UConverterSharedData),
  1.3826 +    ~((uint32_t) 0),
  1.3827 +    NULL,
  1.3828 +    NULL,
  1.3829 +    &_ISO2022JPStaticData,
  1.3830 +    FALSE,
  1.3831 +    &_ISO2022JPImpl,
  1.3832 +    0, UCNV_MBCS_TABLE_INITIALIZER
  1.3833 +};
  1.3834 +
  1.3835 +}  // namespace
  1.3836 +
  1.3837 +/************* KR ***************/
  1.3838 +static const UConverterImpl _ISO2022KRImpl={
  1.3839 +    UCNV_ISO_2022,
  1.3840 +
  1.3841 +    NULL,
  1.3842 +    NULL,
  1.3843 +
  1.3844 +    _ISO2022Open,
  1.3845 +    _ISO2022Close,
  1.3846 +    _ISO2022Reset,
  1.3847 +
  1.3848 +    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
  1.3849 +    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
  1.3850 +    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
  1.3851 +    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
  1.3852 +    NULL,
  1.3853 +
  1.3854 +    NULL,
  1.3855 +    _ISO2022getName,
  1.3856 +    _ISO_2022_WriteSub,
  1.3857 +    _ISO_2022_SafeClone,
  1.3858 +    _ISO_2022_GetUnicodeSet,
  1.3859 +
  1.3860 +    NULL,
  1.3861 +    NULL
  1.3862 +};
  1.3863 +static const UConverterStaticData _ISO2022KRStaticData={
  1.3864 +    sizeof(UConverterStaticData),
  1.3865 +    "ISO_2022_KR",
  1.3866 +    0,
  1.3867 +    UCNV_IBM,
  1.3868 +    UCNV_ISO_2022,
  1.3869 +    1,
  1.3870 +    3, /* max 3 bytes per UChar: SO+DBCS */
  1.3871 +    { 0x1a, 0, 0, 0 },
  1.3872 +    1,
  1.3873 +    FALSE,
  1.3874 +    FALSE,
  1.3875 +    0,
  1.3876 +    0,
  1.3877 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.3878 +};
  1.3879 +
  1.3880 +namespace {
  1.3881 +
  1.3882 +const UConverterSharedData _ISO2022KRData={
  1.3883 +    sizeof(UConverterSharedData),
  1.3884 +    ~((uint32_t) 0),
  1.3885 +    NULL,
  1.3886 +    NULL,
  1.3887 +    &_ISO2022KRStaticData,
  1.3888 +    FALSE,
  1.3889 +    &_ISO2022KRImpl,
  1.3890 +    0, UCNV_MBCS_TABLE_INITIALIZER
  1.3891 +};
  1.3892 +
  1.3893 +}  // namespace
  1.3894 +
  1.3895 +/*************** CN ***************/
  1.3896 +static const UConverterImpl _ISO2022CNImpl={
  1.3897 +
  1.3898 +    UCNV_ISO_2022,
  1.3899 +
  1.3900 +    NULL,
  1.3901 +    NULL,
  1.3902 +
  1.3903 +    _ISO2022Open,
  1.3904 +    _ISO2022Close,
  1.3905 +    _ISO2022Reset,
  1.3906 +
  1.3907 +    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
  1.3908 +    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
  1.3909 +    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
  1.3910 +    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
  1.3911 +    NULL,
  1.3912 +
  1.3913 +    NULL,
  1.3914 +    _ISO2022getName,
  1.3915 +    _ISO_2022_WriteSub,
  1.3916 +    _ISO_2022_SafeClone,
  1.3917 +    _ISO_2022_GetUnicodeSet,
  1.3918 +
  1.3919 +    NULL,
  1.3920 +    NULL
  1.3921 +};
  1.3922 +static const UConverterStaticData _ISO2022CNStaticData={
  1.3923 +    sizeof(UConverterStaticData),
  1.3924 +    "ISO_2022_CN",
  1.3925 +    0,
  1.3926 +    UCNV_IBM,
  1.3927 +    UCNV_ISO_2022,
  1.3928 +    1,
  1.3929 +    8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
  1.3930 +    { 0x1a, 0, 0, 0 },
  1.3931 +    1,
  1.3932 +    FALSE,
  1.3933 +    FALSE,
  1.3934 +    0,
  1.3935 +    0,
  1.3936 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.3937 +};
  1.3938 +
  1.3939 +namespace {
  1.3940 +
  1.3941 +const UConverterSharedData _ISO2022CNData={
  1.3942 +    sizeof(UConverterSharedData),
  1.3943 +    ~((uint32_t) 0),
  1.3944 +    NULL,
  1.3945 +    NULL,
  1.3946 +    &_ISO2022CNStaticData,
  1.3947 +    FALSE,
  1.3948 +    &_ISO2022CNImpl,
  1.3949 +    0, UCNV_MBCS_TABLE_INITIALIZER
  1.3950 +};
  1.3951 +
  1.3952 +}  // namespace
  1.3953 +
  1.3954 +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

mercurial