1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnv2022.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,3951 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2000-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* file name: ucnv2022.cpp 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2000feb03 1.15 +* created by: Markus W. Scherer 1.16 +* 1.17 +* Change history: 1.18 +* 1.19 +* 06/29/2000 helena Major rewrite of the callback APIs. 1.20 +* 08/08/2000 Ram Included support for ISO-2022-JP-2 1.21 +* Changed implementation of toUnicode 1.22 +* function 1.23 +* 08/21/2000 Ram Added support for ISO-2022-KR 1.24 +* 08/29/2000 Ram Seperated implementation of EBCDIC to 1.25 +* ucnvebdc.c 1.26 +* 09/20/2000 Ram Added support for ISO-2022-CN 1.27 +* Added implementations for getNextUChar() 1.28 +* for specific 2022 country variants. 1.29 +* 10/31/2000 Ram Implemented offsets logic functions 1.30 +*/ 1.31 + 1.32 +#include "unicode/utypes.h" 1.33 + 1.34 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 1.35 + 1.36 +#include "unicode/ucnv.h" 1.37 +#include "unicode/uset.h" 1.38 +#include "unicode/ucnv_err.h" 1.39 +#include "unicode/ucnv_cb.h" 1.40 +#include "unicode/utf16.h" 1.41 +#include "ucnv_imp.h" 1.42 +#include "ucnv_bld.h" 1.43 +#include "ucnv_cnv.h" 1.44 +#include "ucnvmbcs.h" 1.45 +#include "cstring.h" 1.46 +#include "cmemory.h" 1.47 +#include "uassert.h" 1.48 + 1.49 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.50 + 1.51 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.52 +/* 1.53 + * I am disabling the generic ISO-2022 converter after proposing to do so on 1.54 + * the icu mailing list two days ago. 1.55 + * 1.56 + * Reasons: 1.57 + * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 1.58 + * its designation sequences, single shifts with return to the previous state, 1.59 + * switch-with-no-return to UTF-16BE or similar, etc. 1.60 + * This is unlike the language-specific variants like ISO-2022-JP which 1.61 + * require a much smaller repertoire of ISO-2022 features. 1.62 + * These variants continue to be supported. 1.63 + * 2. I believe that no one is really using the generic ISO-2022 converter 1.64 + * but rather always one of the language-specific variants. 1.65 + * Note that ICU's generic ISO-2022 converter has always output one escape 1.66 + * sequence followed by UTF-8 for the whole stream. 1.67 + * 3. Switching between subcharsets is extremely slow, because each time 1.68 + * the previous converter is closed and a new one opened, 1.69 + * without any kind of caching, least-recently-used list, etc. 1.70 + * 4. The code is currently buggy, and given the above it does not seem 1.71 + * reasonable to spend the time on maintenance. 1.72 + * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 1.73 + * This means, for example, that when ISO-8859-7 is designated, the following 1.74 + * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 1.75 + * The ICU ISO-2022 converter does not handle this - and has no information 1.76 + * about which subconverter would have to be shifted vs. which is designed 1.77 + * for 7-bit ISO-2022. 1.78 + * 1.79 + * Markus Scherer 2003-dec-03 1.80 + */ 1.81 +#endif 1.82 + 1.83 +static const char SHIFT_IN_STR[] = "\x0F"; 1.84 +// static const char SHIFT_OUT_STR[] = "\x0E"; 1.85 + 1.86 +#define CR 0x0D 1.87 +#define LF 0x0A 1.88 +#define H_TAB 0x09 1.89 +#define V_TAB 0x0B 1.90 +#define SPACE 0x20 1.91 + 1.92 +enum { 1.93 + HWKANA_START=0xff61, 1.94 + HWKANA_END=0xff9f 1.95 +}; 1.96 + 1.97 +/* 1.98 + * 94-character sets with native byte values A1..FE are encoded in ISO 2022 1.99 + * as bytes 21..7E. (Subtract 0x80.) 1.100 + * 96-character sets with native byte values A0..FF are encoded in ISO 2022 1.101 + * as bytes 20..7F. (Subtract 0x80.) 1.102 + * Do not encode C1 control codes with native bytes 80..9F 1.103 + * as bytes 00..1F (C0 control codes). 1.104 + */ 1.105 +enum { 1.106 + GR94_START=0xa1, 1.107 + GR94_END=0xfe, 1.108 + GR96_START=0xa0, 1.109 + GR96_END=0xff 1.110 +}; 1.111 + 1.112 +/* 1.113 + * ISO 2022 control codes must not be converted from Unicode 1.114 + * because they would mess up the byte stream. 1.115 + * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 1.116 + * corresponding to SO, SI, and ESC. 1.117 + */ 1.118 +#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 1.119 + 1.120 +/* for ISO-2022-JP and -CN implementations */ 1.121 +typedef enum { 1.122 + /* shared values */ 1.123 + INVALID_STATE=-1, 1.124 + ASCII = 0, 1.125 + 1.126 + SS2_STATE=0x10, 1.127 + SS3_STATE, 1.128 + 1.129 + /* JP */ 1.130 + ISO8859_1 = 1 , 1.131 + ISO8859_7 = 2 , 1.132 + JISX201 = 3, 1.133 + JISX208 = 4, 1.134 + JISX212 = 5, 1.135 + GB2312 =6, 1.136 + KSC5601 =7, 1.137 + HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 1.138 + 1.139 + /* CN */ 1.140 + /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 1.141 + GB2312_1=1, 1.142 + ISO_IR_165=2, 1.143 + CNS_11643=3, 1.144 + 1.145 + /* 1.146 + * these are used in StateEnum and ISO2022State variables, 1.147 + * but CNS_11643 must be used to index into myConverterArray[] 1.148 + */ 1.149 + CNS_11643_0=0x20, 1.150 + CNS_11643_1, 1.151 + CNS_11643_2, 1.152 + CNS_11643_3, 1.153 + CNS_11643_4, 1.154 + CNS_11643_5, 1.155 + CNS_11643_6, 1.156 + CNS_11643_7 1.157 +} StateEnum; 1.158 + 1.159 +/* is the StateEnum charset value for a DBCS charset? */ 1.160 +#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 1.161 + 1.162 +#define CSM(cs) ((uint16_t)1<<(cs)) 1.163 + 1.164 +/* 1.165 + * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 1.166 + * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 1.167 + * 1.168 + * Note: The converter uses some leniency: 1.169 + * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 1.170 + * all versions, not just JIS7 and JIS8. 1.171 + * - ICU does not distinguish between different versions of JIS X 0208. 1.172 + */ 1.173 +enum { MAX_JA_VERSION=4 }; 1.174 +static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 1.175 + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 1.176 + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 1.177 + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 1.178 + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 1.179 + CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 1.180 +}; 1.181 + 1.182 +typedef enum { 1.183 + ASCII1=0, 1.184 + LATIN1, 1.185 + SBCS, 1.186 + DBCS, 1.187 + MBCS, 1.188 + HWKANA 1.189 +}Cnv2022Type; 1.190 + 1.191 +typedef struct ISO2022State { 1.192 + int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 1.193 + int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 1.194 + int8_t prevG; /* g before single shift (SS2 or SS3) */ 1.195 +} ISO2022State; 1.196 + 1.197 +#define UCNV_OPTIONS_VERSION_MASK 0xf 1.198 +#define UCNV_2022_MAX_CONVERTERS 10 1.199 + 1.200 +typedef struct{ 1.201 + UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 1.202 + UConverter *currentConverter; 1.203 + Cnv2022Type currentType; 1.204 + ISO2022State toU2022State, fromU2022State; 1.205 + uint32_t key; 1.206 + uint32_t version; 1.207 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.208 + UBool isFirstBuffer; 1.209 +#endif 1.210 + UBool isEmptySegment; 1.211 + char name[30]; 1.212 + char locale[3]; 1.213 +}UConverterDataISO2022; 1.214 + 1.215 +/* Protos */ 1.216 +/* ISO-2022 ----------------------------------------------------------------- */ 1.217 + 1.218 +/*Forward declaration */ 1.219 +U_CFUNC void 1.220 +ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 1.221 + UErrorCode * err); 1.222 +U_CFUNC void 1.223 +ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 1.224 + UErrorCode * err); 1.225 + 1.226 +#define ESC_2022 0x1B /*ESC*/ 1.227 + 1.228 +typedef enum 1.229 +{ 1.230 + INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 1.231 + VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 1.232 + VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 1.233 + VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 1.234 +} UCNV_TableStates_2022; 1.235 + 1.236 +/* 1.237 +* The way these state transition arrays work is: 1.238 +* ex : ESC$B is the sequence for JISX208 1.239 +* a) First Iteration: char is ESC 1.240 +* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 1.241 +* int x = normalize_esq_chars_2022[27] which is equal to 1 1.242 +* ii) Search for this value in escSeqStateTable_Key_2022[] 1.243 +* value of x is stored at escSeqStateTable_Key_2022[0] 1.244 +* iii) Save this index as offset 1.245 +* iv) Get state of this sequence from escSeqStateTable_Value_2022[] 1.246 +* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 1.247 +* b) Switch on this state and continue to next char 1.248 +* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 1.249 +* which is normalize_esq_chars_2022[36] == 4 1.250 +* ii) x is currently 1(from above) 1.251 +* x<<=5 -- x is now 32 1.252 +* x+=normalize_esq_chars_2022[36] 1.253 +* now x is 36 1.254 +* iii) Search for this value in escSeqStateTable_Key_2022[] 1.255 +* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 1.256 +* iv) Get state of this sequence from escSeqStateTable_Value_2022[] 1.257 +* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 1.258 +* c) Switch on this state and continue to next char 1.259 +* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 1.260 +* ii) x is currently 36 (from above) 1.261 +* x<<=5 -- x is now 1152 1.262 +* x+=normalize_esq_chars_2022[66] 1.263 +* now x is 1161 1.264 +* iii) Search for this value in escSeqStateTable_Key_2022[] 1.265 +* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 1.266 +* iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 1.267 +* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 1.268 +* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 1.269 +*/ 1.270 + 1.271 + 1.272 +/*Below are the 3 arrays depicting a state transition table*/ 1.273 +static const int8_t normalize_esq_chars_2022[256] = { 1.274 +/* 0 1 2 3 4 5 6 7 8 9 */ 1.275 + 1.276 + 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.277 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.278 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 1.279 + ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 1.280 + ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 1.281 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.282 + ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 1.283 + ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 1.284 + ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.285 + ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.286 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.287 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.288 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.289 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.290 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.291 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.292 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.293 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.294 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.295 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.296 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.297 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.298 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.299 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.300 + ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 1.301 + ,0 ,0 ,0 ,0 ,0 ,0 1.302 +}; 1.303 + 1.304 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.305 +/* 1.306 + * When the generic ISO-2022 converter is completely removed, not just disabled 1.307 + * per #ifdef, then the following state table and the associated tables that are 1.308 + * dimensioned with MAX_STATES_2022 should be trimmed. 1.309 + * 1.310 + * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 1.311 + * the associated escape sequences starting with ESC ( B should be removed. 1.312 + * This includes the ones with key values 1097 and all of the ones above 1000000. 1.313 + * 1.314 + * For the latter, the tables can simply be truncated. 1.315 + * For the former, since the tables must be kept parallel, it is probably best 1.316 + * to simply duplicate an adjacent table cell, parallel in all tables. 1.317 + * 1.318 + * It may make sense to restructure the tables, especially by using small search 1.319 + * tables for the variants instead of indexing them parallel to the table here. 1.320 + */ 1.321 +#endif 1.322 + 1.323 +#define MAX_STATES_2022 74 1.324 +static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 1.325 +/* 0 1 2 3 4 5 6 7 8 9 */ 1.326 + 1.327 + 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 1.328 + ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 1.329 + ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 1.330 + ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 1.331 + ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 1.332 + ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 1.333 + ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 1.334 + ,35947631 ,35947635 ,35947636 ,35947638 1.335 +}; 1.336 + 1.337 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.338 + 1.339 +static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 1.340 + /* 0 1 2 3 4 5 6 7 8 9 */ 1.341 + 1.342 + NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 1.343 + ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 1.344 + ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 1.345 + ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 1.346 + ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 1.347 + ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 1.348 + ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 1.349 + ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 1.350 +}; 1.351 + 1.352 +#endif 1.353 + 1.354 +static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 1.355 +/* 0 1 2 3 4 5 6 7 8 9 */ 1.356 + VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 1.357 + ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 1.358 + ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 1.359 + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 1.360 + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 1.361 + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 1.362 + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 1.363 + ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 1.364 +}; 1.365 + 1.366 + 1.367 +/* Type def for refactoring changeState_2022 code*/ 1.368 +typedef enum{ 1.369 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.370 + ISO_2022=0, 1.371 +#endif 1.372 + ISO_2022_JP=1, 1.373 + ISO_2022_KR=2, 1.374 + ISO_2022_CN=3 1.375 +} Variant2022; 1.376 + 1.377 +/*********** ISO 2022 Converter Protos ***********/ 1.378 +static void 1.379 +_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 1.380 + 1.381 +static void 1.382 + _ISO2022Close(UConverter *converter); 1.383 + 1.384 +static void 1.385 +_ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 1.386 + 1.387 +static const char* 1.388 +_ISO2022getName(const UConverter* cnv); 1.389 + 1.390 +static void 1.391 +_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 1.392 + 1.393 +static UConverter * 1.394 +_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 1.395 + 1.396 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.397 +static void 1.398 +T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 1.399 +#endif 1.400 + 1.401 +namespace { 1.402 + 1.403 +/*const UConverterSharedData _ISO2022Data;*/ 1.404 +extern const UConverterSharedData _ISO2022JPData; 1.405 +extern const UConverterSharedData _ISO2022KRData; 1.406 +extern const UConverterSharedData _ISO2022CNData; 1.407 + 1.408 +} // namespace 1.409 + 1.410 +/*************** Converter implementations ******************/ 1.411 + 1.412 +/* The purpose of this function is to get around gcc compiler warnings. */ 1.413 +static inline void 1.414 +fromUWriteUInt8(UConverter *cnv, 1.415 + const char *bytes, int32_t length, 1.416 + uint8_t **target, const char *targetLimit, 1.417 + int32_t **offsets, 1.418 + int32_t sourceIndex, 1.419 + UErrorCode *pErrorCode) 1.420 +{ 1.421 + char *targetChars = (char *)*target; 1.422 + ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 1.423 + offsets, sourceIndex, pErrorCode); 1.424 + *target = (uint8_t*)targetChars; 1.425 + 1.426 +} 1.427 + 1.428 +static inline void 1.429 +setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ 1.430 + if(myConverterData->version == 1) { 1.431 + UConverter *cnv = myConverterData->currentConverter; 1.432 + 1.433 + cnv->toUnicodeStatus=0; /* offset */ 1.434 + cnv->mode=0; /* state */ 1.435 + cnv->toULength=0; /* byteIndex */ 1.436 + } 1.437 +} 1.438 + 1.439 +static inline void 1.440 +setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 1.441 + /* in ISO-2022-KR the designator sequence appears only once 1.442 + * in a file so we append it only once 1.443 + */ 1.444 + if( converter->charErrorBufferLength==0){ 1.445 + 1.446 + converter->charErrorBufferLength = 4; 1.447 + converter->charErrorBuffer[0] = 0x1b; 1.448 + converter->charErrorBuffer[1] = 0x24; 1.449 + converter->charErrorBuffer[2] = 0x29; 1.450 + converter->charErrorBuffer[3] = 0x43; 1.451 + } 1.452 + if(myConverterData->version == 1) { 1.453 + UConverter *cnv = myConverterData->currentConverter; 1.454 + 1.455 + cnv->fromUChar32=0; 1.456 + cnv->fromUnicodeStatus=1; /* prevLength */ 1.457 + } 1.458 +} 1.459 + 1.460 +static void 1.461 +_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 1.462 + 1.463 + char myLocale[6]={' ',' ',' ',' ',' ',' '}; 1.464 + 1.465 + cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 1.466 + if(cnv->extraInfo != NULL) { 1.467 + UConverterNamePieces stackPieces; 1.468 + UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; 1.469 + UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 1.470 + uint32_t version; 1.471 + 1.472 + stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 1.473 + 1.474 + uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 1.475 + myConverterData->currentType = ASCII1; 1.476 + cnv->fromUnicodeStatus =FALSE; 1.477 + if(pArgs->locale){ 1.478 + uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 1.479 + } 1.480 + version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 1.481 + myConverterData->version = version; 1.482 + if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && 1.483 + (myLocale[2]=='_' || myLocale[2]=='\0')) 1.484 + { 1.485 + size_t len=0; 1.486 + /* open the required converters and cache them */ 1.487 + if(version>MAX_JA_VERSION) { 1.488 + /* prevent indexing beyond jpCharsetMasks[] */ 1.489 + myConverterData->version = version = 0; 1.490 + } 1.491 + if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 1.492 + myConverterData->myConverterArray[ISO8859_7] = 1.493 + ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 1.494 + } 1.495 + myConverterData->myConverterArray[JISX208] = 1.496 + ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 1.497 + if(jpCharsetMasks[version]&CSM(JISX212)) { 1.498 + myConverterData->myConverterArray[JISX212] = 1.499 + ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 1.500 + } 1.501 + if(jpCharsetMasks[version]&CSM(GB2312)) { 1.502 + myConverterData->myConverterArray[GB2312] = 1.503 + ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 1.504 + } 1.505 + if(jpCharsetMasks[version]&CSM(KSC5601)) { 1.506 + myConverterData->myConverterArray[KSC5601] = 1.507 + ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 1.508 + } 1.509 + 1.510 + /* set the function pointers to appropriate funtions */ 1.511 + cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 1.512 + uprv_strcpy(myConverterData->locale,"ja"); 1.513 + 1.514 + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 1.515 + len = uprv_strlen(myConverterData->name); 1.516 + myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 1.517 + myConverterData->name[len+1]='\0'; 1.518 + } 1.519 + else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 1.520 + (myLocale[2]=='_' || myLocale[2]=='\0')) 1.521 + { 1.522 + const char *cnvName; 1.523 + if(version==1) { 1.524 + cnvName="icu-internal-25546"; 1.525 + } else { 1.526 + cnvName="ibm-949"; 1.527 + myConverterData->version=version=0; 1.528 + } 1.529 + if(pArgs->onlyTestIsLoadable) { 1.530 + ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 1.531 + uprv_free(cnv->extraInfo); 1.532 + cnv->extraInfo=NULL; 1.533 + return; 1.534 + } else { 1.535 + myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 1.536 + if (U_FAILURE(*errorCode)) { 1.537 + _ISO2022Close(cnv); 1.538 + return; 1.539 + } 1.540 + 1.541 + if(version==1) { 1.542 + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 1.543 + uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 1.544 + cnv->subCharLen = myConverterData->currentConverter->subCharLen; 1.545 + }else{ 1.546 + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 1.547 + } 1.548 + 1.549 + /* initialize the state variables */ 1.550 + setInitialStateToUnicodeKR(cnv, myConverterData); 1.551 + setInitialStateFromUnicodeKR(cnv, myConverterData); 1.552 + 1.553 + /* set the function pointers to appropriate funtions */ 1.554 + cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 1.555 + uprv_strcpy(myConverterData->locale,"ko"); 1.556 + } 1.557 + } 1.558 + else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 1.559 + (myLocale[2]=='_' || myLocale[2]=='\0')) 1.560 + { 1.561 + 1.562 + /* open the required converters and cache them */ 1.563 + myConverterData->myConverterArray[GB2312_1] = 1.564 + ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); 1.565 + if(version==1) { 1.566 + myConverterData->myConverterArray[ISO_IR_165] = 1.567 + ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); 1.568 + } 1.569 + myConverterData->myConverterArray[CNS_11643] = 1.570 + ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); 1.571 + 1.572 + 1.573 + /* set the function pointers to appropriate funtions */ 1.574 + cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 1.575 + uprv_strcpy(myConverterData->locale,"cn"); 1.576 + 1.577 + if (version==0){ 1.578 + myConverterData->version = 0; 1.579 + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 1.580 + }else if (version==1){ 1.581 + myConverterData->version = 1; 1.582 + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 1.583 + }else { 1.584 + myConverterData->version = 2; 1.585 + (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 1.586 + } 1.587 + } 1.588 + else{ 1.589 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.590 + myConverterData->isFirstBuffer = TRUE; 1.591 + 1.592 + /* append the UTF-8 escape sequence */ 1.593 + cnv->charErrorBufferLength = 3; 1.594 + cnv->charErrorBuffer[0] = 0x1b; 1.595 + cnv->charErrorBuffer[1] = 0x25; 1.596 + cnv->charErrorBuffer[2] = 0x42; 1.597 + 1.598 + cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 1.599 + /* initialize the state variables */ 1.600 + uprv_strcpy(myConverterData->name,"ISO_2022"); 1.601 +#else 1.602 + *errorCode = U_UNSUPPORTED_ERROR; 1.603 + return; 1.604 +#endif 1.605 + } 1.606 + 1.607 + cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 1.608 + 1.609 + if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 1.610 + _ISO2022Close(cnv); 1.611 + } 1.612 + } else { 1.613 + *errorCode = U_MEMORY_ALLOCATION_ERROR; 1.614 + } 1.615 +} 1.616 + 1.617 + 1.618 +static void 1.619 +_ISO2022Close(UConverter *converter) { 1.620 + UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 1.621 + UConverterSharedData **array = myData->myConverterArray; 1.622 + int32_t i; 1.623 + 1.624 + if (converter->extraInfo != NULL) { 1.625 + /*close the array of converter pointers and free the memory*/ 1.626 + for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 1.627 + if(array[i]!=NULL) { 1.628 + ucnv_unloadSharedDataIfReady(array[i]); 1.629 + } 1.630 + } 1.631 + 1.632 + ucnv_close(myData->currentConverter); 1.633 + 1.634 + if(!converter->isExtraLocal){ 1.635 + uprv_free (converter->extraInfo); 1.636 + converter->extraInfo = NULL; 1.637 + } 1.638 + } 1.639 +} 1.640 + 1.641 +static void 1.642 +_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 1.643 + UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 1.644 + if(choice<=UCNV_RESET_TO_UNICODE) { 1.645 + uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 1.646 + myConverterData->key = 0; 1.647 + myConverterData->isEmptySegment = FALSE; 1.648 + } 1.649 + if(choice!=UCNV_RESET_TO_UNICODE) { 1.650 + uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 1.651 + } 1.652 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.653 + if(myConverterData->locale[0] == 0){ 1.654 + if(choice<=UCNV_RESET_TO_UNICODE) { 1.655 + myConverterData->isFirstBuffer = TRUE; 1.656 + myConverterData->key = 0; 1.657 + if (converter->mode == UCNV_SO){ 1.658 + ucnv_close (myConverterData->currentConverter); 1.659 + myConverterData->currentConverter=NULL; 1.660 + } 1.661 + converter->mode = UCNV_SI; 1.662 + } 1.663 + if(choice!=UCNV_RESET_TO_UNICODE) { 1.664 + /* re-append UTF-8 escape sequence */ 1.665 + converter->charErrorBufferLength = 3; 1.666 + converter->charErrorBuffer[0] = 0x1b; 1.667 + converter->charErrorBuffer[1] = 0x28; 1.668 + converter->charErrorBuffer[2] = 0x42; 1.669 + } 1.670 + } 1.671 + else 1.672 +#endif 1.673 + { 1.674 + /* reset the state variables */ 1.675 + if(myConverterData->locale[0] == 'k'){ 1.676 + if(choice<=UCNV_RESET_TO_UNICODE) { 1.677 + setInitialStateToUnicodeKR(converter, myConverterData); 1.678 + } 1.679 + if(choice!=UCNV_RESET_TO_UNICODE) { 1.680 + setInitialStateFromUnicodeKR(converter, myConverterData); 1.681 + } 1.682 + } 1.683 + } 1.684 +} 1.685 + 1.686 +static const char* 1.687 +_ISO2022getName(const UConverter* cnv){ 1.688 + if(cnv->extraInfo){ 1.689 + UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 1.690 + return myData->name; 1.691 + } 1.692 + return NULL; 1.693 +} 1.694 + 1.695 + 1.696 +/*************** to unicode *******************/ 1.697 +/**************************************************************************** 1.698 + * Recognized escape sequences are 1.699 + * <ESC>(B ASCII 1.700 + * <ESC>.A ISO-8859-1 1.701 + * <ESC>.F ISO-8859-7 1.702 + * <ESC>(J JISX-201 1.703 + * <ESC>(I JISX-201 1.704 + * <ESC>$B JISX-208 1.705 + * <ESC>$@ JISX-208 1.706 + * <ESC>$(D JISX-212 1.707 + * <ESC>$A GB2312 1.708 + * <ESC>$(C KSC5601 1.709 + */ 1.710 +static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 1.711 +/* 0 1 2 3 4 5 6 7 8 9 */ 1.712 + INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.713 + ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 1.714 + ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.715 + ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 1.716 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.717 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.718 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.719 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.720 +}; 1.721 + 1.722 +/*************** to unicode *******************/ 1.723 +static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 1.724 +/* 0 1 2 3 4 5 6 7 8 9 */ 1.725 + INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.726 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.727 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.728 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.729 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 1.730 + ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.731 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.732 + ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 1.733 +}; 1.734 + 1.735 + 1.736 +static UCNV_TableStates_2022 1.737 +getKey_2022(char c,int32_t* key,int32_t* offset){ 1.738 + int32_t togo; 1.739 + int32_t low = 0; 1.740 + int32_t hi = MAX_STATES_2022; 1.741 + int32_t oldmid=0; 1.742 + 1.743 + togo = normalize_esq_chars_2022[(uint8_t)c]; 1.744 + if(togo == 0) { 1.745 + /* not a valid character anywhere in an escape sequence */ 1.746 + *key = 0; 1.747 + *offset = 0; 1.748 + return INVALID_2022; 1.749 + } 1.750 + togo = (*key << 5) + togo; 1.751 + 1.752 + while (hi != low) /*binary search*/{ 1.753 + 1.754 + register int32_t mid = (hi+low) >> 1; /*Finds median*/ 1.755 + 1.756 + if (mid == oldmid) 1.757 + break; 1.758 + 1.759 + if (escSeqStateTable_Key_2022[mid] > togo){ 1.760 + hi = mid; 1.761 + } 1.762 + else if (escSeqStateTable_Key_2022[mid] < togo){ 1.763 + low = mid; 1.764 + } 1.765 + else /*we found it*/{ 1.766 + *key = togo; 1.767 + *offset = mid; 1.768 + return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 1.769 + } 1.770 + oldmid = mid; 1.771 + 1.772 + } 1.773 + 1.774 + *key = 0; 1.775 + *offset = 0; 1.776 + return INVALID_2022; 1.777 +} 1.778 + 1.779 +/*runs through a state machine to determine the escape sequence - codepage correspondance 1.780 + */ 1.781 +static void 1.782 +changeState_2022(UConverter* _this, 1.783 + const char** source, 1.784 + const char* sourceLimit, 1.785 + Variant2022 var, 1.786 + UErrorCode* err){ 1.787 + UCNV_TableStates_2022 value; 1.788 + UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 1.789 + uint32_t key = myData2022->key; 1.790 + int32_t offset = 0; 1.791 + int8_t initialToULength = _this->toULength; 1.792 + char c; 1.793 + 1.794 + value = VALID_NON_TERMINAL_2022; 1.795 + while (*source < sourceLimit) { 1.796 + c = *(*source)++; 1.797 + _this->toUBytes[_this->toULength++]=(uint8_t)c; 1.798 + value = getKey_2022(c,(int32_t *) &key, &offset); 1.799 + 1.800 + switch (value){ 1.801 + 1.802 + case VALID_NON_TERMINAL_2022 : 1.803 + /* continue with the loop */ 1.804 + break; 1.805 + 1.806 + case VALID_TERMINAL_2022: 1.807 + key = 0; 1.808 + goto DONE; 1.809 + 1.810 + case INVALID_2022: 1.811 + goto DONE; 1.812 + 1.813 + case VALID_MAYBE_TERMINAL_2022: 1.814 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.815 + /* ESC ( B is ambiguous only for ISO_2022 itself */ 1.816 + if(var == ISO_2022) { 1.817 + /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 1.818 + _this->toULength = 0; 1.819 + 1.820 + /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 1.821 + 1.822 + /* continue with the loop */ 1.823 + value = VALID_NON_TERMINAL_2022; 1.824 + break; 1.825 + } else 1.826 +#endif 1.827 + { 1.828 + /* not ISO_2022 itself, finish here */ 1.829 + value = VALID_TERMINAL_2022; 1.830 + key = 0; 1.831 + goto DONE; 1.832 + } 1.833 + } 1.834 + } 1.835 + 1.836 +DONE: 1.837 + myData2022->key = key; 1.838 + 1.839 + if (value == VALID_NON_TERMINAL_2022) { 1.840 + /* indicate that the escape sequence is incomplete: key!=0 */ 1.841 + return; 1.842 + } else if (value == INVALID_2022 ) { 1.843 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.844 + } else /* value == VALID_TERMINAL_2022 */ { 1.845 + switch(var){ 1.846 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.847 + case ISO_2022: 1.848 + { 1.849 + const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 1.850 + if(chosenConverterName == NULL) { 1.851 + /* SS2 or SS3 */ 1.852 + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1.853 + _this->toUCallbackReason = UCNV_UNASSIGNED; 1.854 + return; 1.855 + } 1.856 + 1.857 + _this->mode = UCNV_SI; 1.858 + ucnv_close(myData2022->currentConverter); 1.859 + myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 1.860 + if(U_SUCCESS(*err)) { 1.861 + myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1.862 + _this->mode = UCNV_SO; 1.863 + } 1.864 + break; 1.865 + } 1.866 +#endif 1.867 + case ISO_2022_JP: 1.868 + { 1.869 + StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 1.870 + switch(tempState) { 1.871 + case INVALID_STATE: 1.872 + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1.873 + break; 1.874 + case SS2_STATE: 1.875 + if(myData2022->toU2022State.cs[2]!=0) { 1.876 + if(myData2022->toU2022State.g<2) { 1.877 + myData2022->toU2022State.prevG=myData2022->toU2022State.g; 1.878 + } 1.879 + myData2022->toU2022State.g=2; 1.880 + } else { 1.881 + /* illegal to have SS2 before a matching designator */ 1.882 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.883 + } 1.884 + break; 1.885 + /* case SS3_STATE: not used in ISO-2022-JP-x */ 1.886 + case ISO8859_1: 1.887 + case ISO8859_7: 1.888 + if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 1.889 + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1.890 + } else { 1.891 + /* G2 charset for SS2 */ 1.892 + myData2022->toU2022State.cs[2]=(int8_t)tempState; 1.893 + } 1.894 + break; 1.895 + default: 1.896 + if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 1.897 + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1.898 + } else { 1.899 + /* G0 charset */ 1.900 + myData2022->toU2022State.cs[0]=(int8_t)tempState; 1.901 + } 1.902 + break; 1.903 + } 1.904 + } 1.905 + break; 1.906 + case ISO_2022_CN: 1.907 + { 1.908 + StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 1.909 + switch(tempState) { 1.910 + case INVALID_STATE: 1.911 + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1.912 + break; 1.913 + case SS2_STATE: 1.914 + if(myData2022->toU2022State.cs[2]!=0) { 1.915 + if(myData2022->toU2022State.g<2) { 1.916 + myData2022->toU2022State.prevG=myData2022->toU2022State.g; 1.917 + } 1.918 + myData2022->toU2022State.g=2; 1.919 + } else { 1.920 + /* illegal to have SS2 before a matching designator */ 1.921 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.922 + } 1.923 + break; 1.924 + case SS3_STATE: 1.925 + if(myData2022->toU2022State.cs[3]!=0) { 1.926 + if(myData2022->toU2022State.g<2) { 1.927 + myData2022->toU2022State.prevG=myData2022->toU2022State.g; 1.928 + } 1.929 + myData2022->toU2022State.g=3; 1.930 + } else { 1.931 + /* illegal to have SS3 before a matching designator */ 1.932 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.933 + } 1.934 + break; 1.935 + case ISO_IR_165: 1.936 + if(myData2022->version==0) { 1.937 + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1.938 + break; 1.939 + } 1.940 + /*fall through*/ 1.941 + case GB2312_1: 1.942 + /*fall through*/ 1.943 + case CNS_11643_1: 1.944 + myData2022->toU2022State.cs[1]=(int8_t)tempState; 1.945 + break; 1.946 + case CNS_11643_2: 1.947 + myData2022->toU2022State.cs[2]=(int8_t)tempState; 1.948 + break; 1.949 + default: 1.950 + /* other CNS 11643 planes */ 1.951 + if(myData2022->version==0) { 1.952 + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1.953 + } else { 1.954 + myData2022->toU2022State.cs[3]=(int8_t)tempState; 1.955 + } 1.956 + break; 1.957 + } 1.958 + } 1.959 + break; 1.960 + case ISO_2022_KR: 1.961 + if(offset==0x30){ 1.962 + /* nothing to be done, just accept this one escape sequence */ 1.963 + } else { 1.964 + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1.965 + } 1.966 + break; 1.967 + 1.968 + default: 1.969 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.970 + break; 1.971 + } 1.972 + } 1.973 + if(U_SUCCESS(*err)) { 1.974 + _this->toULength = 0; 1.975 + } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 1.976 + if(_this->toULength>1) { 1.977 + /* 1.978 + * Ticket 5691: consistent illegal sequences: 1.979 + * - We include at least the first byte (ESC) in the illegal sequence. 1.980 + * - If any of the non-initial bytes could be the start of a character, 1.981 + * we stop the illegal sequence before the first one of those. 1.982 + * In escape sequences, all following bytes are "printable", that is, 1.983 + * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 1.984 + * they are valid single/lead bytes. 1.985 + * For simplicity, we always only report the initial ESC byte as the 1.986 + * illegal sequence and back out all other bytes we looked at. 1.987 + */ 1.988 + /* Back out some bytes. */ 1.989 + int8_t backOutDistance=_this->toULength-1; 1.990 + int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1.991 + if(backOutDistance<=bytesFromThisBuffer) { 1.992 + /* same as initialToULength<=1 */ 1.993 + *source-=backOutDistance; 1.994 + } else { 1.995 + /* Back out bytes from the previous buffer: Need to replay them. */ 1.996 + _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1.997 + /* same as -(initialToULength-1) */ 1.998 + /* preToULength is negative! */ 1.999 + uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1.1000 + *source-=bytesFromThisBuffer; 1.1001 + } 1.1002 + _this->toULength=1; 1.1003 + } 1.1004 + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1.1005 + _this->toUCallbackReason = UCNV_UNASSIGNED; 1.1006 + } 1.1007 +} 1.1008 + 1.1009 +/*Checks the characters of the buffer against valid 2022 escape sequences 1.1010 +*if the match we return a pointer to the initial start of the sequence otherwise 1.1011 +*we return sourceLimit 1.1012 +*/ 1.1013 +/*for 2022 looks ahead in the stream 1.1014 + *to determine the longest possible convertible 1.1015 + *data stream 1.1016 + */ 1.1017 +static inline const char* 1.1018 +getEndOfBuffer_2022(const char** source, 1.1019 + const char* sourceLimit, 1.1020 + UBool /*flush*/){ 1.1021 + 1.1022 + const char* mySource = *source; 1.1023 + 1.1024 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.1025 + if (*source >= sourceLimit) 1.1026 + return sourceLimit; 1.1027 + 1.1028 + do{ 1.1029 + 1.1030 + if (*mySource == ESC_2022){ 1.1031 + int8_t i; 1.1032 + int32_t key = 0; 1.1033 + int32_t offset; 1.1034 + UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1.1035 + 1.1036 + /* Kludge: I could not 1.1037 + * figure out the reason for validating an escape sequence 1.1038 + * twice - once here and once in changeState_2022(). 1.1039 + * is it possible to have an ESC character in a ISO2022 1.1040 + * byte stream which is valid in a code page? Is it legal? 1.1041 + */ 1.1042 + for (i=0; 1.1043 + (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1.1044 + i++) { 1.1045 + value = getKey_2022(*(mySource+i), &key, &offset); 1.1046 + } 1.1047 + if (value > 0 || *mySource==ESC_2022) 1.1048 + return mySource; 1.1049 + 1.1050 + if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1.1051 + return sourceLimit; 1.1052 + } 1.1053 + }while (++mySource < sourceLimit); 1.1054 + 1.1055 + return sourceLimit; 1.1056 +#else 1.1057 + while(mySource < sourceLimit && *mySource != ESC_2022) { 1.1058 + ++mySource; 1.1059 + } 1.1060 + return mySource; 1.1061 +#endif 1.1062 +} 1.1063 + 1.1064 + 1.1065 +/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1.1066 + * any future change in _MBCSFromUChar32() function should be reflected here. 1.1067 + * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1.1068 + */ 1.1069 +static inline int32_t 1.1070 +MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1.1071 + UChar32 c, 1.1072 + uint32_t* value, 1.1073 + UBool useFallback, 1.1074 + int outputType) 1.1075 +{ 1.1076 + const int32_t *cx; 1.1077 + const uint16_t *table; 1.1078 + uint32_t stage2Entry; 1.1079 + uint32_t myValue; 1.1080 + int32_t length; 1.1081 + const uint8_t *p; 1.1082 + /* 1.1083 + * TODO(markus): Use and require new, faster MBCS conversion table structures. 1.1084 + * Use internal version of ucnv_open() that verifies that the new structures are available, 1.1085 + * else U_INTERNAL_PROGRAM_ERROR. 1.1086 + */ 1.1087 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.1088 + if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1.1089 + table=sharedData->mbcs.fromUnicodeTable; 1.1090 + stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1.1091 + /* get the bytes and the length for the output */ 1.1092 + if(outputType==MBCS_OUTPUT_2){ 1.1093 + myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1.1094 + if(myValue<=0xff) { 1.1095 + length=1; 1.1096 + } else { 1.1097 + length=2; 1.1098 + } 1.1099 + } else /* outputType==MBCS_OUTPUT_3 */ { 1.1100 + p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1.1101 + myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1.1102 + if(myValue<=0xff) { 1.1103 + length=1; 1.1104 + } else if(myValue<=0xffff) { 1.1105 + length=2; 1.1106 + } else { 1.1107 + length=3; 1.1108 + } 1.1109 + } 1.1110 + /* is this code point assigned, or do we use fallbacks? */ 1.1111 + if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1.1112 + /* assigned */ 1.1113 + *value=myValue; 1.1114 + return length; 1.1115 + } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1.1116 + /* 1.1117 + * We allow a 0 byte output if the "assigned" bit is set for this entry. 1.1118 + * There is no way with this data structure for fallback output 1.1119 + * to be a zero byte. 1.1120 + */ 1.1121 + *value=myValue; 1.1122 + return -length; 1.1123 + } 1.1124 + } 1.1125 + 1.1126 + cx=sharedData->mbcs.extIndexes; 1.1127 + if(cx!=NULL) { 1.1128 + return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1.1129 + } 1.1130 + 1.1131 + /* unassigned */ 1.1132 + return 0; 1.1133 +} 1.1134 + 1.1135 +/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1.1136 + * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1.1137 + * @param retval pointer to output byte 1.1138 + * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1.1139 + */ 1.1140 +static inline int32_t 1.1141 +MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1.1142 + UChar32 c, 1.1143 + uint32_t* retval, 1.1144 + UBool useFallback) 1.1145 +{ 1.1146 + const uint16_t *table; 1.1147 + int32_t value; 1.1148 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.1149 + if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1.1150 + return 0; 1.1151 + } 1.1152 + /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1.1153 + table=sharedData->mbcs.fromUnicodeTable; 1.1154 + /* get the byte for the output */ 1.1155 + value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1.1156 + /* is this code point assigned, or do we use fallbacks? */ 1.1157 + *retval=(uint32_t)(value&0xff); 1.1158 + if(value>=0xf00) { 1.1159 + return 1; /* roundtrip */ 1.1160 + } else if(useFallback ? value>=0x800 : value>=0xc00) { 1.1161 + return -1; /* fallback taken */ 1.1162 + } else { 1.1163 + return 0; /* no mapping */ 1.1164 + } 1.1165 +} 1.1166 + 1.1167 +/* 1.1168 + * Check that the result is a 2-byte value with each byte in the range A1..FE 1.1169 + * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1.1170 + * to move it to the ISO 2022 range 21..7E. 1.1171 + * Return 0 if out of range. 1.1172 + */ 1.1173 +static inline uint32_t 1.1174 +_2022FromGR94DBCS(uint32_t value) { 1.1175 + if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1.1176 + (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1.1177 + ) { 1.1178 + return value - 0x8080; /* shift down to 21..7e byte range */ 1.1179 + } else { 1.1180 + return 0; /* not valid for ISO 2022 */ 1.1181 + } 1.1182 +} 1.1183 + 1.1184 +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1.1185 +/* 1.1186 + * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1.1187 + * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1.1188 + * unchanged. 1.1189 + */ 1.1190 +static inline uint32_t 1.1191 +_2022ToGR94DBCS(uint32_t value) { 1.1192 + uint32_t returnValue = value + 0x8080; 1.1193 + if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1.1194 + (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1.1195 + return returnValue; 1.1196 + } else { 1.1197 + return value; 1.1198 + } 1.1199 +} 1.1200 +#endif 1.1201 + 1.1202 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.1203 + 1.1204 +/********************************************************************************** 1.1205 +* ISO-2022 Converter 1.1206 +* 1.1207 +* 1.1208 +*/ 1.1209 + 1.1210 +static void 1.1211 +T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1.1212 + UErrorCode* err){ 1.1213 + const char* mySourceLimit, *realSourceLimit; 1.1214 + const char* sourceStart; 1.1215 + const UChar* myTargetStart; 1.1216 + UConverter* saveThis; 1.1217 + UConverterDataISO2022* myData; 1.1218 + int8_t length; 1.1219 + 1.1220 + saveThis = args->converter; 1.1221 + myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1.1222 + 1.1223 + realSourceLimit = args->sourceLimit; 1.1224 + while (args->source < realSourceLimit) { 1.1225 + if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1.1226 + /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1.1227 + mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1.1228 + 1.1229 + if(args->source < mySourceLimit) { 1.1230 + if(myData->currentConverter==NULL) { 1.1231 + myData->currentConverter = ucnv_open("ASCII",err); 1.1232 + if(U_FAILURE(*err)){ 1.1233 + return; 1.1234 + } 1.1235 + 1.1236 + myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1.1237 + saveThis->mode = UCNV_SO; 1.1238 + } 1.1239 + 1.1240 + /* convert to before the ESC or until the end of the buffer */ 1.1241 + myData->isFirstBuffer=FALSE; 1.1242 + sourceStart = args->source; 1.1243 + myTargetStart = args->target; 1.1244 + args->converter = myData->currentConverter; 1.1245 + ucnv_toUnicode(args->converter, 1.1246 + &args->target, 1.1247 + args->targetLimit, 1.1248 + &args->source, 1.1249 + mySourceLimit, 1.1250 + args->offsets, 1.1251 + (UBool)(args->flush && mySourceLimit == realSourceLimit), 1.1252 + err); 1.1253 + args->converter = saveThis; 1.1254 + 1.1255 + if (*err == U_BUFFER_OVERFLOW_ERROR) { 1.1256 + /* move the overflow buffer */ 1.1257 + length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1.1258 + myData->currentConverter->UCharErrorBufferLength = 0; 1.1259 + if(length > 0) { 1.1260 + uprv_memcpy(saveThis->UCharErrorBuffer, 1.1261 + myData->currentConverter->UCharErrorBuffer, 1.1262 + length*U_SIZEOF_UCHAR); 1.1263 + } 1.1264 + return; 1.1265 + } 1.1266 + 1.1267 + /* 1.1268 + * At least one of: 1.1269 + * -Error while converting 1.1270 + * -Done with entire buffer 1.1271 + * -Need to write offsets or update the current offset 1.1272 + * (leave that up to the code in ucnv.c) 1.1273 + * 1.1274 + * or else we just stopped at an ESC byte and continue with changeState_2022() 1.1275 + */ 1.1276 + if (U_FAILURE(*err) || 1.1277 + (args->source == realSourceLimit) || 1.1278 + (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1.1279 + (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1.1280 + ) { 1.1281 + /* copy partial or error input for truncated detection and error handling */ 1.1282 + if(U_FAILURE(*err)) { 1.1283 + length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1.1284 + if(length > 0) { 1.1285 + uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1.1286 + } 1.1287 + } else { 1.1288 + length = saveThis->toULength = myData->currentConverter->toULength; 1.1289 + if(length > 0) { 1.1290 + uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1.1291 + if(args->source < mySourceLimit) { 1.1292 + *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1.1293 + } 1.1294 + } 1.1295 + } 1.1296 + return; 1.1297 + } 1.1298 + } 1.1299 + } 1.1300 + 1.1301 + sourceStart = args->source; 1.1302 + changeState_2022(args->converter, 1.1303 + &(args->source), 1.1304 + realSourceLimit, 1.1305 + ISO_2022, 1.1306 + err); 1.1307 + if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1.1308 + /* let the ucnv.c code update its current offset */ 1.1309 + return; 1.1310 + } 1.1311 + } 1.1312 +} 1.1313 + 1.1314 +#endif 1.1315 + 1.1316 +/* 1.1317 + * To Unicode Callback helper function 1.1318 + */ 1.1319 +static void 1.1320 +toUnicodeCallback(UConverter *cnv, 1.1321 + const uint32_t sourceChar, const uint32_t targetUniChar, 1.1322 + UErrorCode* err){ 1.1323 + if(sourceChar>0xff){ 1.1324 + cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1.1325 + cnv->toUBytes[1] = (uint8_t)sourceChar; 1.1326 + cnv->toULength = 2; 1.1327 + } 1.1328 + else{ 1.1329 + cnv->toUBytes[0] =(char) sourceChar; 1.1330 + cnv->toULength = 1; 1.1331 + } 1.1332 + 1.1333 + if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1.1334 + *err = U_INVALID_CHAR_FOUND; 1.1335 + } 1.1336 + else{ 1.1337 + *err = U_ILLEGAL_CHAR_FOUND; 1.1338 + } 1.1339 +} 1.1340 + 1.1341 +/**************************************ISO-2022-JP*************************************************/ 1.1342 + 1.1343 +/************************************** IMPORTANT ************************************************** 1.1344 +* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1.1345 +* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1.1346 +* The converter iterates over each Unicode codepoint 1.1347 +* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1.1348 +* processed one char at a time it would make sense to reduce the extra processing a canned converter 1.1349 +* would do as far as possible. 1.1350 +* 1.1351 +* If the implementation of these macros or structure of sharedData struct change in the future, make 1.1352 +* sure that ISO-2022 is also changed. 1.1353 +*************************************************************************************************** 1.1354 +*/ 1.1355 + 1.1356 +/*************************************************************************************************** 1.1357 +* Rules for ISO-2022-jp encoding 1.1358 +* (i) Escape sequences must be fully contained within a line they should not 1.1359 +* span new lines or CRs 1.1360 +* (ii) If the last character on a line is represented by two bytes then an ASCII or 1.1361 +* JIS-Roman character escape sequence should follow before the line terminates 1.1362 +* (iii) If the first character on the line is represented by two bytes then a two 1.1363 +* byte character escape sequence should precede it 1.1364 +* (iv) If no escape sequence is encountered then the characters are ASCII 1.1365 +* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1.1366 +* and invoked with SS2 (ESC N). 1.1367 +* (vi) If there is any G0 designation in text, there must be a switch to 1.1368 +* ASCII or to JIS X 0201-Roman before a space character (but not 1.1369 +* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1.1370 +* characters such as tab or CRLF. 1.1371 +* (vi) Supported encodings: 1.1372 +* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1.1373 +* 1.1374 +* source : RFC-1554 1.1375 +* 1.1376 +* JISX201, JISX208,JISX212 : new .cnv data files created 1.1377 +* KSC5601 : alias to ibm-949 mapping table 1.1378 +* GB2312 : alias to ibm-1386 mapping table 1.1379 +* ISO-8859-1 : Algorithmic implemented as LATIN1 case 1.1380 +* ISO-8859-7 : alisas to ibm-9409 mapping table 1.1381 +*/ 1.1382 + 1.1383 +/* preference order of JP charsets */ 1.1384 +static const StateEnum jpCharsetPref[]={ 1.1385 + ASCII, 1.1386 + JISX201, 1.1387 + ISO8859_1, 1.1388 + ISO8859_7, 1.1389 + JISX208, 1.1390 + JISX212, 1.1391 + GB2312, 1.1392 + KSC5601, 1.1393 + HWKANA_7BIT 1.1394 +}; 1.1395 + 1.1396 +/* 1.1397 + * The escape sequences must be in order of the enum constants like JISX201 = 3, 1.1398 + * not in order of jpCharsetPref[]! 1.1399 + */ 1.1400 +static const char escSeqChars[][6] ={ 1.1401 + "\x1B\x28\x42", /* <ESC>(B ASCII */ 1.1402 + "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1.1403 + "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1.1404 + "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1.1405 + "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1.1406 + "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1.1407 + "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1.1408 + "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1.1409 + "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1.1410 + 1.1411 +}; 1.1412 +static const int8_t escSeqCharsLen[] ={ 1.1413 + 3, /* length of <ESC>(B ASCII */ 1.1414 + 3, /* length of <ESC>.A ISO-8859-1 */ 1.1415 + 3, /* length of <ESC>.F ISO-8859-7 */ 1.1416 + 3, /* length of <ESC>(J JISX-201 */ 1.1417 + 3, /* length of <ESC>$B JISX-208 */ 1.1418 + 4, /* length of <ESC>$(D JISX-212 */ 1.1419 + 3, /* length of <ESC>$A GB2312 */ 1.1420 + 4, /* length of <ESC>$(C KSC5601 */ 1.1421 + 3 /* length of <ESC>(I HWKANA_7BIT */ 1.1422 +}; 1.1423 + 1.1424 +/* 1.1425 +* The iteration over various code pages works this way: 1.1426 +* i) Get the currentState from myConverterData->currentState 1.1427 +* ii) Check if the character is mapped to a valid character in the currentState 1.1428 +* Yes -> a) set the initIterState to currentState 1.1429 +* b) remain in this state until an invalid character is found 1.1430 +* No -> a) go to the next code page and find the character 1.1431 +* iii) Before changing the state increment the current state check if the current state 1.1432 +* is equal to the intitIteration state 1.1433 +* Yes -> A character that cannot be represented in any of the supported encodings 1.1434 +* break and return a U_INVALID_CHARACTER error 1.1435 +* No -> Continue and find the character in next code page 1.1436 +* 1.1437 +* 1.1438 +* TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1.1439 +*/ 1.1440 + 1.1441 +/* Map 00..7F to Unicode according to JIS X 0201. */ 1.1442 +static inline uint32_t 1.1443 +jisx201ToU(uint32_t value) { 1.1444 + if(value < 0x5c) { 1.1445 + return value; 1.1446 + } else if(value == 0x5c) { 1.1447 + return 0xa5; 1.1448 + } else if(value == 0x7e) { 1.1449 + return 0x203e; 1.1450 + } else /* value <= 0x7f */ { 1.1451 + return value; 1.1452 + } 1.1453 +} 1.1454 + 1.1455 +/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1.1456 +static inline uint32_t 1.1457 +jisx201FromU(uint32_t value) { 1.1458 + if(value<=0x7f) { 1.1459 + if(value!=0x5c && value!=0x7e) { 1.1460 + return value; 1.1461 + } 1.1462 + } else if(value==0xa5) { 1.1463 + return 0x5c; 1.1464 + } else if(value==0x203e) { 1.1465 + return 0x7e; 1.1466 + } 1.1467 + return 0xfffe; 1.1468 +} 1.1469 + 1.1470 +/* 1.1471 + * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1.1472 + * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1.1473 + * Return 0 if the byte pair is out of range. 1.1474 + */ 1.1475 +static inline uint32_t 1.1476 +_2022FromSJIS(uint32_t value) { 1.1477 + uint8_t trail; 1.1478 + 1.1479 + if(value > 0xEFFC) { 1.1480 + return 0; /* beyond JIS X 0208 */ 1.1481 + } 1.1482 + 1.1483 + trail = (uint8_t)value; 1.1484 + 1.1485 + value &= 0xff00; /* lead byte */ 1.1486 + if(value <= 0x9f00) { 1.1487 + value -= 0x7000; 1.1488 + } else /* 0xe000 <= value <= 0xef00 */ { 1.1489 + value -= 0xb000; 1.1490 + } 1.1491 + value <<= 1; 1.1492 + 1.1493 + if(trail <= 0x9e) { 1.1494 + value -= 0x100; 1.1495 + if(trail <= 0x7e) { 1.1496 + value |= trail - 0x1f; 1.1497 + } else { 1.1498 + value |= trail - 0x20; 1.1499 + } 1.1500 + } else /* trail <= 0xfc */ { 1.1501 + value |= trail - 0x7e; 1.1502 + } 1.1503 + return value; 1.1504 +} 1.1505 + 1.1506 +/* 1.1507 + * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1.1508 + * If either byte is outside 21..7E make sure that the result is not valid 1.1509 + * for Shift-JIS so that the converter catches it. 1.1510 + * Some invalid byte values already turn into equally invalid Shift-JIS 1.1511 + * byte values and need not be tested explicitly. 1.1512 + */ 1.1513 +static inline void 1.1514 +_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1.1515 + if(c1&1) { 1.1516 + ++c1; 1.1517 + if(c2 <= 0x5f) { 1.1518 + c2 += 0x1f; 1.1519 + } else if(c2 <= 0x7e) { 1.1520 + c2 += 0x20; 1.1521 + } else { 1.1522 + c2 = 0; /* invalid */ 1.1523 + } 1.1524 + } else { 1.1525 + if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1.1526 + c2 += 0x7e; 1.1527 + } else { 1.1528 + c2 = 0; /* invalid */ 1.1529 + } 1.1530 + } 1.1531 + c1 >>= 1; 1.1532 + if(c1 <= 0x2f) { 1.1533 + c1 += 0x70; 1.1534 + } else if(c1 <= 0x3f) { 1.1535 + c1 += 0xb0; 1.1536 + } else { 1.1537 + c1 = 0; /* invalid */ 1.1538 + } 1.1539 + bytes[0] = (char)c1; 1.1540 + bytes[1] = (char)c2; 1.1541 +} 1.1542 + 1.1543 +/* 1.1544 + * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1.1545 + * Katakana. 1.1546 + * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1.1547 + * because Shift-JIS roundtrips half-width Katakana to single bytes. 1.1548 + * These were the only fallbacks in ICU's jisx-208.ucm file. 1.1549 + */ 1.1550 +static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1.1551 + 0x2123, /* U+FF61 */ 1.1552 + 0x2156, 1.1553 + 0x2157, 1.1554 + 0x2122, 1.1555 + 0x2126, 1.1556 + 0x2572, 1.1557 + 0x2521, 1.1558 + 0x2523, 1.1559 + 0x2525, 1.1560 + 0x2527, 1.1561 + 0x2529, 1.1562 + 0x2563, 1.1563 + 0x2565, 1.1564 + 0x2567, 1.1565 + 0x2543, 1.1566 + 0x213C, /* U+FF70 */ 1.1567 + 0x2522, 1.1568 + 0x2524, 1.1569 + 0x2526, 1.1570 + 0x2528, 1.1571 + 0x252A, 1.1572 + 0x252B, 1.1573 + 0x252D, 1.1574 + 0x252F, 1.1575 + 0x2531, 1.1576 + 0x2533, 1.1577 + 0x2535, 1.1578 + 0x2537, 1.1579 + 0x2539, 1.1580 + 0x253B, 1.1581 + 0x253D, 1.1582 + 0x253F, /* U+FF80 */ 1.1583 + 0x2541, 1.1584 + 0x2544, 1.1585 + 0x2546, 1.1586 + 0x2548, 1.1587 + 0x254A, 1.1588 + 0x254B, 1.1589 + 0x254C, 1.1590 + 0x254D, 1.1591 + 0x254E, 1.1592 + 0x254F, 1.1593 + 0x2552, 1.1594 + 0x2555, 1.1595 + 0x2558, 1.1596 + 0x255B, 1.1597 + 0x255E, 1.1598 + 0x255F, /* U+FF90 */ 1.1599 + 0x2560, 1.1600 + 0x2561, 1.1601 + 0x2562, 1.1602 + 0x2564, 1.1603 + 0x2566, 1.1604 + 0x2568, 1.1605 + 0x2569, 1.1606 + 0x256A, 1.1607 + 0x256B, 1.1608 + 0x256C, 1.1609 + 0x256D, 1.1610 + 0x256F, 1.1611 + 0x2573, 1.1612 + 0x212B, 1.1613 + 0x212C /* U+FF9F */ 1.1614 +}; 1.1615 + 1.1616 +static void 1.1617 +UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1.1618 + UConverter *cnv = args->converter; 1.1619 + UConverterDataISO2022 *converterData; 1.1620 + ISO2022State *pFromU2022State; 1.1621 + uint8_t *target = (uint8_t *) args->target; 1.1622 + const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1.1623 + const UChar* source = args->source; 1.1624 + const UChar* sourceLimit = args->sourceLimit; 1.1625 + int32_t* offsets = args->offsets; 1.1626 + UChar32 sourceChar; 1.1627 + char buffer[8]; 1.1628 + int32_t len, outLen; 1.1629 + int8_t choices[10]; 1.1630 + int32_t choiceCount; 1.1631 + uint32_t targetValue = 0; 1.1632 + UBool useFallback; 1.1633 + 1.1634 + int32_t i; 1.1635 + int8_t cs, g; 1.1636 + 1.1637 + /* set up the state */ 1.1638 + converterData = (UConverterDataISO2022*)cnv->extraInfo; 1.1639 + pFromU2022State = &converterData->fromU2022State; 1.1640 + 1.1641 + choiceCount = 0; 1.1642 + 1.1643 + /* check if the last codepoint of previous buffer was a lead surrogate*/ 1.1644 + if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1.1645 + goto getTrail; 1.1646 + } 1.1647 + 1.1648 + while(source < sourceLimit) { 1.1649 + if(target < targetLimit) { 1.1650 + 1.1651 + sourceChar = *(source++); 1.1652 + /*check if the char is a First surrogate*/ 1.1653 + if(U16_IS_SURROGATE(sourceChar)) { 1.1654 + if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1.1655 +getTrail: 1.1656 + /*look ahead to find the trail surrogate*/ 1.1657 + if(source < sourceLimit) { 1.1658 + /* test the following code unit */ 1.1659 + UChar trail=(UChar) *source; 1.1660 + if(U16_IS_TRAIL(trail)) { 1.1661 + source++; 1.1662 + sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1.1663 + cnv->fromUChar32=0x00; 1.1664 + /* convert this supplementary code point */ 1.1665 + /* exit this condition tree */ 1.1666 + } else { 1.1667 + /* this is an unmatched lead code unit (1st surrogate) */ 1.1668 + /* callback(illegal) */ 1.1669 + *err=U_ILLEGAL_CHAR_FOUND; 1.1670 + cnv->fromUChar32=sourceChar; 1.1671 + break; 1.1672 + } 1.1673 + } else { 1.1674 + /* no more input */ 1.1675 + cnv->fromUChar32=sourceChar; 1.1676 + break; 1.1677 + } 1.1678 + } else { 1.1679 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.1680 + /* callback(illegal) */ 1.1681 + *err=U_ILLEGAL_CHAR_FOUND; 1.1682 + cnv->fromUChar32=sourceChar; 1.1683 + break; 1.1684 + } 1.1685 + } 1.1686 + 1.1687 + /* do not convert SO/SI/ESC */ 1.1688 + if(IS_2022_CONTROL(sourceChar)) { 1.1689 + /* callback(illegal) */ 1.1690 + *err=U_ILLEGAL_CHAR_FOUND; 1.1691 + cnv->fromUChar32=sourceChar; 1.1692 + break; 1.1693 + } 1.1694 + 1.1695 + /* do the conversion */ 1.1696 + 1.1697 + if(choiceCount == 0) { 1.1698 + uint16_t csm; 1.1699 + 1.1700 + /* 1.1701 + * The csm variable keeps track of which charsets are allowed 1.1702 + * and not used yet while building the choices[]. 1.1703 + */ 1.1704 + csm = jpCharsetMasks[converterData->version]; 1.1705 + choiceCount = 0; 1.1706 + 1.1707 + /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1.1708 + if(converterData->version == 3 || converterData->version == 4) { 1.1709 + choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1.1710 + } 1.1711 + /* Do not try single-byte half-width Katakana for other versions. */ 1.1712 + csm &= ~CSM(HWKANA_7BIT); 1.1713 + 1.1714 + /* try the current G0 charset */ 1.1715 + choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1.1716 + csm &= ~CSM(cs); 1.1717 + 1.1718 + /* try the current G2 charset */ 1.1719 + if((cs = pFromU2022State->cs[2]) != 0) { 1.1720 + choices[choiceCount++] = cs; 1.1721 + csm &= ~CSM(cs); 1.1722 + } 1.1723 + 1.1724 + /* try all the other possible charsets */ 1.1725 + for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1.1726 + cs = (int8_t)jpCharsetPref[i]; 1.1727 + if(CSM(cs) & csm) { 1.1728 + choices[choiceCount++] = cs; 1.1729 + csm &= ~CSM(cs); 1.1730 + } 1.1731 + } 1.1732 + } 1.1733 + 1.1734 + cs = g = 0; 1.1735 + /* 1.1736 + * len==0: no mapping found yet 1.1737 + * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1.1738 + * len>0: found a roundtrip result, done 1.1739 + */ 1.1740 + len = 0; 1.1741 + /* 1.1742 + * We will turn off useFallback after finding a fallback, 1.1743 + * but we still get fallbacks from PUA code points as usual. 1.1744 + * Therefore, we will also need to check that we don't overwrite 1.1745 + * an early fallback with a later one. 1.1746 + */ 1.1747 + useFallback = cnv->useFallback; 1.1748 + 1.1749 + for(i = 0; i < choiceCount && len <= 0; ++i) { 1.1750 + uint32_t value; 1.1751 + int32_t len2; 1.1752 + int8_t cs0 = choices[i]; 1.1753 + switch(cs0) { 1.1754 + case ASCII: 1.1755 + if(sourceChar <= 0x7f) { 1.1756 + targetValue = (uint32_t)sourceChar; 1.1757 + len = 1; 1.1758 + cs = cs0; 1.1759 + g = 0; 1.1760 + } 1.1761 + break; 1.1762 + case ISO8859_1: 1.1763 + if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1.1764 + targetValue = (uint32_t)sourceChar - 0x80; 1.1765 + len = 1; 1.1766 + cs = cs0; 1.1767 + g = 2; 1.1768 + } 1.1769 + break; 1.1770 + case HWKANA_7BIT: 1.1771 + if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1.1772 + if(converterData->version==3) { 1.1773 + /* JIS7: use G1 (SO) */ 1.1774 + /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1.1775 + targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1.1776 + len = 1; 1.1777 + pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1.1778 + g = 1; 1.1779 + } else if(converterData->version==4) { 1.1780 + /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1.1781 + /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1.1782 + targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1.1783 + len = 1; 1.1784 + 1.1785 + cs = pFromU2022State->cs[0]; 1.1786 + if(IS_JP_DBCS(cs)) { 1.1787 + /* switch from a DBCS charset to JISX201 */ 1.1788 + cs = (int8_t)JISX201; 1.1789 + } 1.1790 + /* else stay in the current G0 charset */ 1.1791 + g = 0; 1.1792 + } 1.1793 + /* else do not use HWKANA_7BIT with other versions */ 1.1794 + } 1.1795 + break; 1.1796 + case JISX201: 1.1797 + /* G0 SBCS */ 1.1798 + value = jisx201FromU(sourceChar); 1.1799 + if(value <= 0x7f) { 1.1800 + targetValue = value; 1.1801 + len = 1; 1.1802 + cs = cs0; 1.1803 + g = 0; 1.1804 + useFallback = FALSE; 1.1805 + } 1.1806 + break; 1.1807 + case JISX208: 1.1808 + /* G0 DBCS from Shift-JIS table */ 1.1809 + len2 = MBCS_FROM_UCHAR32_ISO2022( 1.1810 + converterData->myConverterArray[cs0], 1.1811 + sourceChar, &value, 1.1812 + useFallback, MBCS_OUTPUT_2); 1.1813 + if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1.1814 + value = _2022FromSJIS(value); 1.1815 + if(value != 0) { 1.1816 + targetValue = value; 1.1817 + len = len2; 1.1818 + cs = cs0; 1.1819 + g = 0; 1.1820 + useFallback = FALSE; 1.1821 + } 1.1822 + } else if(len == 0 && useFallback && 1.1823 + (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1.1824 + targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1.1825 + len = -2; 1.1826 + cs = cs0; 1.1827 + g = 0; 1.1828 + useFallback = FALSE; 1.1829 + } 1.1830 + break; 1.1831 + case ISO8859_7: 1.1832 + /* G0 SBCS forced to 7-bit output */ 1.1833 + len2 = MBCS_SINGLE_FROM_UCHAR32( 1.1834 + converterData->myConverterArray[cs0], 1.1835 + sourceChar, &value, 1.1836 + useFallback); 1.1837 + if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1.1838 + targetValue = value - 0x80; 1.1839 + len = len2; 1.1840 + cs = cs0; 1.1841 + g = 2; 1.1842 + useFallback = FALSE; 1.1843 + } 1.1844 + break; 1.1845 + default: 1.1846 + /* G0 DBCS */ 1.1847 + len2 = MBCS_FROM_UCHAR32_ISO2022( 1.1848 + converterData->myConverterArray[cs0], 1.1849 + sourceChar, &value, 1.1850 + useFallback, MBCS_OUTPUT_2); 1.1851 + if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1.1852 + if(cs0 == KSC5601) { 1.1853 + /* 1.1854 + * Check for valid bytes for the encoding scheme. 1.1855 + * This is necessary because the sub-converter (windows-949) 1.1856 + * has a broader encoding scheme than is valid for 2022. 1.1857 + */ 1.1858 + value = _2022FromGR94DBCS(value); 1.1859 + if(value == 0) { 1.1860 + break; 1.1861 + } 1.1862 + } 1.1863 + targetValue = value; 1.1864 + len = len2; 1.1865 + cs = cs0; 1.1866 + g = 0; 1.1867 + useFallback = FALSE; 1.1868 + } 1.1869 + break; 1.1870 + } 1.1871 + } 1.1872 + 1.1873 + if(len != 0) { 1.1874 + if(len < 0) { 1.1875 + len = -len; /* fallback */ 1.1876 + } 1.1877 + outLen = 0; /* count output bytes */ 1.1878 + 1.1879 + /* write SI if necessary (only for JIS7) */ 1.1880 + if(pFromU2022State->g == 1 && g == 0) { 1.1881 + buffer[outLen++] = UCNV_SI; 1.1882 + pFromU2022State->g = 0; 1.1883 + } 1.1884 + 1.1885 + /* write the designation sequence if necessary */ 1.1886 + if(cs != pFromU2022State->cs[g]) { 1.1887 + int32_t escLen = escSeqCharsLen[cs]; 1.1888 + uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1.1889 + outLen += escLen; 1.1890 + pFromU2022State->cs[g] = cs; 1.1891 + 1.1892 + /* invalidate the choices[] */ 1.1893 + choiceCount = 0; 1.1894 + } 1.1895 + 1.1896 + /* write the shift sequence if necessary */ 1.1897 + if(g != pFromU2022State->g) { 1.1898 + switch(g) { 1.1899 + /* case 0 handled before writing escapes */ 1.1900 + case 1: 1.1901 + buffer[outLen++] = UCNV_SO; 1.1902 + pFromU2022State->g = 1; 1.1903 + break; 1.1904 + default: /* case 2 */ 1.1905 + buffer[outLen++] = 0x1b; 1.1906 + buffer[outLen++] = 0x4e; 1.1907 + break; 1.1908 + /* no case 3: no SS3 in ISO-2022-JP-x */ 1.1909 + } 1.1910 + } 1.1911 + 1.1912 + /* write the output bytes */ 1.1913 + if(len == 1) { 1.1914 + buffer[outLen++] = (char)targetValue; 1.1915 + } else /* len == 2 */ { 1.1916 + buffer[outLen++] = (char)(targetValue >> 8); 1.1917 + buffer[outLen++] = (char)targetValue; 1.1918 + } 1.1919 + } else { 1.1920 + /* 1.1921 + * if we cannot find the character after checking all codepages 1.1922 + * then this is an error 1.1923 + */ 1.1924 + *err = U_INVALID_CHAR_FOUND; 1.1925 + cnv->fromUChar32=sourceChar; 1.1926 + break; 1.1927 + } 1.1928 + 1.1929 + if(sourceChar == CR || sourceChar == LF) { 1.1930 + /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1.1931 + pFromU2022State->cs[2] = 0; 1.1932 + choiceCount = 0; 1.1933 + } 1.1934 + 1.1935 + /* output outLen>0 bytes in buffer[] */ 1.1936 + if(outLen == 1) { 1.1937 + *target++ = buffer[0]; 1.1938 + if(offsets) { 1.1939 + *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1.1940 + } 1.1941 + } else if(outLen == 2 && (target + 2) <= targetLimit) { 1.1942 + *target++ = buffer[0]; 1.1943 + *target++ = buffer[1]; 1.1944 + if(offsets) { 1.1945 + int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1.1946 + *offsets++ = sourceIndex; 1.1947 + *offsets++ = sourceIndex; 1.1948 + } 1.1949 + } else { 1.1950 + fromUWriteUInt8( 1.1951 + cnv, 1.1952 + buffer, outLen, 1.1953 + &target, (const char *)targetLimit, 1.1954 + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1.1955 + err); 1.1956 + if(U_FAILURE(*err)) { 1.1957 + break; 1.1958 + } 1.1959 + } 1.1960 + } /* end if(myTargetIndex<myTargetLength) */ 1.1961 + else{ 1.1962 + *err =U_BUFFER_OVERFLOW_ERROR; 1.1963 + break; 1.1964 + } 1.1965 + 1.1966 + }/* end while(mySourceIndex<mySourceLength) */ 1.1967 + 1.1968 + /* 1.1969 + * the end of the input stream and detection of truncated input 1.1970 + * are handled by the framework, but for ISO-2022-JP conversion 1.1971 + * we need to be in ASCII mode at the very end 1.1972 + * 1.1973 + * conditions: 1.1974 + * successful 1.1975 + * in SO mode or not in ASCII mode 1.1976 + * end of input and no truncated input 1.1977 + */ 1.1978 + if( U_SUCCESS(*err) && 1.1979 + (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 1.1980 + args->flush && source>=sourceLimit && cnv->fromUChar32==0 1.1981 + ) { 1.1982 + int32_t sourceIndex; 1.1983 + 1.1984 + outLen = 0; 1.1985 + 1.1986 + if(pFromU2022State->g != 0) { 1.1987 + buffer[outLen++] = UCNV_SI; 1.1988 + pFromU2022State->g = 0; 1.1989 + } 1.1990 + 1.1991 + if(pFromU2022State->cs[0] != ASCII) { 1.1992 + int32_t escLen = escSeqCharsLen[ASCII]; 1.1993 + uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 1.1994 + outLen += escLen; 1.1995 + pFromU2022State->cs[0] = (int8_t)ASCII; 1.1996 + } 1.1997 + 1.1998 + /* get the source index of the last input character */ 1.1999 + /* 1.2000 + * TODO this would be simpler and more reliable if we used a pair 1.2001 + * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 1.2002 + * so that we could simply use the prevSourceIndex here; 1.2003 + * this code gives an incorrect result for the rare case of an unmatched 1.2004 + * trail surrogate that is alone in the last buffer of the text stream 1.2005 + */ 1.2006 + sourceIndex=(int32_t)(source-args->source); 1.2007 + if(sourceIndex>0) { 1.2008 + --sourceIndex; 1.2009 + if( U16_IS_TRAIL(args->source[sourceIndex]) && 1.2010 + (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 1.2011 + ) { 1.2012 + --sourceIndex; 1.2013 + } 1.2014 + } else { 1.2015 + sourceIndex=-1; 1.2016 + } 1.2017 + 1.2018 + fromUWriteUInt8( 1.2019 + cnv, 1.2020 + buffer, outLen, 1.2021 + &target, (const char *)targetLimit, 1.2022 + &offsets, sourceIndex, 1.2023 + err); 1.2024 + } 1.2025 + 1.2026 + /*save the state and return */ 1.2027 + args->source = source; 1.2028 + args->target = (char*)target; 1.2029 +} 1.2030 + 1.2031 +/*************** to unicode *******************/ 1.2032 + 1.2033 +static void 1.2034 +UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 1.2035 + UErrorCode* err){ 1.2036 + char tempBuf[2]; 1.2037 + const char *mySource = (char *) args->source; 1.2038 + UChar *myTarget = args->target; 1.2039 + const char *mySourceLimit = args->sourceLimit; 1.2040 + uint32_t targetUniChar = 0x0000; 1.2041 + uint32_t mySourceChar = 0x0000; 1.2042 + uint32_t tmpSourceChar = 0x0000; 1.2043 + UConverterDataISO2022* myData; 1.2044 + ISO2022State *pToU2022State; 1.2045 + StateEnum cs; 1.2046 + 1.2047 + myData=(UConverterDataISO2022*)(args->converter->extraInfo); 1.2048 + pToU2022State = &myData->toU2022State; 1.2049 + 1.2050 + if(myData->key != 0) { 1.2051 + /* continue with a partial escape sequence */ 1.2052 + goto escape; 1.2053 + } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 1.2054 + /* continue with a partial double-byte character */ 1.2055 + mySourceChar = args->converter->toUBytes[0]; 1.2056 + args->converter->toULength = 0; 1.2057 + cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 1.2058 + targetUniChar = missingCharMarker; 1.2059 + goto getTrailByte; 1.2060 + } 1.2061 + 1.2062 + while(mySource < mySourceLimit){ 1.2063 + 1.2064 + targetUniChar =missingCharMarker; 1.2065 + 1.2066 + if(myTarget < args->targetLimit){ 1.2067 + 1.2068 + mySourceChar= (unsigned char) *mySource++; 1.2069 + 1.2070 + switch(mySourceChar) { 1.2071 + case UCNV_SI: 1.2072 + if(myData->version==3) { 1.2073 + pToU2022State->g=0; 1.2074 + continue; 1.2075 + } else { 1.2076 + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 1.2077 + myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 1.2078 + break; 1.2079 + } 1.2080 + 1.2081 + case UCNV_SO: 1.2082 + if(myData->version==3) { 1.2083 + /* JIS7: switch to G1 half-width Katakana */ 1.2084 + pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 1.2085 + pToU2022State->g=1; 1.2086 + continue; 1.2087 + } else { 1.2088 + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 1.2089 + myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 1.2090 + break; 1.2091 + } 1.2092 + 1.2093 + case ESC_2022: 1.2094 + mySource--; 1.2095 +escape: 1.2096 + { 1.2097 + const char * mySourceBefore = mySource; 1.2098 + int8_t toULengthBefore = args->converter->toULength; 1.2099 + 1.2100 + changeState_2022(args->converter,&(mySource), 1.2101 + mySourceLimit, ISO_2022_JP,err); 1.2102 + 1.2103 + /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 1.2104 + if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 1.2105 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.2106 + args->converter->toUCallbackReason = UCNV_IRREGULAR; 1.2107 + args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 1.2108 + } 1.2109 + } 1.2110 + 1.2111 + /* invalid or illegal escape sequence */ 1.2112 + if(U_FAILURE(*err)){ 1.2113 + args->target = myTarget; 1.2114 + args->source = mySource; 1.2115 + myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 1.2116 + return; 1.2117 + } 1.2118 + /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 1.2119 + if(myData->key==0) { 1.2120 + myData->isEmptySegment = TRUE; 1.2121 + } 1.2122 + continue; 1.2123 + 1.2124 + /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 1.2125 + 1.2126 + case CR: 1.2127 + /*falls through*/ 1.2128 + case LF: 1.2129 + /* automatically reset to single-byte mode */ 1.2130 + if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 1.2131 + pToU2022State->cs[0] = (int8_t)ASCII; 1.2132 + } 1.2133 + pToU2022State->cs[2] = 0; 1.2134 + pToU2022State->g = 0; 1.2135 + /* falls through */ 1.2136 + default: 1.2137 + /* convert one or two bytes */ 1.2138 + myData->isEmptySegment = FALSE; 1.2139 + cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 1.2140 + if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 1.2141 + !IS_JP_DBCS(cs) 1.2142 + ) { 1.2143 + /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 1.2144 + targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 1.2145 + 1.2146 + /* return from a single-shift state to the previous one */ 1.2147 + if(pToU2022State->g >= 2) { 1.2148 + pToU2022State->g=pToU2022State->prevG; 1.2149 + } 1.2150 + } else switch(cs) { 1.2151 + case ASCII: 1.2152 + if(mySourceChar <= 0x7f) { 1.2153 + targetUniChar = mySourceChar; 1.2154 + } 1.2155 + break; 1.2156 + case ISO8859_1: 1.2157 + if(mySourceChar <= 0x7f) { 1.2158 + targetUniChar = mySourceChar + 0x80; 1.2159 + } 1.2160 + /* return from a single-shift state to the previous one */ 1.2161 + pToU2022State->g=pToU2022State->prevG; 1.2162 + break; 1.2163 + case ISO8859_7: 1.2164 + if(mySourceChar <= 0x7f) { 1.2165 + /* convert mySourceChar+0x80 to use a normal 8-bit table */ 1.2166 + targetUniChar = 1.2167 + _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 1.2168 + myData->myConverterArray[cs], 1.2169 + mySourceChar + 0x80); 1.2170 + } 1.2171 + /* return from a single-shift state to the previous one */ 1.2172 + pToU2022State->g=pToU2022State->prevG; 1.2173 + break; 1.2174 + case JISX201: 1.2175 + if(mySourceChar <= 0x7f) { 1.2176 + targetUniChar = jisx201ToU(mySourceChar); 1.2177 + } 1.2178 + break; 1.2179 + case HWKANA_7BIT: 1.2180 + if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 1.2181 + /* 7-bit halfwidth Katakana */ 1.2182 + targetUniChar = mySourceChar + (HWKANA_START - 0x21); 1.2183 + } 1.2184 + break; 1.2185 + default: 1.2186 + /* G0 DBCS */ 1.2187 + if(mySource < mySourceLimit) { 1.2188 + int leadIsOk, trailIsOk; 1.2189 + uint8_t trailByte; 1.2190 +getTrailByte: 1.2191 + trailByte = (uint8_t)*mySource; 1.2192 + /* 1.2193 + * Ticket 5691: consistent illegal sequences: 1.2194 + * - We include at least the first byte in the illegal sequence. 1.2195 + * - If any of the non-initial bytes could be the start of a character, 1.2196 + * we stop the illegal sequence before the first one of those. 1.2197 + * 1.2198 + * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 1.2199 + * an ESC/SO/SI, we report only the first byte as the illegal sequence. 1.2200 + * Otherwise we convert or report the pair of bytes. 1.2201 + */ 1.2202 + leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 1.2203 + trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 1.2204 + if (leadIsOk && trailIsOk) { 1.2205 + ++mySource; 1.2206 + tmpSourceChar = (mySourceChar << 8) | trailByte; 1.2207 + if(cs == JISX208) { 1.2208 + _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 1.2209 + mySourceChar = tmpSourceChar; 1.2210 + } else { 1.2211 + /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 1.2212 + mySourceChar = tmpSourceChar; 1.2213 + if (cs == KSC5601) { 1.2214 + tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 1.2215 + } 1.2216 + tempBuf[0] = (char)(tmpSourceChar >> 8); 1.2217 + tempBuf[1] = (char)(tmpSourceChar); 1.2218 + } 1.2219 + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 1.2220 + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 1.2221 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 1.2222 + ++mySource; 1.2223 + /* add another bit so that the code below writes 2 bytes in case of error */ 1.2224 + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 1.2225 + } 1.2226 + } else { 1.2227 + args->converter->toUBytes[0] = (uint8_t)mySourceChar; 1.2228 + args->converter->toULength = 1; 1.2229 + goto endloop; 1.2230 + } 1.2231 + } /* End of inner switch */ 1.2232 + break; 1.2233 + } /* End of outer switch */ 1.2234 + if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 1.2235 + if(args->offsets){ 1.2236 + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 1.2237 + } 1.2238 + *(myTarget++)=(UChar)targetUniChar; 1.2239 + } 1.2240 + else if(targetUniChar > missingCharMarker){ 1.2241 + /* disassemble the surrogate pair and write to output*/ 1.2242 + targetUniChar-=0x0010000; 1.2243 + *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 1.2244 + if(args->offsets){ 1.2245 + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 1.2246 + } 1.2247 + ++myTarget; 1.2248 + if(myTarget< args->targetLimit){ 1.2249 + *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 1.2250 + if(args->offsets){ 1.2251 + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 1.2252 + } 1.2253 + ++myTarget; 1.2254 + }else{ 1.2255 + args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 1.2256 + (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 1.2257 + } 1.2258 + 1.2259 + } 1.2260 + else{ 1.2261 + /* Call the callback function*/ 1.2262 + toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 1.2263 + break; 1.2264 + } 1.2265 + } 1.2266 + else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 1.2267 + *err =U_BUFFER_OVERFLOW_ERROR; 1.2268 + break; 1.2269 + } 1.2270 + } 1.2271 +endloop: 1.2272 + args->target = myTarget; 1.2273 + args->source = mySource; 1.2274 +} 1.2275 + 1.2276 + 1.2277 +/*************************************************************** 1.2278 +* Rules for ISO-2022-KR encoding 1.2279 +* i) The KSC5601 designator sequence should appear only once in a file, 1.2280 +* at the begining of a line before any KSC5601 characters. This usually 1.2281 +* means that it appears by itself on the first line of the file 1.2282 +* ii) There are only 2 shifting sequences SO to shift into double byte mode 1.2283 +* and SI to shift into single byte mode 1.2284 +*/ 1.2285 +static void 1.2286 +UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 1.2287 + 1.2288 + UConverter* saveConv = args->converter; 1.2289 + UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 1.2290 + args->converter=myConverterData->currentConverter; 1.2291 + 1.2292 + myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 1.2293 + ucnv_MBCSFromUnicodeWithOffsets(args,err); 1.2294 + saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 1.2295 + 1.2296 + if(*err == U_BUFFER_OVERFLOW_ERROR) { 1.2297 + if(myConverterData->currentConverter->charErrorBufferLength > 0) { 1.2298 + uprv_memcpy( 1.2299 + saveConv->charErrorBuffer, 1.2300 + myConverterData->currentConverter->charErrorBuffer, 1.2301 + myConverterData->currentConverter->charErrorBufferLength); 1.2302 + } 1.2303 + saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 1.2304 + myConverterData->currentConverter->charErrorBufferLength = 0; 1.2305 + } 1.2306 + args->converter=saveConv; 1.2307 +} 1.2308 + 1.2309 +static void 1.2310 +UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 1.2311 + 1.2312 + const UChar *source = args->source; 1.2313 + const UChar *sourceLimit = args->sourceLimit; 1.2314 + unsigned char *target = (unsigned char *) args->target; 1.2315 + unsigned char *targetLimit = (unsigned char *) args->targetLimit; 1.2316 + int32_t* offsets = args->offsets; 1.2317 + uint32_t targetByteUnit = 0x0000; 1.2318 + UChar32 sourceChar = 0x0000; 1.2319 + UBool isTargetByteDBCS; 1.2320 + UBool oldIsTargetByteDBCS; 1.2321 + UConverterDataISO2022 *converterData; 1.2322 + UConverterSharedData* sharedData; 1.2323 + UBool useFallback; 1.2324 + int32_t length =0; 1.2325 + 1.2326 + converterData=(UConverterDataISO2022*)args->converter->extraInfo; 1.2327 + /* if the version is 1 then the user is requesting 1.2328 + * conversion with ibm-25546 pass the arguments to 1.2329 + * MBCS converter and return 1.2330 + */ 1.2331 + if(converterData->version==1){ 1.2332 + UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 1.2333 + return; 1.2334 + } 1.2335 + 1.2336 + /* initialize data */ 1.2337 + sharedData = converterData->currentConverter->sharedData; 1.2338 + useFallback = args->converter->useFallback; 1.2339 + isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 1.2340 + oldIsTargetByteDBCS = isTargetByteDBCS; 1.2341 + 1.2342 + isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 1.2343 + if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 1.2344 + goto getTrail; 1.2345 + } 1.2346 + while(source < sourceLimit){ 1.2347 + 1.2348 + targetByteUnit = missingCharMarker; 1.2349 + 1.2350 + if(target < (unsigned char*) args->targetLimit){ 1.2351 + sourceChar = *source++; 1.2352 + 1.2353 + /* do not convert SO/SI/ESC */ 1.2354 + if(IS_2022_CONTROL(sourceChar)) { 1.2355 + /* callback(illegal) */ 1.2356 + *err=U_ILLEGAL_CHAR_FOUND; 1.2357 + args->converter->fromUChar32=sourceChar; 1.2358 + break; 1.2359 + } 1.2360 + 1.2361 + length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 1.2362 + if(length < 0) { 1.2363 + length = -length; /* fallback */ 1.2364 + } 1.2365 + /* only DBCS or SBCS characters are expected*/ 1.2366 + /* DB characters with high bit set to 1 are expected */ 1.2367 + if( length > 2 || length==0 || 1.2368 + (length == 1 && targetByteUnit > 0x7f) || 1.2369 + (length == 2 && 1.2370 + ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 1.2371 + (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 1.2372 + ) { 1.2373 + targetByteUnit=missingCharMarker; 1.2374 + } 1.2375 + if (targetByteUnit != missingCharMarker){ 1.2376 + 1.2377 + oldIsTargetByteDBCS = isTargetByteDBCS; 1.2378 + isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 1.2379 + /* append the shift sequence */ 1.2380 + if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 1.2381 + 1.2382 + if (isTargetByteDBCS) 1.2383 + *target++ = UCNV_SO; 1.2384 + else 1.2385 + *target++ = UCNV_SI; 1.2386 + if(offsets) 1.2387 + *(offsets++) = (int32_t)(source - args->source-1); 1.2388 + } 1.2389 + /* write the targetUniChar to target */ 1.2390 + if(targetByteUnit <= 0x00FF){ 1.2391 + if( target < targetLimit){ 1.2392 + *(target++) = (unsigned char) targetByteUnit; 1.2393 + if(offsets){ 1.2394 + *(offsets++) = (int32_t)(source - args->source-1); 1.2395 + } 1.2396 + 1.2397 + }else{ 1.2398 + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 1.2399 + *err = U_BUFFER_OVERFLOW_ERROR; 1.2400 + } 1.2401 + }else{ 1.2402 + if(target < targetLimit){ 1.2403 + *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 1.2404 + if(offsets){ 1.2405 + *(offsets++) = (int32_t)(source - args->source-1); 1.2406 + } 1.2407 + if(target < targetLimit){ 1.2408 + *(target++) =(unsigned char) (targetByteUnit -0x80); 1.2409 + if(offsets){ 1.2410 + *(offsets++) = (int32_t)(source - args->source-1); 1.2411 + } 1.2412 + }else{ 1.2413 + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 1.2414 + *err = U_BUFFER_OVERFLOW_ERROR; 1.2415 + } 1.2416 + }else{ 1.2417 + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 1.2418 + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 1.2419 + *err = U_BUFFER_OVERFLOW_ERROR; 1.2420 + } 1.2421 + } 1.2422 + 1.2423 + } 1.2424 + else{ 1.2425 + /* oops.. the code point is unassingned 1.2426 + * set the error and reason 1.2427 + */ 1.2428 + 1.2429 + /*check if the char is a First surrogate*/ 1.2430 + if(U16_IS_SURROGATE(sourceChar)) { 1.2431 + if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1.2432 +getTrail: 1.2433 + /*look ahead to find the trail surrogate*/ 1.2434 + if(source < sourceLimit) { 1.2435 + /* test the following code unit */ 1.2436 + UChar trail=(UChar) *source; 1.2437 + if(U16_IS_TRAIL(trail)) { 1.2438 + source++; 1.2439 + sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1.2440 + *err = U_INVALID_CHAR_FOUND; 1.2441 + /* convert this surrogate code point */ 1.2442 + /* exit this condition tree */ 1.2443 + } else { 1.2444 + /* this is an unmatched lead code unit (1st surrogate) */ 1.2445 + /* callback(illegal) */ 1.2446 + *err=U_ILLEGAL_CHAR_FOUND; 1.2447 + } 1.2448 + } else { 1.2449 + /* no more input */ 1.2450 + *err = U_ZERO_ERROR; 1.2451 + } 1.2452 + } else { 1.2453 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.2454 + /* callback(illegal) */ 1.2455 + *err=U_ILLEGAL_CHAR_FOUND; 1.2456 + } 1.2457 + } else { 1.2458 + /* callback(unassigned) for a BMP code point */ 1.2459 + *err = U_INVALID_CHAR_FOUND; 1.2460 + } 1.2461 + 1.2462 + args->converter->fromUChar32=sourceChar; 1.2463 + break; 1.2464 + } 1.2465 + } /* end if(myTargetIndex<myTargetLength) */ 1.2466 + else{ 1.2467 + *err =U_BUFFER_OVERFLOW_ERROR; 1.2468 + break; 1.2469 + } 1.2470 + 1.2471 + }/* end while(mySourceIndex<mySourceLength) */ 1.2472 + 1.2473 + /* 1.2474 + * the end of the input stream and detection of truncated input 1.2475 + * are handled by the framework, but for ISO-2022-KR conversion 1.2476 + * we need to be in ASCII mode at the very end 1.2477 + * 1.2478 + * conditions: 1.2479 + * successful 1.2480 + * not in ASCII mode 1.2481 + * end of input and no truncated input 1.2482 + */ 1.2483 + if( U_SUCCESS(*err) && 1.2484 + isTargetByteDBCS && 1.2485 + args->flush && source>=sourceLimit && args->converter->fromUChar32==0 1.2486 + ) { 1.2487 + int32_t sourceIndex; 1.2488 + 1.2489 + /* we are switching to ASCII */ 1.2490 + isTargetByteDBCS=FALSE; 1.2491 + 1.2492 + /* get the source index of the last input character */ 1.2493 + /* 1.2494 + * TODO this would be simpler and more reliable if we used a pair 1.2495 + * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 1.2496 + * so that we could simply use the prevSourceIndex here; 1.2497 + * this code gives an incorrect result for the rare case of an unmatched 1.2498 + * trail surrogate that is alone in the last buffer of the text stream 1.2499 + */ 1.2500 + sourceIndex=(int32_t)(source-args->source); 1.2501 + if(sourceIndex>0) { 1.2502 + --sourceIndex; 1.2503 + if( U16_IS_TRAIL(args->source[sourceIndex]) && 1.2504 + (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 1.2505 + ) { 1.2506 + --sourceIndex; 1.2507 + } 1.2508 + } else { 1.2509 + sourceIndex=-1; 1.2510 + } 1.2511 + 1.2512 + fromUWriteUInt8( 1.2513 + args->converter, 1.2514 + SHIFT_IN_STR, 1, 1.2515 + &target, (const char *)targetLimit, 1.2516 + &offsets, sourceIndex, 1.2517 + err); 1.2518 + } 1.2519 + 1.2520 + /*save the state and return */ 1.2521 + args->source = source; 1.2522 + args->target = (char*)target; 1.2523 + args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 1.2524 +} 1.2525 + 1.2526 +/************************ To Unicode ***************************************/ 1.2527 + 1.2528 +static void 1.2529 +UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 1.2530 + UErrorCode* err){ 1.2531 + char const* sourceStart; 1.2532 + UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 1.2533 + 1.2534 + UConverterToUnicodeArgs subArgs; 1.2535 + int32_t minArgsSize; 1.2536 + 1.2537 + /* set up the subconverter arguments */ 1.2538 + if(args->size<sizeof(UConverterToUnicodeArgs)) { 1.2539 + minArgsSize = args->size; 1.2540 + } else { 1.2541 + minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 1.2542 + } 1.2543 + 1.2544 + uprv_memcpy(&subArgs, args, minArgsSize); 1.2545 + subArgs.size = (uint16_t)minArgsSize; 1.2546 + subArgs.converter = myData->currentConverter; 1.2547 + 1.2548 + /* remember the original start of the input for offsets */ 1.2549 + sourceStart = args->source; 1.2550 + 1.2551 + if(myData->key != 0) { 1.2552 + /* continue with a partial escape sequence */ 1.2553 + goto escape; 1.2554 + } 1.2555 + 1.2556 + while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 1.2557 + /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1.2558 + subArgs.source = args->source; 1.2559 + subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 1.2560 + if(subArgs.source != subArgs.sourceLimit) { 1.2561 + /* 1.2562 + * get the current partial byte sequence 1.2563 + * 1.2564 + * it needs to be moved between the public and the subconverter 1.2565 + * so that the conversion framework, which only sees the public 1.2566 + * converter, can handle truncated and illegal input etc. 1.2567 + */ 1.2568 + if(args->converter->toULength > 0) { 1.2569 + uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 1.2570 + } 1.2571 + subArgs.converter->toULength = args->converter->toULength; 1.2572 + 1.2573 + /* 1.2574 + * Convert up to the end of the input, or to before the next escape character. 1.2575 + * Does not handle conversion extensions because the preToU[] state etc. 1.2576 + * is not copied. 1.2577 + */ 1.2578 + ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 1.2579 + 1.2580 + if(args->offsets != NULL && sourceStart != args->source) { 1.2581 + /* update offsets to base them on the actual start of the input */ 1.2582 + int32_t *offsets = args->offsets; 1.2583 + UChar *target = args->target; 1.2584 + int32_t delta = (int32_t)(args->source - sourceStart); 1.2585 + while(target < subArgs.target) { 1.2586 + if(*offsets >= 0) { 1.2587 + *offsets += delta; 1.2588 + } 1.2589 + ++offsets; 1.2590 + ++target; 1.2591 + } 1.2592 + } 1.2593 + args->source = subArgs.source; 1.2594 + args->target = subArgs.target; 1.2595 + args->offsets = subArgs.offsets; 1.2596 + 1.2597 + /* copy input/error/overflow buffers */ 1.2598 + if(subArgs.converter->toULength > 0) { 1.2599 + uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 1.2600 + } 1.2601 + args->converter->toULength = subArgs.converter->toULength; 1.2602 + 1.2603 + if(*err == U_BUFFER_OVERFLOW_ERROR) { 1.2604 + if(subArgs.converter->UCharErrorBufferLength > 0) { 1.2605 + uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 1.2606 + subArgs.converter->UCharErrorBufferLength); 1.2607 + } 1.2608 + args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 1.2609 + subArgs.converter->UCharErrorBufferLength = 0; 1.2610 + } 1.2611 + } 1.2612 + 1.2613 + if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 1.2614 + return; 1.2615 + } 1.2616 + 1.2617 +escape: 1.2618 + changeState_2022(args->converter, 1.2619 + &(args->source), 1.2620 + args->sourceLimit, 1.2621 + ISO_2022_KR, 1.2622 + err); 1.2623 + } 1.2624 +} 1.2625 + 1.2626 +static void 1.2627 +UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 1.2628 + UErrorCode* err){ 1.2629 + char tempBuf[2]; 1.2630 + const char *mySource = ( char *) args->source; 1.2631 + UChar *myTarget = args->target; 1.2632 + const char *mySourceLimit = args->sourceLimit; 1.2633 + UChar32 targetUniChar = 0x0000; 1.2634 + UChar mySourceChar = 0x0000; 1.2635 + UConverterDataISO2022* myData; 1.2636 + UConverterSharedData* sharedData ; 1.2637 + UBool useFallback; 1.2638 + 1.2639 + myData=(UConverterDataISO2022*)(args->converter->extraInfo); 1.2640 + if(myData->version==1){ 1.2641 + UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 1.2642 + return; 1.2643 + } 1.2644 + 1.2645 + /* initialize state */ 1.2646 + sharedData = myData->currentConverter->sharedData; 1.2647 + useFallback = args->converter->useFallback; 1.2648 + 1.2649 + if(myData->key != 0) { 1.2650 + /* continue with a partial escape sequence */ 1.2651 + goto escape; 1.2652 + } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 1.2653 + /* continue with a partial double-byte character */ 1.2654 + mySourceChar = args->converter->toUBytes[0]; 1.2655 + args->converter->toULength = 0; 1.2656 + goto getTrailByte; 1.2657 + } 1.2658 + 1.2659 + while(mySource< mySourceLimit){ 1.2660 + 1.2661 + if(myTarget < args->targetLimit){ 1.2662 + 1.2663 + mySourceChar= (unsigned char) *mySource++; 1.2664 + 1.2665 + if(mySourceChar==UCNV_SI){ 1.2666 + myData->toU2022State.g = 0; 1.2667 + if (myData->isEmptySegment) { 1.2668 + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 1.2669 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.2670 + args->converter->toUCallbackReason = UCNV_IRREGULAR; 1.2671 + args->converter->toUBytes[0] = (uint8_t)mySourceChar; 1.2672 + args->converter->toULength = 1; 1.2673 + args->target = myTarget; 1.2674 + args->source = mySource; 1.2675 + return; 1.2676 + } 1.2677 + /*consume the source */ 1.2678 + continue; 1.2679 + }else if(mySourceChar==UCNV_SO){ 1.2680 + myData->toU2022State.g = 1; 1.2681 + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 1.2682 + /*consume the source */ 1.2683 + continue; 1.2684 + }else if(mySourceChar==ESC_2022){ 1.2685 + mySource--; 1.2686 +escape: 1.2687 + myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 1.2688 + changeState_2022(args->converter,&(mySource), 1.2689 + mySourceLimit, ISO_2022_KR, err); 1.2690 + if(U_FAILURE(*err)){ 1.2691 + args->target = myTarget; 1.2692 + args->source = mySource; 1.2693 + return; 1.2694 + } 1.2695 + continue; 1.2696 + } 1.2697 + 1.2698 + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 1.2699 + if(myData->toU2022State.g == 1) { 1.2700 + if(mySource < mySourceLimit) { 1.2701 + int leadIsOk, trailIsOk; 1.2702 + uint8_t trailByte; 1.2703 +getTrailByte: 1.2704 + targetUniChar = missingCharMarker; 1.2705 + trailByte = (uint8_t)*mySource; 1.2706 + /* 1.2707 + * Ticket 5691: consistent illegal sequences: 1.2708 + * - We include at least the first byte in the illegal sequence. 1.2709 + * - If any of the non-initial bytes could be the start of a character, 1.2710 + * we stop the illegal sequence before the first one of those. 1.2711 + * 1.2712 + * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 1.2713 + * an ESC/SO/SI, we report only the first byte as the illegal sequence. 1.2714 + * Otherwise we convert or report the pair of bytes. 1.2715 + */ 1.2716 + leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 1.2717 + trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 1.2718 + if (leadIsOk && trailIsOk) { 1.2719 + ++mySource; 1.2720 + tempBuf[0] = (char)(mySourceChar + 0x80); 1.2721 + tempBuf[1] = (char)(trailByte + 0x80); 1.2722 + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 1.2723 + mySourceChar = (mySourceChar << 8) | trailByte; 1.2724 + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 1.2725 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 1.2726 + ++mySource; 1.2727 + /* add another bit so that the code below writes 2 bytes in case of error */ 1.2728 + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 1.2729 + } 1.2730 + } else { 1.2731 + args->converter->toUBytes[0] = (uint8_t)mySourceChar; 1.2732 + args->converter->toULength = 1; 1.2733 + break; 1.2734 + } 1.2735 + } 1.2736 + else if(mySourceChar <= 0x7f) { 1.2737 + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 1.2738 + } else { 1.2739 + targetUniChar = 0xffff; 1.2740 + } 1.2741 + if(targetUniChar < 0xfffe){ 1.2742 + if(args->offsets) { 1.2743 + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 1.2744 + } 1.2745 + *(myTarget++)=(UChar)targetUniChar; 1.2746 + } 1.2747 + else { 1.2748 + /* Call the callback function*/ 1.2749 + toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 1.2750 + break; 1.2751 + } 1.2752 + } 1.2753 + else{ 1.2754 + *err =U_BUFFER_OVERFLOW_ERROR; 1.2755 + break; 1.2756 + } 1.2757 + } 1.2758 + args->target = myTarget; 1.2759 + args->source = mySource; 1.2760 +} 1.2761 + 1.2762 +/*************************** END ISO2022-KR *********************************/ 1.2763 + 1.2764 +/*************************** ISO-2022-CN ********************************* 1.2765 +* 1.2766 +* Rules for ISO-2022-CN Encoding: 1.2767 +* i) The designator sequence must appear once on a line before any instance 1.2768 +* of character set it designates. 1.2769 +* ii) If two lines contain characters from the same character set, both lines 1.2770 +* must include the designator sequence. 1.2771 +* iii) Once the designator sequence is known, a shifting sequence has to be found 1.2772 +* to invoke the shifting 1.2773 +* iv) All lines start in ASCII and end in ASCII. 1.2774 +* v) Four shifting sequences are employed for this purpose: 1.2775 +* 1.2776 +* Sequcence ASCII Eq Charsets 1.2777 +* ---------- ------- --------- 1.2778 +* SI <SI> US-ASCII 1.2779 +* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 1.2780 +* SS2 <ESC>N CNS-11643-1992 Plane 2 1.2781 +* SS3 <ESC>O CNS-11643-1992 Planes 3-7 1.2782 +* 1.2783 +* vi) 1.2784 +* SOdesignator : ESC "$" ")" finalchar_for_SO 1.2785 +* SS2designator : ESC "$" "*" finalchar_for_SS2 1.2786 +* SS3designator : ESC "$" "+" finalchar_for_SS3 1.2787 +* 1.2788 +* ESC $ ) A Indicates the bytes following SO are Chinese 1.2789 +* characters as defined in GB 2312-80, until 1.2790 +* another SOdesignation appears 1.2791 +* 1.2792 +* 1.2793 +* ESC $ ) E Indicates the bytes following SO are as defined 1.2794 +* in ISO-IR-165 (for details, see section 2.1), 1.2795 +* until another SOdesignation appears 1.2796 +* 1.2797 +* ESC $ ) G Indicates the bytes following SO are as defined 1.2798 +* in CNS 11643-plane-1, until another 1.2799 +* SOdesignation appears 1.2800 +* 1.2801 +* ESC $ * H Indicates the two bytes immediately following 1.2802 +* SS2 is a Chinese character as defined in CNS 1.2803 +* 11643-plane-2, until another SS2designation 1.2804 +* appears 1.2805 +* (Meaning <ESC>N must preceed every 2 byte 1.2806 +* sequence.) 1.2807 +* 1.2808 +* ESC $ + I Indicates the immediate two bytes following SS3 1.2809 +* is a Chinese character as defined in CNS 1.2810 +* 11643-plane-3, until another SS3designation 1.2811 +* appears 1.2812 +* (Meaning <ESC>O must preceed every 2 byte 1.2813 +* sequence.) 1.2814 +* 1.2815 +* ESC $ + J Indicates the immediate two bytes following SS3 1.2816 +* is a Chinese character as defined in CNS 1.2817 +* 11643-plane-4, until another SS3designation 1.2818 +* appears 1.2819 +* (In English: <ESC>O must preceed every 2 byte 1.2820 +* sequence.) 1.2821 +* 1.2822 +* ESC $ + K Indicates the immediate two bytes following SS3 1.2823 +* is a Chinese character as defined in CNS 1.2824 +* 11643-plane-5, until another SS3designation 1.2825 +* appears 1.2826 +* 1.2827 +* ESC $ + L Indicates the immediate two bytes following SS3 1.2828 +* is a Chinese character as defined in CNS 1.2829 +* 11643-plane-6, until another SS3designation 1.2830 +* appears 1.2831 +* 1.2832 +* ESC $ + M Indicates the immediate two bytes following SS3 1.2833 +* is a Chinese character as defined in CNS 1.2834 +* 11643-plane-7, until another SS3designation 1.2835 +* appears 1.2836 +* 1.2837 +* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 1.2838 +* has its own designation information before any Chinese characters 1.2839 +* appear 1.2840 +* 1.2841 +*/ 1.2842 + 1.2843 +/* The following are defined this way to make the strings truly readonly */ 1.2844 +static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 1.2845 +static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 1.2846 +static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 1.2847 +static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 1.2848 +static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 1.2849 +static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 1.2850 +static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 1.2851 +static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 1.2852 +static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 1.2853 + 1.2854 +/********************** ISO2022-CN Data **************************/ 1.2855 +static const char* const escSeqCharsCN[10] ={ 1.2856 + SHIFT_IN_STR, /* 0 ASCII */ 1.2857 + GB_2312_80_STR, /* 1 GB2312_1 */ 1.2858 + ISO_IR_165_STR, /* 2 ISO_IR_165 */ 1.2859 + CNS_11643_1992_Plane_1_STR, 1.2860 + CNS_11643_1992_Plane_2_STR, 1.2861 + CNS_11643_1992_Plane_3_STR, 1.2862 + CNS_11643_1992_Plane_4_STR, 1.2863 + CNS_11643_1992_Plane_5_STR, 1.2864 + CNS_11643_1992_Plane_6_STR, 1.2865 + CNS_11643_1992_Plane_7_STR 1.2866 +}; 1.2867 + 1.2868 +static void 1.2869 +UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 1.2870 + UConverter *cnv = args->converter; 1.2871 + UConverterDataISO2022 *converterData; 1.2872 + ISO2022State *pFromU2022State; 1.2873 + uint8_t *target = (uint8_t *) args->target; 1.2874 + const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1.2875 + const UChar* source = args->source; 1.2876 + const UChar* sourceLimit = args->sourceLimit; 1.2877 + int32_t* offsets = args->offsets; 1.2878 + UChar32 sourceChar; 1.2879 + char buffer[8]; 1.2880 + int32_t len; 1.2881 + int8_t choices[3]; 1.2882 + int32_t choiceCount; 1.2883 + uint32_t targetValue = 0; 1.2884 + UBool useFallback; 1.2885 + 1.2886 + /* set up the state */ 1.2887 + converterData = (UConverterDataISO2022*)cnv->extraInfo; 1.2888 + pFromU2022State = &converterData->fromU2022State; 1.2889 + 1.2890 + choiceCount = 0; 1.2891 + 1.2892 + /* check if the last codepoint of previous buffer was a lead surrogate*/ 1.2893 + if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1.2894 + goto getTrail; 1.2895 + } 1.2896 + 1.2897 + while( source < sourceLimit){ 1.2898 + if(target < targetLimit){ 1.2899 + 1.2900 + sourceChar = *(source++); 1.2901 + /*check if the char is a First surrogate*/ 1.2902 + if(U16_IS_SURROGATE(sourceChar)) { 1.2903 + if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1.2904 +getTrail: 1.2905 + /*look ahead to find the trail surrogate*/ 1.2906 + if(source < sourceLimit) { 1.2907 + /* test the following code unit */ 1.2908 + UChar trail=(UChar) *source; 1.2909 + if(U16_IS_TRAIL(trail)) { 1.2910 + source++; 1.2911 + sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1.2912 + cnv->fromUChar32=0x00; 1.2913 + /* convert this supplementary code point */ 1.2914 + /* exit this condition tree */ 1.2915 + } else { 1.2916 + /* this is an unmatched lead code unit (1st surrogate) */ 1.2917 + /* callback(illegal) */ 1.2918 + *err=U_ILLEGAL_CHAR_FOUND; 1.2919 + cnv->fromUChar32=sourceChar; 1.2920 + break; 1.2921 + } 1.2922 + } else { 1.2923 + /* no more input */ 1.2924 + cnv->fromUChar32=sourceChar; 1.2925 + break; 1.2926 + } 1.2927 + } else { 1.2928 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.2929 + /* callback(illegal) */ 1.2930 + *err=U_ILLEGAL_CHAR_FOUND; 1.2931 + cnv->fromUChar32=sourceChar; 1.2932 + break; 1.2933 + } 1.2934 + } 1.2935 + 1.2936 + /* do the conversion */ 1.2937 + if(sourceChar <= 0x007f ){ 1.2938 + /* do not convert SO/SI/ESC */ 1.2939 + if(IS_2022_CONTROL(sourceChar)) { 1.2940 + /* callback(illegal) */ 1.2941 + *err=U_ILLEGAL_CHAR_FOUND; 1.2942 + cnv->fromUChar32=sourceChar; 1.2943 + break; 1.2944 + } 1.2945 + 1.2946 + /* US-ASCII */ 1.2947 + if(pFromU2022State->g == 0) { 1.2948 + buffer[0] = (char)sourceChar; 1.2949 + len = 1; 1.2950 + } else { 1.2951 + buffer[0] = UCNV_SI; 1.2952 + buffer[1] = (char)sourceChar; 1.2953 + len = 2; 1.2954 + pFromU2022State->g = 0; 1.2955 + choiceCount = 0; 1.2956 + } 1.2957 + if(sourceChar == CR || sourceChar == LF) { 1.2958 + /* reset the state at the end of a line */ 1.2959 + uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 1.2960 + choiceCount = 0; 1.2961 + } 1.2962 + } 1.2963 + else{ 1.2964 + /* convert U+0080..U+10ffff */ 1.2965 + int32_t i; 1.2966 + int8_t cs, g; 1.2967 + 1.2968 + if(choiceCount == 0) { 1.2969 + /* try the current SO/G1 converter first */ 1.2970 + choices[0] = pFromU2022State->cs[1]; 1.2971 + 1.2972 + /* default to GB2312_1 if none is designated yet */ 1.2973 + if(choices[0] == 0) { 1.2974 + choices[0] = GB2312_1; 1.2975 + } 1.2976 + 1.2977 + if(converterData->version == 0) { 1.2978 + /* ISO-2022-CN */ 1.2979 + 1.2980 + /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 1.2981 + if(choices[0] == GB2312_1) { 1.2982 + choices[1] = (int8_t)CNS_11643_1; 1.2983 + } else { 1.2984 + choices[1] = (int8_t)GB2312_1; 1.2985 + } 1.2986 + 1.2987 + choiceCount = 2; 1.2988 + } else if (converterData->version == 1) { 1.2989 + /* ISO-2022-CN-EXT */ 1.2990 + 1.2991 + /* try one of the other converters */ 1.2992 + switch(choices[0]) { 1.2993 + case GB2312_1: 1.2994 + choices[1] = (int8_t)CNS_11643_1; 1.2995 + choices[2] = (int8_t)ISO_IR_165; 1.2996 + break; 1.2997 + case ISO_IR_165: 1.2998 + choices[1] = (int8_t)GB2312_1; 1.2999 + choices[2] = (int8_t)CNS_11643_1; 1.3000 + break; 1.3001 + default: /* CNS_11643_x */ 1.3002 + choices[1] = (int8_t)GB2312_1; 1.3003 + choices[2] = (int8_t)ISO_IR_165; 1.3004 + break; 1.3005 + } 1.3006 + 1.3007 + choiceCount = 3; 1.3008 + } else { 1.3009 + choices[0] = (int8_t)CNS_11643_1; 1.3010 + choices[1] = (int8_t)GB2312_1; 1.3011 + } 1.3012 + } 1.3013 + 1.3014 + cs = g = 0; 1.3015 + /* 1.3016 + * len==0: no mapping found yet 1.3017 + * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1.3018 + * len>0: found a roundtrip result, done 1.3019 + */ 1.3020 + len = 0; 1.3021 + /* 1.3022 + * We will turn off useFallback after finding a fallback, 1.3023 + * but we still get fallbacks from PUA code points as usual. 1.3024 + * Therefore, we will also need to check that we don't overwrite 1.3025 + * an early fallback with a later one. 1.3026 + */ 1.3027 + useFallback = cnv->useFallback; 1.3028 + 1.3029 + for(i = 0; i < choiceCount && len <= 0; ++i) { 1.3030 + int8_t cs0 = choices[i]; 1.3031 + if(cs0 > 0) { 1.3032 + uint32_t value; 1.3033 + int32_t len2; 1.3034 + if(cs0 >= CNS_11643_0) { 1.3035 + len2 = MBCS_FROM_UCHAR32_ISO2022( 1.3036 + converterData->myConverterArray[CNS_11643], 1.3037 + sourceChar, 1.3038 + &value, 1.3039 + useFallback, 1.3040 + MBCS_OUTPUT_3); 1.3041 + if(len2 == 3 || (len2 == -3 && len == 0)) { 1.3042 + targetValue = value; 1.3043 + cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 1.3044 + if(len2 >= 0) { 1.3045 + len = 2; 1.3046 + } else { 1.3047 + len = -2; 1.3048 + useFallback = FALSE; 1.3049 + } 1.3050 + if(cs == CNS_11643_1) { 1.3051 + g = 1; 1.3052 + } else if(cs == CNS_11643_2) { 1.3053 + g = 2; 1.3054 + } else /* plane 3..7 */ if(converterData->version == 1) { 1.3055 + g = 3; 1.3056 + } else { 1.3057 + /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 1.3058 + len = 0; 1.3059 + } 1.3060 + } 1.3061 + } else { 1.3062 + /* GB2312_1 or ISO-IR-165 */ 1.3063 + U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); 1.3064 + len2 = MBCS_FROM_UCHAR32_ISO2022( 1.3065 + converterData->myConverterArray[cs0], 1.3066 + sourceChar, 1.3067 + &value, 1.3068 + useFallback, 1.3069 + MBCS_OUTPUT_2); 1.3070 + if(len2 == 2 || (len2 == -2 && len == 0)) { 1.3071 + targetValue = value; 1.3072 + len = len2; 1.3073 + cs = cs0; 1.3074 + g = 1; 1.3075 + useFallback = FALSE; 1.3076 + } 1.3077 + } 1.3078 + } 1.3079 + } 1.3080 + 1.3081 + if(len != 0) { 1.3082 + len = 0; /* count output bytes; it must have been abs(len) == 2 */ 1.3083 + 1.3084 + /* write the designation sequence if necessary */ 1.3085 + if(cs != pFromU2022State->cs[g]) { 1.3086 + if(cs < CNS_11643) { 1.3087 + uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 1.3088 + } else { 1.3089 + U_ASSERT(cs >= CNS_11643_1); 1.3090 + uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 1.3091 + } 1.3092 + len = 4; 1.3093 + pFromU2022State->cs[g] = cs; 1.3094 + if(g == 1) { 1.3095 + /* changing the SO/G1 charset invalidates the choices[] */ 1.3096 + choiceCount = 0; 1.3097 + } 1.3098 + } 1.3099 + 1.3100 + /* write the shift sequence if necessary */ 1.3101 + if(g != pFromU2022State->g) { 1.3102 + switch(g) { 1.3103 + case 1: 1.3104 + buffer[len++] = UCNV_SO; 1.3105 + 1.3106 + /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 1.3107 + pFromU2022State->g = 1; 1.3108 + break; 1.3109 + case 2: 1.3110 + buffer[len++] = 0x1b; 1.3111 + buffer[len++] = 0x4e; 1.3112 + break; 1.3113 + default: /* case 3 */ 1.3114 + buffer[len++] = 0x1b; 1.3115 + buffer[len++] = 0x4f; 1.3116 + break; 1.3117 + } 1.3118 + } 1.3119 + 1.3120 + /* write the two output bytes */ 1.3121 + buffer[len++] = (char)(targetValue >> 8); 1.3122 + buffer[len++] = (char)targetValue; 1.3123 + } else { 1.3124 + /* if we cannot find the character after checking all codepages 1.3125 + * then this is an error 1.3126 + */ 1.3127 + *err = U_INVALID_CHAR_FOUND; 1.3128 + cnv->fromUChar32=sourceChar; 1.3129 + break; 1.3130 + } 1.3131 + } 1.3132 + 1.3133 + /* output len>0 bytes in buffer[] */ 1.3134 + if(len == 1) { 1.3135 + *target++ = buffer[0]; 1.3136 + if(offsets) { 1.3137 + *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1.3138 + } 1.3139 + } else if(len == 2 && (target + 2) <= targetLimit) { 1.3140 + *target++ = buffer[0]; 1.3141 + *target++ = buffer[1]; 1.3142 + if(offsets) { 1.3143 + int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1.3144 + *offsets++ = sourceIndex; 1.3145 + *offsets++ = sourceIndex; 1.3146 + } 1.3147 + } else { 1.3148 + fromUWriteUInt8( 1.3149 + cnv, 1.3150 + buffer, len, 1.3151 + &target, (const char *)targetLimit, 1.3152 + &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1.3153 + err); 1.3154 + if(U_FAILURE(*err)) { 1.3155 + break; 1.3156 + } 1.3157 + } 1.3158 + } /* end if(myTargetIndex<myTargetLength) */ 1.3159 + else{ 1.3160 + *err =U_BUFFER_OVERFLOW_ERROR; 1.3161 + break; 1.3162 + } 1.3163 + 1.3164 + }/* end while(mySourceIndex<mySourceLength) */ 1.3165 + 1.3166 + /* 1.3167 + * the end of the input stream and detection of truncated input 1.3168 + * are handled by the framework, but for ISO-2022-CN conversion 1.3169 + * we need to be in ASCII mode at the very end 1.3170 + * 1.3171 + * conditions: 1.3172 + * successful 1.3173 + * not in ASCII mode 1.3174 + * end of input and no truncated input 1.3175 + */ 1.3176 + if( U_SUCCESS(*err) && 1.3177 + pFromU2022State->g!=0 && 1.3178 + args->flush && source>=sourceLimit && cnv->fromUChar32==0 1.3179 + ) { 1.3180 + int32_t sourceIndex; 1.3181 + 1.3182 + /* we are switching to ASCII */ 1.3183 + pFromU2022State->g=0; 1.3184 + 1.3185 + /* get the source index of the last input character */ 1.3186 + /* 1.3187 + * TODO this would be simpler and more reliable if we used a pair 1.3188 + * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 1.3189 + * so that we could simply use the prevSourceIndex here; 1.3190 + * this code gives an incorrect result for the rare case of an unmatched 1.3191 + * trail surrogate that is alone in the last buffer of the text stream 1.3192 + */ 1.3193 + sourceIndex=(int32_t)(source-args->source); 1.3194 + if(sourceIndex>0) { 1.3195 + --sourceIndex; 1.3196 + if( U16_IS_TRAIL(args->source[sourceIndex]) && 1.3197 + (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 1.3198 + ) { 1.3199 + --sourceIndex; 1.3200 + } 1.3201 + } else { 1.3202 + sourceIndex=-1; 1.3203 + } 1.3204 + 1.3205 + fromUWriteUInt8( 1.3206 + cnv, 1.3207 + SHIFT_IN_STR, 1, 1.3208 + &target, (const char *)targetLimit, 1.3209 + &offsets, sourceIndex, 1.3210 + err); 1.3211 + } 1.3212 + 1.3213 + /*save the state and return */ 1.3214 + args->source = source; 1.3215 + args->target = (char*)target; 1.3216 +} 1.3217 + 1.3218 + 1.3219 +static void 1.3220 +UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 1.3221 + UErrorCode* err){ 1.3222 + char tempBuf[3]; 1.3223 + const char *mySource = (char *) args->source; 1.3224 + UChar *myTarget = args->target; 1.3225 + const char *mySourceLimit = args->sourceLimit; 1.3226 + uint32_t targetUniChar = 0x0000; 1.3227 + uint32_t mySourceChar = 0x0000; 1.3228 + UConverterDataISO2022* myData; 1.3229 + ISO2022State *pToU2022State; 1.3230 + 1.3231 + myData=(UConverterDataISO2022*)(args->converter->extraInfo); 1.3232 + pToU2022State = &myData->toU2022State; 1.3233 + 1.3234 + if(myData->key != 0) { 1.3235 + /* continue with a partial escape sequence */ 1.3236 + goto escape; 1.3237 + } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 1.3238 + /* continue with a partial double-byte character */ 1.3239 + mySourceChar = args->converter->toUBytes[0]; 1.3240 + args->converter->toULength = 0; 1.3241 + targetUniChar = missingCharMarker; 1.3242 + goto getTrailByte; 1.3243 + } 1.3244 + 1.3245 + while(mySource < mySourceLimit){ 1.3246 + 1.3247 + targetUniChar =missingCharMarker; 1.3248 + 1.3249 + if(myTarget < args->targetLimit){ 1.3250 + 1.3251 + mySourceChar= (unsigned char) *mySource++; 1.3252 + 1.3253 + switch(mySourceChar){ 1.3254 + case UCNV_SI: 1.3255 + pToU2022State->g=0; 1.3256 + if (myData->isEmptySegment) { 1.3257 + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 1.3258 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.3259 + args->converter->toUCallbackReason = UCNV_IRREGULAR; 1.3260 + args->converter->toUBytes[0] = mySourceChar; 1.3261 + args->converter->toULength = 1; 1.3262 + args->target = myTarget; 1.3263 + args->source = mySource; 1.3264 + return; 1.3265 + } 1.3266 + continue; 1.3267 + 1.3268 + case UCNV_SO: 1.3269 + if(pToU2022State->cs[1] != 0) { 1.3270 + pToU2022State->g=1; 1.3271 + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 1.3272 + continue; 1.3273 + } else { 1.3274 + /* illegal to have SO before a matching designator */ 1.3275 + myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 1.3276 + break; 1.3277 + } 1.3278 + 1.3279 + case ESC_2022: 1.3280 + mySource--; 1.3281 +escape: 1.3282 + { 1.3283 + const char * mySourceBefore = mySource; 1.3284 + int8_t toULengthBefore = args->converter->toULength; 1.3285 + 1.3286 + changeState_2022(args->converter,&(mySource), 1.3287 + mySourceLimit, ISO_2022_CN,err); 1.3288 + 1.3289 + /* After SO there must be at least one character before a designator (designator error handled separately) */ 1.3290 + if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 1.3291 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1.3292 + args->converter->toUCallbackReason = UCNV_IRREGULAR; 1.3293 + args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 1.3294 + } 1.3295 + } 1.3296 + 1.3297 + /* invalid or illegal escape sequence */ 1.3298 + if(U_FAILURE(*err)){ 1.3299 + args->target = myTarget; 1.3300 + args->source = mySource; 1.3301 + myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 1.3302 + return; 1.3303 + } 1.3304 + continue; 1.3305 + 1.3306 + /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 1.3307 + 1.3308 + case CR: 1.3309 + /*falls through*/ 1.3310 + case LF: 1.3311 + uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 1.3312 + /* falls through */ 1.3313 + default: 1.3314 + /* convert one or two bytes */ 1.3315 + myData->isEmptySegment = FALSE; 1.3316 + if(pToU2022State->g != 0) { 1.3317 + if(mySource < mySourceLimit) { 1.3318 + UConverterSharedData *cnv; 1.3319 + StateEnum tempState; 1.3320 + int32_t tempBufLen; 1.3321 + int leadIsOk, trailIsOk; 1.3322 + uint8_t trailByte; 1.3323 +getTrailByte: 1.3324 + trailByte = (uint8_t)*mySource; 1.3325 + /* 1.3326 + * Ticket 5691: consistent illegal sequences: 1.3327 + * - We include at least the first byte in the illegal sequence. 1.3328 + * - If any of the non-initial bytes could be the start of a character, 1.3329 + * we stop the illegal sequence before the first one of those. 1.3330 + * 1.3331 + * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 1.3332 + * an ESC/SO/SI, we report only the first byte as the illegal sequence. 1.3333 + * Otherwise we convert or report the pair of bytes. 1.3334 + */ 1.3335 + leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 1.3336 + trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 1.3337 + if (leadIsOk && trailIsOk) { 1.3338 + ++mySource; 1.3339 + tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 1.3340 + if(tempState >= CNS_11643_0) { 1.3341 + cnv = myData->myConverterArray[CNS_11643]; 1.3342 + tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 1.3343 + tempBuf[1] = (char) (mySourceChar); 1.3344 + tempBuf[2] = (char) trailByte; 1.3345 + tempBufLen = 3; 1.3346 + 1.3347 + }else{ 1.3348 + U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); 1.3349 + cnv = myData->myConverterArray[tempState]; 1.3350 + tempBuf[0] = (char) (mySourceChar); 1.3351 + tempBuf[1] = (char) trailByte; 1.3352 + tempBufLen = 2; 1.3353 + } 1.3354 + targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 1.3355 + mySourceChar = (mySourceChar << 8) | trailByte; 1.3356 + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 1.3357 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 1.3358 + ++mySource; 1.3359 + /* add another bit so that the code below writes 2 bytes in case of error */ 1.3360 + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 1.3361 + } 1.3362 + if(pToU2022State->g>=2) { 1.3363 + /* return from a single-shift state to the previous one */ 1.3364 + pToU2022State->g=pToU2022State->prevG; 1.3365 + } 1.3366 + } else { 1.3367 + args->converter->toUBytes[0] = (uint8_t)mySourceChar; 1.3368 + args->converter->toULength = 1; 1.3369 + goto endloop; 1.3370 + } 1.3371 + } 1.3372 + else{ 1.3373 + if(mySourceChar <= 0x7f) { 1.3374 + targetUniChar = (UChar) mySourceChar; 1.3375 + } 1.3376 + } 1.3377 + break; 1.3378 + } 1.3379 + if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 1.3380 + if(args->offsets){ 1.3381 + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 1.3382 + } 1.3383 + *(myTarget++)=(UChar)targetUniChar; 1.3384 + } 1.3385 + else if(targetUniChar > missingCharMarker){ 1.3386 + /* disassemble the surrogate pair and write to output*/ 1.3387 + targetUniChar-=0x0010000; 1.3388 + *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 1.3389 + if(args->offsets){ 1.3390 + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 1.3391 + } 1.3392 + ++myTarget; 1.3393 + if(myTarget< args->targetLimit){ 1.3394 + *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 1.3395 + if(args->offsets){ 1.3396 + args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 1.3397 + } 1.3398 + ++myTarget; 1.3399 + }else{ 1.3400 + args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 1.3401 + (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 1.3402 + } 1.3403 + 1.3404 + } 1.3405 + else{ 1.3406 + /* Call the callback function*/ 1.3407 + toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 1.3408 + break; 1.3409 + } 1.3410 + } 1.3411 + else{ 1.3412 + *err =U_BUFFER_OVERFLOW_ERROR; 1.3413 + break; 1.3414 + } 1.3415 + } 1.3416 +endloop: 1.3417 + args->target = myTarget; 1.3418 + args->source = mySource; 1.3419 +} 1.3420 + 1.3421 +static void 1.3422 +_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 1.3423 + UConverter *cnv = args->converter; 1.3424 + UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 1.3425 + ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 1.3426 + char *p, *subchar; 1.3427 + char buffer[8]; 1.3428 + int32_t length; 1.3429 + 1.3430 + subchar=(char *)cnv->subChars; 1.3431 + length=cnv->subCharLen; /* assume length==1 for most variants */ 1.3432 + 1.3433 + p = buffer; 1.3434 + switch(myConverterData->locale[0]){ 1.3435 + case 'j': 1.3436 + { 1.3437 + int8_t cs; 1.3438 + 1.3439 + if(pFromU2022State->g == 1) { 1.3440 + /* JIS7: switch from G1 to G0 */ 1.3441 + pFromU2022State->g = 0; 1.3442 + *p++ = UCNV_SI; 1.3443 + } 1.3444 + 1.3445 + cs = pFromU2022State->cs[0]; 1.3446 + if(cs != ASCII && cs != JISX201) { 1.3447 + /* not in ASCII or JIS X 0201: switch to ASCII */ 1.3448 + pFromU2022State->cs[0] = (int8_t)ASCII; 1.3449 + *p++ = '\x1b'; 1.3450 + *p++ = '\x28'; 1.3451 + *p++ = '\x42'; 1.3452 + } 1.3453 + 1.3454 + *p++ = subchar[0]; 1.3455 + break; 1.3456 + } 1.3457 + case 'c': 1.3458 + if(pFromU2022State->g != 0) { 1.3459 + /* not in ASCII mode: switch to ASCII */ 1.3460 + pFromU2022State->g = 0; 1.3461 + *p++ = UCNV_SI; 1.3462 + } 1.3463 + *p++ = subchar[0]; 1.3464 + break; 1.3465 + case 'k': 1.3466 + if(myConverterData->version == 0) { 1.3467 + if(length == 1) { 1.3468 + if((UBool)args->converter->fromUnicodeStatus) { 1.3469 + /* in DBCS mode: switch to SBCS */ 1.3470 + args->converter->fromUnicodeStatus = 0; 1.3471 + *p++ = UCNV_SI; 1.3472 + } 1.3473 + *p++ = subchar[0]; 1.3474 + } else /* length == 2*/ { 1.3475 + if(!(UBool)args->converter->fromUnicodeStatus) { 1.3476 + /* in SBCS mode: switch to DBCS */ 1.3477 + args->converter->fromUnicodeStatus = 1; 1.3478 + *p++ = UCNV_SO; 1.3479 + } 1.3480 + *p++ = subchar[0]; 1.3481 + *p++ = subchar[1]; 1.3482 + } 1.3483 + break; 1.3484 + } else { 1.3485 + /* save the subconverter's substitution string */ 1.3486 + uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 1.3487 + int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 1.3488 + 1.3489 + /* set our substitution string into the subconverter */ 1.3490 + myConverterData->currentConverter->subChars = (uint8_t *)subchar; 1.3491 + myConverterData->currentConverter->subCharLen = (int8_t)length; 1.3492 + 1.3493 + /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 1.3494 + args->converter = myConverterData->currentConverter; 1.3495 + myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 1.3496 + ucnv_cbFromUWriteSub(args, 0, err); 1.3497 + cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 1.3498 + args->converter = cnv; 1.3499 + 1.3500 + /* restore the subconverter's substitution string */ 1.3501 + myConverterData->currentConverter->subChars = currentSubChars; 1.3502 + myConverterData->currentConverter->subCharLen = currentSubCharLen; 1.3503 + 1.3504 + if(*err == U_BUFFER_OVERFLOW_ERROR) { 1.3505 + if(myConverterData->currentConverter->charErrorBufferLength > 0) { 1.3506 + uprv_memcpy( 1.3507 + cnv->charErrorBuffer, 1.3508 + myConverterData->currentConverter->charErrorBuffer, 1.3509 + myConverterData->currentConverter->charErrorBufferLength); 1.3510 + } 1.3511 + cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 1.3512 + myConverterData->currentConverter->charErrorBufferLength = 0; 1.3513 + } 1.3514 + return; 1.3515 + } 1.3516 + default: 1.3517 + /* not expected */ 1.3518 + break; 1.3519 + } 1.3520 + ucnv_cbFromUWriteBytes(args, 1.3521 + buffer, (int32_t)(p - buffer), 1.3522 + offsetIndex, err); 1.3523 +} 1.3524 + 1.3525 +/* 1.3526 + * Structure for cloning an ISO 2022 converter into a single memory block. 1.3527 + * ucnv_safeClone() of the converter will align the entire cloneStruct, 1.3528 + * and then ucnv_safeClone() of the sub-converter may additionally align 1.3529 + * currentConverter inside the cloneStruct, for which we need the deadSpace 1.3530 + * after currentConverter. 1.3531 + * This is because UAlignedMemory may be larger than the actually 1.3532 + * necessary alignment size for the platform. 1.3533 + * The other cloneStruct fields will not be moved around, 1.3534 + * and are aligned properly with cloneStruct's alignment. 1.3535 + */ 1.3536 +struct cloneStruct 1.3537 +{ 1.3538 + UConverter cnv; 1.3539 + UConverter currentConverter; 1.3540 + UAlignedMemory deadSpace; 1.3541 + UConverterDataISO2022 mydata; 1.3542 +}; 1.3543 + 1.3544 + 1.3545 +static UConverter * 1.3546 +_ISO_2022_SafeClone( 1.3547 + const UConverter *cnv, 1.3548 + void *stackBuffer, 1.3549 + int32_t *pBufferSize, 1.3550 + UErrorCode *status) 1.3551 +{ 1.3552 + struct cloneStruct * localClone; 1.3553 + UConverterDataISO2022 *cnvData; 1.3554 + int32_t i, size; 1.3555 + 1.3556 + if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 1.3557 + *pBufferSize = (int32_t)sizeof(struct cloneStruct); 1.3558 + return NULL; 1.3559 + } 1.3560 + 1.3561 + cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 1.3562 + localClone = (struct cloneStruct *)stackBuffer; 1.3563 + 1.3564 + /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 1.3565 + 1.3566 + uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 1.3567 + localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 1.3568 + localClone->cnv.isExtraLocal = TRUE; 1.3569 + 1.3570 + /* share the subconverters */ 1.3571 + 1.3572 + if(cnvData->currentConverter != NULL) { 1.3573 + size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 1.3574 + localClone->mydata.currentConverter = 1.3575 + ucnv_safeClone(cnvData->currentConverter, 1.3576 + &localClone->currentConverter, 1.3577 + &size, status); 1.3578 + if(U_FAILURE(*status)) { 1.3579 + return NULL; 1.3580 + } 1.3581 + } 1.3582 + 1.3583 + for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 1.3584 + if(cnvData->myConverterArray[i] != NULL) { 1.3585 + ucnv_incrementRefCount(cnvData->myConverterArray[i]); 1.3586 + } 1.3587 + } 1.3588 + 1.3589 + return &localClone->cnv; 1.3590 +} 1.3591 + 1.3592 +static void 1.3593 +_ISO_2022_GetUnicodeSet(const UConverter *cnv, 1.3594 + const USetAdder *sa, 1.3595 + UConverterUnicodeSet which, 1.3596 + UErrorCode *pErrorCode) 1.3597 +{ 1.3598 + int32_t i; 1.3599 + UConverterDataISO2022* cnvData; 1.3600 + 1.3601 + if (U_FAILURE(*pErrorCode)) { 1.3602 + return; 1.3603 + } 1.3604 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.3605 + if (cnv->sharedData == &_ISO2022Data) { 1.3606 + /* We use UTF-8 in this case */ 1.3607 + sa->addRange(sa->set, 0, 0xd7FF); 1.3608 + sa->addRange(sa->set, 0xE000, 0x10FFFF); 1.3609 + return; 1.3610 + } 1.3611 +#endif 1.3612 + 1.3613 + cnvData = (UConverterDataISO2022*)cnv->extraInfo; 1.3614 + 1.3615 + /* open a set and initialize it with code points that are algorithmically round-tripped */ 1.3616 + switch(cnvData->locale[0]){ 1.3617 + case 'j': 1.3618 + /* include JIS X 0201 which is hardcoded */ 1.3619 + sa->add(sa->set, 0xa5); 1.3620 + sa->add(sa->set, 0x203e); 1.3621 + if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 1.3622 + /* include Latin-1 for some variants of JP */ 1.3623 + sa->addRange(sa->set, 0, 0xff); 1.3624 + } else { 1.3625 + /* include ASCII for JP */ 1.3626 + sa->addRange(sa->set, 0, 0x7f); 1.3627 + } 1.3628 + if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 1.3629 + /* 1.3630 + * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 1.3631 + * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 1.3632 + * use half-width Katakana. 1.3633 + * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 1.3634 + * half-width Katakana via the ESC ( I sequence. 1.3635 + * However, we only emit (fromUnicode) half-width Katakana according to the 1.3636 + * definition of each variant. 1.3637 + * 1.3638 + * When including fallbacks, 1.3639 + * we need to include half-width Katakana Unicode code points for all JP variants because 1.3640 + * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 1.3641 + */ 1.3642 + /* include half-width Katakana for JP */ 1.3643 + sa->addRange(sa->set, HWKANA_START, HWKANA_END); 1.3644 + } 1.3645 + break; 1.3646 + case 'c': 1.3647 + case 'z': 1.3648 + /* include ASCII for CN */ 1.3649 + sa->addRange(sa->set, 0, 0x7f); 1.3650 + break; 1.3651 + case 'k': 1.3652 + /* there is only one converter for KR, and it is not in the myConverterArray[] */ 1.3653 + cnvData->currentConverter->sharedData->impl->getUnicodeSet( 1.3654 + cnvData->currentConverter, sa, which, pErrorCode); 1.3655 + /* the loop over myConverterArray[] will simply not find another converter */ 1.3656 + break; 1.3657 + default: 1.3658 + break; 1.3659 + } 1.3660 + 1.3661 +#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 1.3662 + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 1.3663 + cnvData->version==0 && i==CNS_11643 1.3664 + ) { 1.3665 + /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 1.3666 + ucnv_MBCSGetUnicodeSetForBytes( 1.3667 + cnvData->myConverterArray[i], 1.3668 + sa, UCNV_ROUNDTRIP_SET, 1.3669 + 0, 0x81, 0x82, 1.3670 + pErrorCode); 1.3671 + } 1.3672 +#endif 1.3673 + 1.3674 + for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 1.3675 + UConverterSetFilter filter; 1.3676 + if(cnvData->myConverterArray[i]!=NULL) { 1.3677 + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 1.3678 + cnvData->version==0 && i==CNS_11643 1.3679 + ) { 1.3680 + /* 1.3681 + * Version-specific for CN: 1.3682 + * CN version 0 does not map CNS planes 3..7 although 1.3683 + * they are all available in the CNS conversion table; 1.3684 + * CN version 1 (-EXT) does map them all. 1.3685 + * The two versions create different Unicode sets. 1.3686 + */ 1.3687 + filter=UCNV_SET_FILTER_2022_CN; 1.3688 + } else if(cnvData->locale[0]=='j' && i==JISX208) { 1.3689 + /* 1.3690 + * Only add code points that map to Shift-JIS codes 1.3691 + * corresponding to JIS X 0208. 1.3692 + */ 1.3693 + filter=UCNV_SET_FILTER_SJIS; 1.3694 + } else if(i==KSC5601) { 1.3695 + /* 1.3696 + * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 1.3697 + * are broader than GR94. 1.3698 + */ 1.3699 + filter=UCNV_SET_FILTER_GR94DBCS; 1.3700 + } else { 1.3701 + filter=UCNV_SET_FILTER_NONE; 1.3702 + } 1.3703 + ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 1.3704 + } 1.3705 + } 1.3706 + 1.3707 + /* 1.3708 + * ISO 2022 converters must not convert SO/SI/ESC despite what 1.3709 + * sub-converters do by themselves. 1.3710 + * Remove these characters from the set. 1.3711 + */ 1.3712 + sa->remove(sa->set, 0x0e); 1.3713 + sa->remove(sa->set, 0x0f); 1.3714 + sa->remove(sa->set, 0x1b); 1.3715 + 1.3716 + /* ISO 2022 converters do not convert C1 controls either */ 1.3717 + sa->removeRange(sa->set, 0x80, 0x9f); 1.3718 +} 1.3719 + 1.3720 +static const UConverterImpl _ISO2022Impl={ 1.3721 + UCNV_ISO_2022, 1.3722 + 1.3723 + NULL, 1.3724 + NULL, 1.3725 + 1.3726 + _ISO2022Open, 1.3727 + _ISO2022Close, 1.3728 + _ISO2022Reset, 1.3729 + 1.3730 +#ifdef U_ENABLE_GENERIC_ISO_2022 1.3731 + T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 1.3732 + T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 1.3733 + ucnv_fromUnicode_UTF8, 1.3734 + ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 1.3735 +#else 1.3736 + NULL, 1.3737 + NULL, 1.3738 + NULL, 1.3739 + NULL, 1.3740 +#endif 1.3741 + NULL, 1.3742 + 1.3743 + NULL, 1.3744 + _ISO2022getName, 1.3745 + _ISO_2022_WriteSub, 1.3746 + _ISO_2022_SafeClone, 1.3747 + _ISO_2022_GetUnicodeSet, 1.3748 + 1.3749 + NULL, 1.3750 + NULL 1.3751 +}; 1.3752 +static const UConverterStaticData _ISO2022StaticData={ 1.3753 + sizeof(UConverterStaticData), 1.3754 + "ISO_2022", 1.3755 + 2022, 1.3756 + UCNV_IBM, 1.3757 + UCNV_ISO_2022, 1.3758 + 1, 1.3759 + 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 1.3760 + { 0x1a, 0, 0, 0 }, 1.3761 + 1, 1.3762 + FALSE, 1.3763 + FALSE, 1.3764 + 0, 1.3765 + 0, 1.3766 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.3767 +}; 1.3768 +const UConverterSharedData _ISO2022Data={ 1.3769 + sizeof(UConverterSharedData), 1.3770 + ~((uint32_t) 0), 1.3771 + NULL, 1.3772 + NULL, 1.3773 + &_ISO2022StaticData, 1.3774 + FALSE, 1.3775 + &_ISO2022Impl, 1.3776 + 0, UCNV_MBCS_TABLE_INITIALIZER 1.3777 +}; 1.3778 + 1.3779 +/*************JP****************/ 1.3780 +static const UConverterImpl _ISO2022JPImpl={ 1.3781 + UCNV_ISO_2022, 1.3782 + 1.3783 + NULL, 1.3784 + NULL, 1.3785 + 1.3786 + _ISO2022Open, 1.3787 + _ISO2022Close, 1.3788 + _ISO2022Reset, 1.3789 + 1.3790 + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 1.3791 + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 1.3792 + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 1.3793 + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 1.3794 + NULL, 1.3795 + 1.3796 + NULL, 1.3797 + _ISO2022getName, 1.3798 + _ISO_2022_WriteSub, 1.3799 + _ISO_2022_SafeClone, 1.3800 + _ISO_2022_GetUnicodeSet, 1.3801 + 1.3802 + NULL, 1.3803 + NULL 1.3804 +}; 1.3805 +static const UConverterStaticData _ISO2022JPStaticData={ 1.3806 + sizeof(UConverterStaticData), 1.3807 + "ISO_2022_JP", 1.3808 + 0, 1.3809 + UCNV_IBM, 1.3810 + UCNV_ISO_2022, 1.3811 + 1, 1.3812 + 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 1.3813 + { 0x1a, 0, 0, 0 }, 1.3814 + 1, 1.3815 + FALSE, 1.3816 + FALSE, 1.3817 + 0, 1.3818 + 0, 1.3819 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.3820 +}; 1.3821 + 1.3822 +namespace { 1.3823 + 1.3824 +const UConverterSharedData _ISO2022JPData={ 1.3825 + sizeof(UConverterSharedData), 1.3826 + ~((uint32_t) 0), 1.3827 + NULL, 1.3828 + NULL, 1.3829 + &_ISO2022JPStaticData, 1.3830 + FALSE, 1.3831 + &_ISO2022JPImpl, 1.3832 + 0, UCNV_MBCS_TABLE_INITIALIZER 1.3833 +}; 1.3834 + 1.3835 +} // namespace 1.3836 + 1.3837 +/************* KR ***************/ 1.3838 +static const UConverterImpl _ISO2022KRImpl={ 1.3839 + UCNV_ISO_2022, 1.3840 + 1.3841 + NULL, 1.3842 + NULL, 1.3843 + 1.3844 + _ISO2022Open, 1.3845 + _ISO2022Close, 1.3846 + _ISO2022Reset, 1.3847 + 1.3848 + UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 1.3849 + UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 1.3850 + UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 1.3851 + UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 1.3852 + NULL, 1.3853 + 1.3854 + NULL, 1.3855 + _ISO2022getName, 1.3856 + _ISO_2022_WriteSub, 1.3857 + _ISO_2022_SafeClone, 1.3858 + _ISO_2022_GetUnicodeSet, 1.3859 + 1.3860 + NULL, 1.3861 + NULL 1.3862 +}; 1.3863 +static const UConverterStaticData _ISO2022KRStaticData={ 1.3864 + sizeof(UConverterStaticData), 1.3865 + "ISO_2022_KR", 1.3866 + 0, 1.3867 + UCNV_IBM, 1.3868 + UCNV_ISO_2022, 1.3869 + 1, 1.3870 + 3, /* max 3 bytes per UChar: SO+DBCS */ 1.3871 + { 0x1a, 0, 0, 0 }, 1.3872 + 1, 1.3873 + FALSE, 1.3874 + FALSE, 1.3875 + 0, 1.3876 + 0, 1.3877 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.3878 +}; 1.3879 + 1.3880 +namespace { 1.3881 + 1.3882 +const UConverterSharedData _ISO2022KRData={ 1.3883 + sizeof(UConverterSharedData), 1.3884 + ~((uint32_t) 0), 1.3885 + NULL, 1.3886 + NULL, 1.3887 + &_ISO2022KRStaticData, 1.3888 + FALSE, 1.3889 + &_ISO2022KRImpl, 1.3890 + 0, UCNV_MBCS_TABLE_INITIALIZER 1.3891 +}; 1.3892 + 1.3893 +} // namespace 1.3894 + 1.3895 +/*************** CN ***************/ 1.3896 +static const UConverterImpl _ISO2022CNImpl={ 1.3897 + 1.3898 + UCNV_ISO_2022, 1.3899 + 1.3900 + NULL, 1.3901 + NULL, 1.3902 + 1.3903 + _ISO2022Open, 1.3904 + _ISO2022Close, 1.3905 + _ISO2022Reset, 1.3906 + 1.3907 + UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 1.3908 + UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 1.3909 + UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 1.3910 + UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 1.3911 + NULL, 1.3912 + 1.3913 + NULL, 1.3914 + _ISO2022getName, 1.3915 + _ISO_2022_WriteSub, 1.3916 + _ISO_2022_SafeClone, 1.3917 + _ISO_2022_GetUnicodeSet, 1.3918 + 1.3919 + NULL, 1.3920 + NULL 1.3921 +}; 1.3922 +static const UConverterStaticData _ISO2022CNStaticData={ 1.3923 + sizeof(UConverterStaticData), 1.3924 + "ISO_2022_CN", 1.3925 + 0, 1.3926 + UCNV_IBM, 1.3927 + UCNV_ISO_2022, 1.3928 + 1, 1.3929 + 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 1.3930 + { 0x1a, 0, 0, 0 }, 1.3931 + 1, 1.3932 + FALSE, 1.3933 + FALSE, 1.3934 + 0, 1.3935 + 0, 1.3936 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.3937 +}; 1.3938 + 1.3939 +namespace { 1.3940 + 1.3941 +const UConverterSharedData _ISO2022CNData={ 1.3942 + sizeof(UConverterSharedData), 1.3943 + ~((uint32_t) 0), 1.3944 + NULL, 1.3945 + NULL, 1.3946 + &_ISO2022CNStaticData, 1.3947 + FALSE, 1.3948 + &_ISO2022CNImpl, 1.3949 + 0, UCNV_MBCS_TABLE_INITIALIZER 1.3950 +}; 1.3951 + 1.3952 +} // namespace 1.3953 + 1.3954 +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */