intl/icu/source/common/ucnv2022.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2000-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * file name: ucnv2022.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2000feb03
michael@0 12 * created by: Markus W. Scherer
michael@0 13 *
michael@0 14 * Change history:
michael@0 15 *
michael@0 16 * 06/29/2000 helena Major rewrite of the callback APIs.
michael@0 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
michael@0 18 * Changed implementation of toUnicode
michael@0 19 * function
michael@0 20 * 08/21/2000 Ram Added support for ISO-2022-KR
michael@0 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
michael@0 22 * ucnvebdc.c
michael@0 23 * 09/20/2000 Ram Added support for ISO-2022-CN
michael@0 24 * Added implementations for getNextUChar()
michael@0 25 * for specific 2022 country variants.
michael@0 26 * 10/31/2000 Ram Implemented offsets logic functions
michael@0 27 */
michael@0 28
michael@0 29 #include "unicode/utypes.h"
michael@0 30
michael@0 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
michael@0 32
michael@0 33 #include "unicode/ucnv.h"
michael@0 34 #include "unicode/uset.h"
michael@0 35 #include "unicode/ucnv_err.h"
michael@0 36 #include "unicode/ucnv_cb.h"
michael@0 37 #include "unicode/utf16.h"
michael@0 38 #include "ucnv_imp.h"
michael@0 39 #include "ucnv_bld.h"
michael@0 40 #include "ucnv_cnv.h"
michael@0 41 #include "ucnvmbcs.h"
michael@0 42 #include "cstring.h"
michael@0 43 #include "cmemory.h"
michael@0 44 #include "uassert.h"
michael@0 45
michael@0 46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 47
michael@0 48 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 49 /*
michael@0 50 * I am disabling the generic ISO-2022 converter after proposing to do so on
michael@0 51 * the icu mailing list two days ago.
michael@0 52 *
michael@0 53 * Reasons:
michael@0 54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
michael@0 55 * its designation sequences, single shifts with return to the previous state,
michael@0 56 * switch-with-no-return to UTF-16BE or similar, etc.
michael@0 57 * This is unlike the language-specific variants like ISO-2022-JP which
michael@0 58 * require a much smaller repertoire of ISO-2022 features.
michael@0 59 * These variants continue to be supported.
michael@0 60 * 2. I believe that no one is really using the generic ISO-2022 converter
michael@0 61 * but rather always one of the language-specific variants.
michael@0 62 * Note that ICU's generic ISO-2022 converter has always output one escape
michael@0 63 * sequence followed by UTF-8 for the whole stream.
michael@0 64 * 3. Switching between subcharsets is extremely slow, because each time
michael@0 65 * the previous converter is closed and a new one opened,
michael@0 66 * without any kind of caching, least-recently-used list, etc.
michael@0 67 * 4. The code is currently buggy, and given the above it does not seem
michael@0 68 * reasonable to spend the time on maintenance.
michael@0 69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
michael@0 70 * This means, for example, that when ISO-8859-7 is designated, the following
michael@0 71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
michael@0 72 * The ICU ISO-2022 converter does not handle this - and has no information
michael@0 73 * about which subconverter would have to be shifted vs. which is designed
michael@0 74 * for 7-bit ISO-2022.
michael@0 75 *
michael@0 76 * Markus Scherer 2003-dec-03
michael@0 77 */
michael@0 78 #endif
michael@0 79
michael@0 80 static const char SHIFT_IN_STR[] = "\x0F";
michael@0 81 // static const char SHIFT_OUT_STR[] = "\x0E";
michael@0 82
michael@0 83 #define CR 0x0D
michael@0 84 #define LF 0x0A
michael@0 85 #define H_TAB 0x09
michael@0 86 #define V_TAB 0x0B
michael@0 87 #define SPACE 0x20
michael@0 88
michael@0 89 enum {
michael@0 90 HWKANA_START=0xff61,
michael@0 91 HWKANA_END=0xff9f
michael@0 92 };
michael@0 93
michael@0 94 /*
michael@0 95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
michael@0 96 * as bytes 21..7E. (Subtract 0x80.)
michael@0 97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
michael@0 98 * as bytes 20..7F. (Subtract 0x80.)
michael@0 99 * Do not encode C1 control codes with native bytes 80..9F
michael@0 100 * as bytes 00..1F (C0 control codes).
michael@0 101 */
michael@0 102 enum {
michael@0 103 GR94_START=0xa1,
michael@0 104 GR94_END=0xfe,
michael@0 105 GR96_START=0xa0,
michael@0 106 GR96_END=0xff
michael@0 107 };
michael@0 108
michael@0 109 /*
michael@0 110 * ISO 2022 control codes must not be converted from Unicode
michael@0 111 * because they would mess up the byte stream.
michael@0 112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
michael@0 113 * corresponding to SO, SI, and ESC.
michael@0 114 */
michael@0 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
michael@0 116
michael@0 117 /* for ISO-2022-JP and -CN implementations */
michael@0 118 typedef enum {
michael@0 119 /* shared values */
michael@0 120 INVALID_STATE=-1,
michael@0 121 ASCII = 0,
michael@0 122
michael@0 123 SS2_STATE=0x10,
michael@0 124 SS3_STATE,
michael@0 125
michael@0 126 /* JP */
michael@0 127 ISO8859_1 = 1 ,
michael@0 128 ISO8859_7 = 2 ,
michael@0 129 JISX201 = 3,
michael@0 130 JISX208 = 4,
michael@0 131 JISX212 = 5,
michael@0 132 GB2312 =6,
michael@0 133 KSC5601 =7,
michael@0 134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
michael@0 135
michael@0 136 /* CN */
michael@0 137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
michael@0 138 GB2312_1=1,
michael@0 139 ISO_IR_165=2,
michael@0 140 CNS_11643=3,
michael@0 141
michael@0 142 /*
michael@0 143 * these are used in StateEnum and ISO2022State variables,
michael@0 144 * but CNS_11643 must be used to index into myConverterArray[]
michael@0 145 */
michael@0 146 CNS_11643_0=0x20,
michael@0 147 CNS_11643_1,
michael@0 148 CNS_11643_2,
michael@0 149 CNS_11643_3,
michael@0 150 CNS_11643_4,
michael@0 151 CNS_11643_5,
michael@0 152 CNS_11643_6,
michael@0 153 CNS_11643_7
michael@0 154 } StateEnum;
michael@0 155
michael@0 156 /* is the StateEnum charset value for a DBCS charset? */
michael@0 157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
michael@0 158
michael@0 159 #define CSM(cs) ((uint16_t)1<<(cs))
michael@0 160
michael@0 161 /*
michael@0 162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
michael@0 163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
michael@0 164 *
michael@0 165 * Note: The converter uses some leniency:
michael@0 166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
michael@0 167 * all versions, not just JIS7 and JIS8.
michael@0 168 * - ICU does not distinguish between different versions of JIS X 0208.
michael@0 169 */
michael@0 170 enum { MAX_JA_VERSION=4 };
michael@0 171 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
michael@0 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
michael@0 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
michael@0 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
michael@0 175 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
michael@0 176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
michael@0 177 };
michael@0 178
michael@0 179 typedef enum {
michael@0 180 ASCII1=0,
michael@0 181 LATIN1,
michael@0 182 SBCS,
michael@0 183 DBCS,
michael@0 184 MBCS,
michael@0 185 HWKANA
michael@0 186 }Cnv2022Type;
michael@0 187
michael@0 188 typedef struct ISO2022State {
michael@0 189 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
michael@0 190 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
michael@0 191 int8_t prevG; /* g before single shift (SS2 or SS3) */
michael@0 192 } ISO2022State;
michael@0 193
michael@0 194 #define UCNV_OPTIONS_VERSION_MASK 0xf
michael@0 195 #define UCNV_2022_MAX_CONVERTERS 10
michael@0 196
michael@0 197 typedef struct{
michael@0 198 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
michael@0 199 UConverter *currentConverter;
michael@0 200 Cnv2022Type currentType;
michael@0 201 ISO2022State toU2022State, fromU2022State;
michael@0 202 uint32_t key;
michael@0 203 uint32_t version;
michael@0 204 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 205 UBool isFirstBuffer;
michael@0 206 #endif
michael@0 207 UBool isEmptySegment;
michael@0 208 char name[30];
michael@0 209 char locale[3];
michael@0 210 }UConverterDataISO2022;
michael@0 211
michael@0 212 /* Protos */
michael@0 213 /* ISO-2022 ----------------------------------------------------------------- */
michael@0 214
michael@0 215 /*Forward declaration */
michael@0 216 U_CFUNC void
michael@0 217 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
michael@0 218 UErrorCode * err);
michael@0 219 U_CFUNC void
michael@0 220 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
michael@0 221 UErrorCode * err);
michael@0 222
michael@0 223 #define ESC_2022 0x1B /*ESC*/
michael@0 224
michael@0 225 typedef enum
michael@0 226 {
michael@0 227 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
michael@0 228 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
michael@0 229 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
michael@0 230 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
michael@0 231 } UCNV_TableStates_2022;
michael@0 232
michael@0 233 /*
michael@0 234 * The way these state transition arrays work is:
michael@0 235 * ex : ESC$B is the sequence for JISX208
michael@0 236 * a) First Iteration: char is ESC
michael@0 237 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
michael@0 238 * int x = normalize_esq_chars_2022[27] which is equal to 1
michael@0 239 * ii) Search for this value in escSeqStateTable_Key_2022[]
michael@0 240 * value of x is stored at escSeqStateTable_Key_2022[0]
michael@0 241 * iii) Save this index as offset
michael@0 242 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
michael@0 243 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
michael@0 244 * b) Switch on this state and continue to next char
michael@0 245 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
michael@0 246 * which is normalize_esq_chars_2022[36] == 4
michael@0 247 * ii) x is currently 1(from above)
michael@0 248 * x<<=5 -- x is now 32
michael@0 249 * x+=normalize_esq_chars_2022[36]
michael@0 250 * now x is 36
michael@0 251 * iii) Search for this value in escSeqStateTable_Key_2022[]
michael@0 252 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
michael@0 253 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
michael@0 254 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
michael@0 255 * c) Switch on this state and continue to next char
michael@0 256 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
michael@0 257 * ii) x is currently 36 (from above)
michael@0 258 * x<<=5 -- x is now 1152
michael@0 259 * x+=normalize_esq_chars_2022[66]
michael@0 260 * now x is 1161
michael@0 261 * iii) Search for this value in escSeqStateTable_Key_2022[]
michael@0 262 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
michael@0 263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
michael@0 264 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
michael@0 265 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
michael@0 266 */
michael@0 267
michael@0 268
michael@0 269 /*Below are the 3 arrays depicting a state transition table*/
michael@0 270 static const int8_t normalize_esq_chars_2022[256] = {
michael@0 271 /* 0 1 2 3 4 5 6 7 8 9 */
michael@0 272
michael@0 273 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
michael@0 276 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
michael@0 277 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
michael@0 278 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 279 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
michael@0 280 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
michael@0 281 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 282 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 298 ,0 ,0 ,0 ,0 ,0 ,0
michael@0 299 };
michael@0 300
michael@0 301 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 302 /*
michael@0 303 * When the generic ISO-2022 converter is completely removed, not just disabled
michael@0 304 * per #ifdef, then the following state table and the associated tables that are
michael@0 305 * dimensioned with MAX_STATES_2022 should be trimmed.
michael@0 306 *
michael@0 307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
michael@0 308 * the associated escape sequences starting with ESC ( B should be removed.
michael@0 309 * This includes the ones with key values 1097 and all of the ones above 1000000.
michael@0 310 *
michael@0 311 * For the latter, the tables can simply be truncated.
michael@0 312 * For the former, since the tables must be kept parallel, it is probably best
michael@0 313 * to simply duplicate an adjacent table cell, parallel in all tables.
michael@0 314 *
michael@0 315 * It may make sense to restructure the tables, especially by using small search
michael@0 316 * tables for the variants instead of indexing them parallel to the table here.
michael@0 317 */
michael@0 318 #endif
michael@0 319
michael@0 320 #define MAX_STATES_2022 74
michael@0 321 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
michael@0 322 /* 0 1 2 3 4 5 6 7 8 9 */
michael@0 323
michael@0 324 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
michael@0 325 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
michael@0 326 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
michael@0 327 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
michael@0 328 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
michael@0 329 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
michael@0 330 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
michael@0 331 ,35947631 ,35947635 ,35947636 ,35947638
michael@0 332 };
michael@0 333
michael@0 334 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 335
michael@0 336 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
michael@0 337 /* 0 1 2 3 4 5 6 7 8 9 */
michael@0 338
michael@0 339 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
michael@0 340 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
michael@0 341 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
michael@0 342 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
michael@0 343 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
michael@0 344 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
michael@0 345 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
michael@0 346 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
michael@0 347 };
michael@0 348
michael@0 349 #endif
michael@0 350
michael@0 351 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
michael@0 352 /* 0 1 2 3 4 5 6 7 8 9 */
michael@0 353 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
michael@0 354 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
michael@0 355 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
michael@0 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
michael@0 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
michael@0 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
michael@0 359 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
michael@0 360 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
michael@0 361 };
michael@0 362
michael@0 363
michael@0 364 /* Type def for refactoring changeState_2022 code*/
michael@0 365 typedef enum{
michael@0 366 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 367 ISO_2022=0,
michael@0 368 #endif
michael@0 369 ISO_2022_JP=1,
michael@0 370 ISO_2022_KR=2,
michael@0 371 ISO_2022_CN=3
michael@0 372 } Variant2022;
michael@0 373
michael@0 374 /*********** ISO 2022 Converter Protos ***********/
michael@0 375 static void
michael@0 376 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
michael@0 377
michael@0 378 static void
michael@0 379 _ISO2022Close(UConverter *converter);
michael@0 380
michael@0 381 static void
michael@0 382 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
michael@0 383
michael@0 384 static const char*
michael@0 385 _ISO2022getName(const UConverter* cnv);
michael@0 386
michael@0 387 static void
michael@0 388 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
michael@0 389
michael@0 390 static UConverter *
michael@0 391 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
michael@0 392
michael@0 393 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 394 static void
michael@0 395 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
michael@0 396 #endif
michael@0 397
michael@0 398 namespace {
michael@0 399
michael@0 400 /*const UConverterSharedData _ISO2022Data;*/
michael@0 401 extern const UConverterSharedData _ISO2022JPData;
michael@0 402 extern const UConverterSharedData _ISO2022KRData;
michael@0 403 extern const UConverterSharedData _ISO2022CNData;
michael@0 404
michael@0 405 } // namespace
michael@0 406
michael@0 407 /*************** Converter implementations ******************/
michael@0 408
michael@0 409 /* The purpose of this function is to get around gcc compiler warnings. */
michael@0 410 static inline void
michael@0 411 fromUWriteUInt8(UConverter *cnv,
michael@0 412 const char *bytes, int32_t length,
michael@0 413 uint8_t **target, const char *targetLimit,
michael@0 414 int32_t **offsets,
michael@0 415 int32_t sourceIndex,
michael@0 416 UErrorCode *pErrorCode)
michael@0 417 {
michael@0 418 char *targetChars = (char *)*target;
michael@0 419 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
michael@0 420 offsets, sourceIndex, pErrorCode);
michael@0 421 *target = (uint8_t*)targetChars;
michael@0 422
michael@0 423 }
michael@0 424
michael@0 425 static inline void
michael@0 426 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
michael@0 427 if(myConverterData->version == 1) {
michael@0 428 UConverter *cnv = myConverterData->currentConverter;
michael@0 429
michael@0 430 cnv->toUnicodeStatus=0; /* offset */
michael@0 431 cnv->mode=0; /* state */
michael@0 432 cnv->toULength=0; /* byteIndex */
michael@0 433 }
michael@0 434 }
michael@0 435
michael@0 436 static inline void
michael@0 437 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
michael@0 438 /* in ISO-2022-KR the designator sequence appears only once
michael@0 439 * in a file so we append it only once
michael@0 440 */
michael@0 441 if( converter->charErrorBufferLength==0){
michael@0 442
michael@0 443 converter->charErrorBufferLength = 4;
michael@0 444 converter->charErrorBuffer[0] = 0x1b;
michael@0 445 converter->charErrorBuffer[1] = 0x24;
michael@0 446 converter->charErrorBuffer[2] = 0x29;
michael@0 447 converter->charErrorBuffer[3] = 0x43;
michael@0 448 }
michael@0 449 if(myConverterData->version == 1) {
michael@0 450 UConverter *cnv = myConverterData->currentConverter;
michael@0 451
michael@0 452 cnv->fromUChar32=0;
michael@0 453 cnv->fromUnicodeStatus=1; /* prevLength */
michael@0 454 }
michael@0 455 }
michael@0 456
michael@0 457 static void
michael@0 458 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
michael@0 459
michael@0 460 char myLocale[6]={' ',' ',' ',' ',' ',' '};
michael@0 461
michael@0 462 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
michael@0 463 if(cnv->extraInfo != NULL) {
michael@0 464 UConverterNamePieces stackPieces;
michael@0 465 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
michael@0 466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
michael@0 467 uint32_t version;
michael@0 468
michael@0 469 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
michael@0 470
michael@0 471 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
michael@0 472 myConverterData->currentType = ASCII1;
michael@0 473 cnv->fromUnicodeStatus =FALSE;
michael@0 474 if(pArgs->locale){
michael@0 475 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
michael@0 476 }
michael@0 477 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
michael@0 478 myConverterData->version = version;
michael@0 479 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
michael@0 480 (myLocale[2]=='_' || myLocale[2]=='\0'))
michael@0 481 {
michael@0 482 size_t len=0;
michael@0 483 /* open the required converters and cache them */
michael@0 484 if(version>MAX_JA_VERSION) {
michael@0 485 /* prevent indexing beyond jpCharsetMasks[] */
michael@0 486 myConverterData->version = version = 0;
michael@0 487 }
michael@0 488 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
michael@0 489 myConverterData->myConverterArray[ISO8859_7] =
michael@0 490 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
michael@0 491 }
michael@0 492 myConverterData->myConverterArray[JISX208] =
michael@0 493 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
michael@0 494 if(jpCharsetMasks[version]&CSM(JISX212)) {
michael@0 495 myConverterData->myConverterArray[JISX212] =
michael@0 496 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
michael@0 497 }
michael@0 498 if(jpCharsetMasks[version]&CSM(GB2312)) {
michael@0 499 myConverterData->myConverterArray[GB2312] =
michael@0 500 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
michael@0 501 }
michael@0 502 if(jpCharsetMasks[version]&CSM(KSC5601)) {
michael@0 503 myConverterData->myConverterArray[KSC5601] =
michael@0 504 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
michael@0 505 }
michael@0 506
michael@0 507 /* set the function pointers to appropriate funtions */
michael@0 508 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
michael@0 509 uprv_strcpy(myConverterData->locale,"ja");
michael@0 510
michael@0 511 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
michael@0 512 len = uprv_strlen(myConverterData->name);
michael@0 513 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
michael@0 514 myConverterData->name[len+1]='\0';
michael@0 515 }
michael@0 516 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
michael@0 517 (myLocale[2]=='_' || myLocale[2]=='\0'))
michael@0 518 {
michael@0 519 const char *cnvName;
michael@0 520 if(version==1) {
michael@0 521 cnvName="icu-internal-25546";
michael@0 522 } else {
michael@0 523 cnvName="ibm-949";
michael@0 524 myConverterData->version=version=0;
michael@0 525 }
michael@0 526 if(pArgs->onlyTestIsLoadable) {
michael@0 527 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
michael@0 528 uprv_free(cnv->extraInfo);
michael@0 529 cnv->extraInfo=NULL;
michael@0 530 return;
michael@0 531 } else {
michael@0 532 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
michael@0 533 if (U_FAILURE(*errorCode)) {
michael@0 534 _ISO2022Close(cnv);
michael@0 535 return;
michael@0 536 }
michael@0 537
michael@0 538 if(version==1) {
michael@0 539 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
michael@0 540 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
michael@0 541 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
michael@0 542 }else{
michael@0 543 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
michael@0 544 }
michael@0 545
michael@0 546 /* initialize the state variables */
michael@0 547 setInitialStateToUnicodeKR(cnv, myConverterData);
michael@0 548 setInitialStateFromUnicodeKR(cnv, myConverterData);
michael@0 549
michael@0 550 /* set the function pointers to appropriate funtions */
michael@0 551 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
michael@0 552 uprv_strcpy(myConverterData->locale,"ko");
michael@0 553 }
michael@0 554 }
michael@0 555 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
michael@0 556 (myLocale[2]=='_' || myLocale[2]=='\0'))
michael@0 557 {
michael@0 558
michael@0 559 /* open the required converters and cache them */
michael@0 560 myConverterData->myConverterArray[GB2312_1] =
michael@0 561 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
michael@0 562 if(version==1) {
michael@0 563 myConverterData->myConverterArray[ISO_IR_165] =
michael@0 564 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
michael@0 565 }
michael@0 566 myConverterData->myConverterArray[CNS_11643] =
michael@0 567 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
michael@0 568
michael@0 569
michael@0 570 /* set the function pointers to appropriate funtions */
michael@0 571 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
michael@0 572 uprv_strcpy(myConverterData->locale,"cn");
michael@0 573
michael@0 574 if (version==0){
michael@0 575 myConverterData->version = 0;
michael@0 576 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
michael@0 577 }else if (version==1){
michael@0 578 myConverterData->version = 1;
michael@0 579 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
michael@0 580 }else {
michael@0 581 myConverterData->version = 2;
michael@0 582 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
michael@0 583 }
michael@0 584 }
michael@0 585 else{
michael@0 586 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 587 myConverterData->isFirstBuffer = TRUE;
michael@0 588
michael@0 589 /* append the UTF-8 escape sequence */
michael@0 590 cnv->charErrorBufferLength = 3;
michael@0 591 cnv->charErrorBuffer[0] = 0x1b;
michael@0 592 cnv->charErrorBuffer[1] = 0x25;
michael@0 593 cnv->charErrorBuffer[2] = 0x42;
michael@0 594
michael@0 595 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
michael@0 596 /* initialize the state variables */
michael@0 597 uprv_strcpy(myConverterData->name,"ISO_2022");
michael@0 598 #else
michael@0 599 *errorCode = U_UNSUPPORTED_ERROR;
michael@0 600 return;
michael@0 601 #endif
michael@0 602 }
michael@0 603
michael@0 604 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
michael@0 605
michael@0 606 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
michael@0 607 _ISO2022Close(cnv);
michael@0 608 }
michael@0 609 } else {
michael@0 610 *errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 611 }
michael@0 612 }
michael@0 613
michael@0 614
michael@0 615 static void
michael@0 616 _ISO2022Close(UConverter *converter) {
michael@0 617 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
michael@0 618 UConverterSharedData **array = myData->myConverterArray;
michael@0 619 int32_t i;
michael@0 620
michael@0 621 if (converter->extraInfo != NULL) {
michael@0 622 /*close the array of converter pointers and free the memory*/
michael@0 623 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
michael@0 624 if(array[i]!=NULL) {
michael@0 625 ucnv_unloadSharedDataIfReady(array[i]);
michael@0 626 }
michael@0 627 }
michael@0 628
michael@0 629 ucnv_close(myData->currentConverter);
michael@0 630
michael@0 631 if(!converter->isExtraLocal){
michael@0 632 uprv_free (converter->extraInfo);
michael@0 633 converter->extraInfo = NULL;
michael@0 634 }
michael@0 635 }
michael@0 636 }
michael@0 637
michael@0 638 static void
michael@0 639 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
michael@0 640 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
michael@0 641 if(choice<=UCNV_RESET_TO_UNICODE) {
michael@0 642 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
michael@0 643 myConverterData->key = 0;
michael@0 644 myConverterData->isEmptySegment = FALSE;
michael@0 645 }
michael@0 646 if(choice!=UCNV_RESET_TO_UNICODE) {
michael@0 647 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
michael@0 648 }
michael@0 649 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 650 if(myConverterData->locale[0] == 0){
michael@0 651 if(choice<=UCNV_RESET_TO_UNICODE) {
michael@0 652 myConverterData->isFirstBuffer = TRUE;
michael@0 653 myConverterData->key = 0;
michael@0 654 if (converter->mode == UCNV_SO){
michael@0 655 ucnv_close (myConverterData->currentConverter);
michael@0 656 myConverterData->currentConverter=NULL;
michael@0 657 }
michael@0 658 converter->mode = UCNV_SI;
michael@0 659 }
michael@0 660 if(choice!=UCNV_RESET_TO_UNICODE) {
michael@0 661 /* re-append UTF-8 escape sequence */
michael@0 662 converter->charErrorBufferLength = 3;
michael@0 663 converter->charErrorBuffer[0] = 0x1b;
michael@0 664 converter->charErrorBuffer[1] = 0x28;
michael@0 665 converter->charErrorBuffer[2] = 0x42;
michael@0 666 }
michael@0 667 }
michael@0 668 else
michael@0 669 #endif
michael@0 670 {
michael@0 671 /* reset the state variables */
michael@0 672 if(myConverterData->locale[0] == 'k'){
michael@0 673 if(choice<=UCNV_RESET_TO_UNICODE) {
michael@0 674 setInitialStateToUnicodeKR(converter, myConverterData);
michael@0 675 }
michael@0 676 if(choice!=UCNV_RESET_TO_UNICODE) {
michael@0 677 setInitialStateFromUnicodeKR(converter, myConverterData);
michael@0 678 }
michael@0 679 }
michael@0 680 }
michael@0 681 }
michael@0 682
michael@0 683 static const char*
michael@0 684 _ISO2022getName(const UConverter* cnv){
michael@0 685 if(cnv->extraInfo){
michael@0 686 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
michael@0 687 return myData->name;
michael@0 688 }
michael@0 689 return NULL;
michael@0 690 }
michael@0 691
michael@0 692
michael@0 693 /*************** to unicode *******************/
michael@0 694 /****************************************************************************
michael@0 695 * Recognized escape sequences are
michael@0 696 * <ESC>(B ASCII
michael@0 697 * <ESC>.A ISO-8859-1
michael@0 698 * <ESC>.F ISO-8859-7
michael@0 699 * <ESC>(J JISX-201
michael@0 700 * <ESC>(I JISX-201
michael@0 701 * <ESC>$B JISX-208
michael@0 702 * <ESC>$@ JISX-208
michael@0 703 * <ESC>$(D JISX-212
michael@0 704 * <ESC>$A GB2312
michael@0 705 * <ESC>$(C KSC5601
michael@0 706 */
michael@0 707 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
michael@0 708 /* 0 1 2 3 4 5 6 7 8 9 */
michael@0 709 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 710 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
michael@0 711 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 712 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
michael@0 713 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 714 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 715 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 716 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 717 };
michael@0 718
michael@0 719 /*************** to unicode *******************/
michael@0 720 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
michael@0 721 /* 0 1 2 3 4 5 6 7 8 9 */
michael@0 722 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 723 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 724 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 725 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 726 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
michael@0 727 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 728 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 729 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
michael@0 730 };
michael@0 731
michael@0 732
michael@0 733 static UCNV_TableStates_2022
michael@0 734 getKey_2022(char c,int32_t* key,int32_t* offset){
michael@0 735 int32_t togo;
michael@0 736 int32_t low = 0;
michael@0 737 int32_t hi = MAX_STATES_2022;
michael@0 738 int32_t oldmid=0;
michael@0 739
michael@0 740 togo = normalize_esq_chars_2022[(uint8_t)c];
michael@0 741 if(togo == 0) {
michael@0 742 /* not a valid character anywhere in an escape sequence */
michael@0 743 *key = 0;
michael@0 744 *offset = 0;
michael@0 745 return INVALID_2022;
michael@0 746 }
michael@0 747 togo = (*key << 5) + togo;
michael@0 748
michael@0 749 while (hi != low) /*binary search*/{
michael@0 750
michael@0 751 register int32_t mid = (hi+low) >> 1; /*Finds median*/
michael@0 752
michael@0 753 if (mid == oldmid)
michael@0 754 break;
michael@0 755
michael@0 756 if (escSeqStateTable_Key_2022[mid] > togo){
michael@0 757 hi = mid;
michael@0 758 }
michael@0 759 else if (escSeqStateTable_Key_2022[mid] < togo){
michael@0 760 low = mid;
michael@0 761 }
michael@0 762 else /*we found it*/{
michael@0 763 *key = togo;
michael@0 764 *offset = mid;
michael@0 765 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
michael@0 766 }
michael@0 767 oldmid = mid;
michael@0 768
michael@0 769 }
michael@0 770
michael@0 771 *key = 0;
michael@0 772 *offset = 0;
michael@0 773 return INVALID_2022;
michael@0 774 }
michael@0 775
michael@0 776 /*runs through a state machine to determine the escape sequence - codepage correspondance
michael@0 777 */
michael@0 778 static void
michael@0 779 changeState_2022(UConverter* _this,
michael@0 780 const char** source,
michael@0 781 const char* sourceLimit,
michael@0 782 Variant2022 var,
michael@0 783 UErrorCode* err){
michael@0 784 UCNV_TableStates_2022 value;
michael@0 785 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
michael@0 786 uint32_t key = myData2022->key;
michael@0 787 int32_t offset = 0;
michael@0 788 int8_t initialToULength = _this->toULength;
michael@0 789 char c;
michael@0 790
michael@0 791 value = VALID_NON_TERMINAL_2022;
michael@0 792 while (*source < sourceLimit) {
michael@0 793 c = *(*source)++;
michael@0 794 _this->toUBytes[_this->toULength++]=(uint8_t)c;
michael@0 795 value = getKey_2022(c,(int32_t *) &key, &offset);
michael@0 796
michael@0 797 switch (value){
michael@0 798
michael@0 799 case VALID_NON_TERMINAL_2022 :
michael@0 800 /* continue with the loop */
michael@0 801 break;
michael@0 802
michael@0 803 case VALID_TERMINAL_2022:
michael@0 804 key = 0;
michael@0 805 goto DONE;
michael@0 806
michael@0 807 case INVALID_2022:
michael@0 808 goto DONE;
michael@0 809
michael@0 810 case VALID_MAYBE_TERMINAL_2022:
michael@0 811 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 812 /* ESC ( B is ambiguous only for ISO_2022 itself */
michael@0 813 if(var == ISO_2022) {
michael@0 814 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
michael@0 815 _this->toULength = 0;
michael@0 816
michael@0 817 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
michael@0 818
michael@0 819 /* continue with the loop */
michael@0 820 value = VALID_NON_TERMINAL_2022;
michael@0 821 break;
michael@0 822 } else
michael@0 823 #endif
michael@0 824 {
michael@0 825 /* not ISO_2022 itself, finish here */
michael@0 826 value = VALID_TERMINAL_2022;
michael@0 827 key = 0;
michael@0 828 goto DONE;
michael@0 829 }
michael@0 830 }
michael@0 831 }
michael@0 832
michael@0 833 DONE:
michael@0 834 myData2022->key = key;
michael@0 835
michael@0 836 if (value == VALID_NON_TERMINAL_2022) {
michael@0 837 /* indicate that the escape sequence is incomplete: key!=0 */
michael@0 838 return;
michael@0 839 } else if (value == INVALID_2022 ) {
michael@0 840 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 841 } else /* value == VALID_TERMINAL_2022 */ {
michael@0 842 switch(var){
michael@0 843 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 844 case ISO_2022:
michael@0 845 {
michael@0 846 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
michael@0 847 if(chosenConverterName == NULL) {
michael@0 848 /* SS2 or SS3 */
michael@0 849 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
michael@0 850 _this->toUCallbackReason = UCNV_UNASSIGNED;
michael@0 851 return;
michael@0 852 }
michael@0 853
michael@0 854 _this->mode = UCNV_SI;
michael@0 855 ucnv_close(myData2022->currentConverter);
michael@0 856 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
michael@0 857 if(U_SUCCESS(*err)) {
michael@0 858 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
michael@0 859 _this->mode = UCNV_SO;
michael@0 860 }
michael@0 861 break;
michael@0 862 }
michael@0 863 #endif
michael@0 864 case ISO_2022_JP:
michael@0 865 {
michael@0 866 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
michael@0 867 switch(tempState) {
michael@0 868 case INVALID_STATE:
michael@0 869 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
michael@0 870 break;
michael@0 871 case SS2_STATE:
michael@0 872 if(myData2022->toU2022State.cs[2]!=0) {
michael@0 873 if(myData2022->toU2022State.g<2) {
michael@0 874 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
michael@0 875 }
michael@0 876 myData2022->toU2022State.g=2;
michael@0 877 } else {
michael@0 878 /* illegal to have SS2 before a matching designator */
michael@0 879 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 880 }
michael@0 881 break;
michael@0 882 /* case SS3_STATE: not used in ISO-2022-JP-x */
michael@0 883 case ISO8859_1:
michael@0 884 case ISO8859_7:
michael@0 885 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
michael@0 886 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
michael@0 887 } else {
michael@0 888 /* G2 charset for SS2 */
michael@0 889 myData2022->toU2022State.cs[2]=(int8_t)tempState;
michael@0 890 }
michael@0 891 break;
michael@0 892 default:
michael@0 893 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
michael@0 894 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
michael@0 895 } else {
michael@0 896 /* G0 charset */
michael@0 897 myData2022->toU2022State.cs[0]=(int8_t)tempState;
michael@0 898 }
michael@0 899 break;
michael@0 900 }
michael@0 901 }
michael@0 902 break;
michael@0 903 case ISO_2022_CN:
michael@0 904 {
michael@0 905 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
michael@0 906 switch(tempState) {
michael@0 907 case INVALID_STATE:
michael@0 908 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
michael@0 909 break;
michael@0 910 case SS2_STATE:
michael@0 911 if(myData2022->toU2022State.cs[2]!=0) {
michael@0 912 if(myData2022->toU2022State.g<2) {
michael@0 913 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
michael@0 914 }
michael@0 915 myData2022->toU2022State.g=2;
michael@0 916 } else {
michael@0 917 /* illegal to have SS2 before a matching designator */
michael@0 918 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 919 }
michael@0 920 break;
michael@0 921 case SS3_STATE:
michael@0 922 if(myData2022->toU2022State.cs[3]!=0) {
michael@0 923 if(myData2022->toU2022State.g<2) {
michael@0 924 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
michael@0 925 }
michael@0 926 myData2022->toU2022State.g=3;
michael@0 927 } else {
michael@0 928 /* illegal to have SS3 before a matching designator */
michael@0 929 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 930 }
michael@0 931 break;
michael@0 932 case ISO_IR_165:
michael@0 933 if(myData2022->version==0) {
michael@0 934 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
michael@0 935 break;
michael@0 936 }
michael@0 937 /*fall through*/
michael@0 938 case GB2312_1:
michael@0 939 /*fall through*/
michael@0 940 case CNS_11643_1:
michael@0 941 myData2022->toU2022State.cs[1]=(int8_t)tempState;
michael@0 942 break;
michael@0 943 case CNS_11643_2:
michael@0 944 myData2022->toU2022State.cs[2]=(int8_t)tempState;
michael@0 945 break;
michael@0 946 default:
michael@0 947 /* other CNS 11643 planes */
michael@0 948 if(myData2022->version==0) {
michael@0 949 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
michael@0 950 } else {
michael@0 951 myData2022->toU2022State.cs[3]=(int8_t)tempState;
michael@0 952 }
michael@0 953 break;
michael@0 954 }
michael@0 955 }
michael@0 956 break;
michael@0 957 case ISO_2022_KR:
michael@0 958 if(offset==0x30){
michael@0 959 /* nothing to be done, just accept this one escape sequence */
michael@0 960 } else {
michael@0 961 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
michael@0 962 }
michael@0 963 break;
michael@0 964
michael@0 965 default:
michael@0 966 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 967 break;
michael@0 968 }
michael@0 969 }
michael@0 970 if(U_SUCCESS(*err)) {
michael@0 971 _this->toULength = 0;
michael@0 972 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
michael@0 973 if(_this->toULength>1) {
michael@0 974 /*
michael@0 975 * Ticket 5691: consistent illegal sequences:
michael@0 976 * - We include at least the first byte (ESC) in the illegal sequence.
michael@0 977 * - If any of the non-initial bytes could be the start of a character,
michael@0 978 * we stop the illegal sequence before the first one of those.
michael@0 979 * In escape sequences, all following bytes are "printable", that is,
michael@0 980 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
michael@0 981 * they are valid single/lead bytes.
michael@0 982 * For simplicity, we always only report the initial ESC byte as the
michael@0 983 * illegal sequence and back out all other bytes we looked at.
michael@0 984 */
michael@0 985 /* Back out some bytes. */
michael@0 986 int8_t backOutDistance=_this->toULength-1;
michael@0 987 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
michael@0 988 if(backOutDistance<=bytesFromThisBuffer) {
michael@0 989 /* same as initialToULength<=1 */
michael@0 990 *source-=backOutDistance;
michael@0 991 } else {
michael@0 992 /* Back out bytes from the previous buffer: Need to replay them. */
michael@0 993 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
michael@0 994 /* same as -(initialToULength-1) */
michael@0 995 /* preToULength is negative! */
michael@0 996 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
michael@0 997 *source-=bytesFromThisBuffer;
michael@0 998 }
michael@0 999 _this->toULength=1;
michael@0 1000 }
michael@0 1001 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
michael@0 1002 _this->toUCallbackReason = UCNV_UNASSIGNED;
michael@0 1003 }
michael@0 1004 }
michael@0 1005
michael@0 1006 /*Checks the characters of the buffer against valid 2022 escape sequences
michael@0 1007 *if the match we return a pointer to the initial start of the sequence otherwise
michael@0 1008 *we return sourceLimit
michael@0 1009 */
michael@0 1010 /*for 2022 looks ahead in the stream
michael@0 1011 *to determine the longest possible convertible
michael@0 1012 *data stream
michael@0 1013 */
michael@0 1014 static inline const char*
michael@0 1015 getEndOfBuffer_2022(const char** source,
michael@0 1016 const char* sourceLimit,
michael@0 1017 UBool /*flush*/){
michael@0 1018
michael@0 1019 const char* mySource = *source;
michael@0 1020
michael@0 1021 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 1022 if (*source >= sourceLimit)
michael@0 1023 return sourceLimit;
michael@0 1024
michael@0 1025 do{
michael@0 1026
michael@0 1027 if (*mySource == ESC_2022){
michael@0 1028 int8_t i;
michael@0 1029 int32_t key = 0;
michael@0 1030 int32_t offset;
michael@0 1031 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
michael@0 1032
michael@0 1033 /* Kludge: I could not
michael@0 1034 * figure out the reason for validating an escape sequence
michael@0 1035 * twice - once here and once in changeState_2022().
michael@0 1036 * is it possible to have an ESC character in a ISO2022
michael@0 1037 * byte stream which is valid in a code page? Is it legal?
michael@0 1038 */
michael@0 1039 for (i=0;
michael@0 1040 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
michael@0 1041 i++) {
michael@0 1042 value = getKey_2022(*(mySource+i), &key, &offset);
michael@0 1043 }
michael@0 1044 if (value > 0 || *mySource==ESC_2022)
michael@0 1045 return mySource;
michael@0 1046
michael@0 1047 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
michael@0 1048 return sourceLimit;
michael@0 1049 }
michael@0 1050 }while (++mySource < sourceLimit);
michael@0 1051
michael@0 1052 return sourceLimit;
michael@0 1053 #else
michael@0 1054 while(mySource < sourceLimit && *mySource != ESC_2022) {
michael@0 1055 ++mySource;
michael@0 1056 }
michael@0 1057 return mySource;
michael@0 1058 #endif
michael@0 1059 }
michael@0 1060
michael@0 1061
michael@0 1062 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
michael@0 1063 * any future change in _MBCSFromUChar32() function should be reflected here.
michael@0 1064 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
michael@0 1065 */
michael@0 1066 static inline int32_t
michael@0 1067 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
michael@0 1068 UChar32 c,
michael@0 1069 uint32_t* value,
michael@0 1070 UBool useFallback,
michael@0 1071 int outputType)
michael@0 1072 {
michael@0 1073 const int32_t *cx;
michael@0 1074 const uint16_t *table;
michael@0 1075 uint32_t stage2Entry;
michael@0 1076 uint32_t myValue;
michael@0 1077 int32_t length;
michael@0 1078 const uint8_t *p;
michael@0 1079 /*
michael@0 1080 * TODO(markus): Use and require new, faster MBCS conversion table structures.
michael@0 1081 * Use internal version of ucnv_open() that verifies that the new structures are available,
michael@0 1082 * else U_INTERNAL_PROGRAM_ERROR.
michael@0 1083 */
michael@0 1084 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
michael@0 1085 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
michael@0 1086 table=sharedData->mbcs.fromUnicodeTable;
michael@0 1087 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
michael@0 1088 /* get the bytes and the length for the output */
michael@0 1089 if(outputType==MBCS_OUTPUT_2){
michael@0 1090 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
michael@0 1091 if(myValue<=0xff) {
michael@0 1092 length=1;
michael@0 1093 } else {
michael@0 1094 length=2;
michael@0 1095 }
michael@0 1096 } else /* outputType==MBCS_OUTPUT_3 */ {
michael@0 1097 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
michael@0 1098 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
michael@0 1099 if(myValue<=0xff) {
michael@0 1100 length=1;
michael@0 1101 } else if(myValue<=0xffff) {
michael@0 1102 length=2;
michael@0 1103 } else {
michael@0 1104 length=3;
michael@0 1105 }
michael@0 1106 }
michael@0 1107 /* is this code point assigned, or do we use fallbacks? */
michael@0 1108 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
michael@0 1109 /* assigned */
michael@0 1110 *value=myValue;
michael@0 1111 return length;
michael@0 1112 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
michael@0 1113 /*
michael@0 1114 * We allow a 0 byte output if the "assigned" bit is set for this entry.
michael@0 1115 * There is no way with this data structure for fallback output
michael@0 1116 * to be a zero byte.
michael@0 1117 */
michael@0 1118 *value=myValue;
michael@0 1119 return -length;
michael@0 1120 }
michael@0 1121 }
michael@0 1122
michael@0 1123 cx=sharedData->mbcs.extIndexes;
michael@0 1124 if(cx!=NULL) {
michael@0 1125 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
michael@0 1126 }
michael@0 1127
michael@0 1128 /* unassigned */
michael@0 1129 return 0;
michael@0 1130 }
michael@0 1131
michael@0 1132 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
michael@0 1133 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
michael@0 1134 * @param retval pointer to output byte
michael@0 1135 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
michael@0 1136 */
michael@0 1137 static inline int32_t
michael@0 1138 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
michael@0 1139 UChar32 c,
michael@0 1140 uint32_t* retval,
michael@0 1141 UBool useFallback)
michael@0 1142 {
michael@0 1143 const uint16_t *table;
michael@0 1144 int32_t value;
michael@0 1145 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
michael@0 1146 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
michael@0 1147 return 0;
michael@0 1148 }
michael@0 1149 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
michael@0 1150 table=sharedData->mbcs.fromUnicodeTable;
michael@0 1151 /* get the byte for the output */
michael@0 1152 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
michael@0 1153 /* is this code point assigned, or do we use fallbacks? */
michael@0 1154 *retval=(uint32_t)(value&0xff);
michael@0 1155 if(value>=0xf00) {
michael@0 1156 return 1; /* roundtrip */
michael@0 1157 } else if(useFallback ? value>=0x800 : value>=0xc00) {
michael@0 1158 return -1; /* fallback taken */
michael@0 1159 } else {
michael@0 1160 return 0; /* no mapping */
michael@0 1161 }
michael@0 1162 }
michael@0 1163
michael@0 1164 /*
michael@0 1165 * Check that the result is a 2-byte value with each byte in the range A1..FE
michael@0 1166 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
michael@0 1167 * to move it to the ISO 2022 range 21..7E.
michael@0 1168 * Return 0 if out of range.
michael@0 1169 */
michael@0 1170 static inline uint32_t
michael@0 1171 _2022FromGR94DBCS(uint32_t value) {
michael@0 1172 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
michael@0 1173 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
michael@0 1174 ) {
michael@0 1175 return value - 0x8080; /* shift down to 21..7e byte range */
michael@0 1176 } else {
michael@0 1177 return 0; /* not valid for ISO 2022 */
michael@0 1178 }
michael@0 1179 }
michael@0 1180
michael@0 1181 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
michael@0 1182 /*
michael@0 1183 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
michael@0 1184 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
michael@0 1185 * unchanged.
michael@0 1186 */
michael@0 1187 static inline uint32_t
michael@0 1188 _2022ToGR94DBCS(uint32_t value) {
michael@0 1189 uint32_t returnValue = value + 0x8080;
michael@0 1190 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
michael@0 1191 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
michael@0 1192 return returnValue;
michael@0 1193 } else {
michael@0 1194 return value;
michael@0 1195 }
michael@0 1196 }
michael@0 1197 #endif
michael@0 1198
michael@0 1199 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 1200
michael@0 1201 /**********************************************************************************
michael@0 1202 * ISO-2022 Converter
michael@0 1203 *
michael@0 1204 *
michael@0 1205 */
michael@0 1206
michael@0 1207 static void
michael@0 1208 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
michael@0 1209 UErrorCode* err){
michael@0 1210 const char* mySourceLimit, *realSourceLimit;
michael@0 1211 const char* sourceStart;
michael@0 1212 const UChar* myTargetStart;
michael@0 1213 UConverter* saveThis;
michael@0 1214 UConverterDataISO2022* myData;
michael@0 1215 int8_t length;
michael@0 1216
michael@0 1217 saveThis = args->converter;
michael@0 1218 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
michael@0 1219
michael@0 1220 realSourceLimit = args->sourceLimit;
michael@0 1221 while (args->source < realSourceLimit) {
michael@0 1222 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
michael@0 1223 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
michael@0 1224 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
michael@0 1225
michael@0 1226 if(args->source < mySourceLimit) {
michael@0 1227 if(myData->currentConverter==NULL) {
michael@0 1228 myData->currentConverter = ucnv_open("ASCII",err);
michael@0 1229 if(U_FAILURE(*err)){
michael@0 1230 return;
michael@0 1231 }
michael@0 1232
michael@0 1233 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
michael@0 1234 saveThis->mode = UCNV_SO;
michael@0 1235 }
michael@0 1236
michael@0 1237 /* convert to before the ESC or until the end of the buffer */
michael@0 1238 myData->isFirstBuffer=FALSE;
michael@0 1239 sourceStart = args->source;
michael@0 1240 myTargetStart = args->target;
michael@0 1241 args->converter = myData->currentConverter;
michael@0 1242 ucnv_toUnicode(args->converter,
michael@0 1243 &args->target,
michael@0 1244 args->targetLimit,
michael@0 1245 &args->source,
michael@0 1246 mySourceLimit,
michael@0 1247 args->offsets,
michael@0 1248 (UBool)(args->flush && mySourceLimit == realSourceLimit),
michael@0 1249 err);
michael@0 1250 args->converter = saveThis;
michael@0 1251
michael@0 1252 if (*err == U_BUFFER_OVERFLOW_ERROR) {
michael@0 1253 /* move the overflow buffer */
michael@0 1254 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
michael@0 1255 myData->currentConverter->UCharErrorBufferLength = 0;
michael@0 1256 if(length > 0) {
michael@0 1257 uprv_memcpy(saveThis->UCharErrorBuffer,
michael@0 1258 myData->currentConverter->UCharErrorBuffer,
michael@0 1259 length*U_SIZEOF_UCHAR);
michael@0 1260 }
michael@0 1261 return;
michael@0 1262 }
michael@0 1263
michael@0 1264 /*
michael@0 1265 * At least one of:
michael@0 1266 * -Error while converting
michael@0 1267 * -Done with entire buffer
michael@0 1268 * -Need to write offsets or update the current offset
michael@0 1269 * (leave that up to the code in ucnv.c)
michael@0 1270 *
michael@0 1271 * or else we just stopped at an ESC byte and continue with changeState_2022()
michael@0 1272 */
michael@0 1273 if (U_FAILURE(*err) ||
michael@0 1274 (args->source == realSourceLimit) ||
michael@0 1275 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
michael@0 1276 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
michael@0 1277 ) {
michael@0 1278 /* copy partial or error input for truncated detection and error handling */
michael@0 1279 if(U_FAILURE(*err)) {
michael@0 1280 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
michael@0 1281 if(length > 0) {
michael@0 1282 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
michael@0 1283 }
michael@0 1284 } else {
michael@0 1285 length = saveThis->toULength = myData->currentConverter->toULength;
michael@0 1286 if(length > 0) {
michael@0 1287 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
michael@0 1288 if(args->source < mySourceLimit) {
michael@0 1289 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
michael@0 1290 }
michael@0 1291 }
michael@0 1292 }
michael@0 1293 return;
michael@0 1294 }
michael@0 1295 }
michael@0 1296 }
michael@0 1297
michael@0 1298 sourceStart = args->source;
michael@0 1299 changeState_2022(args->converter,
michael@0 1300 &(args->source),
michael@0 1301 realSourceLimit,
michael@0 1302 ISO_2022,
michael@0 1303 err);
michael@0 1304 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
michael@0 1305 /* let the ucnv.c code update its current offset */
michael@0 1306 return;
michael@0 1307 }
michael@0 1308 }
michael@0 1309 }
michael@0 1310
michael@0 1311 #endif
michael@0 1312
michael@0 1313 /*
michael@0 1314 * To Unicode Callback helper function
michael@0 1315 */
michael@0 1316 static void
michael@0 1317 toUnicodeCallback(UConverter *cnv,
michael@0 1318 const uint32_t sourceChar, const uint32_t targetUniChar,
michael@0 1319 UErrorCode* err){
michael@0 1320 if(sourceChar>0xff){
michael@0 1321 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
michael@0 1322 cnv->toUBytes[1] = (uint8_t)sourceChar;
michael@0 1323 cnv->toULength = 2;
michael@0 1324 }
michael@0 1325 else{
michael@0 1326 cnv->toUBytes[0] =(char) sourceChar;
michael@0 1327 cnv->toULength = 1;
michael@0 1328 }
michael@0 1329
michael@0 1330 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
michael@0 1331 *err = U_INVALID_CHAR_FOUND;
michael@0 1332 }
michael@0 1333 else{
michael@0 1334 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 1335 }
michael@0 1336 }
michael@0 1337
michael@0 1338 /**************************************ISO-2022-JP*************************************************/
michael@0 1339
michael@0 1340 /************************************** IMPORTANT **************************************************
michael@0 1341 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
michael@0 1342 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
michael@0 1343 * The converter iterates over each Unicode codepoint
michael@0 1344 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
michael@0 1345 * processed one char at a time it would make sense to reduce the extra processing a canned converter
michael@0 1346 * would do as far as possible.
michael@0 1347 *
michael@0 1348 * If the implementation of these macros or structure of sharedData struct change in the future, make
michael@0 1349 * sure that ISO-2022 is also changed.
michael@0 1350 ***************************************************************************************************
michael@0 1351 */
michael@0 1352
michael@0 1353 /***************************************************************************************************
michael@0 1354 * Rules for ISO-2022-jp encoding
michael@0 1355 * (i) Escape sequences must be fully contained within a line they should not
michael@0 1356 * span new lines or CRs
michael@0 1357 * (ii) If the last character on a line is represented by two bytes then an ASCII or
michael@0 1358 * JIS-Roman character escape sequence should follow before the line terminates
michael@0 1359 * (iii) If the first character on the line is represented by two bytes then a two
michael@0 1360 * byte character escape sequence should precede it
michael@0 1361 * (iv) If no escape sequence is encountered then the characters are ASCII
michael@0 1362 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
michael@0 1363 * and invoked with SS2 (ESC N).
michael@0 1364 * (vi) If there is any G0 designation in text, there must be a switch to
michael@0 1365 * ASCII or to JIS X 0201-Roman before a space character (but not
michael@0 1366 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
michael@0 1367 * characters such as tab or CRLF.
michael@0 1368 * (vi) Supported encodings:
michael@0 1369 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
michael@0 1370 *
michael@0 1371 * source : RFC-1554
michael@0 1372 *
michael@0 1373 * JISX201, JISX208,JISX212 : new .cnv data files created
michael@0 1374 * KSC5601 : alias to ibm-949 mapping table
michael@0 1375 * GB2312 : alias to ibm-1386 mapping table
michael@0 1376 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
michael@0 1377 * ISO-8859-7 : alisas to ibm-9409 mapping table
michael@0 1378 */
michael@0 1379
michael@0 1380 /* preference order of JP charsets */
michael@0 1381 static const StateEnum jpCharsetPref[]={
michael@0 1382 ASCII,
michael@0 1383 JISX201,
michael@0 1384 ISO8859_1,
michael@0 1385 ISO8859_7,
michael@0 1386 JISX208,
michael@0 1387 JISX212,
michael@0 1388 GB2312,
michael@0 1389 KSC5601,
michael@0 1390 HWKANA_7BIT
michael@0 1391 };
michael@0 1392
michael@0 1393 /*
michael@0 1394 * The escape sequences must be in order of the enum constants like JISX201 = 3,
michael@0 1395 * not in order of jpCharsetPref[]!
michael@0 1396 */
michael@0 1397 static const char escSeqChars[][6] ={
michael@0 1398 "\x1B\x28\x42", /* <ESC>(B ASCII */
michael@0 1399 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
michael@0 1400 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
michael@0 1401 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
michael@0 1402 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
michael@0 1403 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
michael@0 1404 "\x1B\x24\x41", /* <ESC>$A GB2312 */
michael@0 1405 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
michael@0 1406 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
michael@0 1407
michael@0 1408 };
michael@0 1409 static const int8_t escSeqCharsLen[] ={
michael@0 1410 3, /* length of <ESC>(B ASCII */
michael@0 1411 3, /* length of <ESC>.A ISO-8859-1 */
michael@0 1412 3, /* length of <ESC>.F ISO-8859-7 */
michael@0 1413 3, /* length of <ESC>(J JISX-201 */
michael@0 1414 3, /* length of <ESC>$B JISX-208 */
michael@0 1415 4, /* length of <ESC>$(D JISX-212 */
michael@0 1416 3, /* length of <ESC>$A GB2312 */
michael@0 1417 4, /* length of <ESC>$(C KSC5601 */
michael@0 1418 3 /* length of <ESC>(I HWKANA_7BIT */
michael@0 1419 };
michael@0 1420
michael@0 1421 /*
michael@0 1422 * The iteration over various code pages works this way:
michael@0 1423 * i) Get the currentState from myConverterData->currentState
michael@0 1424 * ii) Check if the character is mapped to a valid character in the currentState
michael@0 1425 * Yes -> a) set the initIterState to currentState
michael@0 1426 * b) remain in this state until an invalid character is found
michael@0 1427 * No -> a) go to the next code page and find the character
michael@0 1428 * iii) Before changing the state increment the current state check if the current state
michael@0 1429 * is equal to the intitIteration state
michael@0 1430 * Yes -> A character that cannot be represented in any of the supported encodings
michael@0 1431 * break and return a U_INVALID_CHARACTER error
michael@0 1432 * No -> Continue and find the character in next code page
michael@0 1433 *
michael@0 1434 *
michael@0 1435 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
michael@0 1436 */
michael@0 1437
michael@0 1438 /* Map 00..7F to Unicode according to JIS X 0201. */
michael@0 1439 static inline uint32_t
michael@0 1440 jisx201ToU(uint32_t value) {
michael@0 1441 if(value < 0x5c) {
michael@0 1442 return value;
michael@0 1443 } else if(value == 0x5c) {
michael@0 1444 return 0xa5;
michael@0 1445 } else if(value == 0x7e) {
michael@0 1446 return 0x203e;
michael@0 1447 } else /* value <= 0x7f */ {
michael@0 1448 return value;
michael@0 1449 }
michael@0 1450 }
michael@0 1451
michael@0 1452 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
michael@0 1453 static inline uint32_t
michael@0 1454 jisx201FromU(uint32_t value) {
michael@0 1455 if(value<=0x7f) {
michael@0 1456 if(value!=0x5c && value!=0x7e) {
michael@0 1457 return value;
michael@0 1458 }
michael@0 1459 } else if(value==0xa5) {
michael@0 1460 return 0x5c;
michael@0 1461 } else if(value==0x203e) {
michael@0 1462 return 0x7e;
michael@0 1463 }
michael@0 1464 return 0xfffe;
michael@0 1465 }
michael@0 1466
michael@0 1467 /*
michael@0 1468 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
michael@0 1469 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
michael@0 1470 * Return 0 if the byte pair is out of range.
michael@0 1471 */
michael@0 1472 static inline uint32_t
michael@0 1473 _2022FromSJIS(uint32_t value) {
michael@0 1474 uint8_t trail;
michael@0 1475
michael@0 1476 if(value > 0xEFFC) {
michael@0 1477 return 0; /* beyond JIS X 0208 */
michael@0 1478 }
michael@0 1479
michael@0 1480 trail = (uint8_t)value;
michael@0 1481
michael@0 1482 value &= 0xff00; /* lead byte */
michael@0 1483 if(value <= 0x9f00) {
michael@0 1484 value -= 0x7000;
michael@0 1485 } else /* 0xe000 <= value <= 0xef00 */ {
michael@0 1486 value -= 0xb000;
michael@0 1487 }
michael@0 1488 value <<= 1;
michael@0 1489
michael@0 1490 if(trail <= 0x9e) {
michael@0 1491 value -= 0x100;
michael@0 1492 if(trail <= 0x7e) {
michael@0 1493 value |= trail - 0x1f;
michael@0 1494 } else {
michael@0 1495 value |= trail - 0x20;
michael@0 1496 }
michael@0 1497 } else /* trail <= 0xfc */ {
michael@0 1498 value |= trail - 0x7e;
michael@0 1499 }
michael@0 1500 return value;
michael@0 1501 }
michael@0 1502
michael@0 1503 /*
michael@0 1504 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
michael@0 1505 * If either byte is outside 21..7E make sure that the result is not valid
michael@0 1506 * for Shift-JIS so that the converter catches it.
michael@0 1507 * Some invalid byte values already turn into equally invalid Shift-JIS
michael@0 1508 * byte values and need not be tested explicitly.
michael@0 1509 */
michael@0 1510 static inline void
michael@0 1511 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
michael@0 1512 if(c1&1) {
michael@0 1513 ++c1;
michael@0 1514 if(c2 <= 0x5f) {
michael@0 1515 c2 += 0x1f;
michael@0 1516 } else if(c2 <= 0x7e) {
michael@0 1517 c2 += 0x20;
michael@0 1518 } else {
michael@0 1519 c2 = 0; /* invalid */
michael@0 1520 }
michael@0 1521 } else {
michael@0 1522 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
michael@0 1523 c2 += 0x7e;
michael@0 1524 } else {
michael@0 1525 c2 = 0; /* invalid */
michael@0 1526 }
michael@0 1527 }
michael@0 1528 c1 >>= 1;
michael@0 1529 if(c1 <= 0x2f) {
michael@0 1530 c1 += 0x70;
michael@0 1531 } else if(c1 <= 0x3f) {
michael@0 1532 c1 += 0xb0;
michael@0 1533 } else {
michael@0 1534 c1 = 0; /* invalid */
michael@0 1535 }
michael@0 1536 bytes[0] = (char)c1;
michael@0 1537 bytes[1] = (char)c2;
michael@0 1538 }
michael@0 1539
michael@0 1540 /*
michael@0 1541 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
michael@0 1542 * Katakana.
michael@0 1543 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
michael@0 1544 * because Shift-JIS roundtrips half-width Katakana to single bytes.
michael@0 1545 * These were the only fallbacks in ICU's jisx-208.ucm file.
michael@0 1546 */
michael@0 1547 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
michael@0 1548 0x2123, /* U+FF61 */
michael@0 1549 0x2156,
michael@0 1550 0x2157,
michael@0 1551 0x2122,
michael@0 1552 0x2126,
michael@0 1553 0x2572,
michael@0 1554 0x2521,
michael@0 1555 0x2523,
michael@0 1556 0x2525,
michael@0 1557 0x2527,
michael@0 1558 0x2529,
michael@0 1559 0x2563,
michael@0 1560 0x2565,
michael@0 1561 0x2567,
michael@0 1562 0x2543,
michael@0 1563 0x213C, /* U+FF70 */
michael@0 1564 0x2522,
michael@0 1565 0x2524,
michael@0 1566 0x2526,
michael@0 1567 0x2528,
michael@0 1568 0x252A,
michael@0 1569 0x252B,
michael@0 1570 0x252D,
michael@0 1571 0x252F,
michael@0 1572 0x2531,
michael@0 1573 0x2533,
michael@0 1574 0x2535,
michael@0 1575 0x2537,
michael@0 1576 0x2539,
michael@0 1577 0x253B,
michael@0 1578 0x253D,
michael@0 1579 0x253F, /* U+FF80 */
michael@0 1580 0x2541,
michael@0 1581 0x2544,
michael@0 1582 0x2546,
michael@0 1583 0x2548,
michael@0 1584 0x254A,
michael@0 1585 0x254B,
michael@0 1586 0x254C,
michael@0 1587 0x254D,
michael@0 1588 0x254E,
michael@0 1589 0x254F,
michael@0 1590 0x2552,
michael@0 1591 0x2555,
michael@0 1592 0x2558,
michael@0 1593 0x255B,
michael@0 1594 0x255E,
michael@0 1595 0x255F, /* U+FF90 */
michael@0 1596 0x2560,
michael@0 1597 0x2561,
michael@0 1598 0x2562,
michael@0 1599 0x2564,
michael@0 1600 0x2566,
michael@0 1601 0x2568,
michael@0 1602 0x2569,
michael@0 1603 0x256A,
michael@0 1604 0x256B,
michael@0 1605 0x256C,
michael@0 1606 0x256D,
michael@0 1607 0x256F,
michael@0 1608 0x2573,
michael@0 1609 0x212B,
michael@0 1610 0x212C /* U+FF9F */
michael@0 1611 };
michael@0 1612
michael@0 1613 static void
michael@0 1614 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
michael@0 1615 UConverter *cnv = args->converter;
michael@0 1616 UConverterDataISO2022 *converterData;
michael@0 1617 ISO2022State *pFromU2022State;
michael@0 1618 uint8_t *target = (uint8_t *) args->target;
michael@0 1619 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
michael@0 1620 const UChar* source = args->source;
michael@0 1621 const UChar* sourceLimit = args->sourceLimit;
michael@0 1622 int32_t* offsets = args->offsets;
michael@0 1623 UChar32 sourceChar;
michael@0 1624 char buffer[8];
michael@0 1625 int32_t len, outLen;
michael@0 1626 int8_t choices[10];
michael@0 1627 int32_t choiceCount;
michael@0 1628 uint32_t targetValue = 0;
michael@0 1629 UBool useFallback;
michael@0 1630
michael@0 1631 int32_t i;
michael@0 1632 int8_t cs, g;
michael@0 1633
michael@0 1634 /* set up the state */
michael@0 1635 converterData = (UConverterDataISO2022*)cnv->extraInfo;
michael@0 1636 pFromU2022State = &converterData->fromU2022State;
michael@0 1637
michael@0 1638 choiceCount = 0;
michael@0 1639
michael@0 1640 /* check if the last codepoint of previous buffer was a lead surrogate*/
michael@0 1641 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
michael@0 1642 goto getTrail;
michael@0 1643 }
michael@0 1644
michael@0 1645 while(source < sourceLimit) {
michael@0 1646 if(target < targetLimit) {
michael@0 1647
michael@0 1648 sourceChar = *(source++);
michael@0 1649 /*check if the char is a First surrogate*/
michael@0 1650 if(U16_IS_SURROGATE(sourceChar)) {
michael@0 1651 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
michael@0 1652 getTrail:
michael@0 1653 /*look ahead to find the trail surrogate*/
michael@0 1654 if(source < sourceLimit) {
michael@0 1655 /* test the following code unit */
michael@0 1656 UChar trail=(UChar) *source;
michael@0 1657 if(U16_IS_TRAIL(trail)) {
michael@0 1658 source++;
michael@0 1659 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
michael@0 1660 cnv->fromUChar32=0x00;
michael@0 1661 /* convert this supplementary code point */
michael@0 1662 /* exit this condition tree */
michael@0 1663 } else {
michael@0 1664 /* this is an unmatched lead code unit (1st surrogate) */
michael@0 1665 /* callback(illegal) */
michael@0 1666 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 1667 cnv->fromUChar32=sourceChar;
michael@0 1668 break;
michael@0 1669 }
michael@0 1670 } else {
michael@0 1671 /* no more input */
michael@0 1672 cnv->fromUChar32=sourceChar;
michael@0 1673 break;
michael@0 1674 }
michael@0 1675 } else {
michael@0 1676 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 1677 /* callback(illegal) */
michael@0 1678 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 1679 cnv->fromUChar32=sourceChar;
michael@0 1680 break;
michael@0 1681 }
michael@0 1682 }
michael@0 1683
michael@0 1684 /* do not convert SO/SI/ESC */
michael@0 1685 if(IS_2022_CONTROL(sourceChar)) {
michael@0 1686 /* callback(illegal) */
michael@0 1687 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 1688 cnv->fromUChar32=sourceChar;
michael@0 1689 break;
michael@0 1690 }
michael@0 1691
michael@0 1692 /* do the conversion */
michael@0 1693
michael@0 1694 if(choiceCount == 0) {
michael@0 1695 uint16_t csm;
michael@0 1696
michael@0 1697 /*
michael@0 1698 * The csm variable keeps track of which charsets are allowed
michael@0 1699 * and not used yet while building the choices[].
michael@0 1700 */
michael@0 1701 csm = jpCharsetMasks[converterData->version];
michael@0 1702 choiceCount = 0;
michael@0 1703
michael@0 1704 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
michael@0 1705 if(converterData->version == 3 || converterData->version == 4) {
michael@0 1706 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
michael@0 1707 }
michael@0 1708 /* Do not try single-byte half-width Katakana for other versions. */
michael@0 1709 csm &= ~CSM(HWKANA_7BIT);
michael@0 1710
michael@0 1711 /* try the current G0 charset */
michael@0 1712 choices[choiceCount++] = cs = pFromU2022State->cs[0];
michael@0 1713 csm &= ~CSM(cs);
michael@0 1714
michael@0 1715 /* try the current G2 charset */
michael@0 1716 if((cs = pFromU2022State->cs[2]) != 0) {
michael@0 1717 choices[choiceCount++] = cs;
michael@0 1718 csm &= ~CSM(cs);
michael@0 1719 }
michael@0 1720
michael@0 1721 /* try all the other possible charsets */
michael@0 1722 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
michael@0 1723 cs = (int8_t)jpCharsetPref[i];
michael@0 1724 if(CSM(cs) & csm) {
michael@0 1725 choices[choiceCount++] = cs;
michael@0 1726 csm &= ~CSM(cs);
michael@0 1727 }
michael@0 1728 }
michael@0 1729 }
michael@0 1730
michael@0 1731 cs = g = 0;
michael@0 1732 /*
michael@0 1733 * len==0: no mapping found yet
michael@0 1734 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
michael@0 1735 * len>0: found a roundtrip result, done
michael@0 1736 */
michael@0 1737 len = 0;
michael@0 1738 /*
michael@0 1739 * We will turn off useFallback after finding a fallback,
michael@0 1740 * but we still get fallbacks from PUA code points as usual.
michael@0 1741 * Therefore, we will also need to check that we don't overwrite
michael@0 1742 * an early fallback with a later one.
michael@0 1743 */
michael@0 1744 useFallback = cnv->useFallback;
michael@0 1745
michael@0 1746 for(i = 0; i < choiceCount && len <= 0; ++i) {
michael@0 1747 uint32_t value;
michael@0 1748 int32_t len2;
michael@0 1749 int8_t cs0 = choices[i];
michael@0 1750 switch(cs0) {
michael@0 1751 case ASCII:
michael@0 1752 if(sourceChar <= 0x7f) {
michael@0 1753 targetValue = (uint32_t)sourceChar;
michael@0 1754 len = 1;
michael@0 1755 cs = cs0;
michael@0 1756 g = 0;
michael@0 1757 }
michael@0 1758 break;
michael@0 1759 case ISO8859_1:
michael@0 1760 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
michael@0 1761 targetValue = (uint32_t)sourceChar - 0x80;
michael@0 1762 len = 1;
michael@0 1763 cs = cs0;
michael@0 1764 g = 2;
michael@0 1765 }
michael@0 1766 break;
michael@0 1767 case HWKANA_7BIT:
michael@0 1768 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
michael@0 1769 if(converterData->version==3) {
michael@0 1770 /* JIS7: use G1 (SO) */
michael@0 1771 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
michael@0 1772 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
michael@0 1773 len = 1;
michael@0 1774 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
michael@0 1775 g = 1;
michael@0 1776 } else if(converterData->version==4) {
michael@0 1777 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
michael@0 1778 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
michael@0 1779 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
michael@0 1780 len = 1;
michael@0 1781
michael@0 1782 cs = pFromU2022State->cs[0];
michael@0 1783 if(IS_JP_DBCS(cs)) {
michael@0 1784 /* switch from a DBCS charset to JISX201 */
michael@0 1785 cs = (int8_t)JISX201;
michael@0 1786 }
michael@0 1787 /* else stay in the current G0 charset */
michael@0 1788 g = 0;
michael@0 1789 }
michael@0 1790 /* else do not use HWKANA_7BIT with other versions */
michael@0 1791 }
michael@0 1792 break;
michael@0 1793 case JISX201:
michael@0 1794 /* G0 SBCS */
michael@0 1795 value = jisx201FromU(sourceChar);
michael@0 1796 if(value <= 0x7f) {
michael@0 1797 targetValue = value;
michael@0 1798 len = 1;
michael@0 1799 cs = cs0;
michael@0 1800 g = 0;
michael@0 1801 useFallback = FALSE;
michael@0 1802 }
michael@0 1803 break;
michael@0 1804 case JISX208:
michael@0 1805 /* G0 DBCS from Shift-JIS table */
michael@0 1806 len2 = MBCS_FROM_UCHAR32_ISO2022(
michael@0 1807 converterData->myConverterArray[cs0],
michael@0 1808 sourceChar, &value,
michael@0 1809 useFallback, MBCS_OUTPUT_2);
michael@0 1810 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
michael@0 1811 value = _2022FromSJIS(value);
michael@0 1812 if(value != 0) {
michael@0 1813 targetValue = value;
michael@0 1814 len = len2;
michael@0 1815 cs = cs0;
michael@0 1816 g = 0;
michael@0 1817 useFallback = FALSE;
michael@0 1818 }
michael@0 1819 } else if(len == 0 && useFallback &&
michael@0 1820 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
michael@0 1821 targetValue = hwkana_fb[sourceChar - HWKANA_START];
michael@0 1822 len = -2;
michael@0 1823 cs = cs0;
michael@0 1824 g = 0;
michael@0 1825 useFallback = FALSE;
michael@0 1826 }
michael@0 1827 break;
michael@0 1828 case ISO8859_7:
michael@0 1829 /* G0 SBCS forced to 7-bit output */
michael@0 1830 len2 = MBCS_SINGLE_FROM_UCHAR32(
michael@0 1831 converterData->myConverterArray[cs0],
michael@0 1832 sourceChar, &value,
michael@0 1833 useFallback);
michael@0 1834 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
michael@0 1835 targetValue = value - 0x80;
michael@0 1836 len = len2;
michael@0 1837 cs = cs0;
michael@0 1838 g = 2;
michael@0 1839 useFallback = FALSE;
michael@0 1840 }
michael@0 1841 break;
michael@0 1842 default:
michael@0 1843 /* G0 DBCS */
michael@0 1844 len2 = MBCS_FROM_UCHAR32_ISO2022(
michael@0 1845 converterData->myConverterArray[cs0],
michael@0 1846 sourceChar, &value,
michael@0 1847 useFallback, MBCS_OUTPUT_2);
michael@0 1848 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
michael@0 1849 if(cs0 == KSC5601) {
michael@0 1850 /*
michael@0 1851 * Check for valid bytes for the encoding scheme.
michael@0 1852 * This is necessary because the sub-converter (windows-949)
michael@0 1853 * has a broader encoding scheme than is valid for 2022.
michael@0 1854 */
michael@0 1855 value = _2022FromGR94DBCS(value);
michael@0 1856 if(value == 0) {
michael@0 1857 break;
michael@0 1858 }
michael@0 1859 }
michael@0 1860 targetValue = value;
michael@0 1861 len = len2;
michael@0 1862 cs = cs0;
michael@0 1863 g = 0;
michael@0 1864 useFallback = FALSE;
michael@0 1865 }
michael@0 1866 break;
michael@0 1867 }
michael@0 1868 }
michael@0 1869
michael@0 1870 if(len != 0) {
michael@0 1871 if(len < 0) {
michael@0 1872 len = -len; /* fallback */
michael@0 1873 }
michael@0 1874 outLen = 0; /* count output bytes */
michael@0 1875
michael@0 1876 /* write SI if necessary (only for JIS7) */
michael@0 1877 if(pFromU2022State->g == 1 && g == 0) {
michael@0 1878 buffer[outLen++] = UCNV_SI;
michael@0 1879 pFromU2022State->g = 0;
michael@0 1880 }
michael@0 1881
michael@0 1882 /* write the designation sequence if necessary */
michael@0 1883 if(cs != pFromU2022State->cs[g]) {
michael@0 1884 int32_t escLen = escSeqCharsLen[cs];
michael@0 1885 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
michael@0 1886 outLen += escLen;
michael@0 1887 pFromU2022State->cs[g] = cs;
michael@0 1888
michael@0 1889 /* invalidate the choices[] */
michael@0 1890 choiceCount = 0;
michael@0 1891 }
michael@0 1892
michael@0 1893 /* write the shift sequence if necessary */
michael@0 1894 if(g != pFromU2022State->g) {
michael@0 1895 switch(g) {
michael@0 1896 /* case 0 handled before writing escapes */
michael@0 1897 case 1:
michael@0 1898 buffer[outLen++] = UCNV_SO;
michael@0 1899 pFromU2022State->g = 1;
michael@0 1900 break;
michael@0 1901 default: /* case 2 */
michael@0 1902 buffer[outLen++] = 0x1b;
michael@0 1903 buffer[outLen++] = 0x4e;
michael@0 1904 break;
michael@0 1905 /* no case 3: no SS3 in ISO-2022-JP-x */
michael@0 1906 }
michael@0 1907 }
michael@0 1908
michael@0 1909 /* write the output bytes */
michael@0 1910 if(len == 1) {
michael@0 1911 buffer[outLen++] = (char)targetValue;
michael@0 1912 } else /* len == 2 */ {
michael@0 1913 buffer[outLen++] = (char)(targetValue >> 8);
michael@0 1914 buffer[outLen++] = (char)targetValue;
michael@0 1915 }
michael@0 1916 } else {
michael@0 1917 /*
michael@0 1918 * if we cannot find the character after checking all codepages
michael@0 1919 * then this is an error
michael@0 1920 */
michael@0 1921 *err = U_INVALID_CHAR_FOUND;
michael@0 1922 cnv->fromUChar32=sourceChar;
michael@0 1923 break;
michael@0 1924 }
michael@0 1925
michael@0 1926 if(sourceChar == CR || sourceChar == LF) {
michael@0 1927 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
michael@0 1928 pFromU2022State->cs[2] = 0;
michael@0 1929 choiceCount = 0;
michael@0 1930 }
michael@0 1931
michael@0 1932 /* output outLen>0 bytes in buffer[] */
michael@0 1933 if(outLen == 1) {
michael@0 1934 *target++ = buffer[0];
michael@0 1935 if(offsets) {
michael@0 1936 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
michael@0 1937 }
michael@0 1938 } else if(outLen == 2 && (target + 2) <= targetLimit) {
michael@0 1939 *target++ = buffer[0];
michael@0 1940 *target++ = buffer[1];
michael@0 1941 if(offsets) {
michael@0 1942 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
michael@0 1943 *offsets++ = sourceIndex;
michael@0 1944 *offsets++ = sourceIndex;
michael@0 1945 }
michael@0 1946 } else {
michael@0 1947 fromUWriteUInt8(
michael@0 1948 cnv,
michael@0 1949 buffer, outLen,
michael@0 1950 &target, (const char *)targetLimit,
michael@0 1951 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
michael@0 1952 err);
michael@0 1953 if(U_FAILURE(*err)) {
michael@0 1954 break;
michael@0 1955 }
michael@0 1956 }
michael@0 1957 } /* end if(myTargetIndex<myTargetLength) */
michael@0 1958 else{
michael@0 1959 *err =U_BUFFER_OVERFLOW_ERROR;
michael@0 1960 break;
michael@0 1961 }
michael@0 1962
michael@0 1963 }/* end while(mySourceIndex<mySourceLength) */
michael@0 1964
michael@0 1965 /*
michael@0 1966 * the end of the input stream and detection of truncated input
michael@0 1967 * are handled by the framework, but for ISO-2022-JP conversion
michael@0 1968 * we need to be in ASCII mode at the very end
michael@0 1969 *
michael@0 1970 * conditions:
michael@0 1971 * successful
michael@0 1972 * in SO mode or not in ASCII mode
michael@0 1973 * end of input and no truncated input
michael@0 1974 */
michael@0 1975 if( U_SUCCESS(*err) &&
michael@0 1976 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
michael@0 1977 args->flush && source>=sourceLimit && cnv->fromUChar32==0
michael@0 1978 ) {
michael@0 1979 int32_t sourceIndex;
michael@0 1980
michael@0 1981 outLen = 0;
michael@0 1982
michael@0 1983 if(pFromU2022State->g != 0) {
michael@0 1984 buffer[outLen++] = UCNV_SI;
michael@0 1985 pFromU2022State->g = 0;
michael@0 1986 }
michael@0 1987
michael@0 1988 if(pFromU2022State->cs[0] != ASCII) {
michael@0 1989 int32_t escLen = escSeqCharsLen[ASCII];
michael@0 1990 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
michael@0 1991 outLen += escLen;
michael@0 1992 pFromU2022State->cs[0] = (int8_t)ASCII;
michael@0 1993 }
michael@0 1994
michael@0 1995 /* get the source index of the last input character */
michael@0 1996 /*
michael@0 1997 * TODO this would be simpler and more reliable if we used a pair
michael@0 1998 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
michael@0 1999 * so that we could simply use the prevSourceIndex here;
michael@0 2000 * this code gives an incorrect result for the rare case of an unmatched
michael@0 2001 * trail surrogate that is alone in the last buffer of the text stream
michael@0 2002 */
michael@0 2003 sourceIndex=(int32_t)(source-args->source);
michael@0 2004 if(sourceIndex>0) {
michael@0 2005 --sourceIndex;
michael@0 2006 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
michael@0 2007 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
michael@0 2008 ) {
michael@0 2009 --sourceIndex;
michael@0 2010 }
michael@0 2011 } else {
michael@0 2012 sourceIndex=-1;
michael@0 2013 }
michael@0 2014
michael@0 2015 fromUWriteUInt8(
michael@0 2016 cnv,
michael@0 2017 buffer, outLen,
michael@0 2018 &target, (const char *)targetLimit,
michael@0 2019 &offsets, sourceIndex,
michael@0 2020 err);
michael@0 2021 }
michael@0 2022
michael@0 2023 /*save the state and return */
michael@0 2024 args->source = source;
michael@0 2025 args->target = (char*)target;
michael@0 2026 }
michael@0 2027
michael@0 2028 /*************** to unicode *******************/
michael@0 2029
michael@0 2030 static void
michael@0 2031 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
michael@0 2032 UErrorCode* err){
michael@0 2033 char tempBuf[2];
michael@0 2034 const char *mySource = (char *) args->source;
michael@0 2035 UChar *myTarget = args->target;
michael@0 2036 const char *mySourceLimit = args->sourceLimit;
michael@0 2037 uint32_t targetUniChar = 0x0000;
michael@0 2038 uint32_t mySourceChar = 0x0000;
michael@0 2039 uint32_t tmpSourceChar = 0x0000;
michael@0 2040 UConverterDataISO2022* myData;
michael@0 2041 ISO2022State *pToU2022State;
michael@0 2042 StateEnum cs;
michael@0 2043
michael@0 2044 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
michael@0 2045 pToU2022State = &myData->toU2022State;
michael@0 2046
michael@0 2047 if(myData->key != 0) {
michael@0 2048 /* continue with a partial escape sequence */
michael@0 2049 goto escape;
michael@0 2050 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
michael@0 2051 /* continue with a partial double-byte character */
michael@0 2052 mySourceChar = args->converter->toUBytes[0];
michael@0 2053 args->converter->toULength = 0;
michael@0 2054 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
michael@0 2055 targetUniChar = missingCharMarker;
michael@0 2056 goto getTrailByte;
michael@0 2057 }
michael@0 2058
michael@0 2059 while(mySource < mySourceLimit){
michael@0 2060
michael@0 2061 targetUniChar =missingCharMarker;
michael@0 2062
michael@0 2063 if(myTarget < args->targetLimit){
michael@0 2064
michael@0 2065 mySourceChar= (unsigned char) *mySource++;
michael@0 2066
michael@0 2067 switch(mySourceChar) {
michael@0 2068 case UCNV_SI:
michael@0 2069 if(myData->version==3) {
michael@0 2070 pToU2022State->g=0;
michael@0 2071 continue;
michael@0 2072 } else {
michael@0 2073 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
michael@0 2074 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
michael@0 2075 break;
michael@0 2076 }
michael@0 2077
michael@0 2078 case UCNV_SO:
michael@0 2079 if(myData->version==3) {
michael@0 2080 /* JIS7: switch to G1 half-width Katakana */
michael@0 2081 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
michael@0 2082 pToU2022State->g=1;
michael@0 2083 continue;
michael@0 2084 } else {
michael@0 2085 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
michael@0 2086 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
michael@0 2087 break;
michael@0 2088 }
michael@0 2089
michael@0 2090 case ESC_2022:
michael@0 2091 mySource--;
michael@0 2092 escape:
michael@0 2093 {
michael@0 2094 const char * mySourceBefore = mySource;
michael@0 2095 int8_t toULengthBefore = args->converter->toULength;
michael@0 2096
michael@0 2097 changeState_2022(args->converter,&(mySource),
michael@0 2098 mySourceLimit, ISO_2022_JP,err);
michael@0 2099
michael@0 2100 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
michael@0 2101 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
michael@0 2102 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 2103 args->converter->toUCallbackReason = UCNV_IRREGULAR;
michael@0 2104 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
michael@0 2105 }
michael@0 2106 }
michael@0 2107
michael@0 2108 /* invalid or illegal escape sequence */
michael@0 2109 if(U_FAILURE(*err)){
michael@0 2110 args->target = myTarget;
michael@0 2111 args->source = mySource;
michael@0 2112 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
michael@0 2113 return;
michael@0 2114 }
michael@0 2115 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
michael@0 2116 if(myData->key==0) {
michael@0 2117 myData->isEmptySegment = TRUE;
michael@0 2118 }
michael@0 2119 continue;
michael@0 2120
michael@0 2121 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
michael@0 2122
michael@0 2123 case CR:
michael@0 2124 /*falls through*/
michael@0 2125 case LF:
michael@0 2126 /* automatically reset to single-byte mode */
michael@0 2127 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
michael@0 2128 pToU2022State->cs[0] = (int8_t)ASCII;
michael@0 2129 }
michael@0 2130 pToU2022State->cs[2] = 0;
michael@0 2131 pToU2022State->g = 0;
michael@0 2132 /* falls through */
michael@0 2133 default:
michael@0 2134 /* convert one or two bytes */
michael@0 2135 myData->isEmptySegment = FALSE;
michael@0 2136 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
michael@0 2137 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
michael@0 2138 !IS_JP_DBCS(cs)
michael@0 2139 ) {
michael@0 2140 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
michael@0 2141 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
michael@0 2142
michael@0 2143 /* return from a single-shift state to the previous one */
michael@0 2144 if(pToU2022State->g >= 2) {
michael@0 2145 pToU2022State->g=pToU2022State->prevG;
michael@0 2146 }
michael@0 2147 } else switch(cs) {
michael@0 2148 case ASCII:
michael@0 2149 if(mySourceChar <= 0x7f) {
michael@0 2150 targetUniChar = mySourceChar;
michael@0 2151 }
michael@0 2152 break;
michael@0 2153 case ISO8859_1:
michael@0 2154 if(mySourceChar <= 0x7f) {
michael@0 2155 targetUniChar = mySourceChar + 0x80;
michael@0 2156 }
michael@0 2157 /* return from a single-shift state to the previous one */
michael@0 2158 pToU2022State->g=pToU2022State->prevG;
michael@0 2159 break;
michael@0 2160 case ISO8859_7:
michael@0 2161 if(mySourceChar <= 0x7f) {
michael@0 2162 /* convert mySourceChar+0x80 to use a normal 8-bit table */
michael@0 2163 targetUniChar =
michael@0 2164 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
michael@0 2165 myData->myConverterArray[cs],
michael@0 2166 mySourceChar + 0x80);
michael@0 2167 }
michael@0 2168 /* return from a single-shift state to the previous one */
michael@0 2169 pToU2022State->g=pToU2022State->prevG;
michael@0 2170 break;
michael@0 2171 case JISX201:
michael@0 2172 if(mySourceChar <= 0x7f) {
michael@0 2173 targetUniChar = jisx201ToU(mySourceChar);
michael@0 2174 }
michael@0 2175 break;
michael@0 2176 case HWKANA_7BIT:
michael@0 2177 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
michael@0 2178 /* 7-bit halfwidth Katakana */
michael@0 2179 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
michael@0 2180 }
michael@0 2181 break;
michael@0 2182 default:
michael@0 2183 /* G0 DBCS */
michael@0 2184 if(mySource < mySourceLimit) {
michael@0 2185 int leadIsOk, trailIsOk;
michael@0 2186 uint8_t trailByte;
michael@0 2187 getTrailByte:
michael@0 2188 trailByte = (uint8_t)*mySource;
michael@0 2189 /*
michael@0 2190 * Ticket 5691: consistent illegal sequences:
michael@0 2191 * - We include at least the first byte in the illegal sequence.
michael@0 2192 * - If any of the non-initial bytes could be the start of a character,
michael@0 2193 * we stop the illegal sequence before the first one of those.
michael@0 2194 *
michael@0 2195 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
michael@0 2196 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
michael@0 2197 * Otherwise we convert or report the pair of bytes.
michael@0 2198 */
michael@0 2199 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
michael@0 2200 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
michael@0 2201 if (leadIsOk && trailIsOk) {
michael@0 2202 ++mySource;
michael@0 2203 tmpSourceChar = (mySourceChar << 8) | trailByte;
michael@0 2204 if(cs == JISX208) {
michael@0 2205 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
michael@0 2206 mySourceChar = tmpSourceChar;
michael@0 2207 } else {
michael@0 2208 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
michael@0 2209 mySourceChar = tmpSourceChar;
michael@0 2210 if (cs == KSC5601) {
michael@0 2211 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
michael@0 2212 }
michael@0 2213 tempBuf[0] = (char)(tmpSourceChar >> 8);
michael@0 2214 tempBuf[1] = (char)(tmpSourceChar);
michael@0 2215 }
michael@0 2216 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
michael@0 2217 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
michael@0 2218 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
michael@0 2219 ++mySource;
michael@0 2220 /* add another bit so that the code below writes 2 bytes in case of error */
michael@0 2221 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
michael@0 2222 }
michael@0 2223 } else {
michael@0 2224 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
michael@0 2225 args->converter->toULength = 1;
michael@0 2226 goto endloop;
michael@0 2227 }
michael@0 2228 } /* End of inner switch */
michael@0 2229 break;
michael@0 2230 } /* End of outer switch */
michael@0 2231 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
michael@0 2232 if(args->offsets){
michael@0 2233 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
michael@0 2234 }
michael@0 2235 *(myTarget++)=(UChar)targetUniChar;
michael@0 2236 }
michael@0 2237 else if(targetUniChar > missingCharMarker){
michael@0 2238 /* disassemble the surrogate pair and write to output*/
michael@0 2239 targetUniChar-=0x0010000;
michael@0 2240 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
michael@0 2241 if(args->offsets){
michael@0 2242 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
michael@0 2243 }
michael@0 2244 ++myTarget;
michael@0 2245 if(myTarget< args->targetLimit){
michael@0 2246 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
michael@0 2247 if(args->offsets){
michael@0 2248 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
michael@0 2249 }
michael@0 2250 ++myTarget;
michael@0 2251 }else{
michael@0 2252 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
michael@0 2253 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
michael@0 2254 }
michael@0 2255
michael@0 2256 }
michael@0 2257 else{
michael@0 2258 /* Call the callback function*/
michael@0 2259 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
michael@0 2260 break;
michael@0 2261 }
michael@0 2262 }
michael@0 2263 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
michael@0 2264 *err =U_BUFFER_OVERFLOW_ERROR;
michael@0 2265 break;
michael@0 2266 }
michael@0 2267 }
michael@0 2268 endloop:
michael@0 2269 args->target = myTarget;
michael@0 2270 args->source = mySource;
michael@0 2271 }
michael@0 2272
michael@0 2273
michael@0 2274 /***************************************************************
michael@0 2275 * Rules for ISO-2022-KR encoding
michael@0 2276 * i) The KSC5601 designator sequence should appear only once in a file,
michael@0 2277 * at the begining of a line before any KSC5601 characters. This usually
michael@0 2278 * means that it appears by itself on the first line of the file
michael@0 2279 * ii) There are only 2 shifting sequences SO to shift into double byte mode
michael@0 2280 * and SI to shift into single byte mode
michael@0 2281 */
michael@0 2282 static void
michael@0 2283 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
michael@0 2284
michael@0 2285 UConverter* saveConv = args->converter;
michael@0 2286 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
michael@0 2287 args->converter=myConverterData->currentConverter;
michael@0 2288
michael@0 2289 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
michael@0 2290 ucnv_MBCSFromUnicodeWithOffsets(args,err);
michael@0 2291 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
michael@0 2292
michael@0 2293 if(*err == U_BUFFER_OVERFLOW_ERROR) {
michael@0 2294 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
michael@0 2295 uprv_memcpy(
michael@0 2296 saveConv->charErrorBuffer,
michael@0 2297 myConverterData->currentConverter->charErrorBuffer,
michael@0 2298 myConverterData->currentConverter->charErrorBufferLength);
michael@0 2299 }
michael@0 2300 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
michael@0 2301 myConverterData->currentConverter->charErrorBufferLength = 0;
michael@0 2302 }
michael@0 2303 args->converter=saveConv;
michael@0 2304 }
michael@0 2305
michael@0 2306 static void
michael@0 2307 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
michael@0 2308
michael@0 2309 const UChar *source = args->source;
michael@0 2310 const UChar *sourceLimit = args->sourceLimit;
michael@0 2311 unsigned char *target = (unsigned char *) args->target;
michael@0 2312 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
michael@0 2313 int32_t* offsets = args->offsets;
michael@0 2314 uint32_t targetByteUnit = 0x0000;
michael@0 2315 UChar32 sourceChar = 0x0000;
michael@0 2316 UBool isTargetByteDBCS;
michael@0 2317 UBool oldIsTargetByteDBCS;
michael@0 2318 UConverterDataISO2022 *converterData;
michael@0 2319 UConverterSharedData* sharedData;
michael@0 2320 UBool useFallback;
michael@0 2321 int32_t length =0;
michael@0 2322
michael@0 2323 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
michael@0 2324 /* if the version is 1 then the user is requesting
michael@0 2325 * conversion with ibm-25546 pass the arguments to
michael@0 2326 * MBCS converter and return
michael@0 2327 */
michael@0 2328 if(converterData->version==1){
michael@0 2329 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
michael@0 2330 return;
michael@0 2331 }
michael@0 2332
michael@0 2333 /* initialize data */
michael@0 2334 sharedData = converterData->currentConverter->sharedData;
michael@0 2335 useFallback = args->converter->useFallback;
michael@0 2336 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
michael@0 2337 oldIsTargetByteDBCS = isTargetByteDBCS;
michael@0 2338
michael@0 2339 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
michael@0 2340 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
michael@0 2341 goto getTrail;
michael@0 2342 }
michael@0 2343 while(source < sourceLimit){
michael@0 2344
michael@0 2345 targetByteUnit = missingCharMarker;
michael@0 2346
michael@0 2347 if(target < (unsigned char*) args->targetLimit){
michael@0 2348 sourceChar = *source++;
michael@0 2349
michael@0 2350 /* do not convert SO/SI/ESC */
michael@0 2351 if(IS_2022_CONTROL(sourceChar)) {
michael@0 2352 /* callback(illegal) */
michael@0 2353 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 2354 args->converter->fromUChar32=sourceChar;
michael@0 2355 break;
michael@0 2356 }
michael@0 2357
michael@0 2358 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
michael@0 2359 if(length < 0) {
michael@0 2360 length = -length; /* fallback */
michael@0 2361 }
michael@0 2362 /* only DBCS or SBCS characters are expected*/
michael@0 2363 /* DB characters with high bit set to 1 are expected */
michael@0 2364 if( length > 2 || length==0 ||
michael@0 2365 (length == 1 && targetByteUnit > 0x7f) ||
michael@0 2366 (length == 2 &&
michael@0 2367 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
michael@0 2368 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
michael@0 2369 ) {
michael@0 2370 targetByteUnit=missingCharMarker;
michael@0 2371 }
michael@0 2372 if (targetByteUnit != missingCharMarker){
michael@0 2373
michael@0 2374 oldIsTargetByteDBCS = isTargetByteDBCS;
michael@0 2375 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
michael@0 2376 /* append the shift sequence */
michael@0 2377 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
michael@0 2378
michael@0 2379 if (isTargetByteDBCS)
michael@0 2380 *target++ = UCNV_SO;
michael@0 2381 else
michael@0 2382 *target++ = UCNV_SI;
michael@0 2383 if(offsets)
michael@0 2384 *(offsets++) = (int32_t)(source - args->source-1);
michael@0 2385 }
michael@0 2386 /* write the targetUniChar to target */
michael@0 2387 if(targetByteUnit <= 0x00FF){
michael@0 2388 if( target < targetLimit){
michael@0 2389 *(target++) = (unsigned char) targetByteUnit;
michael@0 2390 if(offsets){
michael@0 2391 *(offsets++) = (int32_t)(source - args->source-1);
michael@0 2392 }
michael@0 2393
michael@0 2394 }else{
michael@0 2395 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
michael@0 2396 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 2397 }
michael@0 2398 }else{
michael@0 2399 if(target < targetLimit){
michael@0 2400 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
michael@0 2401 if(offsets){
michael@0 2402 *(offsets++) = (int32_t)(source - args->source-1);
michael@0 2403 }
michael@0 2404 if(target < targetLimit){
michael@0 2405 *(target++) =(unsigned char) (targetByteUnit -0x80);
michael@0 2406 if(offsets){
michael@0 2407 *(offsets++) = (int32_t)(source - args->source-1);
michael@0 2408 }
michael@0 2409 }else{
michael@0 2410 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
michael@0 2411 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 2412 }
michael@0 2413 }else{
michael@0 2414 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
michael@0 2415 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
michael@0 2416 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 2417 }
michael@0 2418 }
michael@0 2419
michael@0 2420 }
michael@0 2421 else{
michael@0 2422 /* oops.. the code point is unassingned
michael@0 2423 * set the error and reason
michael@0 2424 */
michael@0 2425
michael@0 2426 /*check if the char is a First surrogate*/
michael@0 2427 if(U16_IS_SURROGATE(sourceChar)) {
michael@0 2428 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
michael@0 2429 getTrail:
michael@0 2430 /*look ahead to find the trail surrogate*/
michael@0 2431 if(source < sourceLimit) {
michael@0 2432 /* test the following code unit */
michael@0 2433 UChar trail=(UChar) *source;
michael@0 2434 if(U16_IS_TRAIL(trail)) {
michael@0 2435 source++;
michael@0 2436 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
michael@0 2437 *err = U_INVALID_CHAR_FOUND;
michael@0 2438 /* convert this surrogate code point */
michael@0 2439 /* exit this condition tree */
michael@0 2440 } else {
michael@0 2441 /* this is an unmatched lead code unit (1st surrogate) */
michael@0 2442 /* callback(illegal) */
michael@0 2443 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 2444 }
michael@0 2445 } else {
michael@0 2446 /* no more input */
michael@0 2447 *err = U_ZERO_ERROR;
michael@0 2448 }
michael@0 2449 } else {
michael@0 2450 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 2451 /* callback(illegal) */
michael@0 2452 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 2453 }
michael@0 2454 } else {
michael@0 2455 /* callback(unassigned) for a BMP code point */
michael@0 2456 *err = U_INVALID_CHAR_FOUND;
michael@0 2457 }
michael@0 2458
michael@0 2459 args->converter->fromUChar32=sourceChar;
michael@0 2460 break;
michael@0 2461 }
michael@0 2462 } /* end if(myTargetIndex<myTargetLength) */
michael@0 2463 else{
michael@0 2464 *err =U_BUFFER_OVERFLOW_ERROR;
michael@0 2465 break;
michael@0 2466 }
michael@0 2467
michael@0 2468 }/* end while(mySourceIndex<mySourceLength) */
michael@0 2469
michael@0 2470 /*
michael@0 2471 * the end of the input stream and detection of truncated input
michael@0 2472 * are handled by the framework, but for ISO-2022-KR conversion
michael@0 2473 * we need to be in ASCII mode at the very end
michael@0 2474 *
michael@0 2475 * conditions:
michael@0 2476 * successful
michael@0 2477 * not in ASCII mode
michael@0 2478 * end of input and no truncated input
michael@0 2479 */
michael@0 2480 if( U_SUCCESS(*err) &&
michael@0 2481 isTargetByteDBCS &&
michael@0 2482 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
michael@0 2483 ) {
michael@0 2484 int32_t sourceIndex;
michael@0 2485
michael@0 2486 /* we are switching to ASCII */
michael@0 2487 isTargetByteDBCS=FALSE;
michael@0 2488
michael@0 2489 /* get the source index of the last input character */
michael@0 2490 /*
michael@0 2491 * TODO this would be simpler and more reliable if we used a pair
michael@0 2492 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
michael@0 2493 * so that we could simply use the prevSourceIndex here;
michael@0 2494 * this code gives an incorrect result for the rare case of an unmatched
michael@0 2495 * trail surrogate that is alone in the last buffer of the text stream
michael@0 2496 */
michael@0 2497 sourceIndex=(int32_t)(source-args->source);
michael@0 2498 if(sourceIndex>0) {
michael@0 2499 --sourceIndex;
michael@0 2500 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
michael@0 2501 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
michael@0 2502 ) {
michael@0 2503 --sourceIndex;
michael@0 2504 }
michael@0 2505 } else {
michael@0 2506 sourceIndex=-1;
michael@0 2507 }
michael@0 2508
michael@0 2509 fromUWriteUInt8(
michael@0 2510 args->converter,
michael@0 2511 SHIFT_IN_STR, 1,
michael@0 2512 &target, (const char *)targetLimit,
michael@0 2513 &offsets, sourceIndex,
michael@0 2514 err);
michael@0 2515 }
michael@0 2516
michael@0 2517 /*save the state and return */
michael@0 2518 args->source = source;
michael@0 2519 args->target = (char*)target;
michael@0 2520 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
michael@0 2521 }
michael@0 2522
michael@0 2523 /************************ To Unicode ***************************************/
michael@0 2524
michael@0 2525 static void
michael@0 2526 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
michael@0 2527 UErrorCode* err){
michael@0 2528 char const* sourceStart;
michael@0 2529 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
michael@0 2530
michael@0 2531 UConverterToUnicodeArgs subArgs;
michael@0 2532 int32_t minArgsSize;
michael@0 2533
michael@0 2534 /* set up the subconverter arguments */
michael@0 2535 if(args->size<sizeof(UConverterToUnicodeArgs)) {
michael@0 2536 minArgsSize = args->size;
michael@0 2537 } else {
michael@0 2538 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
michael@0 2539 }
michael@0 2540
michael@0 2541 uprv_memcpy(&subArgs, args, minArgsSize);
michael@0 2542 subArgs.size = (uint16_t)minArgsSize;
michael@0 2543 subArgs.converter = myData->currentConverter;
michael@0 2544
michael@0 2545 /* remember the original start of the input for offsets */
michael@0 2546 sourceStart = args->source;
michael@0 2547
michael@0 2548 if(myData->key != 0) {
michael@0 2549 /* continue with a partial escape sequence */
michael@0 2550 goto escape;
michael@0 2551 }
michael@0 2552
michael@0 2553 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
michael@0 2554 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
michael@0 2555 subArgs.source = args->source;
michael@0 2556 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
michael@0 2557 if(subArgs.source != subArgs.sourceLimit) {
michael@0 2558 /*
michael@0 2559 * get the current partial byte sequence
michael@0 2560 *
michael@0 2561 * it needs to be moved between the public and the subconverter
michael@0 2562 * so that the conversion framework, which only sees the public
michael@0 2563 * converter, can handle truncated and illegal input etc.
michael@0 2564 */
michael@0 2565 if(args->converter->toULength > 0) {
michael@0 2566 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
michael@0 2567 }
michael@0 2568 subArgs.converter->toULength = args->converter->toULength;
michael@0 2569
michael@0 2570 /*
michael@0 2571 * Convert up to the end of the input, or to before the next escape character.
michael@0 2572 * Does not handle conversion extensions because the preToU[] state etc.
michael@0 2573 * is not copied.
michael@0 2574 */
michael@0 2575 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
michael@0 2576
michael@0 2577 if(args->offsets != NULL && sourceStart != args->source) {
michael@0 2578 /* update offsets to base them on the actual start of the input */
michael@0 2579 int32_t *offsets = args->offsets;
michael@0 2580 UChar *target = args->target;
michael@0 2581 int32_t delta = (int32_t)(args->source - sourceStart);
michael@0 2582 while(target < subArgs.target) {
michael@0 2583 if(*offsets >= 0) {
michael@0 2584 *offsets += delta;
michael@0 2585 }
michael@0 2586 ++offsets;
michael@0 2587 ++target;
michael@0 2588 }
michael@0 2589 }
michael@0 2590 args->source = subArgs.source;
michael@0 2591 args->target = subArgs.target;
michael@0 2592 args->offsets = subArgs.offsets;
michael@0 2593
michael@0 2594 /* copy input/error/overflow buffers */
michael@0 2595 if(subArgs.converter->toULength > 0) {
michael@0 2596 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
michael@0 2597 }
michael@0 2598 args->converter->toULength = subArgs.converter->toULength;
michael@0 2599
michael@0 2600 if(*err == U_BUFFER_OVERFLOW_ERROR) {
michael@0 2601 if(subArgs.converter->UCharErrorBufferLength > 0) {
michael@0 2602 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
michael@0 2603 subArgs.converter->UCharErrorBufferLength);
michael@0 2604 }
michael@0 2605 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
michael@0 2606 subArgs.converter->UCharErrorBufferLength = 0;
michael@0 2607 }
michael@0 2608 }
michael@0 2609
michael@0 2610 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
michael@0 2611 return;
michael@0 2612 }
michael@0 2613
michael@0 2614 escape:
michael@0 2615 changeState_2022(args->converter,
michael@0 2616 &(args->source),
michael@0 2617 args->sourceLimit,
michael@0 2618 ISO_2022_KR,
michael@0 2619 err);
michael@0 2620 }
michael@0 2621 }
michael@0 2622
michael@0 2623 static void
michael@0 2624 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
michael@0 2625 UErrorCode* err){
michael@0 2626 char tempBuf[2];
michael@0 2627 const char *mySource = ( char *) args->source;
michael@0 2628 UChar *myTarget = args->target;
michael@0 2629 const char *mySourceLimit = args->sourceLimit;
michael@0 2630 UChar32 targetUniChar = 0x0000;
michael@0 2631 UChar mySourceChar = 0x0000;
michael@0 2632 UConverterDataISO2022* myData;
michael@0 2633 UConverterSharedData* sharedData ;
michael@0 2634 UBool useFallback;
michael@0 2635
michael@0 2636 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
michael@0 2637 if(myData->version==1){
michael@0 2638 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
michael@0 2639 return;
michael@0 2640 }
michael@0 2641
michael@0 2642 /* initialize state */
michael@0 2643 sharedData = myData->currentConverter->sharedData;
michael@0 2644 useFallback = args->converter->useFallback;
michael@0 2645
michael@0 2646 if(myData->key != 0) {
michael@0 2647 /* continue with a partial escape sequence */
michael@0 2648 goto escape;
michael@0 2649 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
michael@0 2650 /* continue with a partial double-byte character */
michael@0 2651 mySourceChar = args->converter->toUBytes[0];
michael@0 2652 args->converter->toULength = 0;
michael@0 2653 goto getTrailByte;
michael@0 2654 }
michael@0 2655
michael@0 2656 while(mySource< mySourceLimit){
michael@0 2657
michael@0 2658 if(myTarget < args->targetLimit){
michael@0 2659
michael@0 2660 mySourceChar= (unsigned char) *mySource++;
michael@0 2661
michael@0 2662 if(mySourceChar==UCNV_SI){
michael@0 2663 myData->toU2022State.g = 0;
michael@0 2664 if (myData->isEmptySegment) {
michael@0 2665 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
michael@0 2666 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 2667 args->converter->toUCallbackReason = UCNV_IRREGULAR;
michael@0 2668 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
michael@0 2669 args->converter->toULength = 1;
michael@0 2670 args->target = myTarget;
michael@0 2671 args->source = mySource;
michael@0 2672 return;
michael@0 2673 }
michael@0 2674 /*consume the source */
michael@0 2675 continue;
michael@0 2676 }else if(mySourceChar==UCNV_SO){
michael@0 2677 myData->toU2022State.g = 1;
michael@0 2678 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
michael@0 2679 /*consume the source */
michael@0 2680 continue;
michael@0 2681 }else if(mySourceChar==ESC_2022){
michael@0 2682 mySource--;
michael@0 2683 escape:
michael@0 2684 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
michael@0 2685 changeState_2022(args->converter,&(mySource),
michael@0 2686 mySourceLimit, ISO_2022_KR, err);
michael@0 2687 if(U_FAILURE(*err)){
michael@0 2688 args->target = myTarget;
michael@0 2689 args->source = mySource;
michael@0 2690 return;
michael@0 2691 }
michael@0 2692 continue;
michael@0 2693 }
michael@0 2694
michael@0 2695 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
michael@0 2696 if(myData->toU2022State.g == 1) {
michael@0 2697 if(mySource < mySourceLimit) {
michael@0 2698 int leadIsOk, trailIsOk;
michael@0 2699 uint8_t trailByte;
michael@0 2700 getTrailByte:
michael@0 2701 targetUniChar = missingCharMarker;
michael@0 2702 trailByte = (uint8_t)*mySource;
michael@0 2703 /*
michael@0 2704 * Ticket 5691: consistent illegal sequences:
michael@0 2705 * - We include at least the first byte in the illegal sequence.
michael@0 2706 * - If any of the non-initial bytes could be the start of a character,
michael@0 2707 * we stop the illegal sequence before the first one of those.
michael@0 2708 *
michael@0 2709 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
michael@0 2710 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
michael@0 2711 * Otherwise we convert or report the pair of bytes.
michael@0 2712 */
michael@0 2713 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
michael@0 2714 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
michael@0 2715 if (leadIsOk && trailIsOk) {
michael@0 2716 ++mySource;
michael@0 2717 tempBuf[0] = (char)(mySourceChar + 0x80);
michael@0 2718 tempBuf[1] = (char)(trailByte + 0x80);
michael@0 2719 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
michael@0 2720 mySourceChar = (mySourceChar << 8) | trailByte;
michael@0 2721 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
michael@0 2722 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
michael@0 2723 ++mySource;
michael@0 2724 /* add another bit so that the code below writes 2 bytes in case of error */
michael@0 2725 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
michael@0 2726 }
michael@0 2727 } else {
michael@0 2728 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
michael@0 2729 args->converter->toULength = 1;
michael@0 2730 break;
michael@0 2731 }
michael@0 2732 }
michael@0 2733 else if(mySourceChar <= 0x7f) {
michael@0 2734 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
michael@0 2735 } else {
michael@0 2736 targetUniChar = 0xffff;
michael@0 2737 }
michael@0 2738 if(targetUniChar < 0xfffe){
michael@0 2739 if(args->offsets) {
michael@0 2740 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
michael@0 2741 }
michael@0 2742 *(myTarget++)=(UChar)targetUniChar;
michael@0 2743 }
michael@0 2744 else {
michael@0 2745 /* Call the callback function*/
michael@0 2746 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
michael@0 2747 break;
michael@0 2748 }
michael@0 2749 }
michael@0 2750 else{
michael@0 2751 *err =U_BUFFER_OVERFLOW_ERROR;
michael@0 2752 break;
michael@0 2753 }
michael@0 2754 }
michael@0 2755 args->target = myTarget;
michael@0 2756 args->source = mySource;
michael@0 2757 }
michael@0 2758
michael@0 2759 /*************************** END ISO2022-KR *********************************/
michael@0 2760
michael@0 2761 /*************************** ISO-2022-CN *********************************
michael@0 2762 *
michael@0 2763 * Rules for ISO-2022-CN Encoding:
michael@0 2764 * i) The designator sequence must appear once on a line before any instance
michael@0 2765 * of character set it designates.
michael@0 2766 * ii) If two lines contain characters from the same character set, both lines
michael@0 2767 * must include the designator sequence.
michael@0 2768 * iii) Once the designator sequence is known, a shifting sequence has to be found
michael@0 2769 * to invoke the shifting
michael@0 2770 * iv) All lines start in ASCII and end in ASCII.
michael@0 2771 * v) Four shifting sequences are employed for this purpose:
michael@0 2772 *
michael@0 2773 * Sequcence ASCII Eq Charsets
michael@0 2774 * ---------- ------- ---------
michael@0 2775 * SI <SI> US-ASCII
michael@0 2776 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
michael@0 2777 * SS2 <ESC>N CNS-11643-1992 Plane 2
michael@0 2778 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
michael@0 2779 *
michael@0 2780 * vi)
michael@0 2781 * SOdesignator : ESC "$" ")" finalchar_for_SO
michael@0 2782 * SS2designator : ESC "$" "*" finalchar_for_SS2
michael@0 2783 * SS3designator : ESC "$" "+" finalchar_for_SS3
michael@0 2784 *
michael@0 2785 * ESC $ ) A Indicates the bytes following SO are Chinese
michael@0 2786 * characters as defined in GB 2312-80, until
michael@0 2787 * another SOdesignation appears
michael@0 2788 *
michael@0 2789 *
michael@0 2790 * ESC $ ) E Indicates the bytes following SO are as defined
michael@0 2791 * in ISO-IR-165 (for details, see section 2.1),
michael@0 2792 * until another SOdesignation appears
michael@0 2793 *
michael@0 2794 * ESC $ ) G Indicates the bytes following SO are as defined
michael@0 2795 * in CNS 11643-plane-1, until another
michael@0 2796 * SOdesignation appears
michael@0 2797 *
michael@0 2798 * ESC $ * H Indicates the two bytes immediately following
michael@0 2799 * SS2 is a Chinese character as defined in CNS
michael@0 2800 * 11643-plane-2, until another SS2designation
michael@0 2801 * appears
michael@0 2802 * (Meaning <ESC>N must preceed every 2 byte
michael@0 2803 * sequence.)
michael@0 2804 *
michael@0 2805 * ESC $ + I Indicates the immediate two bytes following SS3
michael@0 2806 * is a Chinese character as defined in CNS
michael@0 2807 * 11643-plane-3, until another SS3designation
michael@0 2808 * appears
michael@0 2809 * (Meaning <ESC>O must preceed every 2 byte
michael@0 2810 * sequence.)
michael@0 2811 *
michael@0 2812 * ESC $ + J Indicates the immediate two bytes following SS3
michael@0 2813 * is a Chinese character as defined in CNS
michael@0 2814 * 11643-plane-4, until another SS3designation
michael@0 2815 * appears
michael@0 2816 * (In English: <ESC>O must preceed every 2 byte
michael@0 2817 * sequence.)
michael@0 2818 *
michael@0 2819 * ESC $ + K Indicates the immediate two bytes following SS3
michael@0 2820 * is a Chinese character as defined in CNS
michael@0 2821 * 11643-plane-5, until another SS3designation
michael@0 2822 * appears
michael@0 2823 *
michael@0 2824 * ESC $ + L Indicates the immediate two bytes following SS3
michael@0 2825 * is a Chinese character as defined in CNS
michael@0 2826 * 11643-plane-6, until another SS3designation
michael@0 2827 * appears
michael@0 2828 *
michael@0 2829 * ESC $ + M Indicates the immediate two bytes following SS3
michael@0 2830 * is a Chinese character as defined in CNS
michael@0 2831 * 11643-plane-7, until another SS3designation
michael@0 2832 * appears
michael@0 2833 *
michael@0 2834 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
michael@0 2835 * has its own designation information before any Chinese characters
michael@0 2836 * appear
michael@0 2837 *
michael@0 2838 */
michael@0 2839
michael@0 2840 /* The following are defined this way to make the strings truly readonly */
michael@0 2841 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
michael@0 2842 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
michael@0 2843 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
michael@0 2844 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
michael@0 2845 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
michael@0 2846 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
michael@0 2847 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
michael@0 2848 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
michael@0 2849 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
michael@0 2850
michael@0 2851 /********************** ISO2022-CN Data **************************/
michael@0 2852 static const char* const escSeqCharsCN[10] ={
michael@0 2853 SHIFT_IN_STR, /* 0 ASCII */
michael@0 2854 GB_2312_80_STR, /* 1 GB2312_1 */
michael@0 2855 ISO_IR_165_STR, /* 2 ISO_IR_165 */
michael@0 2856 CNS_11643_1992_Plane_1_STR,
michael@0 2857 CNS_11643_1992_Plane_2_STR,
michael@0 2858 CNS_11643_1992_Plane_3_STR,
michael@0 2859 CNS_11643_1992_Plane_4_STR,
michael@0 2860 CNS_11643_1992_Plane_5_STR,
michael@0 2861 CNS_11643_1992_Plane_6_STR,
michael@0 2862 CNS_11643_1992_Plane_7_STR
michael@0 2863 };
michael@0 2864
michael@0 2865 static void
michael@0 2866 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
michael@0 2867 UConverter *cnv = args->converter;
michael@0 2868 UConverterDataISO2022 *converterData;
michael@0 2869 ISO2022State *pFromU2022State;
michael@0 2870 uint8_t *target = (uint8_t *) args->target;
michael@0 2871 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
michael@0 2872 const UChar* source = args->source;
michael@0 2873 const UChar* sourceLimit = args->sourceLimit;
michael@0 2874 int32_t* offsets = args->offsets;
michael@0 2875 UChar32 sourceChar;
michael@0 2876 char buffer[8];
michael@0 2877 int32_t len;
michael@0 2878 int8_t choices[3];
michael@0 2879 int32_t choiceCount;
michael@0 2880 uint32_t targetValue = 0;
michael@0 2881 UBool useFallback;
michael@0 2882
michael@0 2883 /* set up the state */
michael@0 2884 converterData = (UConverterDataISO2022*)cnv->extraInfo;
michael@0 2885 pFromU2022State = &converterData->fromU2022State;
michael@0 2886
michael@0 2887 choiceCount = 0;
michael@0 2888
michael@0 2889 /* check if the last codepoint of previous buffer was a lead surrogate*/
michael@0 2890 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
michael@0 2891 goto getTrail;
michael@0 2892 }
michael@0 2893
michael@0 2894 while( source < sourceLimit){
michael@0 2895 if(target < targetLimit){
michael@0 2896
michael@0 2897 sourceChar = *(source++);
michael@0 2898 /*check if the char is a First surrogate*/
michael@0 2899 if(U16_IS_SURROGATE(sourceChar)) {
michael@0 2900 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
michael@0 2901 getTrail:
michael@0 2902 /*look ahead to find the trail surrogate*/
michael@0 2903 if(source < sourceLimit) {
michael@0 2904 /* test the following code unit */
michael@0 2905 UChar trail=(UChar) *source;
michael@0 2906 if(U16_IS_TRAIL(trail)) {
michael@0 2907 source++;
michael@0 2908 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
michael@0 2909 cnv->fromUChar32=0x00;
michael@0 2910 /* convert this supplementary code point */
michael@0 2911 /* exit this condition tree */
michael@0 2912 } else {
michael@0 2913 /* this is an unmatched lead code unit (1st surrogate) */
michael@0 2914 /* callback(illegal) */
michael@0 2915 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 2916 cnv->fromUChar32=sourceChar;
michael@0 2917 break;
michael@0 2918 }
michael@0 2919 } else {
michael@0 2920 /* no more input */
michael@0 2921 cnv->fromUChar32=sourceChar;
michael@0 2922 break;
michael@0 2923 }
michael@0 2924 } else {
michael@0 2925 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 2926 /* callback(illegal) */
michael@0 2927 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 2928 cnv->fromUChar32=sourceChar;
michael@0 2929 break;
michael@0 2930 }
michael@0 2931 }
michael@0 2932
michael@0 2933 /* do the conversion */
michael@0 2934 if(sourceChar <= 0x007f ){
michael@0 2935 /* do not convert SO/SI/ESC */
michael@0 2936 if(IS_2022_CONTROL(sourceChar)) {
michael@0 2937 /* callback(illegal) */
michael@0 2938 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 2939 cnv->fromUChar32=sourceChar;
michael@0 2940 break;
michael@0 2941 }
michael@0 2942
michael@0 2943 /* US-ASCII */
michael@0 2944 if(pFromU2022State->g == 0) {
michael@0 2945 buffer[0] = (char)sourceChar;
michael@0 2946 len = 1;
michael@0 2947 } else {
michael@0 2948 buffer[0] = UCNV_SI;
michael@0 2949 buffer[1] = (char)sourceChar;
michael@0 2950 len = 2;
michael@0 2951 pFromU2022State->g = 0;
michael@0 2952 choiceCount = 0;
michael@0 2953 }
michael@0 2954 if(sourceChar == CR || sourceChar == LF) {
michael@0 2955 /* reset the state at the end of a line */
michael@0 2956 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
michael@0 2957 choiceCount = 0;
michael@0 2958 }
michael@0 2959 }
michael@0 2960 else{
michael@0 2961 /* convert U+0080..U+10ffff */
michael@0 2962 int32_t i;
michael@0 2963 int8_t cs, g;
michael@0 2964
michael@0 2965 if(choiceCount == 0) {
michael@0 2966 /* try the current SO/G1 converter first */
michael@0 2967 choices[0] = pFromU2022State->cs[1];
michael@0 2968
michael@0 2969 /* default to GB2312_1 if none is designated yet */
michael@0 2970 if(choices[0] == 0) {
michael@0 2971 choices[0] = GB2312_1;
michael@0 2972 }
michael@0 2973
michael@0 2974 if(converterData->version == 0) {
michael@0 2975 /* ISO-2022-CN */
michael@0 2976
michael@0 2977 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
michael@0 2978 if(choices[0] == GB2312_1) {
michael@0 2979 choices[1] = (int8_t)CNS_11643_1;
michael@0 2980 } else {
michael@0 2981 choices[1] = (int8_t)GB2312_1;
michael@0 2982 }
michael@0 2983
michael@0 2984 choiceCount = 2;
michael@0 2985 } else if (converterData->version == 1) {
michael@0 2986 /* ISO-2022-CN-EXT */
michael@0 2987
michael@0 2988 /* try one of the other converters */
michael@0 2989 switch(choices[0]) {
michael@0 2990 case GB2312_1:
michael@0 2991 choices[1] = (int8_t)CNS_11643_1;
michael@0 2992 choices[2] = (int8_t)ISO_IR_165;
michael@0 2993 break;
michael@0 2994 case ISO_IR_165:
michael@0 2995 choices[1] = (int8_t)GB2312_1;
michael@0 2996 choices[2] = (int8_t)CNS_11643_1;
michael@0 2997 break;
michael@0 2998 default: /* CNS_11643_x */
michael@0 2999 choices[1] = (int8_t)GB2312_1;
michael@0 3000 choices[2] = (int8_t)ISO_IR_165;
michael@0 3001 break;
michael@0 3002 }
michael@0 3003
michael@0 3004 choiceCount = 3;
michael@0 3005 } else {
michael@0 3006 choices[0] = (int8_t)CNS_11643_1;
michael@0 3007 choices[1] = (int8_t)GB2312_1;
michael@0 3008 }
michael@0 3009 }
michael@0 3010
michael@0 3011 cs = g = 0;
michael@0 3012 /*
michael@0 3013 * len==0: no mapping found yet
michael@0 3014 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
michael@0 3015 * len>0: found a roundtrip result, done
michael@0 3016 */
michael@0 3017 len = 0;
michael@0 3018 /*
michael@0 3019 * We will turn off useFallback after finding a fallback,
michael@0 3020 * but we still get fallbacks from PUA code points as usual.
michael@0 3021 * Therefore, we will also need to check that we don't overwrite
michael@0 3022 * an early fallback with a later one.
michael@0 3023 */
michael@0 3024 useFallback = cnv->useFallback;
michael@0 3025
michael@0 3026 for(i = 0; i < choiceCount && len <= 0; ++i) {
michael@0 3027 int8_t cs0 = choices[i];
michael@0 3028 if(cs0 > 0) {
michael@0 3029 uint32_t value;
michael@0 3030 int32_t len2;
michael@0 3031 if(cs0 >= CNS_11643_0) {
michael@0 3032 len2 = MBCS_FROM_UCHAR32_ISO2022(
michael@0 3033 converterData->myConverterArray[CNS_11643],
michael@0 3034 sourceChar,
michael@0 3035 &value,
michael@0 3036 useFallback,
michael@0 3037 MBCS_OUTPUT_3);
michael@0 3038 if(len2 == 3 || (len2 == -3 && len == 0)) {
michael@0 3039 targetValue = value;
michael@0 3040 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
michael@0 3041 if(len2 >= 0) {
michael@0 3042 len = 2;
michael@0 3043 } else {
michael@0 3044 len = -2;
michael@0 3045 useFallback = FALSE;
michael@0 3046 }
michael@0 3047 if(cs == CNS_11643_1) {
michael@0 3048 g = 1;
michael@0 3049 } else if(cs == CNS_11643_2) {
michael@0 3050 g = 2;
michael@0 3051 } else /* plane 3..7 */ if(converterData->version == 1) {
michael@0 3052 g = 3;
michael@0 3053 } else {
michael@0 3054 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
michael@0 3055 len = 0;
michael@0 3056 }
michael@0 3057 }
michael@0 3058 } else {
michael@0 3059 /* GB2312_1 or ISO-IR-165 */
michael@0 3060 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
michael@0 3061 len2 = MBCS_FROM_UCHAR32_ISO2022(
michael@0 3062 converterData->myConverterArray[cs0],
michael@0 3063 sourceChar,
michael@0 3064 &value,
michael@0 3065 useFallback,
michael@0 3066 MBCS_OUTPUT_2);
michael@0 3067 if(len2 == 2 || (len2 == -2 && len == 0)) {
michael@0 3068 targetValue = value;
michael@0 3069 len = len2;
michael@0 3070 cs = cs0;
michael@0 3071 g = 1;
michael@0 3072 useFallback = FALSE;
michael@0 3073 }
michael@0 3074 }
michael@0 3075 }
michael@0 3076 }
michael@0 3077
michael@0 3078 if(len != 0) {
michael@0 3079 len = 0; /* count output bytes; it must have been abs(len) == 2 */
michael@0 3080
michael@0 3081 /* write the designation sequence if necessary */
michael@0 3082 if(cs != pFromU2022State->cs[g]) {
michael@0 3083 if(cs < CNS_11643) {
michael@0 3084 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
michael@0 3085 } else {
michael@0 3086 U_ASSERT(cs >= CNS_11643_1);
michael@0 3087 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
michael@0 3088 }
michael@0 3089 len = 4;
michael@0 3090 pFromU2022State->cs[g] = cs;
michael@0 3091 if(g == 1) {
michael@0 3092 /* changing the SO/G1 charset invalidates the choices[] */
michael@0 3093 choiceCount = 0;
michael@0 3094 }
michael@0 3095 }
michael@0 3096
michael@0 3097 /* write the shift sequence if necessary */
michael@0 3098 if(g != pFromU2022State->g) {
michael@0 3099 switch(g) {
michael@0 3100 case 1:
michael@0 3101 buffer[len++] = UCNV_SO;
michael@0 3102
michael@0 3103 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
michael@0 3104 pFromU2022State->g = 1;
michael@0 3105 break;
michael@0 3106 case 2:
michael@0 3107 buffer[len++] = 0x1b;
michael@0 3108 buffer[len++] = 0x4e;
michael@0 3109 break;
michael@0 3110 default: /* case 3 */
michael@0 3111 buffer[len++] = 0x1b;
michael@0 3112 buffer[len++] = 0x4f;
michael@0 3113 break;
michael@0 3114 }
michael@0 3115 }
michael@0 3116
michael@0 3117 /* write the two output bytes */
michael@0 3118 buffer[len++] = (char)(targetValue >> 8);
michael@0 3119 buffer[len++] = (char)targetValue;
michael@0 3120 } else {
michael@0 3121 /* if we cannot find the character after checking all codepages
michael@0 3122 * then this is an error
michael@0 3123 */
michael@0 3124 *err = U_INVALID_CHAR_FOUND;
michael@0 3125 cnv->fromUChar32=sourceChar;
michael@0 3126 break;
michael@0 3127 }
michael@0 3128 }
michael@0 3129
michael@0 3130 /* output len>0 bytes in buffer[] */
michael@0 3131 if(len == 1) {
michael@0 3132 *target++ = buffer[0];
michael@0 3133 if(offsets) {
michael@0 3134 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
michael@0 3135 }
michael@0 3136 } else if(len == 2 && (target + 2) <= targetLimit) {
michael@0 3137 *target++ = buffer[0];
michael@0 3138 *target++ = buffer[1];
michael@0 3139 if(offsets) {
michael@0 3140 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
michael@0 3141 *offsets++ = sourceIndex;
michael@0 3142 *offsets++ = sourceIndex;
michael@0 3143 }
michael@0 3144 } else {
michael@0 3145 fromUWriteUInt8(
michael@0 3146 cnv,
michael@0 3147 buffer, len,
michael@0 3148 &target, (const char *)targetLimit,
michael@0 3149 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
michael@0 3150 err);
michael@0 3151 if(U_FAILURE(*err)) {
michael@0 3152 break;
michael@0 3153 }
michael@0 3154 }
michael@0 3155 } /* end if(myTargetIndex<myTargetLength) */
michael@0 3156 else{
michael@0 3157 *err =U_BUFFER_OVERFLOW_ERROR;
michael@0 3158 break;
michael@0 3159 }
michael@0 3160
michael@0 3161 }/* end while(mySourceIndex<mySourceLength) */
michael@0 3162
michael@0 3163 /*
michael@0 3164 * the end of the input stream and detection of truncated input
michael@0 3165 * are handled by the framework, but for ISO-2022-CN conversion
michael@0 3166 * we need to be in ASCII mode at the very end
michael@0 3167 *
michael@0 3168 * conditions:
michael@0 3169 * successful
michael@0 3170 * not in ASCII mode
michael@0 3171 * end of input and no truncated input
michael@0 3172 */
michael@0 3173 if( U_SUCCESS(*err) &&
michael@0 3174 pFromU2022State->g!=0 &&
michael@0 3175 args->flush && source>=sourceLimit && cnv->fromUChar32==0
michael@0 3176 ) {
michael@0 3177 int32_t sourceIndex;
michael@0 3178
michael@0 3179 /* we are switching to ASCII */
michael@0 3180 pFromU2022State->g=0;
michael@0 3181
michael@0 3182 /* get the source index of the last input character */
michael@0 3183 /*
michael@0 3184 * TODO this would be simpler and more reliable if we used a pair
michael@0 3185 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
michael@0 3186 * so that we could simply use the prevSourceIndex here;
michael@0 3187 * this code gives an incorrect result for the rare case of an unmatched
michael@0 3188 * trail surrogate that is alone in the last buffer of the text stream
michael@0 3189 */
michael@0 3190 sourceIndex=(int32_t)(source-args->source);
michael@0 3191 if(sourceIndex>0) {
michael@0 3192 --sourceIndex;
michael@0 3193 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
michael@0 3194 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
michael@0 3195 ) {
michael@0 3196 --sourceIndex;
michael@0 3197 }
michael@0 3198 } else {
michael@0 3199 sourceIndex=-1;
michael@0 3200 }
michael@0 3201
michael@0 3202 fromUWriteUInt8(
michael@0 3203 cnv,
michael@0 3204 SHIFT_IN_STR, 1,
michael@0 3205 &target, (const char *)targetLimit,
michael@0 3206 &offsets, sourceIndex,
michael@0 3207 err);
michael@0 3208 }
michael@0 3209
michael@0 3210 /*save the state and return */
michael@0 3211 args->source = source;
michael@0 3212 args->target = (char*)target;
michael@0 3213 }
michael@0 3214
michael@0 3215
michael@0 3216 static void
michael@0 3217 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
michael@0 3218 UErrorCode* err){
michael@0 3219 char tempBuf[3];
michael@0 3220 const char *mySource = (char *) args->source;
michael@0 3221 UChar *myTarget = args->target;
michael@0 3222 const char *mySourceLimit = args->sourceLimit;
michael@0 3223 uint32_t targetUniChar = 0x0000;
michael@0 3224 uint32_t mySourceChar = 0x0000;
michael@0 3225 UConverterDataISO2022* myData;
michael@0 3226 ISO2022State *pToU2022State;
michael@0 3227
michael@0 3228 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
michael@0 3229 pToU2022State = &myData->toU2022State;
michael@0 3230
michael@0 3231 if(myData->key != 0) {
michael@0 3232 /* continue with a partial escape sequence */
michael@0 3233 goto escape;
michael@0 3234 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
michael@0 3235 /* continue with a partial double-byte character */
michael@0 3236 mySourceChar = args->converter->toUBytes[0];
michael@0 3237 args->converter->toULength = 0;
michael@0 3238 targetUniChar = missingCharMarker;
michael@0 3239 goto getTrailByte;
michael@0 3240 }
michael@0 3241
michael@0 3242 while(mySource < mySourceLimit){
michael@0 3243
michael@0 3244 targetUniChar =missingCharMarker;
michael@0 3245
michael@0 3246 if(myTarget < args->targetLimit){
michael@0 3247
michael@0 3248 mySourceChar= (unsigned char) *mySource++;
michael@0 3249
michael@0 3250 switch(mySourceChar){
michael@0 3251 case UCNV_SI:
michael@0 3252 pToU2022State->g=0;
michael@0 3253 if (myData->isEmptySegment) {
michael@0 3254 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
michael@0 3255 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 3256 args->converter->toUCallbackReason = UCNV_IRREGULAR;
michael@0 3257 args->converter->toUBytes[0] = mySourceChar;
michael@0 3258 args->converter->toULength = 1;
michael@0 3259 args->target = myTarget;
michael@0 3260 args->source = mySource;
michael@0 3261 return;
michael@0 3262 }
michael@0 3263 continue;
michael@0 3264
michael@0 3265 case UCNV_SO:
michael@0 3266 if(pToU2022State->cs[1] != 0) {
michael@0 3267 pToU2022State->g=1;
michael@0 3268 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
michael@0 3269 continue;
michael@0 3270 } else {
michael@0 3271 /* illegal to have SO before a matching designator */
michael@0 3272 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
michael@0 3273 break;
michael@0 3274 }
michael@0 3275
michael@0 3276 case ESC_2022:
michael@0 3277 mySource--;
michael@0 3278 escape:
michael@0 3279 {
michael@0 3280 const char * mySourceBefore = mySource;
michael@0 3281 int8_t toULengthBefore = args->converter->toULength;
michael@0 3282
michael@0 3283 changeState_2022(args->converter,&(mySource),
michael@0 3284 mySourceLimit, ISO_2022_CN,err);
michael@0 3285
michael@0 3286 /* After SO there must be at least one character before a designator (designator error handled separately) */
michael@0 3287 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
michael@0 3288 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 3289 args->converter->toUCallbackReason = UCNV_IRREGULAR;
michael@0 3290 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
michael@0 3291 }
michael@0 3292 }
michael@0 3293
michael@0 3294 /* invalid or illegal escape sequence */
michael@0 3295 if(U_FAILURE(*err)){
michael@0 3296 args->target = myTarget;
michael@0 3297 args->source = mySource;
michael@0 3298 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
michael@0 3299 return;
michael@0 3300 }
michael@0 3301 continue;
michael@0 3302
michael@0 3303 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
michael@0 3304
michael@0 3305 case CR:
michael@0 3306 /*falls through*/
michael@0 3307 case LF:
michael@0 3308 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
michael@0 3309 /* falls through */
michael@0 3310 default:
michael@0 3311 /* convert one or two bytes */
michael@0 3312 myData->isEmptySegment = FALSE;
michael@0 3313 if(pToU2022State->g != 0) {
michael@0 3314 if(mySource < mySourceLimit) {
michael@0 3315 UConverterSharedData *cnv;
michael@0 3316 StateEnum tempState;
michael@0 3317 int32_t tempBufLen;
michael@0 3318 int leadIsOk, trailIsOk;
michael@0 3319 uint8_t trailByte;
michael@0 3320 getTrailByte:
michael@0 3321 trailByte = (uint8_t)*mySource;
michael@0 3322 /*
michael@0 3323 * Ticket 5691: consistent illegal sequences:
michael@0 3324 * - We include at least the first byte in the illegal sequence.
michael@0 3325 * - If any of the non-initial bytes could be the start of a character,
michael@0 3326 * we stop the illegal sequence before the first one of those.
michael@0 3327 *
michael@0 3328 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
michael@0 3329 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
michael@0 3330 * Otherwise we convert or report the pair of bytes.
michael@0 3331 */
michael@0 3332 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
michael@0 3333 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
michael@0 3334 if (leadIsOk && trailIsOk) {
michael@0 3335 ++mySource;
michael@0 3336 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
michael@0 3337 if(tempState >= CNS_11643_0) {
michael@0 3338 cnv = myData->myConverterArray[CNS_11643];
michael@0 3339 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
michael@0 3340 tempBuf[1] = (char) (mySourceChar);
michael@0 3341 tempBuf[2] = (char) trailByte;
michael@0 3342 tempBufLen = 3;
michael@0 3343
michael@0 3344 }else{
michael@0 3345 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
michael@0 3346 cnv = myData->myConverterArray[tempState];
michael@0 3347 tempBuf[0] = (char) (mySourceChar);
michael@0 3348 tempBuf[1] = (char) trailByte;
michael@0 3349 tempBufLen = 2;
michael@0 3350 }
michael@0 3351 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
michael@0 3352 mySourceChar = (mySourceChar << 8) | trailByte;
michael@0 3353 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
michael@0 3354 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
michael@0 3355 ++mySource;
michael@0 3356 /* add another bit so that the code below writes 2 bytes in case of error */
michael@0 3357 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
michael@0 3358 }
michael@0 3359 if(pToU2022State->g>=2) {
michael@0 3360 /* return from a single-shift state to the previous one */
michael@0 3361 pToU2022State->g=pToU2022State->prevG;
michael@0 3362 }
michael@0 3363 } else {
michael@0 3364 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
michael@0 3365 args->converter->toULength = 1;
michael@0 3366 goto endloop;
michael@0 3367 }
michael@0 3368 }
michael@0 3369 else{
michael@0 3370 if(mySourceChar <= 0x7f) {
michael@0 3371 targetUniChar = (UChar) mySourceChar;
michael@0 3372 }
michael@0 3373 }
michael@0 3374 break;
michael@0 3375 }
michael@0 3376 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
michael@0 3377 if(args->offsets){
michael@0 3378 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
michael@0 3379 }
michael@0 3380 *(myTarget++)=(UChar)targetUniChar;
michael@0 3381 }
michael@0 3382 else if(targetUniChar > missingCharMarker){
michael@0 3383 /* disassemble the surrogate pair and write to output*/
michael@0 3384 targetUniChar-=0x0010000;
michael@0 3385 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
michael@0 3386 if(args->offsets){
michael@0 3387 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
michael@0 3388 }
michael@0 3389 ++myTarget;
michael@0 3390 if(myTarget< args->targetLimit){
michael@0 3391 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
michael@0 3392 if(args->offsets){
michael@0 3393 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
michael@0 3394 }
michael@0 3395 ++myTarget;
michael@0 3396 }else{
michael@0 3397 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
michael@0 3398 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
michael@0 3399 }
michael@0 3400
michael@0 3401 }
michael@0 3402 else{
michael@0 3403 /* Call the callback function*/
michael@0 3404 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
michael@0 3405 break;
michael@0 3406 }
michael@0 3407 }
michael@0 3408 else{
michael@0 3409 *err =U_BUFFER_OVERFLOW_ERROR;
michael@0 3410 break;
michael@0 3411 }
michael@0 3412 }
michael@0 3413 endloop:
michael@0 3414 args->target = myTarget;
michael@0 3415 args->source = mySource;
michael@0 3416 }
michael@0 3417
michael@0 3418 static void
michael@0 3419 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
michael@0 3420 UConverter *cnv = args->converter;
michael@0 3421 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
michael@0 3422 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
michael@0 3423 char *p, *subchar;
michael@0 3424 char buffer[8];
michael@0 3425 int32_t length;
michael@0 3426
michael@0 3427 subchar=(char *)cnv->subChars;
michael@0 3428 length=cnv->subCharLen; /* assume length==1 for most variants */
michael@0 3429
michael@0 3430 p = buffer;
michael@0 3431 switch(myConverterData->locale[0]){
michael@0 3432 case 'j':
michael@0 3433 {
michael@0 3434 int8_t cs;
michael@0 3435
michael@0 3436 if(pFromU2022State->g == 1) {
michael@0 3437 /* JIS7: switch from G1 to G0 */
michael@0 3438 pFromU2022State->g = 0;
michael@0 3439 *p++ = UCNV_SI;
michael@0 3440 }
michael@0 3441
michael@0 3442 cs = pFromU2022State->cs[0];
michael@0 3443 if(cs != ASCII && cs != JISX201) {
michael@0 3444 /* not in ASCII or JIS X 0201: switch to ASCII */
michael@0 3445 pFromU2022State->cs[0] = (int8_t)ASCII;
michael@0 3446 *p++ = '\x1b';
michael@0 3447 *p++ = '\x28';
michael@0 3448 *p++ = '\x42';
michael@0 3449 }
michael@0 3450
michael@0 3451 *p++ = subchar[0];
michael@0 3452 break;
michael@0 3453 }
michael@0 3454 case 'c':
michael@0 3455 if(pFromU2022State->g != 0) {
michael@0 3456 /* not in ASCII mode: switch to ASCII */
michael@0 3457 pFromU2022State->g = 0;
michael@0 3458 *p++ = UCNV_SI;
michael@0 3459 }
michael@0 3460 *p++ = subchar[0];
michael@0 3461 break;
michael@0 3462 case 'k':
michael@0 3463 if(myConverterData->version == 0) {
michael@0 3464 if(length == 1) {
michael@0 3465 if((UBool)args->converter->fromUnicodeStatus) {
michael@0 3466 /* in DBCS mode: switch to SBCS */
michael@0 3467 args->converter->fromUnicodeStatus = 0;
michael@0 3468 *p++ = UCNV_SI;
michael@0 3469 }
michael@0 3470 *p++ = subchar[0];
michael@0 3471 } else /* length == 2*/ {
michael@0 3472 if(!(UBool)args->converter->fromUnicodeStatus) {
michael@0 3473 /* in SBCS mode: switch to DBCS */
michael@0 3474 args->converter->fromUnicodeStatus = 1;
michael@0 3475 *p++ = UCNV_SO;
michael@0 3476 }
michael@0 3477 *p++ = subchar[0];
michael@0 3478 *p++ = subchar[1];
michael@0 3479 }
michael@0 3480 break;
michael@0 3481 } else {
michael@0 3482 /* save the subconverter's substitution string */
michael@0 3483 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
michael@0 3484 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
michael@0 3485
michael@0 3486 /* set our substitution string into the subconverter */
michael@0 3487 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
michael@0 3488 myConverterData->currentConverter->subCharLen = (int8_t)length;
michael@0 3489
michael@0 3490 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
michael@0 3491 args->converter = myConverterData->currentConverter;
michael@0 3492 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
michael@0 3493 ucnv_cbFromUWriteSub(args, 0, err);
michael@0 3494 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
michael@0 3495 args->converter = cnv;
michael@0 3496
michael@0 3497 /* restore the subconverter's substitution string */
michael@0 3498 myConverterData->currentConverter->subChars = currentSubChars;
michael@0 3499 myConverterData->currentConverter->subCharLen = currentSubCharLen;
michael@0 3500
michael@0 3501 if(*err == U_BUFFER_OVERFLOW_ERROR) {
michael@0 3502 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
michael@0 3503 uprv_memcpy(
michael@0 3504 cnv->charErrorBuffer,
michael@0 3505 myConverterData->currentConverter->charErrorBuffer,
michael@0 3506 myConverterData->currentConverter->charErrorBufferLength);
michael@0 3507 }
michael@0 3508 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
michael@0 3509 myConverterData->currentConverter->charErrorBufferLength = 0;
michael@0 3510 }
michael@0 3511 return;
michael@0 3512 }
michael@0 3513 default:
michael@0 3514 /* not expected */
michael@0 3515 break;
michael@0 3516 }
michael@0 3517 ucnv_cbFromUWriteBytes(args,
michael@0 3518 buffer, (int32_t)(p - buffer),
michael@0 3519 offsetIndex, err);
michael@0 3520 }
michael@0 3521
michael@0 3522 /*
michael@0 3523 * Structure for cloning an ISO 2022 converter into a single memory block.
michael@0 3524 * ucnv_safeClone() of the converter will align the entire cloneStruct,
michael@0 3525 * and then ucnv_safeClone() of the sub-converter may additionally align
michael@0 3526 * currentConverter inside the cloneStruct, for which we need the deadSpace
michael@0 3527 * after currentConverter.
michael@0 3528 * This is because UAlignedMemory may be larger than the actually
michael@0 3529 * necessary alignment size for the platform.
michael@0 3530 * The other cloneStruct fields will not be moved around,
michael@0 3531 * and are aligned properly with cloneStruct's alignment.
michael@0 3532 */
michael@0 3533 struct cloneStruct
michael@0 3534 {
michael@0 3535 UConverter cnv;
michael@0 3536 UConverter currentConverter;
michael@0 3537 UAlignedMemory deadSpace;
michael@0 3538 UConverterDataISO2022 mydata;
michael@0 3539 };
michael@0 3540
michael@0 3541
michael@0 3542 static UConverter *
michael@0 3543 _ISO_2022_SafeClone(
michael@0 3544 const UConverter *cnv,
michael@0 3545 void *stackBuffer,
michael@0 3546 int32_t *pBufferSize,
michael@0 3547 UErrorCode *status)
michael@0 3548 {
michael@0 3549 struct cloneStruct * localClone;
michael@0 3550 UConverterDataISO2022 *cnvData;
michael@0 3551 int32_t i, size;
michael@0 3552
michael@0 3553 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
michael@0 3554 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
michael@0 3555 return NULL;
michael@0 3556 }
michael@0 3557
michael@0 3558 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
michael@0 3559 localClone = (struct cloneStruct *)stackBuffer;
michael@0 3560
michael@0 3561 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
michael@0 3562
michael@0 3563 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
michael@0 3564 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
michael@0 3565 localClone->cnv.isExtraLocal = TRUE;
michael@0 3566
michael@0 3567 /* share the subconverters */
michael@0 3568
michael@0 3569 if(cnvData->currentConverter != NULL) {
michael@0 3570 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
michael@0 3571 localClone->mydata.currentConverter =
michael@0 3572 ucnv_safeClone(cnvData->currentConverter,
michael@0 3573 &localClone->currentConverter,
michael@0 3574 &size, status);
michael@0 3575 if(U_FAILURE(*status)) {
michael@0 3576 return NULL;
michael@0 3577 }
michael@0 3578 }
michael@0 3579
michael@0 3580 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
michael@0 3581 if(cnvData->myConverterArray[i] != NULL) {
michael@0 3582 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
michael@0 3583 }
michael@0 3584 }
michael@0 3585
michael@0 3586 return &localClone->cnv;
michael@0 3587 }
michael@0 3588
michael@0 3589 static void
michael@0 3590 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
michael@0 3591 const USetAdder *sa,
michael@0 3592 UConverterUnicodeSet which,
michael@0 3593 UErrorCode *pErrorCode)
michael@0 3594 {
michael@0 3595 int32_t i;
michael@0 3596 UConverterDataISO2022* cnvData;
michael@0 3597
michael@0 3598 if (U_FAILURE(*pErrorCode)) {
michael@0 3599 return;
michael@0 3600 }
michael@0 3601 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 3602 if (cnv->sharedData == &_ISO2022Data) {
michael@0 3603 /* We use UTF-8 in this case */
michael@0 3604 sa->addRange(sa->set, 0, 0xd7FF);
michael@0 3605 sa->addRange(sa->set, 0xE000, 0x10FFFF);
michael@0 3606 return;
michael@0 3607 }
michael@0 3608 #endif
michael@0 3609
michael@0 3610 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
michael@0 3611
michael@0 3612 /* open a set and initialize it with code points that are algorithmically round-tripped */
michael@0 3613 switch(cnvData->locale[0]){
michael@0 3614 case 'j':
michael@0 3615 /* include JIS X 0201 which is hardcoded */
michael@0 3616 sa->add(sa->set, 0xa5);
michael@0 3617 sa->add(sa->set, 0x203e);
michael@0 3618 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
michael@0 3619 /* include Latin-1 for some variants of JP */
michael@0 3620 sa->addRange(sa->set, 0, 0xff);
michael@0 3621 } else {
michael@0 3622 /* include ASCII for JP */
michael@0 3623 sa->addRange(sa->set, 0, 0x7f);
michael@0 3624 }
michael@0 3625 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
michael@0 3626 /*
michael@0 3627 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
michael@0 3628 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
michael@0 3629 * use half-width Katakana.
michael@0 3630 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
michael@0 3631 * half-width Katakana via the ESC ( I sequence.
michael@0 3632 * However, we only emit (fromUnicode) half-width Katakana according to the
michael@0 3633 * definition of each variant.
michael@0 3634 *
michael@0 3635 * When including fallbacks,
michael@0 3636 * we need to include half-width Katakana Unicode code points for all JP variants because
michael@0 3637 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
michael@0 3638 */
michael@0 3639 /* include half-width Katakana for JP */
michael@0 3640 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
michael@0 3641 }
michael@0 3642 break;
michael@0 3643 case 'c':
michael@0 3644 case 'z':
michael@0 3645 /* include ASCII for CN */
michael@0 3646 sa->addRange(sa->set, 0, 0x7f);
michael@0 3647 break;
michael@0 3648 case 'k':
michael@0 3649 /* there is only one converter for KR, and it is not in the myConverterArray[] */
michael@0 3650 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
michael@0 3651 cnvData->currentConverter, sa, which, pErrorCode);
michael@0 3652 /* the loop over myConverterArray[] will simply not find another converter */
michael@0 3653 break;
michael@0 3654 default:
michael@0 3655 break;
michael@0 3656 }
michael@0 3657
michael@0 3658 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
michael@0 3659 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
michael@0 3660 cnvData->version==0 && i==CNS_11643
michael@0 3661 ) {
michael@0 3662 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
michael@0 3663 ucnv_MBCSGetUnicodeSetForBytes(
michael@0 3664 cnvData->myConverterArray[i],
michael@0 3665 sa, UCNV_ROUNDTRIP_SET,
michael@0 3666 0, 0x81, 0x82,
michael@0 3667 pErrorCode);
michael@0 3668 }
michael@0 3669 #endif
michael@0 3670
michael@0 3671 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
michael@0 3672 UConverterSetFilter filter;
michael@0 3673 if(cnvData->myConverterArray[i]!=NULL) {
michael@0 3674 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
michael@0 3675 cnvData->version==0 && i==CNS_11643
michael@0 3676 ) {
michael@0 3677 /*
michael@0 3678 * Version-specific for CN:
michael@0 3679 * CN version 0 does not map CNS planes 3..7 although
michael@0 3680 * they are all available in the CNS conversion table;
michael@0 3681 * CN version 1 (-EXT) does map them all.
michael@0 3682 * The two versions create different Unicode sets.
michael@0 3683 */
michael@0 3684 filter=UCNV_SET_FILTER_2022_CN;
michael@0 3685 } else if(cnvData->locale[0]=='j' && i==JISX208) {
michael@0 3686 /*
michael@0 3687 * Only add code points that map to Shift-JIS codes
michael@0 3688 * corresponding to JIS X 0208.
michael@0 3689 */
michael@0 3690 filter=UCNV_SET_FILTER_SJIS;
michael@0 3691 } else if(i==KSC5601) {
michael@0 3692 /*
michael@0 3693 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
michael@0 3694 * are broader than GR94.
michael@0 3695 */
michael@0 3696 filter=UCNV_SET_FILTER_GR94DBCS;
michael@0 3697 } else {
michael@0 3698 filter=UCNV_SET_FILTER_NONE;
michael@0 3699 }
michael@0 3700 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
michael@0 3701 }
michael@0 3702 }
michael@0 3703
michael@0 3704 /*
michael@0 3705 * ISO 2022 converters must not convert SO/SI/ESC despite what
michael@0 3706 * sub-converters do by themselves.
michael@0 3707 * Remove these characters from the set.
michael@0 3708 */
michael@0 3709 sa->remove(sa->set, 0x0e);
michael@0 3710 sa->remove(sa->set, 0x0f);
michael@0 3711 sa->remove(sa->set, 0x1b);
michael@0 3712
michael@0 3713 /* ISO 2022 converters do not convert C1 controls either */
michael@0 3714 sa->removeRange(sa->set, 0x80, 0x9f);
michael@0 3715 }
michael@0 3716
michael@0 3717 static const UConverterImpl _ISO2022Impl={
michael@0 3718 UCNV_ISO_2022,
michael@0 3719
michael@0 3720 NULL,
michael@0 3721 NULL,
michael@0 3722
michael@0 3723 _ISO2022Open,
michael@0 3724 _ISO2022Close,
michael@0 3725 _ISO2022Reset,
michael@0 3726
michael@0 3727 #ifdef U_ENABLE_GENERIC_ISO_2022
michael@0 3728 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
michael@0 3729 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
michael@0 3730 ucnv_fromUnicode_UTF8,
michael@0 3731 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
michael@0 3732 #else
michael@0 3733 NULL,
michael@0 3734 NULL,
michael@0 3735 NULL,
michael@0 3736 NULL,
michael@0 3737 #endif
michael@0 3738 NULL,
michael@0 3739
michael@0 3740 NULL,
michael@0 3741 _ISO2022getName,
michael@0 3742 _ISO_2022_WriteSub,
michael@0 3743 _ISO_2022_SafeClone,
michael@0 3744 _ISO_2022_GetUnicodeSet,
michael@0 3745
michael@0 3746 NULL,
michael@0 3747 NULL
michael@0 3748 };
michael@0 3749 static const UConverterStaticData _ISO2022StaticData={
michael@0 3750 sizeof(UConverterStaticData),
michael@0 3751 "ISO_2022",
michael@0 3752 2022,
michael@0 3753 UCNV_IBM,
michael@0 3754 UCNV_ISO_2022,
michael@0 3755 1,
michael@0 3756 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
michael@0 3757 { 0x1a, 0, 0, 0 },
michael@0 3758 1,
michael@0 3759 FALSE,
michael@0 3760 FALSE,
michael@0 3761 0,
michael@0 3762 0,
michael@0 3763 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 3764 };
michael@0 3765 const UConverterSharedData _ISO2022Data={
michael@0 3766 sizeof(UConverterSharedData),
michael@0 3767 ~((uint32_t) 0),
michael@0 3768 NULL,
michael@0 3769 NULL,
michael@0 3770 &_ISO2022StaticData,
michael@0 3771 FALSE,
michael@0 3772 &_ISO2022Impl,
michael@0 3773 0, UCNV_MBCS_TABLE_INITIALIZER
michael@0 3774 };
michael@0 3775
michael@0 3776 /*************JP****************/
michael@0 3777 static const UConverterImpl _ISO2022JPImpl={
michael@0 3778 UCNV_ISO_2022,
michael@0 3779
michael@0 3780 NULL,
michael@0 3781 NULL,
michael@0 3782
michael@0 3783 _ISO2022Open,
michael@0 3784 _ISO2022Close,
michael@0 3785 _ISO2022Reset,
michael@0 3786
michael@0 3787 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
michael@0 3788 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
michael@0 3789 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
michael@0 3790 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
michael@0 3791 NULL,
michael@0 3792
michael@0 3793 NULL,
michael@0 3794 _ISO2022getName,
michael@0 3795 _ISO_2022_WriteSub,
michael@0 3796 _ISO_2022_SafeClone,
michael@0 3797 _ISO_2022_GetUnicodeSet,
michael@0 3798
michael@0 3799 NULL,
michael@0 3800 NULL
michael@0 3801 };
michael@0 3802 static const UConverterStaticData _ISO2022JPStaticData={
michael@0 3803 sizeof(UConverterStaticData),
michael@0 3804 "ISO_2022_JP",
michael@0 3805 0,
michael@0 3806 UCNV_IBM,
michael@0 3807 UCNV_ISO_2022,
michael@0 3808 1,
michael@0 3809 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
michael@0 3810 { 0x1a, 0, 0, 0 },
michael@0 3811 1,
michael@0 3812 FALSE,
michael@0 3813 FALSE,
michael@0 3814 0,
michael@0 3815 0,
michael@0 3816 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 3817 };
michael@0 3818
michael@0 3819 namespace {
michael@0 3820
michael@0 3821 const UConverterSharedData _ISO2022JPData={
michael@0 3822 sizeof(UConverterSharedData),
michael@0 3823 ~((uint32_t) 0),
michael@0 3824 NULL,
michael@0 3825 NULL,
michael@0 3826 &_ISO2022JPStaticData,
michael@0 3827 FALSE,
michael@0 3828 &_ISO2022JPImpl,
michael@0 3829 0, UCNV_MBCS_TABLE_INITIALIZER
michael@0 3830 };
michael@0 3831
michael@0 3832 } // namespace
michael@0 3833
michael@0 3834 /************* KR ***************/
michael@0 3835 static const UConverterImpl _ISO2022KRImpl={
michael@0 3836 UCNV_ISO_2022,
michael@0 3837
michael@0 3838 NULL,
michael@0 3839 NULL,
michael@0 3840
michael@0 3841 _ISO2022Open,
michael@0 3842 _ISO2022Close,
michael@0 3843 _ISO2022Reset,
michael@0 3844
michael@0 3845 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
michael@0 3846 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
michael@0 3847 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
michael@0 3848 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
michael@0 3849 NULL,
michael@0 3850
michael@0 3851 NULL,
michael@0 3852 _ISO2022getName,
michael@0 3853 _ISO_2022_WriteSub,
michael@0 3854 _ISO_2022_SafeClone,
michael@0 3855 _ISO_2022_GetUnicodeSet,
michael@0 3856
michael@0 3857 NULL,
michael@0 3858 NULL
michael@0 3859 };
michael@0 3860 static const UConverterStaticData _ISO2022KRStaticData={
michael@0 3861 sizeof(UConverterStaticData),
michael@0 3862 "ISO_2022_KR",
michael@0 3863 0,
michael@0 3864 UCNV_IBM,
michael@0 3865 UCNV_ISO_2022,
michael@0 3866 1,
michael@0 3867 3, /* max 3 bytes per UChar: SO+DBCS */
michael@0 3868 { 0x1a, 0, 0, 0 },
michael@0 3869 1,
michael@0 3870 FALSE,
michael@0 3871 FALSE,
michael@0 3872 0,
michael@0 3873 0,
michael@0 3874 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 3875 };
michael@0 3876
michael@0 3877 namespace {
michael@0 3878
michael@0 3879 const UConverterSharedData _ISO2022KRData={
michael@0 3880 sizeof(UConverterSharedData),
michael@0 3881 ~((uint32_t) 0),
michael@0 3882 NULL,
michael@0 3883 NULL,
michael@0 3884 &_ISO2022KRStaticData,
michael@0 3885 FALSE,
michael@0 3886 &_ISO2022KRImpl,
michael@0 3887 0, UCNV_MBCS_TABLE_INITIALIZER
michael@0 3888 };
michael@0 3889
michael@0 3890 } // namespace
michael@0 3891
michael@0 3892 /*************** CN ***************/
michael@0 3893 static const UConverterImpl _ISO2022CNImpl={
michael@0 3894
michael@0 3895 UCNV_ISO_2022,
michael@0 3896
michael@0 3897 NULL,
michael@0 3898 NULL,
michael@0 3899
michael@0 3900 _ISO2022Open,
michael@0 3901 _ISO2022Close,
michael@0 3902 _ISO2022Reset,
michael@0 3903
michael@0 3904 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
michael@0 3905 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
michael@0 3906 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
michael@0 3907 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
michael@0 3908 NULL,
michael@0 3909
michael@0 3910 NULL,
michael@0 3911 _ISO2022getName,
michael@0 3912 _ISO_2022_WriteSub,
michael@0 3913 _ISO_2022_SafeClone,
michael@0 3914 _ISO_2022_GetUnicodeSet,
michael@0 3915
michael@0 3916 NULL,
michael@0 3917 NULL
michael@0 3918 };
michael@0 3919 static const UConverterStaticData _ISO2022CNStaticData={
michael@0 3920 sizeof(UConverterStaticData),
michael@0 3921 "ISO_2022_CN",
michael@0 3922 0,
michael@0 3923 UCNV_IBM,
michael@0 3924 UCNV_ISO_2022,
michael@0 3925 1,
michael@0 3926 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
michael@0 3927 { 0x1a, 0, 0, 0 },
michael@0 3928 1,
michael@0 3929 FALSE,
michael@0 3930 FALSE,
michael@0 3931 0,
michael@0 3932 0,
michael@0 3933 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 3934 };
michael@0 3935
michael@0 3936 namespace {
michael@0 3937
michael@0 3938 const UConverterSharedData _ISO2022CNData={
michael@0 3939 sizeof(UConverterSharedData),
michael@0 3940 ~((uint32_t) 0),
michael@0 3941 NULL,
michael@0 3942 NULL,
michael@0 3943 &_ISO2022CNStaticData,
michael@0 3944 FALSE,
michael@0 3945 &_ISO2022CNImpl,
michael@0 3946 0, UCNV_MBCS_TABLE_INITIALIZER
michael@0 3947 };
michael@0 3948
michael@0 3949 } // namespace
michael@0 3950
michael@0 3951 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

mercurial