Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2000-2012, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * file name: ucnv2022.cpp |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * created on: 2000feb03 |
michael@0 | 12 | * created by: Markus W. Scherer |
michael@0 | 13 | * |
michael@0 | 14 | * Change history: |
michael@0 | 15 | * |
michael@0 | 16 | * 06/29/2000 helena Major rewrite of the callback APIs. |
michael@0 | 17 | * 08/08/2000 Ram Included support for ISO-2022-JP-2 |
michael@0 | 18 | * Changed implementation of toUnicode |
michael@0 | 19 | * function |
michael@0 | 20 | * 08/21/2000 Ram Added support for ISO-2022-KR |
michael@0 | 21 | * 08/29/2000 Ram Seperated implementation of EBCDIC to |
michael@0 | 22 | * ucnvebdc.c |
michael@0 | 23 | * 09/20/2000 Ram Added support for ISO-2022-CN |
michael@0 | 24 | * Added implementations for getNextUChar() |
michael@0 | 25 | * for specific 2022 country variants. |
michael@0 | 26 | * 10/31/2000 Ram Implemented offsets logic functions |
michael@0 | 27 | */ |
michael@0 | 28 | |
michael@0 | 29 | #include "unicode/utypes.h" |
michael@0 | 30 | |
michael@0 | 31 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
michael@0 | 32 | |
michael@0 | 33 | #include "unicode/ucnv.h" |
michael@0 | 34 | #include "unicode/uset.h" |
michael@0 | 35 | #include "unicode/ucnv_err.h" |
michael@0 | 36 | #include "unicode/ucnv_cb.h" |
michael@0 | 37 | #include "unicode/utf16.h" |
michael@0 | 38 | #include "ucnv_imp.h" |
michael@0 | 39 | #include "ucnv_bld.h" |
michael@0 | 40 | #include "ucnv_cnv.h" |
michael@0 | 41 | #include "ucnvmbcs.h" |
michael@0 | 42 | #include "cstring.h" |
michael@0 | 43 | #include "cmemory.h" |
michael@0 | 44 | #include "uassert.h" |
michael@0 | 45 | |
michael@0 | 46 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 47 | |
michael@0 | 48 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 49 | /* |
michael@0 | 50 | * I am disabling the generic ISO-2022 converter after proposing to do so on |
michael@0 | 51 | * the icu mailing list two days ago. |
michael@0 | 52 | * |
michael@0 | 53 | * Reasons: |
michael@0 | 54 | * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of |
michael@0 | 55 | * its designation sequences, single shifts with return to the previous state, |
michael@0 | 56 | * switch-with-no-return to UTF-16BE or similar, etc. |
michael@0 | 57 | * This is unlike the language-specific variants like ISO-2022-JP which |
michael@0 | 58 | * require a much smaller repertoire of ISO-2022 features. |
michael@0 | 59 | * These variants continue to be supported. |
michael@0 | 60 | * 2. I believe that no one is really using the generic ISO-2022 converter |
michael@0 | 61 | * but rather always one of the language-specific variants. |
michael@0 | 62 | * Note that ICU's generic ISO-2022 converter has always output one escape |
michael@0 | 63 | * sequence followed by UTF-8 for the whole stream. |
michael@0 | 64 | * 3. Switching between subcharsets is extremely slow, because each time |
michael@0 | 65 | * the previous converter is closed and a new one opened, |
michael@0 | 66 | * without any kind of caching, least-recently-used list, etc. |
michael@0 | 67 | * 4. The code is currently buggy, and given the above it does not seem |
michael@0 | 68 | * reasonable to spend the time on maintenance. |
michael@0 | 69 | * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. |
michael@0 | 70 | * This means, for example, that when ISO-8859-7 is designated, the following |
michael@0 | 71 | * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. |
michael@0 | 72 | * The ICU ISO-2022 converter does not handle this - and has no information |
michael@0 | 73 | * about which subconverter would have to be shifted vs. which is designed |
michael@0 | 74 | * for 7-bit ISO-2022. |
michael@0 | 75 | * |
michael@0 | 76 | * Markus Scherer 2003-dec-03 |
michael@0 | 77 | */ |
michael@0 | 78 | #endif |
michael@0 | 79 | |
michael@0 | 80 | static const char SHIFT_IN_STR[] = "\x0F"; |
michael@0 | 81 | // static const char SHIFT_OUT_STR[] = "\x0E"; |
michael@0 | 82 | |
michael@0 | 83 | #define CR 0x0D |
michael@0 | 84 | #define LF 0x0A |
michael@0 | 85 | #define H_TAB 0x09 |
michael@0 | 86 | #define V_TAB 0x0B |
michael@0 | 87 | #define SPACE 0x20 |
michael@0 | 88 | |
michael@0 | 89 | enum { |
michael@0 | 90 | HWKANA_START=0xff61, |
michael@0 | 91 | HWKANA_END=0xff9f |
michael@0 | 92 | }; |
michael@0 | 93 | |
michael@0 | 94 | /* |
michael@0 | 95 | * 94-character sets with native byte values A1..FE are encoded in ISO 2022 |
michael@0 | 96 | * as bytes 21..7E. (Subtract 0x80.) |
michael@0 | 97 | * 96-character sets with native byte values A0..FF are encoded in ISO 2022 |
michael@0 | 98 | * as bytes 20..7F. (Subtract 0x80.) |
michael@0 | 99 | * Do not encode C1 control codes with native bytes 80..9F |
michael@0 | 100 | * as bytes 00..1F (C0 control codes). |
michael@0 | 101 | */ |
michael@0 | 102 | enum { |
michael@0 | 103 | GR94_START=0xa1, |
michael@0 | 104 | GR94_END=0xfe, |
michael@0 | 105 | GR96_START=0xa0, |
michael@0 | 106 | GR96_END=0xff |
michael@0 | 107 | }; |
michael@0 | 108 | |
michael@0 | 109 | /* |
michael@0 | 110 | * ISO 2022 control codes must not be converted from Unicode |
michael@0 | 111 | * because they would mess up the byte stream. |
michael@0 | 112 | * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b |
michael@0 | 113 | * corresponding to SO, SI, and ESC. |
michael@0 | 114 | */ |
michael@0 | 115 | #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) |
michael@0 | 116 | |
michael@0 | 117 | /* for ISO-2022-JP and -CN implementations */ |
michael@0 | 118 | typedef enum { |
michael@0 | 119 | /* shared values */ |
michael@0 | 120 | INVALID_STATE=-1, |
michael@0 | 121 | ASCII = 0, |
michael@0 | 122 | |
michael@0 | 123 | SS2_STATE=0x10, |
michael@0 | 124 | SS3_STATE, |
michael@0 | 125 | |
michael@0 | 126 | /* JP */ |
michael@0 | 127 | ISO8859_1 = 1 , |
michael@0 | 128 | ISO8859_7 = 2 , |
michael@0 | 129 | JISX201 = 3, |
michael@0 | 130 | JISX208 = 4, |
michael@0 | 131 | JISX212 = 5, |
michael@0 | 132 | GB2312 =6, |
michael@0 | 133 | KSC5601 =7, |
michael@0 | 134 | HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ |
michael@0 | 135 | |
michael@0 | 136 | /* CN */ |
michael@0 | 137 | /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ |
michael@0 | 138 | GB2312_1=1, |
michael@0 | 139 | ISO_IR_165=2, |
michael@0 | 140 | CNS_11643=3, |
michael@0 | 141 | |
michael@0 | 142 | /* |
michael@0 | 143 | * these are used in StateEnum and ISO2022State variables, |
michael@0 | 144 | * but CNS_11643 must be used to index into myConverterArray[] |
michael@0 | 145 | */ |
michael@0 | 146 | CNS_11643_0=0x20, |
michael@0 | 147 | CNS_11643_1, |
michael@0 | 148 | CNS_11643_2, |
michael@0 | 149 | CNS_11643_3, |
michael@0 | 150 | CNS_11643_4, |
michael@0 | 151 | CNS_11643_5, |
michael@0 | 152 | CNS_11643_6, |
michael@0 | 153 | CNS_11643_7 |
michael@0 | 154 | } StateEnum; |
michael@0 | 155 | |
michael@0 | 156 | /* is the StateEnum charset value for a DBCS charset? */ |
michael@0 | 157 | #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) |
michael@0 | 158 | |
michael@0 | 159 | #define CSM(cs) ((uint16_t)1<<(cs)) |
michael@0 | 160 | |
michael@0 | 161 | /* |
michael@0 | 162 | * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence |
michael@0 | 163 | * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x |
michael@0 | 164 | * |
michael@0 | 165 | * Note: The converter uses some leniency: |
michael@0 | 166 | * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in |
michael@0 | 167 | * all versions, not just JIS7 and JIS8. |
michael@0 | 168 | * - ICU does not distinguish between different versions of JIS X 0208. |
michael@0 | 169 | */ |
michael@0 | 170 | enum { MAX_JA_VERSION=4 }; |
michael@0 | 171 | static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ |
michael@0 | 172 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), |
michael@0 | 173 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), |
michael@0 | 174 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
michael@0 | 175 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
michael@0 | 176 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) |
michael@0 | 177 | }; |
michael@0 | 178 | |
michael@0 | 179 | typedef enum { |
michael@0 | 180 | ASCII1=0, |
michael@0 | 181 | LATIN1, |
michael@0 | 182 | SBCS, |
michael@0 | 183 | DBCS, |
michael@0 | 184 | MBCS, |
michael@0 | 185 | HWKANA |
michael@0 | 186 | }Cnv2022Type; |
michael@0 | 187 | |
michael@0 | 188 | typedef struct ISO2022State { |
michael@0 | 189 | int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ |
michael@0 | 190 | int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ |
michael@0 | 191 | int8_t prevG; /* g before single shift (SS2 or SS3) */ |
michael@0 | 192 | } ISO2022State; |
michael@0 | 193 | |
michael@0 | 194 | #define UCNV_OPTIONS_VERSION_MASK 0xf |
michael@0 | 195 | #define UCNV_2022_MAX_CONVERTERS 10 |
michael@0 | 196 | |
michael@0 | 197 | typedef struct{ |
michael@0 | 198 | UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; |
michael@0 | 199 | UConverter *currentConverter; |
michael@0 | 200 | Cnv2022Type currentType; |
michael@0 | 201 | ISO2022State toU2022State, fromU2022State; |
michael@0 | 202 | uint32_t key; |
michael@0 | 203 | uint32_t version; |
michael@0 | 204 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 205 | UBool isFirstBuffer; |
michael@0 | 206 | #endif |
michael@0 | 207 | UBool isEmptySegment; |
michael@0 | 208 | char name[30]; |
michael@0 | 209 | char locale[3]; |
michael@0 | 210 | }UConverterDataISO2022; |
michael@0 | 211 | |
michael@0 | 212 | /* Protos */ |
michael@0 | 213 | /* ISO-2022 ----------------------------------------------------------------- */ |
michael@0 | 214 | |
michael@0 | 215 | /*Forward declaration */ |
michael@0 | 216 | U_CFUNC void |
michael@0 | 217 | ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, |
michael@0 | 218 | UErrorCode * err); |
michael@0 | 219 | U_CFUNC void |
michael@0 | 220 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, |
michael@0 | 221 | UErrorCode * err); |
michael@0 | 222 | |
michael@0 | 223 | #define ESC_2022 0x1B /*ESC*/ |
michael@0 | 224 | |
michael@0 | 225 | typedef enum |
michael@0 | 226 | { |
michael@0 | 227 | INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ |
michael@0 | 228 | VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ |
michael@0 | 229 | VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ |
michael@0 | 230 | VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ |
michael@0 | 231 | } UCNV_TableStates_2022; |
michael@0 | 232 | |
michael@0 | 233 | /* |
michael@0 | 234 | * The way these state transition arrays work is: |
michael@0 | 235 | * ex : ESC$B is the sequence for JISX208 |
michael@0 | 236 | * a) First Iteration: char is ESC |
michael@0 | 237 | * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index |
michael@0 | 238 | * int x = normalize_esq_chars_2022[27] which is equal to 1 |
michael@0 | 239 | * ii) Search for this value in escSeqStateTable_Key_2022[] |
michael@0 | 240 | * value of x is stored at escSeqStateTable_Key_2022[0] |
michael@0 | 241 | * iii) Save this index as offset |
michael@0 | 242 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
michael@0 | 243 | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
michael@0 | 244 | * b) Switch on this state and continue to next char |
michael@0 | 245 | * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index |
michael@0 | 246 | * which is normalize_esq_chars_2022[36] == 4 |
michael@0 | 247 | * ii) x is currently 1(from above) |
michael@0 | 248 | * x<<=5 -- x is now 32 |
michael@0 | 249 | * x+=normalize_esq_chars_2022[36] |
michael@0 | 250 | * now x is 36 |
michael@0 | 251 | * iii) Search for this value in escSeqStateTable_Key_2022[] |
michael@0 | 252 | * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 |
michael@0 | 253 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
michael@0 | 254 | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
michael@0 | 255 | * c) Switch on this state and continue to next char |
michael@0 | 256 | * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index |
michael@0 | 257 | * ii) x is currently 36 (from above) |
michael@0 | 258 | * x<<=5 -- x is now 1152 |
michael@0 | 259 | * x+=normalize_esq_chars_2022[66] |
michael@0 | 260 | * now x is 1161 |
michael@0 | 261 | * iii) Search for this value in escSeqStateTable_Key_2022[] |
michael@0 | 262 | * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 |
michael@0 | 263 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] |
michael@0 | 264 | * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 |
michael@0 | 265 | * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 |
michael@0 | 266 | */ |
michael@0 | 267 | |
michael@0 | 268 | |
michael@0 | 269 | /*Below are the 3 arrays depicting a state transition table*/ |
michael@0 | 270 | static const int8_t normalize_esq_chars_2022[256] = { |
michael@0 | 271 | /* 0 1 2 3 4 5 6 7 8 9 */ |
michael@0 | 272 | |
michael@0 | 273 | 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 274 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 275 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 |
michael@0 | 276 | ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 |
michael@0 | 277 | ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 |
michael@0 | 278 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 279 | ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 |
michael@0 | 280 | ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 |
michael@0 | 281 | ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 282 | ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 283 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 284 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 285 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 286 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 287 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 288 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 289 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 290 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 291 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 292 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 293 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 294 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 295 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 296 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 297 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 298 | ,0 ,0 ,0 ,0 ,0 ,0 |
michael@0 | 299 | }; |
michael@0 | 300 | |
michael@0 | 301 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 302 | /* |
michael@0 | 303 | * When the generic ISO-2022 converter is completely removed, not just disabled |
michael@0 | 304 | * per #ifdef, then the following state table and the associated tables that are |
michael@0 | 305 | * dimensioned with MAX_STATES_2022 should be trimmed. |
michael@0 | 306 | * |
michael@0 | 307 | * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of |
michael@0 | 308 | * the associated escape sequences starting with ESC ( B should be removed. |
michael@0 | 309 | * This includes the ones with key values 1097 and all of the ones above 1000000. |
michael@0 | 310 | * |
michael@0 | 311 | * For the latter, the tables can simply be truncated. |
michael@0 | 312 | * For the former, since the tables must be kept parallel, it is probably best |
michael@0 | 313 | * to simply duplicate an adjacent table cell, parallel in all tables. |
michael@0 | 314 | * |
michael@0 | 315 | * It may make sense to restructure the tables, especially by using small search |
michael@0 | 316 | * tables for the variants instead of indexing them parallel to the table here. |
michael@0 | 317 | */ |
michael@0 | 318 | #endif |
michael@0 | 319 | |
michael@0 | 320 | #define MAX_STATES_2022 74 |
michael@0 | 321 | static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { |
michael@0 | 322 | /* 0 1 2 3 4 5 6 7 8 9 */ |
michael@0 | 323 | |
michael@0 | 324 | 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 |
michael@0 | 325 | ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 |
michael@0 | 326 | ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 |
michael@0 | 327 | ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 |
michael@0 | 328 | ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 |
michael@0 | 329 | ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 |
michael@0 | 330 | ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 |
michael@0 | 331 | ,35947631 ,35947635 ,35947636 ,35947638 |
michael@0 | 332 | }; |
michael@0 | 333 | |
michael@0 | 334 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 335 | |
michael@0 | 336 | static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { |
michael@0 | 337 | /* 0 1 2 3 4 5 6 7 8 9 */ |
michael@0 | 338 | |
michael@0 | 339 | NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" |
michael@0 | 340 | ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" |
michael@0 | 341 | ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" |
michael@0 | 342 | ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" |
michael@0 | 343 | ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" |
michael@0 | 344 | ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" |
michael@0 | 345 | ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" |
michael@0 | 346 | ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" |
michael@0 | 347 | }; |
michael@0 | 348 | |
michael@0 | 349 | #endif |
michael@0 | 350 | |
michael@0 | 351 | static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { |
michael@0 | 352 | /* 0 1 2 3 4 5 6 7 8 9 */ |
michael@0 | 353 | VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
michael@0 | 354 | ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
michael@0 | 355 | ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 |
michael@0 | 356 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
michael@0 | 357 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
michael@0 | 358 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
michael@0 | 359 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
michael@0 | 360 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
michael@0 | 361 | }; |
michael@0 | 362 | |
michael@0 | 363 | |
michael@0 | 364 | /* Type def for refactoring changeState_2022 code*/ |
michael@0 | 365 | typedef enum{ |
michael@0 | 366 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 367 | ISO_2022=0, |
michael@0 | 368 | #endif |
michael@0 | 369 | ISO_2022_JP=1, |
michael@0 | 370 | ISO_2022_KR=2, |
michael@0 | 371 | ISO_2022_CN=3 |
michael@0 | 372 | } Variant2022; |
michael@0 | 373 | |
michael@0 | 374 | /*********** ISO 2022 Converter Protos ***********/ |
michael@0 | 375 | static void |
michael@0 | 376 | _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); |
michael@0 | 377 | |
michael@0 | 378 | static void |
michael@0 | 379 | _ISO2022Close(UConverter *converter); |
michael@0 | 380 | |
michael@0 | 381 | static void |
michael@0 | 382 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); |
michael@0 | 383 | |
michael@0 | 384 | static const char* |
michael@0 | 385 | _ISO2022getName(const UConverter* cnv); |
michael@0 | 386 | |
michael@0 | 387 | static void |
michael@0 | 388 | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); |
michael@0 | 389 | |
michael@0 | 390 | static UConverter * |
michael@0 | 391 | _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); |
michael@0 | 392 | |
michael@0 | 393 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 394 | static void |
michael@0 | 395 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); |
michael@0 | 396 | #endif |
michael@0 | 397 | |
michael@0 | 398 | namespace { |
michael@0 | 399 | |
michael@0 | 400 | /*const UConverterSharedData _ISO2022Data;*/ |
michael@0 | 401 | extern const UConverterSharedData _ISO2022JPData; |
michael@0 | 402 | extern const UConverterSharedData _ISO2022KRData; |
michael@0 | 403 | extern const UConverterSharedData _ISO2022CNData; |
michael@0 | 404 | |
michael@0 | 405 | } // namespace |
michael@0 | 406 | |
michael@0 | 407 | /*************** Converter implementations ******************/ |
michael@0 | 408 | |
michael@0 | 409 | /* The purpose of this function is to get around gcc compiler warnings. */ |
michael@0 | 410 | static inline void |
michael@0 | 411 | fromUWriteUInt8(UConverter *cnv, |
michael@0 | 412 | const char *bytes, int32_t length, |
michael@0 | 413 | uint8_t **target, const char *targetLimit, |
michael@0 | 414 | int32_t **offsets, |
michael@0 | 415 | int32_t sourceIndex, |
michael@0 | 416 | UErrorCode *pErrorCode) |
michael@0 | 417 | { |
michael@0 | 418 | char *targetChars = (char *)*target; |
michael@0 | 419 | ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, |
michael@0 | 420 | offsets, sourceIndex, pErrorCode); |
michael@0 | 421 | *target = (uint8_t*)targetChars; |
michael@0 | 422 | |
michael@0 | 423 | } |
michael@0 | 424 | |
michael@0 | 425 | static inline void |
michael@0 | 426 | setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ |
michael@0 | 427 | if(myConverterData->version == 1) { |
michael@0 | 428 | UConverter *cnv = myConverterData->currentConverter; |
michael@0 | 429 | |
michael@0 | 430 | cnv->toUnicodeStatus=0; /* offset */ |
michael@0 | 431 | cnv->mode=0; /* state */ |
michael@0 | 432 | cnv->toULength=0; /* byteIndex */ |
michael@0 | 433 | } |
michael@0 | 434 | } |
michael@0 | 435 | |
michael@0 | 436 | static inline void |
michael@0 | 437 | setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ |
michael@0 | 438 | /* in ISO-2022-KR the designator sequence appears only once |
michael@0 | 439 | * in a file so we append it only once |
michael@0 | 440 | */ |
michael@0 | 441 | if( converter->charErrorBufferLength==0){ |
michael@0 | 442 | |
michael@0 | 443 | converter->charErrorBufferLength = 4; |
michael@0 | 444 | converter->charErrorBuffer[0] = 0x1b; |
michael@0 | 445 | converter->charErrorBuffer[1] = 0x24; |
michael@0 | 446 | converter->charErrorBuffer[2] = 0x29; |
michael@0 | 447 | converter->charErrorBuffer[3] = 0x43; |
michael@0 | 448 | } |
michael@0 | 449 | if(myConverterData->version == 1) { |
michael@0 | 450 | UConverter *cnv = myConverterData->currentConverter; |
michael@0 | 451 | |
michael@0 | 452 | cnv->fromUChar32=0; |
michael@0 | 453 | cnv->fromUnicodeStatus=1; /* prevLength */ |
michael@0 | 454 | } |
michael@0 | 455 | } |
michael@0 | 456 | |
michael@0 | 457 | static void |
michael@0 | 458 | _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
michael@0 | 459 | |
michael@0 | 460 | char myLocale[6]={' ',' ',' ',' ',' ',' '}; |
michael@0 | 461 | |
michael@0 | 462 | cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); |
michael@0 | 463 | if(cnv->extraInfo != NULL) { |
michael@0 | 464 | UConverterNamePieces stackPieces; |
michael@0 | 465 | UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; |
michael@0 | 466 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
michael@0 | 467 | uint32_t version; |
michael@0 | 468 | |
michael@0 | 469 | stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; |
michael@0 | 470 | |
michael@0 | 471 | uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); |
michael@0 | 472 | myConverterData->currentType = ASCII1; |
michael@0 | 473 | cnv->fromUnicodeStatus =FALSE; |
michael@0 | 474 | if(pArgs->locale){ |
michael@0 | 475 | uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); |
michael@0 | 476 | } |
michael@0 | 477 | version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; |
michael@0 | 478 | myConverterData->version = version; |
michael@0 | 479 | if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && |
michael@0 | 480 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
michael@0 | 481 | { |
michael@0 | 482 | size_t len=0; |
michael@0 | 483 | /* open the required converters and cache them */ |
michael@0 | 484 | if(version>MAX_JA_VERSION) { |
michael@0 | 485 | /* prevent indexing beyond jpCharsetMasks[] */ |
michael@0 | 486 | myConverterData->version = version = 0; |
michael@0 | 487 | } |
michael@0 | 488 | if(jpCharsetMasks[version]&CSM(ISO8859_7)) { |
michael@0 | 489 | myConverterData->myConverterArray[ISO8859_7] = |
michael@0 | 490 | ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); |
michael@0 | 491 | } |
michael@0 | 492 | myConverterData->myConverterArray[JISX208] = |
michael@0 | 493 | ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); |
michael@0 | 494 | if(jpCharsetMasks[version]&CSM(JISX212)) { |
michael@0 | 495 | myConverterData->myConverterArray[JISX212] = |
michael@0 | 496 | ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); |
michael@0 | 497 | } |
michael@0 | 498 | if(jpCharsetMasks[version]&CSM(GB2312)) { |
michael@0 | 499 | myConverterData->myConverterArray[GB2312] = |
michael@0 | 500 | ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ |
michael@0 | 501 | } |
michael@0 | 502 | if(jpCharsetMasks[version]&CSM(KSC5601)) { |
michael@0 | 503 | myConverterData->myConverterArray[KSC5601] = |
michael@0 | 504 | ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); |
michael@0 | 505 | } |
michael@0 | 506 | |
michael@0 | 507 | /* set the function pointers to appropriate funtions */ |
michael@0 | 508 | cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); |
michael@0 | 509 | uprv_strcpy(myConverterData->locale,"ja"); |
michael@0 | 510 | |
michael@0 | 511 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); |
michael@0 | 512 | len = uprv_strlen(myConverterData->name); |
michael@0 | 513 | myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); |
michael@0 | 514 | myConverterData->name[len+1]='\0'; |
michael@0 | 515 | } |
michael@0 | 516 | else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && |
michael@0 | 517 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
michael@0 | 518 | { |
michael@0 | 519 | const char *cnvName; |
michael@0 | 520 | if(version==1) { |
michael@0 | 521 | cnvName="icu-internal-25546"; |
michael@0 | 522 | } else { |
michael@0 | 523 | cnvName="ibm-949"; |
michael@0 | 524 | myConverterData->version=version=0; |
michael@0 | 525 | } |
michael@0 | 526 | if(pArgs->onlyTestIsLoadable) { |
michael@0 | 527 | ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ |
michael@0 | 528 | uprv_free(cnv->extraInfo); |
michael@0 | 529 | cnv->extraInfo=NULL; |
michael@0 | 530 | return; |
michael@0 | 531 | } else { |
michael@0 | 532 | myConverterData->currentConverter=ucnv_open(cnvName, errorCode); |
michael@0 | 533 | if (U_FAILURE(*errorCode)) { |
michael@0 | 534 | _ISO2022Close(cnv); |
michael@0 | 535 | return; |
michael@0 | 536 | } |
michael@0 | 537 | |
michael@0 | 538 | if(version==1) { |
michael@0 | 539 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); |
michael@0 | 540 | uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); |
michael@0 | 541 | cnv->subCharLen = myConverterData->currentConverter->subCharLen; |
michael@0 | 542 | }else{ |
michael@0 | 543 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); |
michael@0 | 544 | } |
michael@0 | 545 | |
michael@0 | 546 | /* initialize the state variables */ |
michael@0 | 547 | setInitialStateToUnicodeKR(cnv, myConverterData); |
michael@0 | 548 | setInitialStateFromUnicodeKR(cnv, myConverterData); |
michael@0 | 549 | |
michael@0 | 550 | /* set the function pointers to appropriate funtions */ |
michael@0 | 551 | cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; |
michael@0 | 552 | uprv_strcpy(myConverterData->locale,"ko"); |
michael@0 | 553 | } |
michael@0 | 554 | } |
michael@0 | 555 | else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& |
michael@0 | 556 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
michael@0 | 557 | { |
michael@0 | 558 | |
michael@0 | 559 | /* open the required converters and cache them */ |
michael@0 | 560 | myConverterData->myConverterArray[GB2312_1] = |
michael@0 | 561 | ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); |
michael@0 | 562 | if(version==1) { |
michael@0 | 563 | myConverterData->myConverterArray[ISO_IR_165] = |
michael@0 | 564 | ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); |
michael@0 | 565 | } |
michael@0 | 566 | myConverterData->myConverterArray[CNS_11643] = |
michael@0 | 567 | ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); |
michael@0 | 568 | |
michael@0 | 569 | |
michael@0 | 570 | /* set the function pointers to appropriate funtions */ |
michael@0 | 571 | cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; |
michael@0 | 572 | uprv_strcpy(myConverterData->locale,"cn"); |
michael@0 | 573 | |
michael@0 | 574 | if (version==0){ |
michael@0 | 575 | myConverterData->version = 0; |
michael@0 | 576 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); |
michael@0 | 577 | }else if (version==1){ |
michael@0 | 578 | myConverterData->version = 1; |
michael@0 | 579 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); |
michael@0 | 580 | }else { |
michael@0 | 581 | myConverterData->version = 2; |
michael@0 | 582 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); |
michael@0 | 583 | } |
michael@0 | 584 | } |
michael@0 | 585 | else{ |
michael@0 | 586 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 587 | myConverterData->isFirstBuffer = TRUE; |
michael@0 | 588 | |
michael@0 | 589 | /* append the UTF-8 escape sequence */ |
michael@0 | 590 | cnv->charErrorBufferLength = 3; |
michael@0 | 591 | cnv->charErrorBuffer[0] = 0x1b; |
michael@0 | 592 | cnv->charErrorBuffer[1] = 0x25; |
michael@0 | 593 | cnv->charErrorBuffer[2] = 0x42; |
michael@0 | 594 | |
michael@0 | 595 | cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; |
michael@0 | 596 | /* initialize the state variables */ |
michael@0 | 597 | uprv_strcpy(myConverterData->name,"ISO_2022"); |
michael@0 | 598 | #else |
michael@0 | 599 | *errorCode = U_UNSUPPORTED_ERROR; |
michael@0 | 600 | return; |
michael@0 | 601 | #endif |
michael@0 | 602 | } |
michael@0 | 603 | |
michael@0 | 604 | cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; |
michael@0 | 605 | |
michael@0 | 606 | if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { |
michael@0 | 607 | _ISO2022Close(cnv); |
michael@0 | 608 | } |
michael@0 | 609 | } else { |
michael@0 | 610 | *errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 611 | } |
michael@0 | 612 | } |
michael@0 | 613 | |
michael@0 | 614 | |
michael@0 | 615 | static void |
michael@0 | 616 | _ISO2022Close(UConverter *converter) { |
michael@0 | 617 | UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); |
michael@0 | 618 | UConverterSharedData **array = myData->myConverterArray; |
michael@0 | 619 | int32_t i; |
michael@0 | 620 | |
michael@0 | 621 | if (converter->extraInfo != NULL) { |
michael@0 | 622 | /*close the array of converter pointers and free the memory*/ |
michael@0 | 623 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
michael@0 | 624 | if(array[i]!=NULL) { |
michael@0 | 625 | ucnv_unloadSharedDataIfReady(array[i]); |
michael@0 | 626 | } |
michael@0 | 627 | } |
michael@0 | 628 | |
michael@0 | 629 | ucnv_close(myData->currentConverter); |
michael@0 | 630 | |
michael@0 | 631 | if(!converter->isExtraLocal){ |
michael@0 | 632 | uprv_free (converter->extraInfo); |
michael@0 | 633 | converter->extraInfo = NULL; |
michael@0 | 634 | } |
michael@0 | 635 | } |
michael@0 | 636 | } |
michael@0 | 637 | |
michael@0 | 638 | static void |
michael@0 | 639 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { |
michael@0 | 640 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); |
michael@0 | 641 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 642 | uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); |
michael@0 | 643 | myConverterData->key = 0; |
michael@0 | 644 | myConverterData->isEmptySegment = FALSE; |
michael@0 | 645 | } |
michael@0 | 646 | if(choice!=UCNV_RESET_TO_UNICODE) { |
michael@0 | 647 | uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); |
michael@0 | 648 | } |
michael@0 | 649 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 650 | if(myConverterData->locale[0] == 0){ |
michael@0 | 651 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 652 | myConverterData->isFirstBuffer = TRUE; |
michael@0 | 653 | myConverterData->key = 0; |
michael@0 | 654 | if (converter->mode == UCNV_SO){ |
michael@0 | 655 | ucnv_close (myConverterData->currentConverter); |
michael@0 | 656 | myConverterData->currentConverter=NULL; |
michael@0 | 657 | } |
michael@0 | 658 | converter->mode = UCNV_SI; |
michael@0 | 659 | } |
michael@0 | 660 | if(choice!=UCNV_RESET_TO_UNICODE) { |
michael@0 | 661 | /* re-append UTF-8 escape sequence */ |
michael@0 | 662 | converter->charErrorBufferLength = 3; |
michael@0 | 663 | converter->charErrorBuffer[0] = 0x1b; |
michael@0 | 664 | converter->charErrorBuffer[1] = 0x28; |
michael@0 | 665 | converter->charErrorBuffer[2] = 0x42; |
michael@0 | 666 | } |
michael@0 | 667 | } |
michael@0 | 668 | else |
michael@0 | 669 | #endif |
michael@0 | 670 | { |
michael@0 | 671 | /* reset the state variables */ |
michael@0 | 672 | if(myConverterData->locale[0] == 'k'){ |
michael@0 | 673 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 674 | setInitialStateToUnicodeKR(converter, myConverterData); |
michael@0 | 675 | } |
michael@0 | 676 | if(choice!=UCNV_RESET_TO_UNICODE) { |
michael@0 | 677 | setInitialStateFromUnicodeKR(converter, myConverterData); |
michael@0 | 678 | } |
michael@0 | 679 | } |
michael@0 | 680 | } |
michael@0 | 681 | } |
michael@0 | 682 | |
michael@0 | 683 | static const char* |
michael@0 | 684 | _ISO2022getName(const UConverter* cnv){ |
michael@0 | 685 | if(cnv->extraInfo){ |
michael@0 | 686 | UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; |
michael@0 | 687 | return myData->name; |
michael@0 | 688 | } |
michael@0 | 689 | return NULL; |
michael@0 | 690 | } |
michael@0 | 691 | |
michael@0 | 692 | |
michael@0 | 693 | /*************** to unicode *******************/ |
michael@0 | 694 | /**************************************************************************** |
michael@0 | 695 | * Recognized escape sequences are |
michael@0 | 696 | * <ESC>(B ASCII |
michael@0 | 697 | * <ESC>.A ISO-8859-1 |
michael@0 | 698 | * <ESC>.F ISO-8859-7 |
michael@0 | 699 | * <ESC>(J JISX-201 |
michael@0 | 700 | * <ESC>(I JISX-201 |
michael@0 | 701 | * <ESC>$B JISX-208 |
michael@0 | 702 | * <ESC>$@ JISX-208 |
michael@0 | 703 | * <ESC>$(D JISX-212 |
michael@0 | 704 | * <ESC>$A GB2312 |
michael@0 | 705 | * <ESC>$(C KSC5601 |
michael@0 | 706 | */ |
michael@0 | 707 | static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { |
michael@0 | 708 | /* 0 1 2 3 4 5 6 7 8 9 */ |
michael@0 | 709 | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 710 | ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE |
michael@0 | 711 | ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 712 | ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE |
michael@0 | 713 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 714 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 715 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 716 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 717 | }; |
michael@0 | 718 | |
michael@0 | 719 | /*************** to unicode *******************/ |
michael@0 | 720 | static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { |
michael@0 | 721 | /* 0 1 2 3 4 5 6 7 8 9 */ |
michael@0 | 722 | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 723 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 724 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 725 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 726 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 |
michael@0 | 727 | ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 728 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 729 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
michael@0 | 730 | }; |
michael@0 | 731 | |
michael@0 | 732 | |
michael@0 | 733 | static UCNV_TableStates_2022 |
michael@0 | 734 | getKey_2022(char c,int32_t* key,int32_t* offset){ |
michael@0 | 735 | int32_t togo; |
michael@0 | 736 | int32_t low = 0; |
michael@0 | 737 | int32_t hi = MAX_STATES_2022; |
michael@0 | 738 | int32_t oldmid=0; |
michael@0 | 739 | |
michael@0 | 740 | togo = normalize_esq_chars_2022[(uint8_t)c]; |
michael@0 | 741 | if(togo == 0) { |
michael@0 | 742 | /* not a valid character anywhere in an escape sequence */ |
michael@0 | 743 | *key = 0; |
michael@0 | 744 | *offset = 0; |
michael@0 | 745 | return INVALID_2022; |
michael@0 | 746 | } |
michael@0 | 747 | togo = (*key << 5) + togo; |
michael@0 | 748 | |
michael@0 | 749 | while (hi != low) /*binary search*/{ |
michael@0 | 750 | |
michael@0 | 751 | register int32_t mid = (hi+low) >> 1; /*Finds median*/ |
michael@0 | 752 | |
michael@0 | 753 | if (mid == oldmid) |
michael@0 | 754 | break; |
michael@0 | 755 | |
michael@0 | 756 | if (escSeqStateTable_Key_2022[mid] > togo){ |
michael@0 | 757 | hi = mid; |
michael@0 | 758 | } |
michael@0 | 759 | else if (escSeqStateTable_Key_2022[mid] < togo){ |
michael@0 | 760 | low = mid; |
michael@0 | 761 | } |
michael@0 | 762 | else /*we found it*/{ |
michael@0 | 763 | *key = togo; |
michael@0 | 764 | *offset = mid; |
michael@0 | 765 | return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; |
michael@0 | 766 | } |
michael@0 | 767 | oldmid = mid; |
michael@0 | 768 | |
michael@0 | 769 | } |
michael@0 | 770 | |
michael@0 | 771 | *key = 0; |
michael@0 | 772 | *offset = 0; |
michael@0 | 773 | return INVALID_2022; |
michael@0 | 774 | } |
michael@0 | 775 | |
michael@0 | 776 | /*runs through a state machine to determine the escape sequence - codepage correspondance |
michael@0 | 777 | */ |
michael@0 | 778 | static void |
michael@0 | 779 | changeState_2022(UConverter* _this, |
michael@0 | 780 | const char** source, |
michael@0 | 781 | const char* sourceLimit, |
michael@0 | 782 | Variant2022 var, |
michael@0 | 783 | UErrorCode* err){ |
michael@0 | 784 | UCNV_TableStates_2022 value; |
michael@0 | 785 | UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); |
michael@0 | 786 | uint32_t key = myData2022->key; |
michael@0 | 787 | int32_t offset = 0; |
michael@0 | 788 | int8_t initialToULength = _this->toULength; |
michael@0 | 789 | char c; |
michael@0 | 790 | |
michael@0 | 791 | value = VALID_NON_TERMINAL_2022; |
michael@0 | 792 | while (*source < sourceLimit) { |
michael@0 | 793 | c = *(*source)++; |
michael@0 | 794 | _this->toUBytes[_this->toULength++]=(uint8_t)c; |
michael@0 | 795 | value = getKey_2022(c,(int32_t *) &key, &offset); |
michael@0 | 796 | |
michael@0 | 797 | switch (value){ |
michael@0 | 798 | |
michael@0 | 799 | case VALID_NON_TERMINAL_2022 : |
michael@0 | 800 | /* continue with the loop */ |
michael@0 | 801 | break; |
michael@0 | 802 | |
michael@0 | 803 | case VALID_TERMINAL_2022: |
michael@0 | 804 | key = 0; |
michael@0 | 805 | goto DONE; |
michael@0 | 806 | |
michael@0 | 807 | case INVALID_2022: |
michael@0 | 808 | goto DONE; |
michael@0 | 809 | |
michael@0 | 810 | case VALID_MAYBE_TERMINAL_2022: |
michael@0 | 811 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 812 | /* ESC ( B is ambiguous only for ISO_2022 itself */ |
michael@0 | 813 | if(var == ISO_2022) { |
michael@0 | 814 | /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ |
michael@0 | 815 | _this->toULength = 0; |
michael@0 | 816 | |
michael@0 | 817 | /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ |
michael@0 | 818 | |
michael@0 | 819 | /* continue with the loop */ |
michael@0 | 820 | value = VALID_NON_TERMINAL_2022; |
michael@0 | 821 | break; |
michael@0 | 822 | } else |
michael@0 | 823 | #endif |
michael@0 | 824 | { |
michael@0 | 825 | /* not ISO_2022 itself, finish here */ |
michael@0 | 826 | value = VALID_TERMINAL_2022; |
michael@0 | 827 | key = 0; |
michael@0 | 828 | goto DONE; |
michael@0 | 829 | } |
michael@0 | 830 | } |
michael@0 | 831 | } |
michael@0 | 832 | |
michael@0 | 833 | DONE: |
michael@0 | 834 | myData2022->key = key; |
michael@0 | 835 | |
michael@0 | 836 | if (value == VALID_NON_TERMINAL_2022) { |
michael@0 | 837 | /* indicate that the escape sequence is incomplete: key!=0 */ |
michael@0 | 838 | return; |
michael@0 | 839 | } else if (value == INVALID_2022 ) { |
michael@0 | 840 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 841 | } else /* value == VALID_TERMINAL_2022 */ { |
michael@0 | 842 | switch(var){ |
michael@0 | 843 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 844 | case ISO_2022: |
michael@0 | 845 | { |
michael@0 | 846 | const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; |
michael@0 | 847 | if(chosenConverterName == NULL) { |
michael@0 | 848 | /* SS2 or SS3 */ |
michael@0 | 849 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
michael@0 | 850 | _this->toUCallbackReason = UCNV_UNASSIGNED; |
michael@0 | 851 | return; |
michael@0 | 852 | } |
michael@0 | 853 | |
michael@0 | 854 | _this->mode = UCNV_SI; |
michael@0 | 855 | ucnv_close(myData2022->currentConverter); |
michael@0 | 856 | myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); |
michael@0 | 857 | if(U_SUCCESS(*err)) { |
michael@0 | 858 | myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
michael@0 | 859 | _this->mode = UCNV_SO; |
michael@0 | 860 | } |
michael@0 | 861 | break; |
michael@0 | 862 | } |
michael@0 | 863 | #endif |
michael@0 | 864 | case ISO_2022_JP: |
michael@0 | 865 | { |
michael@0 | 866 | StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; |
michael@0 | 867 | switch(tempState) { |
michael@0 | 868 | case INVALID_STATE: |
michael@0 | 869 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
michael@0 | 870 | break; |
michael@0 | 871 | case SS2_STATE: |
michael@0 | 872 | if(myData2022->toU2022State.cs[2]!=0) { |
michael@0 | 873 | if(myData2022->toU2022State.g<2) { |
michael@0 | 874 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
michael@0 | 875 | } |
michael@0 | 876 | myData2022->toU2022State.g=2; |
michael@0 | 877 | } else { |
michael@0 | 878 | /* illegal to have SS2 before a matching designator */ |
michael@0 | 879 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 880 | } |
michael@0 | 881 | break; |
michael@0 | 882 | /* case SS3_STATE: not used in ISO-2022-JP-x */ |
michael@0 | 883 | case ISO8859_1: |
michael@0 | 884 | case ISO8859_7: |
michael@0 | 885 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
michael@0 | 886 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
michael@0 | 887 | } else { |
michael@0 | 888 | /* G2 charset for SS2 */ |
michael@0 | 889 | myData2022->toU2022State.cs[2]=(int8_t)tempState; |
michael@0 | 890 | } |
michael@0 | 891 | break; |
michael@0 | 892 | default: |
michael@0 | 893 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
michael@0 | 894 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
michael@0 | 895 | } else { |
michael@0 | 896 | /* G0 charset */ |
michael@0 | 897 | myData2022->toU2022State.cs[0]=(int8_t)tempState; |
michael@0 | 898 | } |
michael@0 | 899 | break; |
michael@0 | 900 | } |
michael@0 | 901 | } |
michael@0 | 902 | break; |
michael@0 | 903 | case ISO_2022_CN: |
michael@0 | 904 | { |
michael@0 | 905 | StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; |
michael@0 | 906 | switch(tempState) { |
michael@0 | 907 | case INVALID_STATE: |
michael@0 | 908 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
michael@0 | 909 | break; |
michael@0 | 910 | case SS2_STATE: |
michael@0 | 911 | if(myData2022->toU2022State.cs[2]!=0) { |
michael@0 | 912 | if(myData2022->toU2022State.g<2) { |
michael@0 | 913 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
michael@0 | 914 | } |
michael@0 | 915 | myData2022->toU2022State.g=2; |
michael@0 | 916 | } else { |
michael@0 | 917 | /* illegal to have SS2 before a matching designator */ |
michael@0 | 918 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 919 | } |
michael@0 | 920 | break; |
michael@0 | 921 | case SS3_STATE: |
michael@0 | 922 | if(myData2022->toU2022State.cs[3]!=0) { |
michael@0 | 923 | if(myData2022->toU2022State.g<2) { |
michael@0 | 924 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
michael@0 | 925 | } |
michael@0 | 926 | myData2022->toU2022State.g=3; |
michael@0 | 927 | } else { |
michael@0 | 928 | /* illegal to have SS3 before a matching designator */ |
michael@0 | 929 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 930 | } |
michael@0 | 931 | break; |
michael@0 | 932 | case ISO_IR_165: |
michael@0 | 933 | if(myData2022->version==0) { |
michael@0 | 934 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
michael@0 | 935 | break; |
michael@0 | 936 | } |
michael@0 | 937 | /*fall through*/ |
michael@0 | 938 | case GB2312_1: |
michael@0 | 939 | /*fall through*/ |
michael@0 | 940 | case CNS_11643_1: |
michael@0 | 941 | myData2022->toU2022State.cs[1]=(int8_t)tempState; |
michael@0 | 942 | break; |
michael@0 | 943 | case CNS_11643_2: |
michael@0 | 944 | myData2022->toU2022State.cs[2]=(int8_t)tempState; |
michael@0 | 945 | break; |
michael@0 | 946 | default: |
michael@0 | 947 | /* other CNS 11643 planes */ |
michael@0 | 948 | if(myData2022->version==0) { |
michael@0 | 949 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
michael@0 | 950 | } else { |
michael@0 | 951 | myData2022->toU2022State.cs[3]=(int8_t)tempState; |
michael@0 | 952 | } |
michael@0 | 953 | break; |
michael@0 | 954 | } |
michael@0 | 955 | } |
michael@0 | 956 | break; |
michael@0 | 957 | case ISO_2022_KR: |
michael@0 | 958 | if(offset==0x30){ |
michael@0 | 959 | /* nothing to be done, just accept this one escape sequence */ |
michael@0 | 960 | } else { |
michael@0 | 961 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
michael@0 | 962 | } |
michael@0 | 963 | break; |
michael@0 | 964 | |
michael@0 | 965 | default: |
michael@0 | 966 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 967 | break; |
michael@0 | 968 | } |
michael@0 | 969 | } |
michael@0 | 970 | if(U_SUCCESS(*err)) { |
michael@0 | 971 | _this->toULength = 0; |
michael@0 | 972 | } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
michael@0 | 973 | if(_this->toULength>1) { |
michael@0 | 974 | /* |
michael@0 | 975 | * Ticket 5691: consistent illegal sequences: |
michael@0 | 976 | * - We include at least the first byte (ESC) in the illegal sequence. |
michael@0 | 977 | * - If any of the non-initial bytes could be the start of a character, |
michael@0 | 978 | * we stop the illegal sequence before the first one of those. |
michael@0 | 979 | * In escape sequences, all following bytes are "printable", that is, |
michael@0 | 980 | * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), |
michael@0 | 981 | * they are valid single/lead bytes. |
michael@0 | 982 | * For simplicity, we always only report the initial ESC byte as the |
michael@0 | 983 | * illegal sequence and back out all other bytes we looked at. |
michael@0 | 984 | */ |
michael@0 | 985 | /* Back out some bytes. */ |
michael@0 | 986 | int8_t backOutDistance=_this->toULength-1; |
michael@0 | 987 | int8_t bytesFromThisBuffer=_this->toULength-initialToULength; |
michael@0 | 988 | if(backOutDistance<=bytesFromThisBuffer) { |
michael@0 | 989 | /* same as initialToULength<=1 */ |
michael@0 | 990 | *source-=backOutDistance; |
michael@0 | 991 | } else { |
michael@0 | 992 | /* Back out bytes from the previous buffer: Need to replay them. */ |
michael@0 | 993 | _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); |
michael@0 | 994 | /* same as -(initialToULength-1) */ |
michael@0 | 995 | /* preToULength is negative! */ |
michael@0 | 996 | uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); |
michael@0 | 997 | *source-=bytesFromThisBuffer; |
michael@0 | 998 | } |
michael@0 | 999 | _this->toULength=1; |
michael@0 | 1000 | } |
michael@0 | 1001 | } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { |
michael@0 | 1002 | _this->toUCallbackReason = UCNV_UNASSIGNED; |
michael@0 | 1003 | } |
michael@0 | 1004 | } |
michael@0 | 1005 | |
michael@0 | 1006 | /*Checks the characters of the buffer against valid 2022 escape sequences |
michael@0 | 1007 | *if the match we return a pointer to the initial start of the sequence otherwise |
michael@0 | 1008 | *we return sourceLimit |
michael@0 | 1009 | */ |
michael@0 | 1010 | /*for 2022 looks ahead in the stream |
michael@0 | 1011 | *to determine the longest possible convertible |
michael@0 | 1012 | *data stream |
michael@0 | 1013 | */ |
michael@0 | 1014 | static inline const char* |
michael@0 | 1015 | getEndOfBuffer_2022(const char** source, |
michael@0 | 1016 | const char* sourceLimit, |
michael@0 | 1017 | UBool /*flush*/){ |
michael@0 | 1018 | |
michael@0 | 1019 | const char* mySource = *source; |
michael@0 | 1020 | |
michael@0 | 1021 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 1022 | if (*source >= sourceLimit) |
michael@0 | 1023 | return sourceLimit; |
michael@0 | 1024 | |
michael@0 | 1025 | do{ |
michael@0 | 1026 | |
michael@0 | 1027 | if (*mySource == ESC_2022){ |
michael@0 | 1028 | int8_t i; |
michael@0 | 1029 | int32_t key = 0; |
michael@0 | 1030 | int32_t offset; |
michael@0 | 1031 | UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; |
michael@0 | 1032 | |
michael@0 | 1033 | /* Kludge: I could not |
michael@0 | 1034 | * figure out the reason for validating an escape sequence |
michael@0 | 1035 | * twice - once here and once in changeState_2022(). |
michael@0 | 1036 | * is it possible to have an ESC character in a ISO2022 |
michael@0 | 1037 | * byte stream which is valid in a code page? Is it legal? |
michael@0 | 1038 | */ |
michael@0 | 1039 | for (i=0; |
michael@0 | 1040 | (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); |
michael@0 | 1041 | i++) { |
michael@0 | 1042 | value = getKey_2022(*(mySource+i), &key, &offset); |
michael@0 | 1043 | } |
michael@0 | 1044 | if (value > 0 || *mySource==ESC_2022) |
michael@0 | 1045 | return mySource; |
michael@0 | 1046 | |
michael@0 | 1047 | if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) |
michael@0 | 1048 | return sourceLimit; |
michael@0 | 1049 | } |
michael@0 | 1050 | }while (++mySource < sourceLimit); |
michael@0 | 1051 | |
michael@0 | 1052 | return sourceLimit; |
michael@0 | 1053 | #else |
michael@0 | 1054 | while(mySource < sourceLimit && *mySource != ESC_2022) { |
michael@0 | 1055 | ++mySource; |
michael@0 | 1056 | } |
michael@0 | 1057 | return mySource; |
michael@0 | 1058 | #endif |
michael@0 | 1059 | } |
michael@0 | 1060 | |
michael@0 | 1061 | |
michael@0 | 1062 | /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c |
michael@0 | 1063 | * any future change in _MBCSFromUChar32() function should be reflected here. |
michael@0 | 1064 | * @return number of bytes in *value; negative number if fallback; 0 if no mapping |
michael@0 | 1065 | */ |
michael@0 | 1066 | static inline int32_t |
michael@0 | 1067 | MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, |
michael@0 | 1068 | UChar32 c, |
michael@0 | 1069 | uint32_t* value, |
michael@0 | 1070 | UBool useFallback, |
michael@0 | 1071 | int outputType) |
michael@0 | 1072 | { |
michael@0 | 1073 | const int32_t *cx; |
michael@0 | 1074 | const uint16_t *table; |
michael@0 | 1075 | uint32_t stage2Entry; |
michael@0 | 1076 | uint32_t myValue; |
michael@0 | 1077 | int32_t length; |
michael@0 | 1078 | const uint8_t *p; |
michael@0 | 1079 | /* |
michael@0 | 1080 | * TODO(markus): Use and require new, faster MBCS conversion table structures. |
michael@0 | 1081 | * Use internal version of ucnv_open() that verifies that the new structures are available, |
michael@0 | 1082 | * else U_INTERNAL_PROGRAM_ERROR. |
michael@0 | 1083 | */ |
michael@0 | 1084 | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
michael@0 | 1085 | if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
michael@0 | 1086 | table=sharedData->mbcs.fromUnicodeTable; |
michael@0 | 1087 | stage2Entry=MBCS_STAGE_2_FROM_U(table, c); |
michael@0 | 1088 | /* get the bytes and the length for the output */ |
michael@0 | 1089 | if(outputType==MBCS_OUTPUT_2){ |
michael@0 | 1090 | myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
michael@0 | 1091 | if(myValue<=0xff) { |
michael@0 | 1092 | length=1; |
michael@0 | 1093 | } else { |
michael@0 | 1094 | length=2; |
michael@0 | 1095 | } |
michael@0 | 1096 | } else /* outputType==MBCS_OUTPUT_3 */ { |
michael@0 | 1097 | p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
michael@0 | 1098 | myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; |
michael@0 | 1099 | if(myValue<=0xff) { |
michael@0 | 1100 | length=1; |
michael@0 | 1101 | } else if(myValue<=0xffff) { |
michael@0 | 1102 | length=2; |
michael@0 | 1103 | } else { |
michael@0 | 1104 | length=3; |
michael@0 | 1105 | } |
michael@0 | 1106 | } |
michael@0 | 1107 | /* is this code point assigned, or do we use fallbacks? */ |
michael@0 | 1108 | if((stage2Entry&(1<<(16+(c&0xf))))!=0) { |
michael@0 | 1109 | /* assigned */ |
michael@0 | 1110 | *value=myValue; |
michael@0 | 1111 | return length; |
michael@0 | 1112 | } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { |
michael@0 | 1113 | /* |
michael@0 | 1114 | * We allow a 0 byte output if the "assigned" bit is set for this entry. |
michael@0 | 1115 | * There is no way with this data structure for fallback output |
michael@0 | 1116 | * to be a zero byte. |
michael@0 | 1117 | */ |
michael@0 | 1118 | *value=myValue; |
michael@0 | 1119 | return -length; |
michael@0 | 1120 | } |
michael@0 | 1121 | } |
michael@0 | 1122 | |
michael@0 | 1123 | cx=sharedData->mbcs.extIndexes; |
michael@0 | 1124 | if(cx!=NULL) { |
michael@0 | 1125 | return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); |
michael@0 | 1126 | } |
michael@0 | 1127 | |
michael@0 | 1128 | /* unassigned */ |
michael@0 | 1129 | return 0; |
michael@0 | 1130 | } |
michael@0 | 1131 | |
michael@0 | 1132 | /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c |
michael@0 | 1133 | * any future change in _MBCSSingleFromUChar32() function should be reflected here. |
michael@0 | 1134 | * @param retval pointer to output byte |
michael@0 | 1135 | * @return 1 roundtrip byte 0 no mapping -1 fallback byte |
michael@0 | 1136 | */ |
michael@0 | 1137 | static inline int32_t |
michael@0 | 1138 | MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, |
michael@0 | 1139 | UChar32 c, |
michael@0 | 1140 | uint32_t* retval, |
michael@0 | 1141 | UBool useFallback) |
michael@0 | 1142 | { |
michael@0 | 1143 | const uint16_t *table; |
michael@0 | 1144 | int32_t value; |
michael@0 | 1145 | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
michael@0 | 1146 | if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
michael@0 | 1147 | return 0; |
michael@0 | 1148 | } |
michael@0 | 1149 | /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ |
michael@0 | 1150 | table=sharedData->mbcs.fromUnicodeTable; |
michael@0 | 1151 | /* get the byte for the output */ |
michael@0 | 1152 | value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); |
michael@0 | 1153 | /* is this code point assigned, or do we use fallbacks? */ |
michael@0 | 1154 | *retval=(uint32_t)(value&0xff); |
michael@0 | 1155 | if(value>=0xf00) { |
michael@0 | 1156 | return 1; /* roundtrip */ |
michael@0 | 1157 | } else if(useFallback ? value>=0x800 : value>=0xc00) { |
michael@0 | 1158 | return -1; /* fallback taken */ |
michael@0 | 1159 | } else { |
michael@0 | 1160 | return 0; /* no mapping */ |
michael@0 | 1161 | } |
michael@0 | 1162 | } |
michael@0 | 1163 | |
michael@0 | 1164 | /* |
michael@0 | 1165 | * Check that the result is a 2-byte value with each byte in the range A1..FE |
michael@0 | 1166 | * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte |
michael@0 | 1167 | * to move it to the ISO 2022 range 21..7E. |
michael@0 | 1168 | * Return 0 if out of range. |
michael@0 | 1169 | */ |
michael@0 | 1170 | static inline uint32_t |
michael@0 | 1171 | _2022FromGR94DBCS(uint32_t value) { |
michael@0 | 1172 | if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && |
michael@0 | 1173 | (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) |
michael@0 | 1174 | ) { |
michael@0 | 1175 | return value - 0x8080; /* shift down to 21..7e byte range */ |
michael@0 | 1176 | } else { |
michael@0 | 1177 | return 0; /* not valid for ISO 2022 */ |
michael@0 | 1178 | } |
michael@0 | 1179 | } |
michael@0 | 1180 | |
michael@0 | 1181 | #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ |
michael@0 | 1182 | /* |
michael@0 | 1183 | * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the |
michael@0 | 1184 | * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point |
michael@0 | 1185 | * unchanged. |
michael@0 | 1186 | */ |
michael@0 | 1187 | static inline uint32_t |
michael@0 | 1188 | _2022ToGR94DBCS(uint32_t value) { |
michael@0 | 1189 | uint32_t returnValue = value + 0x8080; |
michael@0 | 1190 | if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && |
michael@0 | 1191 | (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { |
michael@0 | 1192 | return returnValue; |
michael@0 | 1193 | } else { |
michael@0 | 1194 | return value; |
michael@0 | 1195 | } |
michael@0 | 1196 | } |
michael@0 | 1197 | #endif |
michael@0 | 1198 | |
michael@0 | 1199 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 1200 | |
michael@0 | 1201 | /********************************************************************************** |
michael@0 | 1202 | * ISO-2022 Converter |
michael@0 | 1203 | * |
michael@0 | 1204 | * |
michael@0 | 1205 | */ |
michael@0 | 1206 | |
michael@0 | 1207 | static void |
michael@0 | 1208 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, |
michael@0 | 1209 | UErrorCode* err){ |
michael@0 | 1210 | const char* mySourceLimit, *realSourceLimit; |
michael@0 | 1211 | const char* sourceStart; |
michael@0 | 1212 | const UChar* myTargetStart; |
michael@0 | 1213 | UConverter* saveThis; |
michael@0 | 1214 | UConverterDataISO2022* myData; |
michael@0 | 1215 | int8_t length; |
michael@0 | 1216 | |
michael@0 | 1217 | saveThis = args->converter; |
michael@0 | 1218 | myData=((UConverterDataISO2022*)(saveThis->extraInfo)); |
michael@0 | 1219 | |
michael@0 | 1220 | realSourceLimit = args->sourceLimit; |
michael@0 | 1221 | while (args->source < realSourceLimit) { |
michael@0 | 1222 | if(myData->key == 0) { /* are we in the middle of an escape sequence? */ |
michael@0 | 1223 | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
michael@0 | 1224 | mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); |
michael@0 | 1225 | |
michael@0 | 1226 | if(args->source < mySourceLimit) { |
michael@0 | 1227 | if(myData->currentConverter==NULL) { |
michael@0 | 1228 | myData->currentConverter = ucnv_open("ASCII",err); |
michael@0 | 1229 | if(U_FAILURE(*err)){ |
michael@0 | 1230 | return; |
michael@0 | 1231 | } |
michael@0 | 1232 | |
michael@0 | 1233 | myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
michael@0 | 1234 | saveThis->mode = UCNV_SO; |
michael@0 | 1235 | } |
michael@0 | 1236 | |
michael@0 | 1237 | /* convert to before the ESC or until the end of the buffer */ |
michael@0 | 1238 | myData->isFirstBuffer=FALSE; |
michael@0 | 1239 | sourceStart = args->source; |
michael@0 | 1240 | myTargetStart = args->target; |
michael@0 | 1241 | args->converter = myData->currentConverter; |
michael@0 | 1242 | ucnv_toUnicode(args->converter, |
michael@0 | 1243 | &args->target, |
michael@0 | 1244 | args->targetLimit, |
michael@0 | 1245 | &args->source, |
michael@0 | 1246 | mySourceLimit, |
michael@0 | 1247 | args->offsets, |
michael@0 | 1248 | (UBool)(args->flush && mySourceLimit == realSourceLimit), |
michael@0 | 1249 | err); |
michael@0 | 1250 | args->converter = saveThis; |
michael@0 | 1251 | |
michael@0 | 1252 | if (*err == U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 1253 | /* move the overflow buffer */ |
michael@0 | 1254 | length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; |
michael@0 | 1255 | myData->currentConverter->UCharErrorBufferLength = 0; |
michael@0 | 1256 | if(length > 0) { |
michael@0 | 1257 | uprv_memcpy(saveThis->UCharErrorBuffer, |
michael@0 | 1258 | myData->currentConverter->UCharErrorBuffer, |
michael@0 | 1259 | length*U_SIZEOF_UCHAR); |
michael@0 | 1260 | } |
michael@0 | 1261 | return; |
michael@0 | 1262 | } |
michael@0 | 1263 | |
michael@0 | 1264 | /* |
michael@0 | 1265 | * At least one of: |
michael@0 | 1266 | * -Error while converting |
michael@0 | 1267 | * -Done with entire buffer |
michael@0 | 1268 | * -Need to write offsets or update the current offset |
michael@0 | 1269 | * (leave that up to the code in ucnv.c) |
michael@0 | 1270 | * |
michael@0 | 1271 | * or else we just stopped at an ESC byte and continue with changeState_2022() |
michael@0 | 1272 | */ |
michael@0 | 1273 | if (U_FAILURE(*err) || |
michael@0 | 1274 | (args->source == realSourceLimit) || |
michael@0 | 1275 | (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || |
michael@0 | 1276 | (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) |
michael@0 | 1277 | ) { |
michael@0 | 1278 | /* copy partial or error input for truncated detection and error handling */ |
michael@0 | 1279 | if(U_FAILURE(*err)) { |
michael@0 | 1280 | length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; |
michael@0 | 1281 | if(length > 0) { |
michael@0 | 1282 | uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); |
michael@0 | 1283 | } |
michael@0 | 1284 | } else { |
michael@0 | 1285 | length = saveThis->toULength = myData->currentConverter->toULength; |
michael@0 | 1286 | if(length > 0) { |
michael@0 | 1287 | uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); |
michael@0 | 1288 | if(args->source < mySourceLimit) { |
michael@0 | 1289 | *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ |
michael@0 | 1290 | } |
michael@0 | 1291 | } |
michael@0 | 1292 | } |
michael@0 | 1293 | return; |
michael@0 | 1294 | } |
michael@0 | 1295 | } |
michael@0 | 1296 | } |
michael@0 | 1297 | |
michael@0 | 1298 | sourceStart = args->source; |
michael@0 | 1299 | changeState_2022(args->converter, |
michael@0 | 1300 | &(args->source), |
michael@0 | 1301 | realSourceLimit, |
michael@0 | 1302 | ISO_2022, |
michael@0 | 1303 | err); |
michael@0 | 1304 | if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { |
michael@0 | 1305 | /* let the ucnv.c code update its current offset */ |
michael@0 | 1306 | return; |
michael@0 | 1307 | } |
michael@0 | 1308 | } |
michael@0 | 1309 | } |
michael@0 | 1310 | |
michael@0 | 1311 | #endif |
michael@0 | 1312 | |
michael@0 | 1313 | /* |
michael@0 | 1314 | * To Unicode Callback helper function |
michael@0 | 1315 | */ |
michael@0 | 1316 | static void |
michael@0 | 1317 | toUnicodeCallback(UConverter *cnv, |
michael@0 | 1318 | const uint32_t sourceChar, const uint32_t targetUniChar, |
michael@0 | 1319 | UErrorCode* err){ |
michael@0 | 1320 | if(sourceChar>0xff){ |
michael@0 | 1321 | cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); |
michael@0 | 1322 | cnv->toUBytes[1] = (uint8_t)sourceChar; |
michael@0 | 1323 | cnv->toULength = 2; |
michael@0 | 1324 | } |
michael@0 | 1325 | else{ |
michael@0 | 1326 | cnv->toUBytes[0] =(char) sourceChar; |
michael@0 | 1327 | cnv->toULength = 1; |
michael@0 | 1328 | } |
michael@0 | 1329 | |
michael@0 | 1330 | if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ |
michael@0 | 1331 | *err = U_INVALID_CHAR_FOUND; |
michael@0 | 1332 | } |
michael@0 | 1333 | else{ |
michael@0 | 1334 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1335 | } |
michael@0 | 1336 | } |
michael@0 | 1337 | |
michael@0 | 1338 | /**************************************ISO-2022-JP*************************************************/ |
michael@0 | 1339 | |
michael@0 | 1340 | /************************************** IMPORTANT ************************************************** |
michael@0 | 1341 | * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and |
michael@0 | 1342 | * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). |
michael@0 | 1343 | * The converter iterates over each Unicode codepoint |
michael@0 | 1344 | * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is |
michael@0 | 1345 | * processed one char at a time it would make sense to reduce the extra processing a canned converter |
michael@0 | 1346 | * would do as far as possible. |
michael@0 | 1347 | * |
michael@0 | 1348 | * If the implementation of these macros or structure of sharedData struct change in the future, make |
michael@0 | 1349 | * sure that ISO-2022 is also changed. |
michael@0 | 1350 | *************************************************************************************************** |
michael@0 | 1351 | */ |
michael@0 | 1352 | |
michael@0 | 1353 | /*************************************************************************************************** |
michael@0 | 1354 | * Rules for ISO-2022-jp encoding |
michael@0 | 1355 | * (i) Escape sequences must be fully contained within a line they should not |
michael@0 | 1356 | * span new lines or CRs |
michael@0 | 1357 | * (ii) If the last character on a line is represented by two bytes then an ASCII or |
michael@0 | 1358 | * JIS-Roman character escape sequence should follow before the line terminates |
michael@0 | 1359 | * (iii) If the first character on the line is represented by two bytes then a two |
michael@0 | 1360 | * byte character escape sequence should precede it |
michael@0 | 1361 | * (iv) If no escape sequence is encountered then the characters are ASCII |
michael@0 | 1362 | * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, |
michael@0 | 1363 | * and invoked with SS2 (ESC N). |
michael@0 | 1364 | * (vi) If there is any G0 designation in text, there must be a switch to |
michael@0 | 1365 | * ASCII or to JIS X 0201-Roman before a space character (but not |
michael@0 | 1366 | * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control |
michael@0 | 1367 | * characters such as tab or CRLF. |
michael@0 | 1368 | * (vi) Supported encodings: |
michael@0 | 1369 | * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 |
michael@0 | 1370 | * |
michael@0 | 1371 | * source : RFC-1554 |
michael@0 | 1372 | * |
michael@0 | 1373 | * JISX201, JISX208,JISX212 : new .cnv data files created |
michael@0 | 1374 | * KSC5601 : alias to ibm-949 mapping table |
michael@0 | 1375 | * GB2312 : alias to ibm-1386 mapping table |
michael@0 | 1376 | * ISO-8859-1 : Algorithmic implemented as LATIN1 case |
michael@0 | 1377 | * ISO-8859-7 : alisas to ibm-9409 mapping table |
michael@0 | 1378 | */ |
michael@0 | 1379 | |
michael@0 | 1380 | /* preference order of JP charsets */ |
michael@0 | 1381 | static const StateEnum jpCharsetPref[]={ |
michael@0 | 1382 | ASCII, |
michael@0 | 1383 | JISX201, |
michael@0 | 1384 | ISO8859_1, |
michael@0 | 1385 | ISO8859_7, |
michael@0 | 1386 | JISX208, |
michael@0 | 1387 | JISX212, |
michael@0 | 1388 | GB2312, |
michael@0 | 1389 | KSC5601, |
michael@0 | 1390 | HWKANA_7BIT |
michael@0 | 1391 | }; |
michael@0 | 1392 | |
michael@0 | 1393 | /* |
michael@0 | 1394 | * The escape sequences must be in order of the enum constants like JISX201 = 3, |
michael@0 | 1395 | * not in order of jpCharsetPref[]! |
michael@0 | 1396 | */ |
michael@0 | 1397 | static const char escSeqChars[][6] ={ |
michael@0 | 1398 | "\x1B\x28\x42", /* <ESC>(B ASCII */ |
michael@0 | 1399 | "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ |
michael@0 | 1400 | "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ |
michael@0 | 1401 | "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ |
michael@0 | 1402 | "\x1B\x24\x42", /* <ESC>$B JISX-208 */ |
michael@0 | 1403 | "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ |
michael@0 | 1404 | "\x1B\x24\x41", /* <ESC>$A GB2312 */ |
michael@0 | 1405 | "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ |
michael@0 | 1406 | "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ |
michael@0 | 1407 | |
michael@0 | 1408 | }; |
michael@0 | 1409 | static const int8_t escSeqCharsLen[] ={ |
michael@0 | 1410 | 3, /* length of <ESC>(B ASCII */ |
michael@0 | 1411 | 3, /* length of <ESC>.A ISO-8859-1 */ |
michael@0 | 1412 | 3, /* length of <ESC>.F ISO-8859-7 */ |
michael@0 | 1413 | 3, /* length of <ESC>(J JISX-201 */ |
michael@0 | 1414 | 3, /* length of <ESC>$B JISX-208 */ |
michael@0 | 1415 | 4, /* length of <ESC>$(D JISX-212 */ |
michael@0 | 1416 | 3, /* length of <ESC>$A GB2312 */ |
michael@0 | 1417 | 4, /* length of <ESC>$(C KSC5601 */ |
michael@0 | 1418 | 3 /* length of <ESC>(I HWKANA_7BIT */ |
michael@0 | 1419 | }; |
michael@0 | 1420 | |
michael@0 | 1421 | /* |
michael@0 | 1422 | * The iteration over various code pages works this way: |
michael@0 | 1423 | * i) Get the currentState from myConverterData->currentState |
michael@0 | 1424 | * ii) Check if the character is mapped to a valid character in the currentState |
michael@0 | 1425 | * Yes -> a) set the initIterState to currentState |
michael@0 | 1426 | * b) remain in this state until an invalid character is found |
michael@0 | 1427 | * No -> a) go to the next code page and find the character |
michael@0 | 1428 | * iii) Before changing the state increment the current state check if the current state |
michael@0 | 1429 | * is equal to the intitIteration state |
michael@0 | 1430 | * Yes -> A character that cannot be represented in any of the supported encodings |
michael@0 | 1431 | * break and return a U_INVALID_CHARACTER error |
michael@0 | 1432 | * No -> Continue and find the character in next code page |
michael@0 | 1433 | * |
michael@0 | 1434 | * |
michael@0 | 1435 | * TODO: Implement a priority technique where the users are allowed to set the priority of code pages |
michael@0 | 1436 | */ |
michael@0 | 1437 | |
michael@0 | 1438 | /* Map 00..7F to Unicode according to JIS X 0201. */ |
michael@0 | 1439 | static inline uint32_t |
michael@0 | 1440 | jisx201ToU(uint32_t value) { |
michael@0 | 1441 | if(value < 0x5c) { |
michael@0 | 1442 | return value; |
michael@0 | 1443 | } else if(value == 0x5c) { |
michael@0 | 1444 | return 0xa5; |
michael@0 | 1445 | } else if(value == 0x7e) { |
michael@0 | 1446 | return 0x203e; |
michael@0 | 1447 | } else /* value <= 0x7f */ { |
michael@0 | 1448 | return value; |
michael@0 | 1449 | } |
michael@0 | 1450 | } |
michael@0 | 1451 | |
michael@0 | 1452 | /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ |
michael@0 | 1453 | static inline uint32_t |
michael@0 | 1454 | jisx201FromU(uint32_t value) { |
michael@0 | 1455 | if(value<=0x7f) { |
michael@0 | 1456 | if(value!=0x5c && value!=0x7e) { |
michael@0 | 1457 | return value; |
michael@0 | 1458 | } |
michael@0 | 1459 | } else if(value==0xa5) { |
michael@0 | 1460 | return 0x5c; |
michael@0 | 1461 | } else if(value==0x203e) { |
michael@0 | 1462 | return 0x7e; |
michael@0 | 1463 | } |
michael@0 | 1464 | return 0xfffe; |
michael@0 | 1465 | } |
michael@0 | 1466 | |
michael@0 | 1467 | /* |
michael@0 | 1468 | * Take a valid Shift-JIS byte pair, check that it is in the range corresponding |
michael@0 | 1469 | * to JIS X 0208, and convert it to a pair of 21..7E bytes. |
michael@0 | 1470 | * Return 0 if the byte pair is out of range. |
michael@0 | 1471 | */ |
michael@0 | 1472 | static inline uint32_t |
michael@0 | 1473 | _2022FromSJIS(uint32_t value) { |
michael@0 | 1474 | uint8_t trail; |
michael@0 | 1475 | |
michael@0 | 1476 | if(value > 0xEFFC) { |
michael@0 | 1477 | return 0; /* beyond JIS X 0208 */ |
michael@0 | 1478 | } |
michael@0 | 1479 | |
michael@0 | 1480 | trail = (uint8_t)value; |
michael@0 | 1481 | |
michael@0 | 1482 | value &= 0xff00; /* lead byte */ |
michael@0 | 1483 | if(value <= 0x9f00) { |
michael@0 | 1484 | value -= 0x7000; |
michael@0 | 1485 | } else /* 0xe000 <= value <= 0xef00 */ { |
michael@0 | 1486 | value -= 0xb000; |
michael@0 | 1487 | } |
michael@0 | 1488 | value <<= 1; |
michael@0 | 1489 | |
michael@0 | 1490 | if(trail <= 0x9e) { |
michael@0 | 1491 | value -= 0x100; |
michael@0 | 1492 | if(trail <= 0x7e) { |
michael@0 | 1493 | value |= trail - 0x1f; |
michael@0 | 1494 | } else { |
michael@0 | 1495 | value |= trail - 0x20; |
michael@0 | 1496 | } |
michael@0 | 1497 | } else /* trail <= 0xfc */ { |
michael@0 | 1498 | value |= trail - 0x7e; |
michael@0 | 1499 | } |
michael@0 | 1500 | return value; |
michael@0 | 1501 | } |
michael@0 | 1502 | |
michael@0 | 1503 | /* |
michael@0 | 1504 | * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. |
michael@0 | 1505 | * If either byte is outside 21..7E make sure that the result is not valid |
michael@0 | 1506 | * for Shift-JIS so that the converter catches it. |
michael@0 | 1507 | * Some invalid byte values already turn into equally invalid Shift-JIS |
michael@0 | 1508 | * byte values and need not be tested explicitly. |
michael@0 | 1509 | */ |
michael@0 | 1510 | static inline void |
michael@0 | 1511 | _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { |
michael@0 | 1512 | if(c1&1) { |
michael@0 | 1513 | ++c1; |
michael@0 | 1514 | if(c2 <= 0x5f) { |
michael@0 | 1515 | c2 += 0x1f; |
michael@0 | 1516 | } else if(c2 <= 0x7e) { |
michael@0 | 1517 | c2 += 0x20; |
michael@0 | 1518 | } else { |
michael@0 | 1519 | c2 = 0; /* invalid */ |
michael@0 | 1520 | } |
michael@0 | 1521 | } else { |
michael@0 | 1522 | if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { |
michael@0 | 1523 | c2 += 0x7e; |
michael@0 | 1524 | } else { |
michael@0 | 1525 | c2 = 0; /* invalid */ |
michael@0 | 1526 | } |
michael@0 | 1527 | } |
michael@0 | 1528 | c1 >>= 1; |
michael@0 | 1529 | if(c1 <= 0x2f) { |
michael@0 | 1530 | c1 += 0x70; |
michael@0 | 1531 | } else if(c1 <= 0x3f) { |
michael@0 | 1532 | c1 += 0xb0; |
michael@0 | 1533 | } else { |
michael@0 | 1534 | c1 = 0; /* invalid */ |
michael@0 | 1535 | } |
michael@0 | 1536 | bytes[0] = (char)c1; |
michael@0 | 1537 | bytes[1] = (char)c2; |
michael@0 | 1538 | } |
michael@0 | 1539 | |
michael@0 | 1540 | /* |
michael@0 | 1541 | * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) |
michael@0 | 1542 | * Katakana. |
michael@0 | 1543 | * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks |
michael@0 | 1544 | * because Shift-JIS roundtrips half-width Katakana to single bytes. |
michael@0 | 1545 | * These were the only fallbacks in ICU's jisx-208.ucm file. |
michael@0 | 1546 | */ |
michael@0 | 1547 | static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { |
michael@0 | 1548 | 0x2123, /* U+FF61 */ |
michael@0 | 1549 | 0x2156, |
michael@0 | 1550 | 0x2157, |
michael@0 | 1551 | 0x2122, |
michael@0 | 1552 | 0x2126, |
michael@0 | 1553 | 0x2572, |
michael@0 | 1554 | 0x2521, |
michael@0 | 1555 | 0x2523, |
michael@0 | 1556 | 0x2525, |
michael@0 | 1557 | 0x2527, |
michael@0 | 1558 | 0x2529, |
michael@0 | 1559 | 0x2563, |
michael@0 | 1560 | 0x2565, |
michael@0 | 1561 | 0x2567, |
michael@0 | 1562 | 0x2543, |
michael@0 | 1563 | 0x213C, /* U+FF70 */ |
michael@0 | 1564 | 0x2522, |
michael@0 | 1565 | 0x2524, |
michael@0 | 1566 | 0x2526, |
michael@0 | 1567 | 0x2528, |
michael@0 | 1568 | 0x252A, |
michael@0 | 1569 | 0x252B, |
michael@0 | 1570 | 0x252D, |
michael@0 | 1571 | 0x252F, |
michael@0 | 1572 | 0x2531, |
michael@0 | 1573 | 0x2533, |
michael@0 | 1574 | 0x2535, |
michael@0 | 1575 | 0x2537, |
michael@0 | 1576 | 0x2539, |
michael@0 | 1577 | 0x253B, |
michael@0 | 1578 | 0x253D, |
michael@0 | 1579 | 0x253F, /* U+FF80 */ |
michael@0 | 1580 | 0x2541, |
michael@0 | 1581 | 0x2544, |
michael@0 | 1582 | 0x2546, |
michael@0 | 1583 | 0x2548, |
michael@0 | 1584 | 0x254A, |
michael@0 | 1585 | 0x254B, |
michael@0 | 1586 | 0x254C, |
michael@0 | 1587 | 0x254D, |
michael@0 | 1588 | 0x254E, |
michael@0 | 1589 | 0x254F, |
michael@0 | 1590 | 0x2552, |
michael@0 | 1591 | 0x2555, |
michael@0 | 1592 | 0x2558, |
michael@0 | 1593 | 0x255B, |
michael@0 | 1594 | 0x255E, |
michael@0 | 1595 | 0x255F, /* U+FF90 */ |
michael@0 | 1596 | 0x2560, |
michael@0 | 1597 | 0x2561, |
michael@0 | 1598 | 0x2562, |
michael@0 | 1599 | 0x2564, |
michael@0 | 1600 | 0x2566, |
michael@0 | 1601 | 0x2568, |
michael@0 | 1602 | 0x2569, |
michael@0 | 1603 | 0x256A, |
michael@0 | 1604 | 0x256B, |
michael@0 | 1605 | 0x256C, |
michael@0 | 1606 | 0x256D, |
michael@0 | 1607 | 0x256F, |
michael@0 | 1608 | 0x2573, |
michael@0 | 1609 | 0x212B, |
michael@0 | 1610 | 0x212C /* U+FF9F */ |
michael@0 | 1611 | }; |
michael@0 | 1612 | |
michael@0 | 1613 | static void |
michael@0 | 1614 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { |
michael@0 | 1615 | UConverter *cnv = args->converter; |
michael@0 | 1616 | UConverterDataISO2022 *converterData; |
michael@0 | 1617 | ISO2022State *pFromU2022State; |
michael@0 | 1618 | uint8_t *target = (uint8_t *) args->target; |
michael@0 | 1619 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
michael@0 | 1620 | const UChar* source = args->source; |
michael@0 | 1621 | const UChar* sourceLimit = args->sourceLimit; |
michael@0 | 1622 | int32_t* offsets = args->offsets; |
michael@0 | 1623 | UChar32 sourceChar; |
michael@0 | 1624 | char buffer[8]; |
michael@0 | 1625 | int32_t len, outLen; |
michael@0 | 1626 | int8_t choices[10]; |
michael@0 | 1627 | int32_t choiceCount; |
michael@0 | 1628 | uint32_t targetValue = 0; |
michael@0 | 1629 | UBool useFallback; |
michael@0 | 1630 | |
michael@0 | 1631 | int32_t i; |
michael@0 | 1632 | int8_t cs, g; |
michael@0 | 1633 | |
michael@0 | 1634 | /* set up the state */ |
michael@0 | 1635 | converterData = (UConverterDataISO2022*)cnv->extraInfo; |
michael@0 | 1636 | pFromU2022State = &converterData->fromU2022State; |
michael@0 | 1637 | |
michael@0 | 1638 | choiceCount = 0; |
michael@0 | 1639 | |
michael@0 | 1640 | /* check if the last codepoint of previous buffer was a lead surrogate*/ |
michael@0 | 1641 | if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
michael@0 | 1642 | goto getTrail; |
michael@0 | 1643 | } |
michael@0 | 1644 | |
michael@0 | 1645 | while(source < sourceLimit) { |
michael@0 | 1646 | if(target < targetLimit) { |
michael@0 | 1647 | |
michael@0 | 1648 | sourceChar = *(source++); |
michael@0 | 1649 | /*check if the char is a First surrogate*/ |
michael@0 | 1650 | if(U16_IS_SURROGATE(sourceChar)) { |
michael@0 | 1651 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
michael@0 | 1652 | getTrail: |
michael@0 | 1653 | /*look ahead to find the trail surrogate*/ |
michael@0 | 1654 | if(source < sourceLimit) { |
michael@0 | 1655 | /* test the following code unit */ |
michael@0 | 1656 | UChar trail=(UChar) *source; |
michael@0 | 1657 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 1658 | source++; |
michael@0 | 1659 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
michael@0 | 1660 | cnv->fromUChar32=0x00; |
michael@0 | 1661 | /* convert this supplementary code point */ |
michael@0 | 1662 | /* exit this condition tree */ |
michael@0 | 1663 | } else { |
michael@0 | 1664 | /* this is an unmatched lead code unit (1st surrogate) */ |
michael@0 | 1665 | /* callback(illegal) */ |
michael@0 | 1666 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1667 | cnv->fromUChar32=sourceChar; |
michael@0 | 1668 | break; |
michael@0 | 1669 | } |
michael@0 | 1670 | } else { |
michael@0 | 1671 | /* no more input */ |
michael@0 | 1672 | cnv->fromUChar32=sourceChar; |
michael@0 | 1673 | break; |
michael@0 | 1674 | } |
michael@0 | 1675 | } else { |
michael@0 | 1676 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 1677 | /* callback(illegal) */ |
michael@0 | 1678 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1679 | cnv->fromUChar32=sourceChar; |
michael@0 | 1680 | break; |
michael@0 | 1681 | } |
michael@0 | 1682 | } |
michael@0 | 1683 | |
michael@0 | 1684 | /* do not convert SO/SI/ESC */ |
michael@0 | 1685 | if(IS_2022_CONTROL(sourceChar)) { |
michael@0 | 1686 | /* callback(illegal) */ |
michael@0 | 1687 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1688 | cnv->fromUChar32=sourceChar; |
michael@0 | 1689 | break; |
michael@0 | 1690 | } |
michael@0 | 1691 | |
michael@0 | 1692 | /* do the conversion */ |
michael@0 | 1693 | |
michael@0 | 1694 | if(choiceCount == 0) { |
michael@0 | 1695 | uint16_t csm; |
michael@0 | 1696 | |
michael@0 | 1697 | /* |
michael@0 | 1698 | * The csm variable keeps track of which charsets are allowed |
michael@0 | 1699 | * and not used yet while building the choices[]. |
michael@0 | 1700 | */ |
michael@0 | 1701 | csm = jpCharsetMasks[converterData->version]; |
michael@0 | 1702 | choiceCount = 0; |
michael@0 | 1703 | |
michael@0 | 1704 | /* JIS7/8: try single-byte half-width Katakana before JISX208 */ |
michael@0 | 1705 | if(converterData->version == 3 || converterData->version == 4) { |
michael@0 | 1706 | choices[choiceCount++] = (int8_t)HWKANA_7BIT; |
michael@0 | 1707 | } |
michael@0 | 1708 | /* Do not try single-byte half-width Katakana for other versions. */ |
michael@0 | 1709 | csm &= ~CSM(HWKANA_7BIT); |
michael@0 | 1710 | |
michael@0 | 1711 | /* try the current G0 charset */ |
michael@0 | 1712 | choices[choiceCount++] = cs = pFromU2022State->cs[0]; |
michael@0 | 1713 | csm &= ~CSM(cs); |
michael@0 | 1714 | |
michael@0 | 1715 | /* try the current G2 charset */ |
michael@0 | 1716 | if((cs = pFromU2022State->cs[2]) != 0) { |
michael@0 | 1717 | choices[choiceCount++] = cs; |
michael@0 | 1718 | csm &= ~CSM(cs); |
michael@0 | 1719 | } |
michael@0 | 1720 | |
michael@0 | 1721 | /* try all the other possible charsets */ |
michael@0 | 1722 | for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { |
michael@0 | 1723 | cs = (int8_t)jpCharsetPref[i]; |
michael@0 | 1724 | if(CSM(cs) & csm) { |
michael@0 | 1725 | choices[choiceCount++] = cs; |
michael@0 | 1726 | csm &= ~CSM(cs); |
michael@0 | 1727 | } |
michael@0 | 1728 | } |
michael@0 | 1729 | } |
michael@0 | 1730 | |
michael@0 | 1731 | cs = g = 0; |
michael@0 | 1732 | /* |
michael@0 | 1733 | * len==0: no mapping found yet |
michael@0 | 1734 | * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
michael@0 | 1735 | * len>0: found a roundtrip result, done |
michael@0 | 1736 | */ |
michael@0 | 1737 | len = 0; |
michael@0 | 1738 | /* |
michael@0 | 1739 | * We will turn off useFallback after finding a fallback, |
michael@0 | 1740 | * but we still get fallbacks from PUA code points as usual. |
michael@0 | 1741 | * Therefore, we will also need to check that we don't overwrite |
michael@0 | 1742 | * an early fallback with a later one. |
michael@0 | 1743 | */ |
michael@0 | 1744 | useFallback = cnv->useFallback; |
michael@0 | 1745 | |
michael@0 | 1746 | for(i = 0; i < choiceCount && len <= 0; ++i) { |
michael@0 | 1747 | uint32_t value; |
michael@0 | 1748 | int32_t len2; |
michael@0 | 1749 | int8_t cs0 = choices[i]; |
michael@0 | 1750 | switch(cs0) { |
michael@0 | 1751 | case ASCII: |
michael@0 | 1752 | if(sourceChar <= 0x7f) { |
michael@0 | 1753 | targetValue = (uint32_t)sourceChar; |
michael@0 | 1754 | len = 1; |
michael@0 | 1755 | cs = cs0; |
michael@0 | 1756 | g = 0; |
michael@0 | 1757 | } |
michael@0 | 1758 | break; |
michael@0 | 1759 | case ISO8859_1: |
michael@0 | 1760 | if(GR96_START <= sourceChar && sourceChar <= GR96_END) { |
michael@0 | 1761 | targetValue = (uint32_t)sourceChar - 0x80; |
michael@0 | 1762 | len = 1; |
michael@0 | 1763 | cs = cs0; |
michael@0 | 1764 | g = 2; |
michael@0 | 1765 | } |
michael@0 | 1766 | break; |
michael@0 | 1767 | case HWKANA_7BIT: |
michael@0 | 1768 | if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
michael@0 | 1769 | if(converterData->version==3) { |
michael@0 | 1770 | /* JIS7: use G1 (SO) */ |
michael@0 | 1771 | /* Shift U+FF61..U+FF9F to bytes 21..5F. */ |
michael@0 | 1772 | targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); |
michael@0 | 1773 | len = 1; |
michael@0 | 1774 | pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ |
michael@0 | 1775 | g = 1; |
michael@0 | 1776 | } else if(converterData->version==4) { |
michael@0 | 1777 | /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ |
michael@0 | 1778 | /* Shift U+FF61..U+FF9F to bytes A1..DF. */ |
michael@0 | 1779 | targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); |
michael@0 | 1780 | len = 1; |
michael@0 | 1781 | |
michael@0 | 1782 | cs = pFromU2022State->cs[0]; |
michael@0 | 1783 | if(IS_JP_DBCS(cs)) { |
michael@0 | 1784 | /* switch from a DBCS charset to JISX201 */ |
michael@0 | 1785 | cs = (int8_t)JISX201; |
michael@0 | 1786 | } |
michael@0 | 1787 | /* else stay in the current G0 charset */ |
michael@0 | 1788 | g = 0; |
michael@0 | 1789 | } |
michael@0 | 1790 | /* else do not use HWKANA_7BIT with other versions */ |
michael@0 | 1791 | } |
michael@0 | 1792 | break; |
michael@0 | 1793 | case JISX201: |
michael@0 | 1794 | /* G0 SBCS */ |
michael@0 | 1795 | value = jisx201FromU(sourceChar); |
michael@0 | 1796 | if(value <= 0x7f) { |
michael@0 | 1797 | targetValue = value; |
michael@0 | 1798 | len = 1; |
michael@0 | 1799 | cs = cs0; |
michael@0 | 1800 | g = 0; |
michael@0 | 1801 | useFallback = FALSE; |
michael@0 | 1802 | } |
michael@0 | 1803 | break; |
michael@0 | 1804 | case JISX208: |
michael@0 | 1805 | /* G0 DBCS from Shift-JIS table */ |
michael@0 | 1806 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
michael@0 | 1807 | converterData->myConverterArray[cs0], |
michael@0 | 1808 | sourceChar, &value, |
michael@0 | 1809 | useFallback, MBCS_OUTPUT_2); |
michael@0 | 1810 | if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
michael@0 | 1811 | value = _2022FromSJIS(value); |
michael@0 | 1812 | if(value != 0) { |
michael@0 | 1813 | targetValue = value; |
michael@0 | 1814 | len = len2; |
michael@0 | 1815 | cs = cs0; |
michael@0 | 1816 | g = 0; |
michael@0 | 1817 | useFallback = FALSE; |
michael@0 | 1818 | } |
michael@0 | 1819 | } else if(len == 0 && useFallback && |
michael@0 | 1820 | (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
michael@0 | 1821 | targetValue = hwkana_fb[sourceChar - HWKANA_START]; |
michael@0 | 1822 | len = -2; |
michael@0 | 1823 | cs = cs0; |
michael@0 | 1824 | g = 0; |
michael@0 | 1825 | useFallback = FALSE; |
michael@0 | 1826 | } |
michael@0 | 1827 | break; |
michael@0 | 1828 | case ISO8859_7: |
michael@0 | 1829 | /* G0 SBCS forced to 7-bit output */ |
michael@0 | 1830 | len2 = MBCS_SINGLE_FROM_UCHAR32( |
michael@0 | 1831 | converterData->myConverterArray[cs0], |
michael@0 | 1832 | sourceChar, &value, |
michael@0 | 1833 | useFallback); |
michael@0 | 1834 | if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { |
michael@0 | 1835 | targetValue = value - 0x80; |
michael@0 | 1836 | len = len2; |
michael@0 | 1837 | cs = cs0; |
michael@0 | 1838 | g = 2; |
michael@0 | 1839 | useFallback = FALSE; |
michael@0 | 1840 | } |
michael@0 | 1841 | break; |
michael@0 | 1842 | default: |
michael@0 | 1843 | /* G0 DBCS */ |
michael@0 | 1844 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
michael@0 | 1845 | converterData->myConverterArray[cs0], |
michael@0 | 1846 | sourceChar, &value, |
michael@0 | 1847 | useFallback, MBCS_OUTPUT_2); |
michael@0 | 1848 | if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
michael@0 | 1849 | if(cs0 == KSC5601) { |
michael@0 | 1850 | /* |
michael@0 | 1851 | * Check for valid bytes for the encoding scheme. |
michael@0 | 1852 | * This is necessary because the sub-converter (windows-949) |
michael@0 | 1853 | * has a broader encoding scheme than is valid for 2022. |
michael@0 | 1854 | */ |
michael@0 | 1855 | value = _2022FromGR94DBCS(value); |
michael@0 | 1856 | if(value == 0) { |
michael@0 | 1857 | break; |
michael@0 | 1858 | } |
michael@0 | 1859 | } |
michael@0 | 1860 | targetValue = value; |
michael@0 | 1861 | len = len2; |
michael@0 | 1862 | cs = cs0; |
michael@0 | 1863 | g = 0; |
michael@0 | 1864 | useFallback = FALSE; |
michael@0 | 1865 | } |
michael@0 | 1866 | break; |
michael@0 | 1867 | } |
michael@0 | 1868 | } |
michael@0 | 1869 | |
michael@0 | 1870 | if(len != 0) { |
michael@0 | 1871 | if(len < 0) { |
michael@0 | 1872 | len = -len; /* fallback */ |
michael@0 | 1873 | } |
michael@0 | 1874 | outLen = 0; /* count output bytes */ |
michael@0 | 1875 | |
michael@0 | 1876 | /* write SI if necessary (only for JIS7) */ |
michael@0 | 1877 | if(pFromU2022State->g == 1 && g == 0) { |
michael@0 | 1878 | buffer[outLen++] = UCNV_SI; |
michael@0 | 1879 | pFromU2022State->g = 0; |
michael@0 | 1880 | } |
michael@0 | 1881 | |
michael@0 | 1882 | /* write the designation sequence if necessary */ |
michael@0 | 1883 | if(cs != pFromU2022State->cs[g]) { |
michael@0 | 1884 | int32_t escLen = escSeqCharsLen[cs]; |
michael@0 | 1885 | uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); |
michael@0 | 1886 | outLen += escLen; |
michael@0 | 1887 | pFromU2022State->cs[g] = cs; |
michael@0 | 1888 | |
michael@0 | 1889 | /* invalidate the choices[] */ |
michael@0 | 1890 | choiceCount = 0; |
michael@0 | 1891 | } |
michael@0 | 1892 | |
michael@0 | 1893 | /* write the shift sequence if necessary */ |
michael@0 | 1894 | if(g != pFromU2022State->g) { |
michael@0 | 1895 | switch(g) { |
michael@0 | 1896 | /* case 0 handled before writing escapes */ |
michael@0 | 1897 | case 1: |
michael@0 | 1898 | buffer[outLen++] = UCNV_SO; |
michael@0 | 1899 | pFromU2022State->g = 1; |
michael@0 | 1900 | break; |
michael@0 | 1901 | default: /* case 2 */ |
michael@0 | 1902 | buffer[outLen++] = 0x1b; |
michael@0 | 1903 | buffer[outLen++] = 0x4e; |
michael@0 | 1904 | break; |
michael@0 | 1905 | /* no case 3: no SS3 in ISO-2022-JP-x */ |
michael@0 | 1906 | } |
michael@0 | 1907 | } |
michael@0 | 1908 | |
michael@0 | 1909 | /* write the output bytes */ |
michael@0 | 1910 | if(len == 1) { |
michael@0 | 1911 | buffer[outLen++] = (char)targetValue; |
michael@0 | 1912 | } else /* len == 2 */ { |
michael@0 | 1913 | buffer[outLen++] = (char)(targetValue >> 8); |
michael@0 | 1914 | buffer[outLen++] = (char)targetValue; |
michael@0 | 1915 | } |
michael@0 | 1916 | } else { |
michael@0 | 1917 | /* |
michael@0 | 1918 | * if we cannot find the character after checking all codepages |
michael@0 | 1919 | * then this is an error |
michael@0 | 1920 | */ |
michael@0 | 1921 | *err = U_INVALID_CHAR_FOUND; |
michael@0 | 1922 | cnv->fromUChar32=sourceChar; |
michael@0 | 1923 | break; |
michael@0 | 1924 | } |
michael@0 | 1925 | |
michael@0 | 1926 | if(sourceChar == CR || sourceChar == LF) { |
michael@0 | 1927 | /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ |
michael@0 | 1928 | pFromU2022State->cs[2] = 0; |
michael@0 | 1929 | choiceCount = 0; |
michael@0 | 1930 | } |
michael@0 | 1931 | |
michael@0 | 1932 | /* output outLen>0 bytes in buffer[] */ |
michael@0 | 1933 | if(outLen == 1) { |
michael@0 | 1934 | *target++ = buffer[0]; |
michael@0 | 1935 | if(offsets) { |
michael@0 | 1936 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
michael@0 | 1937 | } |
michael@0 | 1938 | } else if(outLen == 2 && (target + 2) <= targetLimit) { |
michael@0 | 1939 | *target++ = buffer[0]; |
michael@0 | 1940 | *target++ = buffer[1]; |
michael@0 | 1941 | if(offsets) { |
michael@0 | 1942 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
michael@0 | 1943 | *offsets++ = sourceIndex; |
michael@0 | 1944 | *offsets++ = sourceIndex; |
michael@0 | 1945 | } |
michael@0 | 1946 | } else { |
michael@0 | 1947 | fromUWriteUInt8( |
michael@0 | 1948 | cnv, |
michael@0 | 1949 | buffer, outLen, |
michael@0 | 1950 | &target, (const char *)targetLimit, |
michael@0 | 1951 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
michael@0 | 1952 | err); |
michael@0 | 1953 | if(U_FAILURE(*err)) { |
michael@0 | 1954 | break; |
michael@0 | 1955 | } |
michael@0 | 1956 | } |
michael@0 | 1957 | } /* end if(myTargetIndex<myTargetLength) */ |
michael@0 | 1958 | else{ |
michael@0 | 1959 | *err =U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1960 | break; |
michael@0 | 1961 | } |
michael@0 | 1962 | |
michael@0 | 1963 | }/* end while(mySourceIndex<mySourceLength) */ |
michael@0 | 1964 | |
michael@0 | 1965 | /* |
michael@0 | 1966 | * the end of the input stream and detection of truncated input |
michael@0 | 1967 | * are handled by the framework, but for ISO-2022-JP conversion |
michael@0 | 1968 | * we need to be in ASCII mode at the very end |
michael@0 | 1969 | * |
michael@0 | 1970 | * conditions: |
michael@0 | 1971 | * successful |
michael@0 | 1972 | * in SO mode or not in ASCII mode |
michael@0 | 1973 | * end of input and no truncated input |
michael@0 | 1974 | */ |
michael@0 | 1975 | if( U_SUCCESS(*err) && |
michael@0 | 1976 | (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && |
michael@0 | 1977 | args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
michael@0 | 1978 | ) { |
michael@0 | 1979 | int32_t sourceIndex; |
michael@0 | 1980 | |
michael@0 | 1981 | outLen = 0; |
michael@0 | 1982 | |
michael@0 | 1983 | if(pFromU2022State->g != 0) { |
michael@0 | 1984 | buffer[outLen++] = UCNV_SI; |
michael@0 | 1985 | pFromU2022State->g = 0; |
michael@0 | 1986 | } |
michael@0 | 1987 | |
michael@0 | 1988 | if(pFromU2022State->cs[0] != ASCII) { |
michael@0 | 1989 | int32_t escLen = escSeqCharsLen[ASCII]; |
michael@0 | 1990 | uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); |
michael@0 | 1991 | outLen += escLen; |
michael@0 | 1992 | pFromU2022State->cs[0] = (int8_t)ASCII; |
michael@0 | 1993 | } |
michael@0 | 1994 | |
michael@0 | 1995 | /* get the source index of the last input character */ |
michael@0 | 1996 | /* |
michael@0 | 1997 | * TODO this would be simpler and more reliable if we used a pair |
michael@0 | 1998 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
michael@0 | 1999 | * so that we could simply use the prevSourceIndex here; |
michael@0 | 2000 | * this code gives an incorrect result for the rare case of an unmatched |
michael@0 | 2001 | * trail surrogate that is alone in the last buffer of the text stream |
michael@0 | 2002 | */ |
michael@0 | 2003 | sourceIndex=(int32_t)(source-args->source); |
michael@0 | 2004 | if(sourceIndex>0) { |
michael@0 | 2005 | --sourceIndex; |
michael@0 | 2006 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
michael@0 | 2007 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
michael@0 | 2008 | ) { |
michael@0 | 2009 | --sourceIndex; |
michael@0 | 2010 | } |
michael@0 | 2011 | } else { |
michael@0 | 2012 | sourceIndex=-1; |
michael@0 | 2013 | } |
michael@0 | 2014 | |
michael@0 | 2015 | fromUWriteUInt8( |
michael@0 | 2016 | cnv, |
michael@0 | 2017 | buffer, outLen, |
michael@0 | 2018 | &target, (const char *)targetLimit, |
michael@0 | 2019 | &offsets, sourceIndex, |
michael@0 | 2020 | err); |
michael@0 | 2021 | } |
michael@0 | 2022 | |
michael@0 | 2023 | /*save the state and return */ |
michael@0 | 2024 | args->source = source; |
michael@0 | 2025 | args->target = (char*)target; |
michael@0 | 2026 | } |
michael@0 | 2027 | |
michael@0 | 2028 | /*************** to unicode *******************/ |
michael@0 | 2029 | |
michael@0 | 2030 | static void |
michael@0 | 2031 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
michael@0 | 2032 | UErrorCode* err){ |
michael@0 | 2033 | char tempBuf[2]; |
michael@0 | 2034 | const char *mySource = (char *) args->source; |
michael@0 | 2035 | UChar *myTarget = args->target; |
michael@0 | 2036 | const char *mySourceLimit = args->sourceLimit; |
michael@0 | 2037 | uint32_t targetUniChar = 0x0000; |
michael@0 | 2038 | uint32_t mySourceChar = 0x0000; |
michael@0 | 2039 | uint32_t tmpSourceChar = 0x0000; |
michael@0 | 2040 | UConverterDataISO2022* myData; |
michael@0 | 2041 | ISO2022State *pToU2022State; |
michael@0 | 2042 | StateEnum cs; |
michael@0 | 2043 | |
michael@0 | 2044 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
michael@0 | 2045 | pToU2022State = &myData->toU2022State; |
michael@0 | 2046 | |
michael@0 | 2047 | if(myData->key != 0) { |
michael@0 | 2048 | /* continue with a partial escape sequence */ |
michael@0 | 2049 | goto escape; |
michael@0 | 2050 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
michael@0 | 2051 | /* continue with a partial double-byte character */ |
michael@0 | 2052 | mySourceChar = args->converter->toUBytes[0]; |
michael@0 | 2053 | args->converter->toULength = 0; |
michael@0 | 2054 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
michael@0 | 2055 | targetUniChar = missingCharMarker; |
michael@0 | 2056 | goto getTrailByte; |
michael@0 | 2057 | } |
michael@0 | 2058 | |
michael@0 | 2059 | while(mySource < mySourceLimit){ |
michael@0 | 2060 | |
michael@0 | 2061 | targetUniChar =missingCharMarker; |
michael@0 | 2062 | |
michael@0 | 2063 | if(myTarget < args->targetLimit){ |
michael@0 | 2064 | |
michael@0 | 2065 | mySourceChar= (unsigned char) *mySource++; |
michael@0 | 2066 | |
michael@0 | 2067 | switch(mySourceChar) { |
michael@0 | 2068 | case UCNV_SI: |
michael@0 | 2069 | if(myData->version==3) { |
michael@0 | 2070 | pToU2022State->g=0; |
michael@0 | 2071 | continue; |
michael@0 | 2072 | } else { |
michael@0 | 2073 | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
michael@0 | 2074 | myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
michael@0 | 2075 | break; |
michael@0 | 2076 | } |
michael@0 | 2077 | |
michael@0 | 2078 | case UCNV_SO: |
michael@0 | 2079 | if(myData->version==3) { |
michael@0 | 2080 | /* JIS7: switch to G1 half-width Katakana */ |
michael@0 | 2081 | pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; |
michael@0 | 2082 | pToU2022State->g=1; |
michael@0 | 2083 | continue; |
michael@0 | 2084 | } else { |
michael@0 | 2085 | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
michael@0 | 2086 | myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
michael@0 | 2087 | break; |
michael@0 | 2088 | } |
michael@0 | 2089 | |
michael@0 | 2090 | case ESC_2022: |
michael@0 | 2091 | mySource--; |
michael@0 | 2092 | escape: |
michael@0 | 2093 | { |
michael@0 | 2094 | const char * mySourceBefore = mySource; |
michael@0 | 2095 | int8_t toULengthBefore = args->converter->toULength; |
michael@0 | 2096 | |
michael@0 | 2097 | changeState_2022(args->converter,&(mySource), |
michael@0 | 2098 | mySourceLimit, ISO_2022_JP,err); |
michael@0 | 2099 | |
michael@0 | 2100 | /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ |
michael@0 | 2101 | if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
michael@0 | 2102 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 2103 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
michael@0 | 2104 | args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
michael@0 | 2105 | } |
michael@0 | 2106 | } |
michael@0 | 2107 | |
michael@0 | 2108 | /* invalid or illegal escape sequence */ |
michael@0 | 2109 | if(U_FAILURE(*err)){ |
michael@0 | 2110 | args->target = myTarget; |
michael@0 | 2111 | args->source = mySource; |
michael@0 | 2112 | myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
michael@0 | 2113 | return; |
michael@0 | 2114 | } |
michael@0 | 2115 | /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ |
michael@0 | 2116 | if(myData->key==0) { |
michael@0 | 2117 | myData->isEmptySegment = TRUE; |
michael@0 | 2118 | } |
michael@0 | 2119 | continue; |
michael@0 | 2120 | |
michael@0 | 2121 | /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
michael@0 | 2122 | |
michael@0 | 2123 | case CR: |
michael@0 | 2124 | /*falls through*/ |
michael@0 | 2125 | case LF: |
michael@0 | 2126 | /* automatically reset to single-byte mode */ |
michael@0 | 2127 | if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { |
michael@0 | 2128 | pToU2022State->cs[0] = (int8_t)ASCII; |
michael@0 | 2129 | } |
michael@0 | 2130 | pToU2022State->cs[2] = 0; |
michael@0 | 2131 | pToU2022State->g = 0; |
michael@0 | 2132 | /* falls through */ |
michael@0 | 2133 | default: |
michael@0 | 2134 | /* convert one or two bytes */ |
michael@0 | 2135 | myData->isEmptySegment = FALSE; |
michael@0 | 2136 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
michael@0 | 2137 | if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && |
michael@0 | 2138 | !IS_JP_DBCS(cs) |
michael@0 | 2139 | ) { |
michael@0 | 2140 | /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ |
michael@0 | 2141 | targetUniChar = mySourceChar + (HWKANA_START - 0xa1); |
michael@0 | 2142 | |
michael@0 | 2143 | /* return from a single-shift state to the previous one */ |
michael@0 | 2144 | if(pToU2022State->g >= 2) { |
michael@0 | 2145 | pToU2022State->g=pToU2022State->prevG; |
michael@0 | 2146 | } |
michael@0 | 2147 | } else switch(cs) { |
michael@0 | 2148 | case ASCII: |
michael@0 | 2149 | if(mySourceChar <= 0x7f) { |
michael@0 | 2150 | targetUniChar = mySourceChar; |
michael@0 | 2151 | } |
michael@0 | 2152 | break; |
michael@0 | 2153 | case ISO8859_1: |
michael@0 | 2154 | if(mySourceChar <= 0x7f) { |
michael@0 | 2155 | targetUniChar = mySourceChar + 0x80; |
michael@0 | 2156 | } |
michael@0 | 2157 | /* return from a single-shift state to the previous one */ |
michael@0 | 2158 | pToU2022State->g=pToU2022State->prevG; |
michael@0 | 2159 | break; |
michael@0 | 2160 | case ISO8859_7: |
michael@0 | 2161 | if(mySourceChar <= 0x7f) { |
michael@0 | 2162 | /* convert mySourceChar+0x80 to use a normal 8-bit table */ |
michael@0 | 2163 | targetUniChar = |
michael@0 | 2164 | _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( |
michael@0 | 2165 | myData->myConverterArray[cs], |
michael@0 | 2166 | mySourceChar + 0x80); |
michael@0 | 2167 | } |
michael@0 | 2168 | /* return from a single-shift state to the previous one */ |
michael@0 | 2169 | pToU2022State->g=pToU2022State->prevG; |
michael@0 | 2170 | break; |
michael@0 | 2171 | case JISX201: |
michael@0 | 2172 | if(mySourceChar <= 0x7f) { |
michael@0 | 2173 | targetUniChar = jisx201ToU(mySourceChar); |
michael@0 | 2174 | } |
michael@0 | 2175 | break; |
michael@0 | 2176 | case HWKANA_7BIT: |
michael@0 | 2177 | if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { |
michael@0 | 2178 | /* 7-bit halfwidth Katakana */ |
michael@0 | 2179 | targetUniChar = mySourceChar + (HWKANA_START - 0x21); |
michael@0 | 2180 | } |
michael@0 | 2181 | break; |
michael@0 | 2182 | default: |
michael@0 | 2183 | /* G0 DBCS */ |
michael@0 | 2184 | if(mySource < mySourceLimit) { |
michael@0 | 2185 | int leadIsOk, trailIsOk; |
michael@0 | 2186 | uint8_t trailByte; |
michael@0 | 2187 | getTrailByte: |
michael@0 | 2188 | trailByte = (uint8_t)*mySource; |
michael@0 | 2189 | /* |
michael@0 | 2190 | * Ticket 5691: consistent illegal sequences: |
michael@0 | 2191 | * - We include at least the first byte in the illegal sequence. |
michael@0 | 2192 | * - If any of the non-initial bytes could be the start of a character, |
michael@0 | 2193 | * we stop the illegal sequence before the first one of those. |
michael@0 | 2194 | * |
michael@0 | 2195 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
michael@0 | 2196 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
michael@0 | 2197 | * Otherwise we convert or report the pair of bytes. |
michael@0 | 2198 | */ |
michael@0 | 2199 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
michael@0 | 2200 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
michael@0 | 2201 | if (leadIsOk && trailIsOk) { |
michael@0 | 2202 | ++mySource; |
michael@0 | 2203 | tmpSourceChar = (mySourceChar << 8) | trailByte; |
michael@0 | 2204 | if(cs == JISX208) { |
michael@0 | 2205 | _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); |
michael@0 | 2206 | mySourceChar = tmpSourceChar; |
michael@0 | 2207 | } else { |
michael@0 | 2208 | /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ |
michael@0 | 2209 | mySourceChar = tmpSourceChar; |
michael@0 | 2210 | if (cs == KSC5601) { |
michael@0 | 2211 | tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ |
michael@0 | 2212 | } |
michael@0 | 2213 | tempBuf[0] = (char)(tmpSourceChar >> 8); |
michael@0 | 2214 | tempBuf[1] = (char)(tmpSourceChar); |
michael@0 | 2215 | } |
michael@0 | 2216 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); |
michael@0 | 2217 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
michael@0 | 2218 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
michael@0 | 2219 | ++mySource; |
michael@0 | 2220 | /* add another bit so that the code below writes 2 bytes in case of error */ |
michael@0 | 2221 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
michael@0 | 2222 | } |
michael@0 | 2223 | } else { |
michael@0 | 2224 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
michael@0 | 2225 | args->converter->toULength = 1; |
michael@0 | 2226 | goto endloop; |
michael@0 | 2227 | } |
michael@0 | 2228 | } /* End of inner switch */ |
michael@0 | 2229 | break; |
michael@0 | 2230 | } /* End of outer switch */ |
michael@0 | 2231 | if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
michael@0 | 2232 | if(args->offsets){ |
michael@0 | 2233 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
michael@0 | 2234 | } |
michael@0 | 2235 | *(myTarget++)=(UChar)targetUniChar; |
michael@0 | 2236 | } |
michael@0 | 2237 | else if(targetUniChar > missingCharMarker){ |
michael@0 | 2238 | /* disassemble the surrogate pair and write to output*/ |
michael@0 | 2239 | targetUniChar-=0x0010000; |
michael@0 | 2240 | *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
michael@0 | 2241 | if(args->offsets){ |
michael@0 | 2242 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
michael@0 | 2243 | } |
michael@0 | 2244 | ++myTarget; |
michael@0 | 2245 | if(myTarget< args->targetLimit){ |
michael@0 | 2246 | *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
michael@0 | 2247 | if(args->offsets){ |
michael@0 | 2248 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
michael@0 | 2249 | } |
michael@0 | 2250 | ++myTarget; |
michael@0 | 2251 | }else{ |
michael@0 | 2252 | args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
michael@0 | 2253 | (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
michael@0 | 2254 | } |
michael@0 | 2255 | |
michael@0 | 2256 | } |
michael@0 | 2257 | else{ |
michael@0 | 2258 | /* Call the callback function*/ |
michael@0 | 2259 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
michael@0 | 2260 | break; |
michael@0 | 2261 | } |
michael@0 | 2262 | } |
michael@0 | 2263 | else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ |
michael@0 | 2264 | *err =U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 2265 | break; |
michael@0 | 2266 | } |
michael@0 | 2267 | } |
michael@0 | 2268 | endloop: |
michael@0 | 2269 | args->target = myTarget; |
michael@0 | 2270 | args->source = mySource; |
michael@0 | 2271 | } |
michael@0 | 2272 | |
michael@0 | 2273 | |
michael@0 | 2274 | /*************************************************************** |
michael@0 | 2275 | * Rules for ISO-2022-KR encoding |
michael@0 | 2276 | * i) The KSC5601 designator sequence should appear only once in a file, |
michael@0 | 2277 | * at the begining of a line before any KSC5601 characters. This usually |
michael@0 | 2278 | * means that it appears by itself on the first line of the file |
michael@0 | 2279 | * ii) There are only 2 shifting sequences SO to shift into double byte mode |
michael@0 | 2280 | * and SI to shift into single byte mode |
michael@0 | 2281 | */ |
michael@0 | 2282 | static void |
michael@0 | 2283 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
michael@0 | 2284 | |
michael@0 | 2285 | UConverter* saveConv = args->converter; |
michael@0 | 2286 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; |
michael@0 | 2287 | args->converter=myConverterData->currentConverter; |
michael@0 | 2288 | |
michael@0 | 2289 | myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; |
michael@0 | 2290 | ucnv_MBCSFromUnicodeWithOffsets(args,err); |
michael@0 | 2291 | saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
michael@0 | 2292 | |
michael@0 | 2293 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 2294 | if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
michael@0 | 2295 | uprv_memcpy( |
michael@0 | 2296 | saveConv->charErrorBuffer, |
michael@0 | 2297 | myConverterData->currentConverter->charErrorBuffer, |
michael@0 | 2298 | myConverterData->currentConverter->charErrorBufferLength); |
michael@0 | 2299 | } |
michael@0 | 2300 | saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
michael@0 | 2301 | myConverterData->currentConverter->charErrorBufferLength = 0; |
michael@0 | 2302 | } |
michael@0 | 2303 | args->converter=saveConv; |
michael@0 | 2304 | } |
michael@0 | 2305 | |
michael@0 | 2306 | static void |
michael@0 | 2307 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
michael@0 | 2308 | |
michael@0 | 2309 | const UChar *source = args->source; |
michael@0 | 2310 | const UChar *sourceLimit = args->sourceLimit; |
michael@0 | 2311 | unsigned char *target = (unsigned char *) args->target; |
michael@0 | 2312 | unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
michael@0 | 2313 | int32_t* offsets = args->offsets; |
michael@0 | 2314 | uint32_t targetByteUnit = 0x0000; |
michael@0 | 2315 | UChar32 sourceChar = 0x0000; |
michael@0 | 2316 | UBool isTargetByteDBCS; |
michael@0 | 2317 | UBool oldIsTargetByteDBCS; |
michael@0 | 2318 | UConverterDataISO2022 *converterData; |
michael@0 | 2319 | UConverterSharedData* sharedData; |
michael@0 | 2320 | UBool useFallback; |
michael@0 | 2321 | int32_t length =0; |
michael@0 | 2322 | |
michael@0 | 2323 | converterData=(UConverterDataISO2022*)args->converter->extraInfo; |
michael@0 | 2324 | /* if the version is 1 then the user is requesting |
michael@0 | 2325 | * conversion with ibm-25546 pass the arguments to |
michael@0 | 2326 | * MBCS converter and return |
michael@0 | 2327 | */ |
michael@0 | 2328 | if(converterData->version==1){ |
michael@0 | 2329 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
michael@0 | 2330 | return; |
michael@0 | 2331 | } |
michael@0 | 2332 | |
michael@0 | 2333 | /* initialize data */ |
michael@0 | 2334 | sharedData = converterData->currentConverter->sharedData; |
michael@0 | 2335 | useFallback = args->converter->useFallback; |
michael@0 | 2336 | isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; |
michael@0 | 2337 | oldIsTargetByteDBCS = isTargetByteDBCS; |
michael@0 | 2338 | |
michael@0 | 2339 | isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; |
michael@0 | 2340 | if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { |
michael@0 | 2341 | goto getTrail; |
michael@0 | 2342 | } |
michael@0 | 2343 | while(source < sourceLimit){ |
michael@0 | 2344 | |
michael@0 | 2345 | targetByteUnit = missingCharMarker; |
michael@0 | 2346 | |
michael@0 | 2347 | if(target < (unsigned char*) args->targetLimit){ |
michael@0 | 2348 | sourceChar = *source++; |
michael@0 | 2349 | |
michael@0 | 2350 | /* do not convert SO/SI/ESC */ |
michael@0 | 2351 | if(IS_2022_CONTROL(sourceChar)) { |
michael@0 | 2352 | /* callback(illegal) */ |
michael@0 | 2353 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 2354 | args->converter->fromUChar32=sourceChar; |
michael@0 | 2355 | break; |
michael@0 | 2356 | } |
michael@0 | 2357 | |
michael@0 | 2358 | length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); |
michael@0 | 2359 | if(length < 0) { |
michael@0 | 2360 | length = -length; /* fallback */ |
michael@0 | 2361 | } |
michael@0 | 2362 | /* only DBCS or SBCS characters are expected*/ |
michael@0 | 2363 | /* DB characters with high bit set to 1 are expected */ |
michael@0 | 2364 | if( length > 2 || length==0 || |
michael@0 | 2365 | (length == 1 && targetByteUnit > 0x7f) || |
michael@0 | 2366 | (length == 2 && |
michael@0 | 2367 | ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || |
michael@0 | 2368 | (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) |
michael@0 | 2369 | ) { |
michael@0 | 2370 | targetByteUnit=missingCharMarker; |
michael@0 | 2371 | } |
michael@0 | 2372 | if (targetByteUnit != missingCharMarker){ |
michael@0 | 2373 | |
michael@0 | 2374 | oldIsTargetByteDBCS = isTargetByteDBCS; |
michael@0 | 2375 | isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); |
michael@0 | 2376 | /* append the shift sequence */ |
michael@0 | 2377 | if (oldIsTargetByteDBCS != isTargetByteDBCS ){ |
michael@0 | 2378 | |
michael@0 | 2379 | if (isTargetByteDBCS) |
michael@0 | 2380 | *target++ = UCNV_SO; |
michael@0 | 2381 | else |
michael@0 | 2382 | *target++ = UCNV_SI; |
michael@0 | 2383 | if(offsets) |
michael@0 | 2384 | *(offsets++) = (int32_t)(source - args->source-1); |
michael@0 | 2385 | } |
michael@0 | 2386 | /* write the targetUniChar to target */ |
michael@0 | 2387 | if(targetByteUnit <= 0x00FF){ |
michael@0 | 2388 | if( target < targetLimit){ |
michael@0 | 2389 | *(target++) = (unsigned char) targetByteUnit; |
michael@0 | 2390 | if(offsets){ |
michael@0 | 2391 | *(offsets++) = (int32_t)(source - args->source-1); |
michael@0 | 2392 | } |
michael@0 | 2393 | |
michael@0 | 2394 | }else{ |
michael@0 | 2395 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); |
michael@0 | 2396 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 2397 | } |
michael@0 | 2398 | }else{ |
michael@0 | 2399 | if(target < targetLimit){ |
michael@0 | 2400 | *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); |
michael@0 | 2401 | if(offsets){ |
michael@0 | 2402 | *(offsets++) = (int32_t)(source - args->source-1); |
michael@0 | 2403 | } |
michael@0 | 2404 | if(target < targetLimit){ |
michael@0 | 2405 | *(target++) =(unsigned char) (targetByteUnit -0x80); |
michael@0 | 2406 | if(offsets){ |
michael@0 | 2407 | *(offsets++) = (int32_t)(source - args->source-1); |
michael@0 | 2408 | } |
michael@0 | 2409 | }else{ |
michael@0 | 2410 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); |
michael@0 | 2411 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 2412 | } |
michael@0 | 2413 | }else{ |
michael@0 | 2414 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); |
michael@0 | 2415 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); |
michael@0 | 2416 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 2417 | } |
michael@0 | 2418 | } |
michael@0 | 2419 | |
michael@0 | 2420 | } |
michael@0 | 2421 | else{ |
michael@0 | 2422 | /* oops.. the code point is unassingned |
michael@0 | 2423 | * set the error and reason |
michael@0 | 2424 | */ |
michael@0 | 2425 | |
michael@0 | 2426 | /*check if the char is a First surrogate*/ |
michael@0 | 2427 | if(U16_IS_SURROGATE(sourceChar)) { |
michael@0 | 2428 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
michael@0 | 2429 | getTrail: |
michael@0 | 2430 | /*look ahead to find the trail surrogate*/ |
michael@0 | 2431 | if(source < sourceLimit) { |
michael@0 | 2432 | /* test the following code unit */ |
michael@0 | 2433 | UChar trail=(UChar) *source; |
michael@0 | 2434 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 2435 | source++; |
michael@0 | 2436 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
michael@0 | 2437 | *err = U_INVALID_CHAR_FOUND; |
michael@0 | 2438 | /* convert this surrogate code point */ |
michael@0 | 2439 | /* exit this condition tree */ |
michael@0 | 2440 | } else { |
michael@0 | 2441 | /* this is an unmatched lead code unit (1st surrogate) */ |
michael@0 | 2442 | /* callback(illegal) */ |
michael@0 | 2443 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 2444 | } |
michael@0 | 2445 | } else { |
michael@0 | 2446 | /* no more input */ |
michael@0 | 2447 | *err = U_ZERO_ERROR; |
michael@0 | 2448 | } |
michael@0 | 2449 | } else { |
michael@0 | 2450 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 2451 | /* callback(illegal) */ |
michael@0 | 2452 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 2453 | } |
michael@0 | 2454 | } else { |
michael@0 | 2455 | /* callback(unassigned) for a BMP code point */ |
michael@0 | 2456 | *err = U_INVALID_CHAR_FOUND; |
michael@0 | 2457 | } |
michael@0 | 2458 | |
michael@0 | 2459 | args->converter->fromUChar32=sourceChar; |
michael@0 | 2460 | break; |
michael@0 | 2461 | } |
michael@0 | 2462 | } /* end if(myTargetIndex<myTargetLength) */ |
michael@0 | 2463 | else{ |
michael@0 | 2464 | *err =U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 2465 | break; |
michael@0 | 2466 | } |
michael@0 | 2467 | |
michael@0 | 2468 | }/* end while(mySourceIndex<mySourceLength) */ |
michael@0 | 2469 | |
michael@0 | 2470 | /* |
michael@0 | 2471 | * the end of the input stream and detection of truncated input |
michael@0 | 2472 | * are handled by the framework, but for ISO-2022-KR conversion |
michael@0 | 2473 | * we need to be in ASCII mode at the very end |
michael@0 | 2474 | * |
michael@0 | 2475 | * conditions: |
michael@0 | 2476 | * successful |
michael@0 | 2477 | * not in ASCII mode |
michael@0 | 2478 | * end of input and no truncated input |
michael@0 | 2479 | */ |
michael@0 | 2480 | if( U_SUCCESS(*err) && |
michael@0 | 2481 | isTargetByteDBCS && |
michael@0 | 2482 | args->flush && source>=sourceLimit && args->converter->fromUChar32==0 |
michael@0 | 2483 | ) { |
michael@0 | 2484 | int32_t sourceIndex; |
michael@0 | 2485 | |
michael@0 | 2486 | /* we are switching to ASCII */ |
michael@0 | 2487 | isTargetByteDBCS=FALSE; |
michael@0 | 2488 | |
michael@0 | 2489 | /* get the source index of the last input character */ |
michael@0 | 2490 | /* |
michael@0 | 2491 | * TODO this would be simpler and more reliable if we used a pair |
michael@0 | 2492 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
michael@0 | 2493 | * so that we could simply use the prevSourceIndex here; |
michael@0 | 2494 | * this code gives an incorrect result for the rare case of an unmatched |
michael@0 | 2495 | * trail surrogate that is alone in the last buffer of the text stream |
michael@0 | 2496 | */ |
michael@0 | 2497 | sourceIndex=(int32_t)(source-args->source); |
michael@0 | 2498 | if(sourceIndex>0) { |
michael@0 | 2499 | --sourceIndex; |
michael@0 | 2500 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
michael@0 | 2501 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
michael@0 | 2502 | ) { |
michael@0 | 2503 | --sourceIndex; |
michael@0 | 2504 | } |
michael@0 | 2505 | } else { |
michael@0 | 2506 | sourceIndex=-1; |
michael@0 | 2507 | } |
michael@0 | 2508 | |
michael@0 | 2509 | fromUWriteUInt8( |
michael@0 | 2510 | args->converter, |
michael@0 | 2511 | SHIFT_IN_STR, 1, |
michael@0 | 2512 | &target, (const char *)targetLimit, |
michael@0 | 2513 | &offsets, sourceIndex, |
michael@0 | 2514 | err); |
michael@0 | 2515 | } |
michael@0 | 2516 | |
michael@0 | 2517 | /*save the state and return */ |
michael@0 | 2518 | args->source = source; |
michael@0 | 2519 | args->target = (char*)target; |
michael@0 | 2520 | args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; |
michael@0 | 2521 | } |
michael@0 | 2522 | |
michael@0 | 2523 | /************************ To Unicode ***************************************/ |
michael@0 | 2524 | |
michael@0 | 2525 | static void |
michael@0 | 2526 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, |
michael@0 | 2527 | UErrorCode* err){ |
michael@0 | 2528 | char const* sourceStart; |
michael@0 | 2529 | UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
michael@0 | 2530 | |
michael@0 | 2531 | UConverterToUnicodeArgs subArgs; |
michael@0 | 2532 | int32_t minArgsSize; |
michael@0 | 2533 | |
michael@0 | 2534 | /* set up the subconverter arguments */ |
michael@0 | 2535 | if(args->size<sizeof(UConverterToUnicodeArgs)) { |
michael@0 | 2536 | minArgsSize = args->size; |
michael@0 | 2537 | } else { |
michael@0 | 2538 | minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); |
michael@0 | 2539 | } |
michael@0 | 2540 | |
michael@0 | 2541 | uprv_memcpy(&subArgs, args, minArgsSize); |
michael@0 | 2542 | subArgs.size = (uint16_t)minArgsSize; |
michael@0 | 2543 | subArgs.converter = myData->currentConverter; |
michael@0 | 2544 | |
michael@0 | 2545 | /* remember the original start of the input for offsets */ |
michael@0 | 2546 | sourceStart = args->source; |
michael@0 | 2547 | |
michael@0 | 2548 | if(myData->key != 0) { |
michael@0 | 2549 | /* continue with a partial escape sequence */ |
michael@0 | 2550 | goto escape; |
michael@0 | 2551 | } |
michael@0 | 2552 | |
michael@0 | 2553 | while(U_SUCCESS(*err) && args->source < args->sourceLimit) { |
michael@0 | 2554 | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
michael@0 | 2555 | subArgs.source = args->source; |
michael@0 | 2556 | subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); |
michael@0 | 2557 | if(subArgs.source != subArgs.sourceLimit) { |
michael@0 | 2558 | /* |
michael@0 | 2559 | * get the current partial byte sequence |
michael@0 | 2560 | * |
michael@0 | 2561 | * it needs to be moved between the public and the subconverter |
michael@0 | 2562 | * so that the conversion framework, which only sees the public |
michael@0 | 2563 | * converter, can handle truncated and illegal input etc. |
michael@0 | 2564 | */ |
michael@0 | 2565 | if(args->converter->toULength > 0) { |
michael@0 | 2566 | uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); |
michael@0 | 2567 | } |
michael@0 | 2568 | subArgs.converter->toULength = args->converter->toULength; |
michael@0 | 2569 | |
michael@0 | 2570 | /* |
michael@0 | 2571 | * Convert up to the end of the input, or to before the next escape character. |
michael@0 | 2572 | * Does not handle conversion extensions because the preToU[] state etc. |
michael@0 | 2573 | * is not copied. |
michael@0 | 2574 | */ |
michael@0 | 2575 | ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); |
michael@0 | 2576 | |
michael@0 | 2577 | if(args->offsets != NULL && sourceStart != args->source) { |
michael@0 | 2578 | /* update offsets to base them on the actual start of the input */ |
michael@0 | 2579 | int32_t *offsets = args->offsets; |
michael@0 | 2580 | UChar *target = args->target; |
michael@0 | 2581 | int32_t delta = (int32_t)(args->source - sourceStart); |
michael@0 | 2582 | while(target < subArgs.target) { |
michael@0 | 2583 | if(*offsets >= 0) { |
michael@0 | 2584 | *offsets += delta; |
michael@0 | 2585 | } |
michael@0 | 2586 | ++offsets; |
michael@0 | 2587 | ++target; |
michael@0 | 2588 | } |
michael@0 | 2589 | } |
michael@0 | 2590 | args->source = subArgs.source; |
michael@0 | 2591 | args->target = subArgs.target; |
michael@0 | 2592 | args->offsets = subArgs.offsets; |
michael@0 | 2593 | |
michael@0 | 2594 | /* copy input/error/overflow buffers */ |
michael@0 | 2595 | if(subArgs.converter->toULength > 0) { |
michael@0 | 2596 | uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); |
michael@0 | 2597 | } |
michael@0 | 2598 | args->converter->toULength = subArgs.converter->toULength; |
michael@0 | 2599 | |
michael@0 | 2600 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 2601 | if(subArgs.converter->UCharErrorBufferLength > 0) { |
michael@0 | 2602 | uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, |
michael@0 | 2603 | subArgs.converter->UCharErrorBufferLength); |
michael@0 | 2604 | } |
michael@0 | 2605 | args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; |
michael@0 | 2606 | subArgs.converter->UCharErrorBufferLength = 0; |
michael@0 | 2607 | } |
michael@0 | 2608 | } |
michael@0 | 2609 | |
michael@0 | 2610 | if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { |
michael@0 | 2611 | return; |
michael@0 | 2612 | } |
michael@0 | 2613 | |
michael@0 | 2614 | escape: |
michael@0 | 2615 | changeState_2022(args->converter, |
michael@0 | 2616 | &(args->source), |
michael@0 | 2617 | args->sourceLimit, |
michael@0 | 2618 | ISO_2022_KR, |
michael@0 | 2619 | err); |
michael@0 | 2620 | } |
michael@0 | 2621 | } |
michael@0 | 2622 | |
michael@0 | 2623 | static void |
michael@0 | 2624 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
michael@0 | 2625 | UErrorCode* err){ |
michael@0 | 2626 | char tempBuf[2]; |
michael@0 | 2627 | const char *mySource = ( char *) args->source; |
michael@0 | 2628 | UChar *myTarget = args->target; |
michael@0 | 2629 | const char *mySourceLimit = args->sourceLimit; |
michael@0 | 2630 | UChar32 targetUniChar = 0x0000; |
michael@0 | 2631 | UChar mySourceChar = 0x0000; |
michael@0 | 2632 | UConverterDataISO2022* myData; |
michael@0 | 2633 | UConverterSharedData* sharedData ; |
michael@0 | 2634 | UBool useFallback; |
michael@0 | 2635 | |
michael@0 | 2636 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
michael@0 | 2637 | if(myData->version==1){ |
michael@0 | 2638 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
michael@0 | 2639 | return; |
michael@0 | 2640 | } |
michael@0 | 2641 | |
michael@0 | 2642 | /* initialize state */ |
michael@0 | 2643 | sharedData = myData->currentConverter->sharedData; |
michael@0 | 2644 | useFallback = args->converter->useFallback; |
michael@0 | 2645 | |
michael@0 | 2646 | if(myData->key != 0) { |
michael@0 | 2647 | /* continue with a partial escape sequence */ |
michael@0 | 2648 | goto escape; |
michael@0 | 2649 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
michael@0 | 2650 | /* continue with a partial double-byte character */ |
michael@0 | 2651 | mySourceChar = args->converter->toUBytes[0]; |
michael@0 | 2652 | args->converter->toULength = 0; |
michael@0 | 2653 | goto getTrailByte; |
michael@0 | 2654 | } |
michael@0 | 2655 | |
michael@0 | 2656 | while(mySource< mySourceLimit){ |
michael@0 | 2657 | |
michael@0 | 2658 | if(myTarget < args->targetLimit){ |
michael@0 | 2659 | |
michael@0 | 2660 | mySourceChar= (unsigned char) *mySource++; |
michael@0 | 2661 | |
michael@0 | 2662 | if(mySourceChar==UCNV_SI){ |
michael@0 | 2663 | myData->toU2022State.g = 0; |
michael@0 | 2664 | if (myData->isEmptySegment) { |
michael@0 | 2665 | myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ |
michael@0 | 2666 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 2667 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
michael@0 | 2668 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
michael@0 | 2669 | args->converter->toULength = 1; |
michael@0 | 2670 | args->target = myTarget; |
michael@0 | 2671 | args->source = mySource; |
michael@0 | 2672 | return; |
michael@0 | 2673 | } |
michael@0 | 2674 | /*consume the source */ |
michael@0 | 2675 | continue; |
michael@0 | 2676 | }else if(mySourceChar==UCNV_SO){ |
michael@0 | 2677 | myData->toU2022State.g = 1; |
michael@0 | 2678 | myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
michael@0 | 2679 | /*consume the source */ |
michael@0 | 2680 | continue; |
michael@0 | 2681 | }else if(mySourceChar==ESC_2022){ |
michael@0 | 2682 | mySource--; |
michael@0 | 2683 | escape: |
michael@0 | 2684 | myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ |
michael@0 | 2685 | changeState_2022(args->converter,&(mySource), |
michael@0 | 2686 | mySourceLimit, ISO_2022_KR, err); |
michael@0 | 2687 | if(U_FAILURE(*err)){ |
michael@0 | 2688 | args->target = myTarget; |
michael@0 | 2689 | args->source = mySource; |
michael@0 | 2690 | return; |
michael@0 | 2691 | } |
michael@0 | 2692 | continue; |
michael@0 | 2693 | } |
michael@0 | 2694 | |
michael@0 | 2695 | myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ |
michael@0 | 2696 | if(myData->toU2022State.g == 1) { |
michael@0 | 2697 | if(mySource < mySourceLimit) { |
michael@0 | 2698 | int leadIsOk, trailIsOk; |
michael@0 | 2699 | uint8_t trailByte; |
michael@0 | 2700 | getTrailByte: |
michael@0 | 2701 | targetUniChar = missingCharMarker; |
michael@0 | 2702 | trailByte = (uint8_t)*mySource; |
michael@0 | 2703 | /* |
michael@0 | 2704 | * Ticket 5691: consistent illegal sequences: |
michael@0 | 2705 | * - We include at least the first byte in the illegal sequence. |
michael@0 | 2706 | * - If any of the non-initial bytes could be the start of a character, |
michael@0 | 2707 | * we stop the illegal sequence before the first one of those. |
michael@0 | 2708 | * |
michael@0 | 2709 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
michael@0 | 2710 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
michael@0 | 2711 | * Otherwise we convert or report the pair of bytes. |
michael@0 | 2712 | */ |
michael@0 | 2713 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
michael@0 | 2714 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
michael@0 | 2715 | if (leadIsOk && trailIsOk) { |
michael@0 | 2716 | ++mySource; |
michael@0 | 2717 | tempBuf[0] = (char)(mySourceChar + 0x80); |
michael@0 | 2718 | tempBuf[1] = (char)(trailByte + 0x80); |
michael@0 | 2719 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); |
michael@0 | 2720 | mySourceChar = (mySourceChar << 8) | trailByte; |
michael@0 | 2721 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
michael@0 | 2722 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
michael@0 | 2723 | ++mySource; |
michael@0 | 2724 | /* add another bit so that the code below writes 2 bytes in case of error */ |
michael@0 | 2725 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
michael@0 | 2726 | } |
michael@0 | 2727 | } else { |
michael@0 | 2728 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
michael@0 | 2729 | args->converter->toULength = 1; |
michael@0 | 2730 | break; |
michael@0 | 2731 | } |
michael@0 | 2732 | } |
michael@0 | 2733 | else if(mySourceChar <= 0x7f) { |
michael@0 | 2734 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); |
michael@0 | 2735 | } else { |
michael@0 | 2736 | targetUniChar = 0xffff; |
michael@0 | 2737 | } |
michael@0 | 2738 | if(targetUniChar < 0xfffe){ |
michael@0 | 2739 | if(args->offsets) { |
michael@0 | 2740 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
michael@0 | 2741 | } |
michael@0 | 2742 | *(myTarget++)=(UChar)targetUniChar; |
michael@0 | 2743 | } |
michael@0 | 2744 | else { |
michael@0 | 2745 | /* Call the callback function*/ |
michael@0 | 2746 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
michael@0 | 2747 | break; |
michael@0 | 2748 | } |
michael@0 | 2749 | } |
michael@0 | 2750 | else{ |
michael@0 | 2751 | *err =U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 2752 | break; |
michael@0 | 2753 | } |
michael@0 | 2754 | } |
michael@0 | 2755 | args->target = myTarget; |
michael@0 | 2756 | args->source = mySource; |
michael@0 | 2757 | } |
michael@0 | 2758 | |
michael@0 | 2759 | /*************************** END ISO2022-KR *********************************/ |
michael@0 | 2760 | |
michael@0 | 2761 | /*************************** ISO-2022-CN ********************************* |
michael@0 | 2762 | * |
michael@0 | 2763 | * Rules for ISO-2022-CN Encoding: |
michael@0 | 2764 | * i) The designator sequence must appear once on a line before any instance |
michael@0 | 2765 | * of character set it designates. |
michael@0 | 2766 | * ii) If two lines contain characters from the same character set, both lines |
michael@0 | 2767 | * must include the designator sequence. |
michael@0 | 2768 | * iii) Once the designator sequence is known, a shifting sequence has to be found |
michael@0 | 2769 | * to invoke the shifting |
michael@0 | 2770 | * iv) All lines start in ASCII and end in ASCII. |
michael@0 | 2771 | * v) Four shifting sequences are employed for this purpose: |
michael@0 | 2772 | * |
michael@0 | 2773 | * Sequcence ASCII Eq Charsets |
michael@0 | 2774 | * ---------- ------- --------- |
michael@0 | 2775 | * SI <SI> US-ASCII |
michael@0 | 2776 | * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 |
michael@0 | 2777 | * SS2 <ESC>N CNS-11643-1992 Plane 2 |
michael@0 | 2778 | * SS3 <ESC>O CNS-11643-1992 Planes 3-7 |
michael@0 | 2779 | * |
michael@0 | 2780 | * vi) |
michael@0 | 2781 | * SOdesignator : ESC "$" ")" finalchar_for_SO |
michael@0 | 2782 | * SS2designator : ESC "$" "*" finalchar_for_SS2 |
michael@0 | 2783 | * SS3designator : ESC "$" "+" finalchar_for_SS3 |
michael@0 | 2784 | * |
michael@0 | 2785 | * ESC $ ) A Indicates the bytes following SO are Chinese |
michael@0 | 2786 | * characters as defined in GB 2312-80, until |
michael@0 | 2787 | * another SOdesignation appears |
michael@0 | 2788 | * |
michael@0 | 2789 | * |
michael@0 | 2790 | * ESC $ ) E Indicates the bytes following SO are as defined |
michael@0 | 2791 | * in ISO-IR-165 (for details, see section 2.1), |
michael@0 | 2792 | * until another SOdesignation appears |
michael@0 | 2793 | * |
michael@0 | 2794 | * ESC $ ) G Indicates the bytes following SO are as defined |
michael@0 | 2795 | * in CNS 11643-plane-1, until another |
michael@0 | 2796 | * SOdesignation appears |
michael@0 | 2797 | * |
michael@0 | 2798 | * ESC $ * H Indicates the two bytes immediately following |
michael@0 | 2799 | * SS2 is a Chinese character as defined in CNS |
michael@0 | 2800 | * 11643-plane-2, until another SS2designation |
michael@0 | 2801 | * appears |
michael@0 | 2802 | * (Meaning <ESC>N must preceed every 2 byte |
michael@0 | 2803 | * sequence.) |
michael@0 | 2804 | * |
michael@0 | 2805 | * ESC $ + I Indicates the immediate two bytes following SS3 |
michael@0 | 2806 | * is a Chinese character as defined in CNS |
michael@0 | 2807 | * 11643-plane-3, until another SS3designation |
michael@0 | 2808 | * appears |
michael@0 | 2809 | * (Meaning <ESC>O must preceed every 2 byte |
michael@0 | 2810 | * sequence.) |
michael@0 | 2811 | * |
michael@0 | 2812 | * ESC $ + J Indicates the immediate two bytes following SS3 |
michael@0 | 2813 | * is a Chinese character as defined in CNS |
michael@0 | 2814 | * 11643-plane-4, until another SS3designation |
michael@0 | 2815 | * appears |
michael@0 | 2816 | * (In English: <ESC>O must preceed every 2 byte |
michael@0 | 2817 | * sequence.) |
michael@0 | 2818 | * |
michael@0 | 2819 | * ESC $ + K Indicates the immediate two bytes following SS3 |
michael@0 | 2820 | * is a Chinese character as defined in CNS |
michael@0 | 2821 | * 11643-plane-5, until another SS3designation |
michael@0 | 2822 | * appears |
michael@0 | 2823 | * |
michael@0 | 2824 | * ESC $ + L Indicates the immediate two bytes following SS3 |
michael@0 | 2825 | * is a Chinese character as defined in CNS |
michael@0 | 2826 | * 11643-plane-6, until another SS3designation |
michael@0 | 2827 | * appears |
michael@0 | 2828 | * |
michael@0 | 2829 | * ESC $ + M Indicates the immediate two bytes following SS3 |
michael@0 | 2830 | * is a Chinese character as defined in CNS |
michael@0 | 2831 | * 11643-plane-7, until another SS3designation |
michael@0 | 2832 | * appears |
michael@0 | 2833 | * |
michael@0 | 2834 | * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and |
michael@0 | 2835 | * has its own designation information before any Chinese characters |
michael@0 | 2836 | * appear |
michael@0 | 2837 | * |
michael@0 | 2838 | */ |
michael@0 | 2839 | |
michael@0 | 2840 | /* The following are defined this way to make the strings truly readonly */ |
michael@0 | 2841 | static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; |
michael@0 | 2842 | static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; |
michael@0 | 2843 | static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; |
michael@0 | 2844 | static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; |
michael@0 | 2845 | static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; |
michael@0 | 2846 | static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; |
michael@0 | 2847 | static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; |
michael@0 | 2848 | static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; |
michael@0 | 2849 | static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; |
michael@0 | 2850 | |
michael@0 | 2851 | /********************** ISO2022-CN Data **************************/ |
michael@0 | 2852 | static const char* const escSeqCharsCN[10] ={ |
michael@0 | 2853 | SHIFT_IN_STR, /* 0 ASCII */ |
michael@0 | 2854 | GB_2312_80_STR, /* 1 GB2312_1 */ |
michael@0 | 2855 | ISO_IR_165_STR, /* 2 ISO_IR_165 */ |
michael@0 | 2856 | CNS_11643_1992_Plane_1_STR, |
michael@0 | 2857 | CNS_11643_1992_Plane_2_STR, |
michael@0 | 2858 | CNS_11643_1992_Plane_3_STR, |
michael@0 | 2859 | CNS_11643_1992_Plane_4_STR, |
michael@0 | 2860 | CNS_11643_1992_Plane_5_STR, |
michael@0 | 2861 | CNS_11643_1992_Plane_6_STR, |
michael@0 | 2862 | CNS_11643_1992_Plane_7_STR |
michael@0 | 2863 | }; |
michael@0 | 2864 | |
michael@0 | 2865 | static void |
michael@0 | 2866 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
michael@0 | 2867 | UConverter *cnv = args->converter; |
michael@0 | 2868 | UConverterDataISO2022 *converterData; |
michael@0 | 2869 | ISO2022State *pFromU2022State; |
michael@0 | 2870 | uint8_t *target = (uint8_t *) args->target; |
michael@0 | 2871 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
michael@0 | 2872 | const UChar* source = args->source; |
michael@0 | 2873 | const UChar* sourceLimit = args->sourceLimit; |
michael@0 | 2874 | int32_t* offsets = args->offsets; |
michael@0 | 2875 | UChar32 sourceChar; |
michael@0 | 2876 | char buffer[8]; |
michael@0 | 2877 | int32_t len; |
michael@0 | 2878 | int8_t choices[3]; |
michael@0 | 2879 | int32_t choiceCount; |
michael@0 | 2880 | uint32_t targetValue = 0; |
michael@0 | 2881 | UBool useFallback; |
michael@0 | 2882 | |
michael@0 | 2883 | /* set up the state */ |
michael@0 | 2884 | converterData = (UConverterDataISO2022*)cnv->extraInfo; |
michael@0 | 2885 | pFromU2022State = &converterData->fromU2022State; |
michael@0 | 2886 | |
michael@0 | 2887 | choiceCount = 0; |
michael@0 | 2888 | |
michael@0 | 2889 | /* check if the last codepoint of previous buffer was a lead surrogate*/ |
michael@0 | 2890 | if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
michael@0 | 2891 | goto getTrail; |
michael@0 | 2892 | } |
michael@0 | 2893 | |
michael@0 | 2894 | while( source < sourceLimit){ |
michael@0 | 2895 | if(target < targetLimit){ |
michael@0 | 2896 | |
michael@0 | 2897 | sourceChar = *(source++); |
michael@0 | 2898 | /*check if the char is a First surrogate*/ |
michael@0 | 2899 | if(U16_IS_SURROGATE(sourceChar)) { |
michael@0 | 2900 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
michael@0 | 2901 | getTrail: |
michael@0 | 2902 | /*look ahead to find the trail surrogate*/ |
michael@0 | 2903 | if(source < sourceLimit) { |
michael@0 | 2904 | /* test the following code unit */ |
michael@0 | 2905 | UChar trail=(UChar) *source; |
michael@0 | 2906 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 2907 | source++; |
michael@0 | 2908 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
michael@0 | 2909 | cnv->fromUChar32=0x00; |
michael@0 | 2910 | /* convert this supplementary code point */ |
michael@0 | 2911 | /* exit this condition tree */ |
michael@0 | 2912 | } else { |
michael@0 | 2913 | /* this is an unmatched lead code unit (1st surrogate) */ |
michael@0 | 2914 | /* callback(illegal) */ |
michael@0 | 2915 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 2916 | cnv->fromUChar32=sourceChar; |
michael@0 | 2917 | break; |
michael@0 | 2918 | } |
michael@0 | 2919 | } else { |
michael@0 | 2920 | /* no more input */ |
michael@0 | 2921 | cnv->fromUChar32=sourceChar; |
michael@0 | 2922 | break; |
michael@0 | 2923 | } |
michael@0 | 2924 | } else { |
michael@0 | 2925 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 2926 | /* callback(illegal) */ |
michael@0 | 2927 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 2928 | cnv->fromUChar32=sourceChar; |
michael@0 | 2929 | break; |
michael@0 | 2930 | } |
michael@0 | 2931 | } |
michael@0 | 2932 | |
michael@0 | 2933 | /* do the conversion */ |
michael@0 | 2934 | if(sourceChar <= 0x007f ){ |
michael@0 | 2935 | /* do not convert SO/SI/ESC */ |
michael@0 | 2936 | if(IS_2022_CONTROL(sourceChar)) { |
michael@0 | 2937 | /* callback(illegal) */ |
michael@0 | 2938 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 2939 | cnv->fromUChar32=sourceChar; |
michael@0 | 2940 | break; |
michael@0 | 2941 | } |
michael@0 | 2942 | |
michael@0 | 2943 | /* US-ASCII */ |
michael@0 | 2944 | if(pFromU2022State->g == 0) { |
michael@0 | 2945 | buffer[0] = (char)sourceChar; |
michael@0 | 2946 | len = 1; |
michael@0 | 2947 | } else { |
michael@0 | 2948 | buffer[0] = UCNV_SI; |
michael@0 | 2949 | buffer[1] = (char)sourceChar; |
michael@0 | 2950 | len = 2; |
michael@0 | 2951 | pFromU2022State->g = 0; |
michael@0 | 2952 | choiceCount = 0; |
michael@0 | 2953 | } |
michael@0 | 2954 | if(sourceChar == CR || sourceChar == LF) { |
michael@0 | 2955 | /* reset the state at the end of a line */ |
michael@0 | 2956 | uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); |
michael@0 | 2957 | choiceCount = 0; |
michael@0 | 2958 | } |
michael@0 | 2959 | } |
michael@0 | 2960 | else{ |
michael@0 | 2961 | /* convert U+0080..U+10ffff */ |
michael@0 | 2962 | int32_t i; |
michael@0 | 2963 | int8_t cs, g; |
michael@0 | 2964 | |
michael@0 | 2965 | if(choiceCount == 0) { |
michael@0 | 2966 | /* try the current SO/G1 converter first */ |
michael@0 | 2967 | choices[0] = pFromU2022State->cs[1]; |
michael@0 | 2968 | |
michael@0 | 2969 | /* default to GB2312_1 if none is designated yet */ |
michael@0 | 2970 | if(choices[0] == 0) { |
michael@0 | 2971 | choices[0] = GB2312_1; |
michael@0 | 2972 | } |
michael@0 | 2973 | |
michael@0 | 2974 | if(converterData->version == 0) { |
michael@0 | 2975 | /* ISO-2022-CN */ |
michael@0 | 2976 | |
michael@0 | 2977 | /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ |
michael@0 | 2978 | if(choices[0] == GB2312_1) { |
michael@0 | 2979 | choices[1] = (int8_t)CNS_11643_1; |
michael@0 | 2980 | } else { |
michael@0 | 2981 | choices[1] = (int8_t)GB2312_1; |
michael@0 | 2982 | } |
michael@0 | 2983 | |
michael@0 | 2984 | choiceCount = 2; |
michael@0 | 2985 | } else if (converterData->version == 1) { |
michael@0 | 2986 | /* ISO-2022-CN-EXT */ |
michael@0 | 2987 | |
michael@0 | 2988 | /* try one of the other converters */ |
michael@0 | 2989 | switch(choices[0]) { |
michael@0 | 2990 | case GB2312_1: |
michael@0 | 2991 | choices[1] = (int8_t)CNS_11643_1; |
michael@0 | 2992 | choices[2] = (int8_t)ISO_IR_165; |
michael@0 | 2993 | break; |
michael@0 | 2994 | case ISO_IR_165: |
michael@0 | 2995 | choices[1] = (int8_t)GB2312_1; |
michael@0 | 2996 | choices[2] = (int8_t)CNS_11643_1; |
michael@0 | 2997 | break; |
michael@0 | 2998 | default: /* CNS_11643_x */ |
michael@0 | 2999 | choices[1] = (int8_t)GB2312_1; |
michael@0 | 3000 | choices[2] = (int8_t)ISO_IR_165; |
michael@0 | 3001 | break; |
michael@0 | 3002 | } |
michael@0 | 3003 | |
michael@0 | 3004 | choiceCount = 3; |
michael@0 | 3005 | } else { |
michael@0 | 3006 | choices[0] = (int8_t)CNS_11643_1; |
michael@0 | 3007 | choices[1] = (int8_t)GB2312_1; |
michael@0 | 3008 | } |
michael@0 | 3009 | } |
michael@0 | 3010 | |
michael@0 | 3011 | cs = g = 0; |
michael@0 | 3012 | /* |
michael@0 | 3013 | * len==0: no mapping found yet |
michael@0 | 3014 | * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
michael@0 | 3015 | * len>0: found a roundtrip result, done |
michael@0 | 3016 | */ |
michael@0 | 3017 | len = 0; |
michael@0 | 3018 | /* |
michael@0 | 3019 | * We will turn off useFallback after finding a fallback, |
michael@0 | 3020 | * but we still get fallbacks from PUA code points as usual. |
michael@0 | 3021 | * Therefore, we will also need to check that we don't overwrite |
michael@0 | 3022 | * an early fallback with a later one. |
michael@0 | 3023 | */ |
michael@0 | 3024 | useFallback = cnv->useFallback; |
michael@0 | 3025 | |
michael@0 | 3026 | for(i = 0; i < choiceCount && len <= 0; ++i) { |
michael@0 | 3027 | int8_t cs0 = choices[i]; |
michael@0 | 3028 | if(cs0 > 0) { |
michael@0 | 3029 | uint32_t value; |
michael@0 | 3030 | int32_t len2; |
michael@0 | 3031 | if(cs0 >= CNS_11643_0) { |
michael@0 | 3032 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
michael@0 | 3033 | converterData->myConverterArray[CNS_11643], |
michael@0 | 3034 | sourceChar, |
michael@0 | 3035 | &value, |
michael@0 | 3036 | useFallback, |
michael@0 | 3037 | MBCS_OUTPUT_3); |
michael@0 | 3038 | if(len2 == 3 || (len2 == -3 && len == 0)) { |
michael@0 | 3039 | targetValue = value; |
michael@0 | 3040 | cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); |
michael@0 | 3041 | if(len2 >= 0) { |
michael@0 | 3042 | len = 2; |
michael@0 | 3043 | } else { |
michael@0 | 3044 | len = -2; |
michael@0 | 3045 | useFallback = FALSE; |
michael@0 | 3046 | } |
michael@0 | 3047 | if(cs == CNS_11643_1) { |
michael@0 | 3048 | g = 1; |
michael@0 | 3049 | } else if(cs == CNS_11643_2) { |
michael@0 | 3050 | g = 2; |
michael@0 | 3051 | } else /* plane 3..7 */ if(converterData->version == 1) { |
michael@0 | 3052 | g = 3; |
michael@0 | 3053 | } else { |
michael@0 | 3054 | /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ |
michael@0 | 3055 | len = 0; |
michael@0 | 3056 | } |
michael@0 | 3057 | } |
michael@0 | 3058 | } else { |
michael@0 | 3059 | /* GB2312_1 or ISO-IR-165 */ |
michael@0 | 3060 | U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); |
michael@0 | 3061 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
michael@0 | 3062 | converterData->myConverterArray[cs0], |
michael@0 | 3063 | sourceChar, |
michael@0 | 3064 | &value, |
michael@0 | 3065 | useFallback, |
michael@0 | 3066 | MBCS_OUTPUT_2); |
michael@0 | 3067 | if(len2 == 2 || (len2 == -2 && len == 0)) { |
michael@0 | 3068 | targetValue = value; |
michael@0 | 3069 | len = len2; |
michael@0 | 3070 | cs = cs0; |
michael@0 | 3071 | g = 1; |
michael@0 | 3072 | useFallback = FALSE; |
michael@0 | 3073 | } |
michael@0 | 3074 | } |
michael@0 | 3075 | } |
michael@0 | 3076 | } |
michael@0 | 3077 | |
michael@0 | 3078 | if(len != 0) { |
michael@0 | 3079 | len = 0; /* count output bytes; it must have been abs(len) == 2 */ |
michael@0 | 3080 | |
michael@0 | 3081 | /* write the designation sequence if necessary */ |
michael@0 | 3082 | if(cs != pFromU2022State->cs[g]) { |
michael@0 | 3083 | if(cs < CNS_11643) { |
michael@0 | 3084 | uprv_memcpy(buffer, escSeqCharsCN[cs], 4); |
michael@0 | 3085 | } else { |
michael@0 | 3086 | U_ASSERT(cs >= CNS_11643_1); |
michael@0 | 3087 | uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); |
michael@0 | 3088 | } |
michael@0 | 3089 | len = 4; |
michael@0 | 3090 | pFromU2022State->cs[g] = cs; |
michael@0 | 3091 | if(g == 1) { |
michael@0 | 3092 | /* changing the SO/G1 charset invalidates the choices[] */ |
michael@0 | 3093 | choiceCount = 0; |
michael@0 | 3094 | } |
michael@0 | 3095 | } |
michael@0 | 3096 | |
michael@0 | 3097 | /* write the shift sequence if necessary */ |
michael@0 | 3098 | if(g != pFromU2022State->g) { |
michael@0 | 3099 | switch(g) { |
michael@0 | 3100 | case 1: |
michael@0 | 3101 | buffer[len++] = UCNV_SO; |
michael@0 | 3102 | |
michael@0 | 3103 | /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ |
michael@0 | 3104 | pFromU2022State->g = 1; |
michael@0 | 3105 | break; |
michael@0 | 3106 | case 2: |
michael@0 | 3107 | buffer[len++] = 0x1b; |
michael@0 | 3108 | buffer[len++] = 0x4e; |
michael@0 | 3109 | break; |
michael@0 | 3110 | default: /* case 3 */ |
michael@0 | 3111 | buffer[len++] = 0x1b; |
michael@0 | 3112 | buffer[len++] = 0x4f; |
michael@0 | 3113 | break; |
michael@0 | 3114 | } |
michael@0 | 3115 | } |
michael@0 | 3116 | |
michael@0 | 3117 | /* write the two output bytes */ |
michael@0 | 3118 | buffer[len++] = (char)(targetValue >> 8); |
michael@0 | 3119 | buffer[len++] = (char)targetValue; |
michael@0 | 3120 | } else { |
michael@0 | 3121 | /* if we cannot find the character after checking all codepages |
michael@0 | 3122 | * then this is an error |
michael@0 | 3123 | */ |
michael@0 | 3124 | *err = U_INVALID_CHAR_FOUND; |
michael@0 | 3125 | cnv->fromUChar32=sourceChar; |
michael@0 | 3126 | break; |
michael@0 | 3127 | } |
michael@0 | 3128 | } |
michael@0 | 3129 | |
michael@0 | 3130 | /* output len>0 bytes in buffer[] */ |
michael@0 | 3131 | if(len == 1) { |
michael@0 | 3132 | *target++ = buffer[0]; |
michael@0 | 3133 | if(offsets) { |
michael@0 | 3134 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
michael@0 | 3135 | } |
michael@0 | 3136 | } else if(len == 2 && (target + 2) <= targetLimit) { |
michael@0 | 3137 | *target++ = buffer[0]; |
michael@0 | 3138 | *target++ = buffer[1]; |
michael@0 | 3139 | if(offsets) { |
michael@0 | 3140 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
michael@0 | 3141 | *offsets++ = sourceIndex; |
michael@0 | 3142 | *offsets++ = sourceIndex; |
michael@0 | 3143 | } |
michael@0 | 3144 | } else { |
michael@0 | 3145 | fromUWriteUInt8( |
michael@0 | 3146 | cnv, |
michael@0 | 3147 | buffer, len, |
michael@0 | 3148 | &target, (const char *)targetLimit, |
michael@0 | 3149 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
michael@0 | 3150 | err); |
michael@0 | 3151 | if(U_FAILURE(*err)) { |
michael@0 | 3152 | break; |
michael@0 | 3153 | } |
michael@0 | 3154 | } |
michael@0 | 3155 | } /* end if(myTargetIndex<myTargetLength) */ |
michael@0 | 3156 | else{ |
michael@0 | 3157 | *err =U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 3158 | break; |
michael@0 | 3159 | } |
michael@0 | 3160 | |
michael@0 | 3161 | }/* end while(mySourceIndex<mySourceLength) */ |
michael@0 | 3162 | |
michael@0 | 3163 | /* |
michael@0 | 3164 | * the end of the input stream and detection of truncated input |
michael@0 | 3165 | * are handled by the framework, but for ISO-2022-CN conversion |
michael@0 | 3166 | * we need to be in ASCII mode at the very end |
michael@0 | 3167 | * |
michael@0 | 3168 | * conditions: |
michael@0 | 3169 | * successful |
michael@0 | 3170 | * not in ASCII mode |
michael@0 | 3171 | * end of input and no truncated input |
michael@0 | 3172 | */ |
michael@0 | 3173 | if( U_SUCCESS(*err) && |
michael@0 | 3174 | pFromU2022State->g!=0 && |
michael@0 | 3175 | args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
michael@0 | 3176 | ) { |
michael@0 | 3177 | int32_t sourceIndex; |
michael@0 | 3178 | |
michael@0 | 3179 | /* we are switching to ASCII */ |
michael@0 | 3180 | pFromU2022State->g=0; |
michael@0 | 3181 | |
michael@0 | 3182 | /* get the source index of the last input character */ |
michael@0 | 3183 | /* |
michael@0 | 3184 | * TODO this would be simpler and more reliable if we used a pair |
michael@0 | 3185 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
michael@0 | 3186 | * so that we could simply use the prevSourceIndex here; |
michael@0 | 3187 | * this code gives an incorrect result for the rare case of an unmatched |
michael@0 | 3188 | * trail surrogate that is alone in the last buffer of the text stream |
michael@0 | 3189 | */ |
michael@0 | 3190 | sourceIndex=(int32_t)(source-args->source); |
michael@0 | 3191 | if(sourceIndex>0) { |
michael@0 | 3192 | --sourceIndex; |
michael@0 | 3193 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
michael@0 | 3194 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
michael@0 | 3195 | ) { |
michael@0 | 3196 | --sourceIndex; |
michael@0 | 3197 | } |
michael@0 | 3198 | } else { |
michael@0 | 3199 | sourceIndex=-1; |
michael@0 | 3200 | } |
michael@0 | 3201 | |
michael@0 | 3202 | fromUWriteUInt8( |
michael@0 | 3203 | cnv, |
michael@0 | 3204 | SHIFT_IN_STR, 1, |
michael@0 | 3205 | &target, (const char *)targetLimit, |
michael@0 | 3206 | &offsets, sourceIndex, |
michael@0 | 3207 | err); |
michael@0 | 3208 | } |
michael@0 | 3209 | |
michael@0 | 3210 | /*save the state and return */ |
michael@0 | 3211 | args->source = source; |
michael@0 | 3212 | args->target = (char*)target; |
michael@0 | 3213 | } |
michael@0 | 3214 | |
michael@0 | 3215 | |
michael@0 | 3216 | static void |
michael@0 | 3217 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
michael@0 | 3218 | UErrorCode* err){ |
michael@0 | 3219 | char tempBuf[3]; |
michael@0 | 3220 | const char *mySource = (char *) args->source; |
michael@0 | 3221 | UChar *myTarget = args->target; |
michael@0 | 3222 | const char *mySourceLimit = args->sourceLimit; |
michael@0 | 3223 | uint32_t targetUniChar = 0x0000; |
michael@0 | 3224 | uint32_t mySourceChar = 0x0000; |
michael@0 | 3225 | UConverterDataISO2022* myData; |
michael@0 | 3226 | ISO2022State *pToU2022State; |
michael@0 | 3227 | |
michael@0 | 3228 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
michael@0 | 3229 | pToU2022State = &myData->toU2022State; |
michael@0 | 3230 | |
michael@0 | 3231 | if(myData->key != 0) { |
michael@0 | 3232 | /* continue with a partial escape sequence */ |
michael@0 | 3233 | goto escape; |
michael@0 | 3234 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
michael@0 | 3235 | /* continue with a partial double-byte character */ |
michael@0 | 3236 | mySourceChar = args->converter->toUBytes[0]; |
michael@0 | 3237 | args->converter->toULength = 0; |
michael@0 | 3238 | targetUniChar = missingCharMarker; |
michael@0 | 3239 | goto getTrailByte; |
michael@0 | 3240 | } |
michael@0 | 3241 | |
michael@0 | 3242 | while(mySource < mySourceLimit){ |
michael@0 | 3243 | |
michael@0 | 3244 | targetUniChar =missingCharMarker; |
michael@0 | 3245 | |
michael@0 | 3246 | if(myTarget < args->targetLimit){ |
michael@0 | 3247 | |
michael@0 | 3248 | mySourceChar= (unsigned char) *mySource++; |
michael@0 | 3249 | |
michael@0 | 3250 | switch(mySourceChar){ |
michael@0 | 3251 | case UCNV_SI: |
michael@0 | 3252 | pToU2022State->g=0; |
michael@0 | 3253 | if (myData->isEmptySegment) { |
michael@0 | 3254 | myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ |
michael@0 | 3255 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 3256 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
michael@0 | 3257 | args->converter->toUBytes[0] = mySourceChar; |
michael@0 | 3258 | args->converter->toULength = 1; |
michael@0 | 3259 | args->target = myTarget; |
michael@0 | 3260 | args->source = mySource; |
michael@0 | 3261 | return; |
michael@0 | 3262 | } |
michael@0 | 3263 | continue; |
michael@0 | 3264 | |
michael@0 | 3265 | case UCNV_SO: |
michael@0 | 3266 | if(pToU2022State->cs[1] != 0) { |
michael@0 | 3267 | pToU2022State->g=1; |
michael@0 | 3268 | myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
michael@0 | 3269 | continue; |
michael@0 | 3270 | } else { |
michael@0 | 3271 | /* illegal to have SO before a matching designator */ |
michael@0 | 3272 | myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ |
michael@0 | 3273 | break; |
michael@0 | 3274 | } |
michael@0 | 3275 | |
michael@0 | 3276 | case ESC_2022: |
michael@0 | 3277 | mySource--; |
michael@0 | 3278 | escape: |
michael@0 | 3279 | { |
michael@0 | 3280 | const char * mySourceBefore = mySource; |
michael@0 | 3281 | int8_t toULengthBefore = args->converter->toULength; |
michael@0 | 3282 | |
michael@0 | 3283 | changeState_2022(args->converter,&(mySource), |
michael@0 | 3284 | mySourceLimit, ISO_2022_CN,err); |
michael@0 | 3285 | |
michael@0 | 3286 | /* After SO there must be at least one character before a designator (designator error handled separately) */ |
michael@0 | 3287 | if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
michael@0 | 3288 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 3289 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
michael@0 | 3290 | args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
michael@0 | 3291 | } |
michael@0 | 3292 | } |
michael@0 | 3293 | |
michael@0 | 3294 | /* invalid or illegal escape sequence */ |
michael@0 | 3295 | if(U_FAILURE(*err)){ |
michael@0 | 3296 | args->target = myTarget; |
michael@0 | 3297 | args->source = mySource; |
michael@0 | 3298 | myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
michael@0 | 3299 | return; |
michael@0 | 3300 | } |
michael@0 | 3301 | continue; |
michael@0 | 3302 | |
michael@0 | 3303 | /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ |
michael@0 | 3304 | |
michael@0 | 3305 | case CR: |
michael@0 | 3306 | /*falls through*/ |
michael@0 | 3307 | case LF: |
michael@0 | 3308 | uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); |
michael@0 | 3309 | /* falls through */ |
michael@0 | 3310 | default: |
michael@0 | 3311 | /* convert one or two bytes */ |
michael@0 | 3312 | myData->isEmptySegment = FALSE; |
michael@0 | 3313 | if(pToU2022State->g != 0) { |
michael@0 | 3314 | if(mySource < mySourceLimit) { |
michael@0 | 3315 | UConverterSharedData *cnv; |
michael@0 | 3316 | StateEnum tempState; |
michael@0 | 3317 | int32_t tempBufLen; |
michael@0 | 3318 | int leadIsOk, trailIsOk; |
michael@0 | 3319 | uint8_t trailByte; |
michael@0 | 3320 | getTrailByte: |
michael@0 | 3321 | trailByte = (uint8_t)*mySource; |
michael@0 | 3322 | /* |
michael@0 | 3323 | * Ticket 5691: consistent illegal sequences: |
michael@0 | 3324 | * - We include at least the first byte in the illegal sequence. |
michael@0 | 3325 | * - If any of the non-initial bytes could be the start of a character, |
michael@0 | 3326 | * we stop the illegal sequence before the first one of those. |
michael@0 | 3327 | * |
michael@0 | 3328 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
michael@0 | 3329 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
michael@0 | 3330 | * Otherwise we convert or report the pair of bytes. |
michael@0 | 3331 | */ |
michael@0 | 3332 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
michael@0 | 3333 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
michael@0 | 3334 | if (leadIsOk && trailIsOk) { |
michael@0 | 3335 | ++mySource; |
michael@0 | 3336 | tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
michael@0 | 3337 | if(tempState >= CNS_11643_0) { |
michael@0 | 3338 | cnv = myData->myConverterArray[CNS_11643]; |
michael@0 | 3339 | tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); |
michael@0 | 3340 | tempBuf[1] = (char) (mySourceChar); |
michael@0 | 3341 | tempBuf[2] = (char) trailByte; |
michael@0 | 3342 | tempBufLen = 3; |
michael@0 | 3343 | |
michael@0 | 3344 | }else{ |
michael@0 | 3345 | U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); |
michael@0 | 3346 | cnv = myData->myConverterArray[tempState]; |
michael@0 | 3347 | tempBuf[0] = (char) (mySourceChar); |
michael@0 | 3348 | tempBuf[1] = (char) trailByte; |
michael@0 | 3349 | tempBufLen = 2; |
michael@0 | 3350 | } |
michael@0 | 3351 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); |
michael@0 | 3352 | mySourceChar = (mySourceChar << 8) | trailByte; |
michael@0 | 3353 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
michael@0 | 3354 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
michael@0 | 3355 | ++mySource; |
michael@0 | 3356 | /* add another bit so that the code below writes 2 bytes in case of error */ |
michael@0 | 3357 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
michael@0 | 3358 | } |
michael@0 | 3359 | if(pToU2022State->g>=2) { |
michael@0 | 3360 | /* return from a single-shift state to the previous one */ |
michael@0 | 3361 | pToU2022State->g=pToU2022State->prevG; |
michael@0 | 3362 | } |
michael@0 | 3363 | } else { |
michael@0 | 3364 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
michael@0 | 3365 | args->converter->toULength = 1; |
michael@0 | 3366 | goto endloop; |
michael@0 | 3367 | } |
michael@0 | 3368 | } |
michael@0 | 3369 | else{ |
michael@0 | 3370 | if(mySourceChar <= 0x7f) { |
michael@0 | 3371 | targetUniChar = (UChar) mySourceChar; |
michael@0 | 3372 | } |
michael@0 | 3373 | } |
michael@0 | 3374 | break; |
michael@0 | 3375 | } |
michael@0 | 3376 | if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
michael@0 | 3377 | if(args->offsets){ |
michael@0 | 3378 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
michael@0 | 3379 | } |
michael@0 | 3380 | *(myTarget++)=(UChar)targetUniChar; |
michael@0 | 3381 | } |
michael@0 | 3382 | else if(targetUniChar > missingCharMarker){ |
michael@0 | 3383 | /* disassemble the surrogate pair and write to output*/ |
michael@0 | 3384 | targetUniChar-=0x0010000; |
michael@0 | 3385 | *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
michael@0 | 3386 | if(args->offsets){ |
michael@0 | 3387 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
michael@0 | 3388 | } |
michael@0 | 3389 | ++myTarget; |
michael@0 | 3390 | if(myTarget< args->targetLimit){ |
michael@0 | 3391 | *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
michael@0 | 3392 | if(args->offsets){ |
michael@0 | 3393 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
michael@0 | 3394 | } |
michael@0 | 3395 | ++myTarget; |
michael@0 | 3396 | }else{ |
michael@0 | 3397 | args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
michael@0 | 3398 | (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
michael@0 | 3399 | } |
michael@0 | 3400 | |
michael@0 | 3401 | } |
michael@0 | 3402 | else{ |
michael@0 | 3403 | /* Call the callback function*/ |
michael@0 | 3404 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
michael@0 | 3405 | break; |
michael@0 | 3406 | } |
michael@0 | 3407 | } |
michael@0 | 3408 | else{ |
michael@0 | 3409 | *err =U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 3410 | break; |
michael@0 | 3411 | } |
michael@0 | 3412 | } |
michael@0 | 3413 | endloop: |
michael@0 | 3414 | args->target = myTarget; |
michael@0 | 3415 | args->source = mySource; |
michael@0 | 3416 | } |
michael@0 | 3417 | |
michael@0 | 3418 | static void |
michael@0 | 3419 | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { |
michael@0 | 3420 | UConverter *cnv = args->converter; |
michael@0 | 3421 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
michael@0 | 3422 | ISO2022State *pFromU2022State=&myConverterData->fromU2022State; |
michael@0 | 3423 | char *p, *subchar; |
michael@0 | 3424 | char buffer[8]; |
michael@0 | 3425 | int32_t length; |
michael@0 | 3426 | |
michael@0 | 3427 | subchar=(char *)cnv->subChars; |
michael@0 | 3428 | length=cnv->subCharLen; /* assume length==1 for most variants */ |
michael@0 | 3429 | |
michael@0 | 3430 | p = buffer; |
michael@0 | 3431 | switch(myConverterData->locale[0]){ |
michael@0 | 3432 | case 'j': |
michael@0 | 3433 | { |
michael@0 | 3434 | int8_t cs; |
michael@0 | 3435 | |
michael@0 | 3436 | if(pFromU2022State->g == 1) { |
michael@0 | 3437 | /* JIS7: switch from G1 to G0 */ |
michael@0 | 3438 | pFromU2022State->g = 0; |
michael@0 | 3439 | *p++ = UCNV_SI; |
michael@0 | 3440 | } |
michael@0 | 3441 | |
michael@0 | 3442 | cs = pFromU2022State->cs[0]; |
michael@0 | 3443 | if(cs != ASCII && cs != JISX201) { |
michael@0 | 3444 | /* not in ASCII or JIS X 0201: switch to ASCII */ |
michael@0 | 3445 | pFromU2022State->cs[0] = (int8_t)ASCII; |
michael@0 | 3446 | *p++ = '\x1b'; |
michael@0 | 3447 | *p++ = '\x28'; |
michael@0 | 3448 | *p++ = '\x42'; |
michael@0 | 3449 | } |
michael@0 | 3450 | |
michael@0 | 3451 | *p++ = subchar[0]; |
michael@0 | 3452 | break; |
michael@0 | 3453 | } |
michael@0 | 3454 | case 'c': |
michael@0 | 3455 | if(pFromU2022State->g != 0) { |
michael@0 | 3456 | /* not in ASCII mode: switch to ASCII */ |
michael@0 | 3457 | pFromU2022State->g = 0; |
michael@0 | 3458 | *p++ = UCNV_SI; |
michael@0 | 3459 | } |
michael@0 | 3460 | *p++ = subchar[0]; |
michael@0 | 3461 | break; |
michael@0 | 3462 | case 'k': |
michael@0 | 3463 | if(myConverterData->version == 0) { |
michael@0 | 3464 | if(length == 1) { |
michael@0 | 3465 | if((UBool)args->converter->fromUnicodeStatus) { |
michael@0 | 3466 | /* in DBCS mode: switch to SBCS */ |
michael@0 | 3467 | args->converter->fromUnicodeStatus = 0; |
michael@0 | 3468 | *p++ = UCNV_SI; |
michael@0 | 3469 | } |
michael@0 | 3470 | *p++ = subchar[0]; |
michael@0 | 3471 | } else /* length == 2*/ { |
michael@0 | 3472 | if(!(UBool)args->converter->fromUnicodeStatus) { |
michael@0 | 3473 | /* in SBCS mode: switch to DBCS */ |
michael@0 | 3474 | args->converter->fromUnicodeStatus = 1; |
michael@0 | 3475 | *p++ = UCNV_SO; |
michael@0 | 3476 | } |
michael@0 | 3477 | *p++ = subchar[0]; |
michael@0 | 3478 | *p++ = subchar[1]; |
michael@0 | 3479 | } |
michael@0 | 3480 | break; |
michael@0 | 3481 | } else { |
michael@0 | 3482 | /* save the subconverter's substitution string */ |
michael@0 | 3483 | uint8_t *currentSubChars = myConverterData->currentConverter->subChars; |
michael@0 | 3484 | int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; |
michael@0 | 3485 | |
michael@0 | 3486 | /* set our substitution string into the subconverter */ |
michael@0 | 3487 | myConverterData->currentConverter->subChars = (uint8_t *)subchar; |
michael@0 | 3488 | myConverterData->currentConverter->subCharLen = (int8_t)length; |
michael@0 | 3489 | |
michael@0 | 3490 | /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ |
michael@0 | 3491 | args->converter = myConverterData->currentConverter; |
michael@0 | 3492 | myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; |
michael@0 | 3493 | ucnv_cbFromUWriteSub(args, 0, err); |
michael@0 | 3494 | cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
michael@0 | 3495 | args->converter = cnv; |
michael@0 | 3496 | |
michael@0 | 3497 | /* restore the subconverter's substitution string */ |
michael@0 | 3498 | myConverterData->currentConverter->subChars = currentSubChars; |
michael@0 | 3499 | myConverterData->currentConverter->subCharLen = currentSubCharLen; |
michael@0 | 3500 | |
michael@0 | 3501 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 3502 | if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
michael@0 | 3503 | uprv_memcpy( |
michael@0 | 3504 | cnv->charErrorBuffer, |
michael@0 | 3505 | myConverterData->currentConverter->charErrorBuffer, |
michael@0 | 3506 | myConverterData->currentConverter->charErrorBufferLength); |
michael@0 | 3507 | } |
michael@0 | 3508 | cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
michael@0 | 3509 | myConverterData->currentConverter->charErrorBufferLength = 0; |
michael@0 | 3510 | } |
michael@0 | 3511 | return; |
michael@0 | 3512 | } |
michael@0 | 3513 | default: |
michael@0 | 3514 | /* not expected */ |
michael@0 | 3515 | break; |
michael@0 | 3516 | } |
michael@0 | 3517 | ucnv_cbFromUWriteBytes(args, |
michael@0 | 3518 | buffer, (int32_t)(p - buffer), |
michael@0 | 3519 | offsetIndex, err); |
michael@0 | 3520 | } |
michael@0 | 3521 | |
michael@0 | 3522 | /* |
michael@0 | 3523 | * Structure for cloning an ISO 2022 converter into a single memory block. |
michael@0 | 3524 | * ucnv_safeClone() of the converter will align the entire cloneStruct, |
michael@0 | 3525 | * and then ucnv_safeClone() of the sub-converter may additionally align |
michael@0 | 3526 | * currentConverter inside the cloneStruct, for which we need the deadSpace |
michael@0 | 3527 | * after currentConverter. |
michael@0 | 3528 | * This is because UAlignedMemory may be larger than the actually |
michael@0 | 3529 | * necessary alignment size for the platform. |
michael@0 | 3530 | * The other cloneStruct fields will not be moved around, |
michael@0 | 3531 | * and are aligned properly with cloneStruct's alignment. |
michael@0 | 3532 | */ |
michael@0 | 3533 | struct cloneStruct |
michael@0 | 3534 | { |
michael@0 | 3535 | UConverter cnv; |
michael@0 | 3536 | UConverter currentConverter; |
michael@0 | 3537 | UAlignedMemory deadSpace; |
michael@0 | 3538 | UConverterDataISO2022 mydata; |
michael@0 | 3539 | }; |
michael@0 | 3540 | |
michael@0 | 3541 | |
michael@0 | 3542 | static UConverter * |
michael@0 | 3543 | _ISO_2022_SafeClone( |
michael@0 | 3544 | const UConverter *cnv, |
michael@0 | 3545 | void *stackBuffer, |
michael@0 | 3546 | int32_t *pBufferSize, |
michael@0 | 3547 | UErrorCode *status) |
michael@0 | 3548 | { |
michael@0 | 3549 | struct cloneStruct * localClone; |
michael@0 | 3550 | UConverterDataISO2022 *cnvData; |
michael@0 | 3551 | int32_t i, size; |
michael@0 | 3552 | |
michael@0 | 3553 | if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ |
michael@0 | 3554 | *pBufferSize = (int32_t)sizeof(struct cloneStruct); |
michael@0 | 3555 | return NULL; |
michael@0 | 3556 | } |
michael@0 | 3557 | |
michael@0 | 3558 | cnvData = (UConverterDataISO2022 *)cnv->extraInfo; |
michael@0 | 3559 | localClone = (struct cloneStruct *)stackBuffer; |
michael@0 | 3560 | |
michael@0 | 3561 | /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
michael@0 | 3562 | |
michael@0 | 3563 | uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); |
michael@0 | 3564 | localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ |
michael@0 | 3565 | localClone->cnv.isExtraLocal = TRUE; |
michael@0 | 3566 | |
michael@0 | 3567 | /* share the subconverters */ |
michael@0 | 3568 | |
michael@0 | 3569 | if(cnvData->currentConverter != NULL) { |
michael@0 | 3570 | size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ |
michael@0 | 3571 | localClone->mydata.currentConverter = |
michael@0 | 3572 | ucnv_safeClone(cnvData->currentConverter, |
michael@0 | 3573 | &localClone->currentConverter, |
michael@0 | 3574 | &size, status); |
michael@0 | 3575 | if(U_FAILURE(*status)) { |
michael@0 | 3576 | return NULL; |
michael@0 | 3577 | } |
michael@0 | 3578 | } |
michael@0 | 3579 | |
michael@0 | 3580 | for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { |
michael@0 | 3581 | if(cnvData->myConverterArray[i] != NULL) { |
michael@0 | 3582 | ucnv_incrementRefCount(cnvData->myConverterArray[i]); |
michael@0 | 3583 | } |
michael@0 | 3584 | } |
michael@0 | 3585 | |
michael@0 | 3586 | return &localClone->cnv; |
michael@0 | 3587 | } |
michael@0 | 3588 | |
michael@0 | 3589 | static void |
michael@0 | 3590 | _ISO_2022_GetUnicodeSet(const UConverter *cnv, |
michael@0 | 3591 | const USetAdder *sa, |
michael@0 | 3592 | UConverterUnicodeSet which, |
michael@0 | 3593 | UErrorCode *pErrorCode) |
michael@0 | 3594 | { |
michael@0 | 3595 | int32_t i; |
michael@0 | 3596 | UConverterDataISO2022* cnvData; |
michael@0 | 3597 | |
michael@0 | 3598 | if (U_FAILURE(*pErrorCode)) { |
michael@0 | 3599 | return; |
michael@0 | 3600 | } |
michael@0 | 3601 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 3602 | if (cnv->sharedData == &_ISO2022Data) { |
michael@0 | 3603 | /* We use UTF-8 in this case */ |
michael@0 | 3604 | sa->addRange(sa->set, 0, 0xd7FF); |
michael@0 | 3605 | sa->addRange(sa->set, 0xE000, 0x10FFFF); |
michael@0 | 3606 | return; |
michael@0 | 3607 | } |
michael@0 | 3608 | #endif |
michael@0 | 3609 | |
michael@0 | 3610 | cnvData = (UConverterDataISO2022*)cnv->extraInfo; |
michael@0 | 3611 | |
michael@0 | 3612 | /* open a set and initialize it with code points that are algorithmically round-tripped */ |
michael@0 | 3613 | switch(cnvData->locale[0]){ |
michael@0 | 3614 | case 'j': |
michael@0 | 3615 | /* include JIS X 0201 which is hardcoded */ |
michael@0 | 3616 | sa->add(sa->set, 0xa5); |
michael@0 | 3617 | sa->add(sa->set, 0x203e); |
michael@0 | 3618 | if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { |
michael@0 | 3619 | /* include Latin-1 for some variants of JP */ |
michael@0 | 3620 | sa->addRange(sa->set, 0, 0xff); |
michael@0 | 3621 | } else { |
michael@0 | 3622 | /* include ASCII for JP */ |
michael@0 | 3623 | sa->addRange(sa->set, 0, 0x7f); |
michael@0 | 3624 | } |
michael@0 | 3625 | if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { |
michael@0 | 3626 | /* |
michael@0 | 3627 | * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 |
michael@0 | 3628 | * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) |
michael@0 | 3629 | * use half-width Katakana. |
michael@0 | 3630 | * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) |
michael@0 | 3631 | * half-width Katakana via the ESC ( I sequence. |
michael@0 | 3632 | * However, we only emit (fromUnicode) half-width Katakana according to the |
michael@0 | 3633 | * definition of each variant. |
michael@0 | 3634 | * |
michael@0 | 3635 | * When including fallbacks, |
michael@0 | 3636 | * we need to include half-width Katakana Unicode code points for all JP variants because |
michael@0 | 3637 | * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). |
michael@0 | 3638 | */ |
michael@0 | 3639 | /* include half-width Katakana for JP */ |
michael@0 | 3640 | sa->addRange(sa->set, HWKANA_START, HWKANA_END); |
michael@0 | 3641 | } |
michael@0 | 3642 | break; |
michael@0 | 3643 | case 'c': |
michael@0 | 3644 | case 'z': |
michael@0 | 3645 | /* include ASCII for CN */ |
michael@0 | 3646 | sa->addRange(sa->set, 0, 0x7f); |
michael@0 | 3647 | break; |
michael@0 | 3648 | case 'k': |
michael@0 | 3649 | /* there is only one converter for KR, and it is not in the myConverterArray[] */ |
michael@0 | 3650 | cnvData->currentConverter->sharedData->impl->getUnicodeSet( |
michael@0 | 3651 | cnvData->currentConverter, sa, which, pErrorCode); |
michael@0 | 3652 | /* the loop over myConverterArray[] will simply not find another converter */ |
michael@0 | 3653 | break; |
michael@0 | 3654 | default: |
michael@0 | 3655 | break; |
michael@0 | 3656 | } |
michael@0 | 3657 | |
michael@0 | 3658 | #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ |
michael@0 | 3659 | if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
michael@0 | 3660 | cnvData->version==0 && i==CNS_11643 |
michael@0 | 3661 | ) { |
michael@0 | 3662 | /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ |
michael@0 | 3663 | ucnv_MBCSGetUnicodeSetForBytes( |
michael@0 | 3664 | cnvData->myConverterArray[i], |
michael@0 | 3665 | sa, UCNV_ROUNDTRIP_SET, |
michael@0 | 3666 | 0, 0x81, 0x82, |
michael@0 | 3667 | pErrorCode); |
michael@0 | 3668 | } |
michael@0 | 3669 | #endif |
michael@0 | 3670 | |
michael@0 | 3671 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
michael@0 | 3672 | UConverterSetFilter filter; |
michael@0 | 3673 | if(cnvData->myConverterArray[i]!=NULL) { |
michael@0 | 3674 | if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
michael@0 | 3675 | cnvData->version==0 && i==CNS_11643 |
michael@0 | 3676 | ) { |
michael@0 | 3677 | /* |
michael@0 | 3678 | * Version-specific for CN: |
michael@0 | 3679 | * CN version 0 does not map CNS planes 3..7 although |
michael@0 | 3680 | * they are all available in the CNS conversion table; |
michael@0 | 3681 | * CN version 1 (-EXT) does map them all. |
michael@0 | 3682 | * The two versions create different Unicode sets. |
michael@0 | 3683 | */ |
michael@0 | 3684 | filter=UCNV_SET_FILTER_2022_CN; |
michael@0 | 3685 | } else if(cnvData->locale[0]=='j' && i==JISX208) { |
michael@0 | 3686 | /* |
michael@0 | 3687 | * Only add code points that map to Shift-JIS codes |
michael@0 | 3688 | * corresponding to JIS X 0208. |
michael@0 | 3689 | */ |
michael@0 | 3690 | filter=UCNV_SET_FILTER_SJIS; |
michael@0 | 3691 | } else if(i==KSC5601) { |
michael@0 | 3692 | /* |
michael@0 | 3693 | * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) |
michael@0 | 3694 | * are broader than GR94. |
michael@0 | 3695 | */ |
michael@0 | 3696 | filter=UCNV_SET_FILTER_GR94DBCS; |
michael@0 | 3697 | } else { |
michael@0 | 3698 | filter=UCNV_SET_FILTER_NONE; |
michael@0 | 3699 | } |
michael@0 | 3700 | ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); |
michael@0 | 3701 | } |
michael@0 | 3702 | } |
michael@0 | 3703 | |
michael@0 | 3704 | /* |
michael@0 | 3705 | * ISO 2022 converters must not convert SO/SI/ESC despite what |
michael@0 | 3706 | * sub-converters do by themselves. |
michael@0 | 3707 | * Remove these characters from the set. |
michael@0 | 3708 | */ |
michael@0 | 3709 | sa->remove(sa->set, 0x0e); |
michael@0 | 3710 | sa->remove(sa->set, 0x0f); |
michael@0 | 3711 | sa->remove(sa->set, 0x1b); |
michael@0 | 3712 | |
michael@0 | 3713 | /* ISO 2022 converters do not convert C1 controls either */ |
michael@0 | 3714 | sa->removeRange(sa->set, 0x80, 0x9f); |
michael@0 | 3715 | } |
michael@0 | 3716 | |
michael@0 | 3717 | static const UConverterImpl _ISO2022Impl={ |
michael@0 | 3718 | UCNV_ISO_2022, |
michael@0 | 3719 | |
michael@0 | 3720 | NULL, |
michael@0 | 3721 | NULL, |
michael@0 | 3722 | |
michael@0 | 3723 | _ISO2022Open, |
michael@0 | 3724 | _ISO2022Close, |
michael@0 | 3725 | _ISO2022Reset, |
michael@0 | 3726 | |
michael@0 | 3727 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
michael@0 | 3728 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
michael@0 | 3729 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
michael@0 | 3730 | ucnv_fromUnicode_UTF8, |
michael@0 | 3731 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
michael@0 | 3732 | #else |
michael@0 | 3733 | NULL, |
michael@0 | 3734 | NULL, |
michael@0 | 3735 | NULL, |
michael@0 | 3736 | NULL, |
michael@0 | 3737 | #endif |
michael@0 | 3738 | NULL, |
michael@0 | 3739 | |
michael@0 | 3740 | NULL, |
michael@0 | 3741 | _ISO2022getName, |
michael@0 | 3742 | _ISO_2022_WriteSub, |
michael@0 | 3743 | _ISO_2022_SafeClone, |
michael@0 | 3744 | _ISO_2022_GetUnicodeSet, |
michael@0 | 3745 | |
michael@0 | 3746 | NULL, |
michael@0 | 3747 | NULL |
michael@0 | 3748 | }; |
michael@0 | 3749 | static const UConverterStaticData _ISO2022StaticData={ |
michael@0 | 3750 | sizeof(UConverterStaticData), |
michael@0 | 3751 | "ISO_2022", |
michael@0 | 3752 | 2022, |
michael@0 | 3753 | UCNV_IBM, |
michael@0 | 3754 | UCNV_ISO_2022, |
michael@0 | 3755 | 1, |
michael@0 | 3756 | 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ |
michael@0 | 3757 | { 0x1a, 0, 0, 0 }, |
michael@0 | 3758 | 1, |
michael@0 | 3759 | FALSE, |
michael@0 | 3760 | FALSE, |
michael@0 | 3761 | 0, |
michael@0 | 3762 | 0, |
michael@0 | 3763 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 3764 | }; |
michael@0 | 3765 | const UConverterSharedData _ISO2022Data={ |
michael@0 | 3766 | sizeof(UConverterSharedData), |
michael@0 | 3767 | ~((uint32_t) 0), |
michael@0 | 3768 | NULL, |
michael@0 | 3769 | NULL, |
michael@0 | 3770 | &_ISO2022StaticData, |
michael@0 | 3771 | FALSE, |
michael@0 | 3772 | &_ISO2022Impl, |
michael@0 | 3773 | 0, UCNV_MBCS_TABLE_INITIALIZER |
michael@0 | 3774 | }; |
michael@0 | 3775 | |
michael@0 | 3776 | /*************JP****************/ |
michael@0 | 3777 | static const UConverterImpl _ISO2022JPImpl={ |
michael@0 | 3778 | UCNV_ISO_2022, |
michael@0 | 3779 | |
michael@0 | 3780 | NULL, |
michael@0 | 3781 | NULL, |
michael@0 | 3782 | |
michael@0 | 3783 | _ISO2022Open, |
michael@0 | 3784 | _ISO2022Close, |
michael@0 | 3785 | _ISO2022Reset, |
michael@0 | 3786 | |
michael@0 | 3787 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
michael@0 | 3788 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
michael@0 | 3789 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
michael@0 | 3790 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
michael@0 | 3791 | NULL, |
michael@0 | 3792 | |
michael@0 | 3793 | NULL, |
michael@0 | 3794 | _ISO2022getName, |
michael@0 | 3795 | _ISO_2022_WriteSub, |
michael@0 | 3796 | _ISO_2022_SafeClone, |
michael@0 | 3797 | _ISO_2022_GetUnicodeSet, |
michael@0 | 3798 | |
michael@0 | 3799 | NULL, |
michael@0 | 3800 | NULL |
michael@0 | 3801 | }; |
michael@0 | 3802 | static const UConverterStaticData _ISO2022JPStaticData={ |
michael@0 | 3803 | sizeof(UConverterStaticData), |
michael@0 | 3804 | "ISO_2022_JP", |
michael@0 | 3805 | 0, |
michael@0 | 3806 | UCNV_IBM, |
michael@0 | 3807 | UCNV_ISO_2022, |
michael@0 | 3808 | 1, |
michael@0 | 3809 | 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ |
michael@0 | 3810 | { 0x1a, 0, 0, 0 }, |
michael@0 | 3811 | 1, |
michael@0 | 3812 | FALSE, |
michael@0 | 3813 | FALSE, |
michael@0 | 3814 | 0, |
michael@0 | 3815 | 0, |
michael@0 | 3816 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 3817 | }; |
michael@0 | 3818 | |
michael@0 | 3819 | namespace { |
michael@0 | 3820 | |
michael@0 | 3821 | const UConverterSharedData _ISO2022JPData={ |
michael@0 | 3822 | sizeof(UConverterSharedData), |
michael@0 | 3823 | ~((uint32_t) 0), |
michael@0 | 3824 | NULL, |
michael@0 | 3825 | NULL, |
michael@0 | 3826 | &_ISO2022JPStaticData, |
michael@0 | 3827 | FALSE, |
michael@0 | 3828 | &_ISO2022JPImpl, |
michael@0 | 3829 | 0, UCNV_MBCS_TABLE_INITIALIZER |
michael@0 | 3830 | }; |
michael@0 | 3831 | |
michael@0 | 3832 | } // namespace |
michael@0 | 3833 | |
michael@0 | 3834 | /************* KR ***************/ |
michael@0 | 3835 | static const UConverterImpl _ISO2022KRImpl={ |
michael@0 | 3836 | UCNV_ISO_2022, |
michael@0 | 3837 | |
michael@0 | 3838 | NULL, |
michael@0 | 3839 | NULL, |
michael@0 | 3840 | |
michael@0 | 3841 | _ISO2022Open, |
michael@0 | 3842 | _ISO2022Close, |
michael@0 | 3843 | _ISO2022Reset, |
michael@0 | 3844 | |
michael@0 | 3845 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
michael@0 | 3846 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
michael@0 | 3847 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
michael@0 | 3848 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
michael@0 | 3849 | NULL, |
michael@0 | 3850 | |
michael@0 | 3851 | NULL, |
michael@0 | 3852 | _ISO2022getName, |
michael@0 | 3853 | _ISO_2022_WriteSub, |
michael@0 | 3854 | _ISO_2022_SafeClone, |
michael@0 | 3855 | _ISO_2022_GetUnicodeSet, |
michael@0 | 3856 | |
michael@0 | 3857 | NULL, |
michael@0 | 3858 | NULL |
michael@0 | 3859 | }; |
michael@0 | 3860 | static const UConverterStaticData _ISO2022KRStaticData={ |
michael@0 | 3861 | sizeof(UConverterStaticData), |
michael@0 | 3862 | "ISO_2022_KR", |
michael@0 | 3863 | 0, |
michael@0 | 3864 | UCNV_IBM, |
michael@0 | 3865 | UCNV_ISO_2022, |
michael@0 | 3866 | 1, |
michael@0 | 3867 | 3, /* max 3 bytes per UChar: SO+DBCS */ |
michael@0 | 3868 | { 0x1a, 0, 0, 0 }, |
michael@0 | 3869 | 1, |
michael@0 | 3870 | FALSE, |
michael@0 | 3871 | FALSE, |
michael@0 | 3872 | 0, |
michael@0 | 3873 | 0, |
michael@0 | 3874 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 3875 | }; |
michael@0 | 3876 | |
michael@0 | 3877 | namespace { |
michael@0 | 3878 | |
michael@0 | 3879 | const UConverterSharedData _ISO2022KRData={ |
michael@0 | 3880 | sizeof(UConverterSharedData), |
michael@0 | 3881 | ~((uint32_t) 0), |
michael@0 | 3882 | NULL, |
michael@0 | 3883 | NULL, |
michael@0 | 3884 | &_ISO2022KRStaticData, |
michael@0 | 3885 | FALSE, |
michael@0 | 3886 | &_ISO2022KRImpl, |
michael@0 | 3887 | 0, UCNV_MBCS_TABLE_INITIALIZER |
michael@0 | 3888 | }; |
michael@0 | 3889 | |
michael@0 | 3890 | } // namespace |
michael@0 | 3891 | |
michael@0 | 3892 | /*************** CN ***************/ |
michael@0 | 3893 | static const UConverterImpl _ISO2022CNImpl={ |
michael@0 | 3894 | |
michael@0 | 3895 | UCNV_ISO_2022, |
michael@0 | 3896 | |
michael@0 | 3897 | NULL, |
michael@0 | 3898 | NULL, |
michael@0 | 3899 | |
michael@0 | 3900 | _ISO2022Open, |
michael@0 | 3901 | _ISO2022Close, |
michael@0 | 3902 | _ISO2022Reset, |
michael@0 | 3903 | |
michael@0 | 3904 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
michael@0 | 3905 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
michael@0 | 3906 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
michael@0 | 3907 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
michael@0 | 3908 | NULL, |
michael@0 | 3909 | |
michael@0 | 3910 | NULL, |
michael@0 | 3911 | _ISO2022getName, |
michael@0 | 3912 | _ISO_2022_WriteSub, |
michael@0 | 3913 | _ISO_2022_SafeClone, |
michael@0 | 3914 | _ISO_2022_GetUnicodeSet, |
michael@0 | 3915 | |
michael@0 | 3916 | NULL, |
michael@0 | 3917 | NULL |
michael@0 | 3918 | }; |
michael@0 | 3919 | static const UConverterStaticData _ISO2022CNStaticData={ |
michael@0 | 3920 | sizeof(UConverterStaticData), |
michael@0 | 3921 | "ISO_2022_CN", |
michael@0 | 3922 | 0, |
michael@0 | 3923 | UCNV_IBM, |
michael@0 | 3924 | UCNV_ISO_2022, |
michael@0 | 3925 | 1, |
michael@0 | 3926 | 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ |
michael@0 | 3927 | { 0x1a, 0, 0, 0 }, |
michael@0 | 3928 | 1, |
michael@0 | 3929 | FALSE, |
michael@0 | 3930 | FALSE, |
michael@0 | 3931 | 0, |
michael@0 | 3932 | 0, |
michael@0 | 3933 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 3934 | }; |
michael@0 | 3935 | |
michael@0 | 3936 | namespace { |
michael@0 | 3937 | |
michael@0 | 3938 | const UConverterSharedData _ISO2022CNData={ |
michael@0 | 3939 | sizeof(UConverterSharedData), |
michael@0 | 3940 | ~((uint32_t) 0), |
michael@0 | 3941 | NULL, |
michael@0 | 3942 | NULL, |
michael@0 | 3943 | &_ISO2022CNStaticData, |
michael@0 | 3944 | FALSE, |
michael@0 | 3945 | &_ISO2022CNImpl, |
michael@0 | 3946 | 0, UCNV_MBCS_TABLE_INITIALIZER |
michael@0 | 3947 | }; |
michael@0 | 3948 | |
michael@0 | 3949 | } // namespace |
michael@0 | 3950 | |
michael@0 | 3951 | #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |