michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2000-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * file name: ucnv2022.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2000feb03 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * Change history: michael@0: * michael@0: * 06/29/2000 helena Major rewrite of the callback APIs. michael@0: * 08/08/2000 Ram Included support for ISO-2022-JP-2 michael@0: * Changed implementation of toUnicode michael@0: * function michael@0: * 08/21/2000 Ram Added support for ISO-2022-KR michael@0: * 08/29/2000 Ram Seperated implementation of EBCDIC to michael@0: * ucnvebdc.c michael@0: * 09/20/2000 Ram Added support for ISO-2022-CN michael@0: * Added implementations for getNextUChar() michael@0: * for specific 2022 country variants. michael@0: * 10/31/2000 Ram Implemented offsets logic functions michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION michael@0: michael@0: #include "unicode/ucnv.h" michael@0: #include "unicode/uset.h" michael@0: #include "unicode/ucnv_err.h" michael@0: #include "unicode/ucnv_cb.h" michael@0: #include "unicode/utf16.h" michael@0: #include "ucnv_imp.h" michael@0: #include "ucnv_bld.h" michael@0: #include "ucnv_cnv.h" michael@0: #include "ucnvmbcs.h" michael@0: #include "cstring.h" michael@0: #include "cmemory.h" michael@0: #include "uassert.h" michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: /* michael@0: * I am disabling the generic ISO-2022 converter after proposing to do so on michael@0: * the icu mailing list two days ago. michael@0: * michael@0: * Reasons: michael@0: * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of michael@0: * its designation sequences, single shifts with return to the previous state, michael@0: * switch-with-no-return to UTF-16BE or similar, etc. michael@0: * This is unlike the language-specific variants like ISO-2022-JP which michael@0: * require a much smaller repertoire of ISO-2022 features. michael@0: * These variants continue to be supported. michael@0: * 2. I believe that no one is really using the generic ISO-2022 converter michael@0: * but rather always one of the language-specific variants. michael@0: * Note that ICU's generic ISO-2022 converter has always output one escape michael@0: * sequence followed by UTF-8 for the whole stream. michael@0: * 3. Switching between subcharsets is extremely slow, because each time michael@0: * the previous converter is closed and a new one opened, michael@0: * without any kind of caching, least-recently-used list, etc. michael@0: * 4. The code is currently buggy, and given the above it does not seem michael@0: * reasonable to spend the time on maintenance. michael@0: * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. michael@0: * This means, for example, that when ISO-8859-7 is designated, the following michael@0: * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. michael@0: * The ICU ISO-2022 converter does not handle this - and has no information michael@0: * about which subconverter would have to be shifted vs. which is designed michael@0: * for 7-bit ISO-2022. michael@0: * michael@0: * Markus Scherer 2003-dec-03 michael@0: */ michael@0: #endif michael@0: michael@0: static const char SHIFT_IN_STR[] = "\x0F"; michael@0: // static const char SHIFT_OUT_STR[] = "\x0E"; michael@0: michael@0: #define CR 0x0D michael@0: #define LF 0x0A michael@0: #define H_TAB 0x09 michael@0: #define V_TAB 0x0B michael@0: #define SPACE 0x20 michael@0: michael@0: enum { michael@0: HWKANA_START=0xff61, michael@0: HWKANA_END=0xff9f michael@0: }; michael@0: michael@0: /* michael@0: * 94-character sets with native byte values A1..FE are encoded in ISO 2022 michael@0: * as bytes 21..7E. (Subtract 0x80.) michael@0: * 96-character sets with native byte values A0..FF are encoded in ISO 2022 michael@0: * as bytes 20..7F. (Subtract 0x80.) michael@0: * Do not encode C1 control codes with native bytes 80..9F michael@0: * as bytes 00..1F (C0 control codes). michael@0: */ michael@0: enum { michael@0: GR94_START=0xa1, michael@0: GR94_END=0xfe, michael@0: GR96_START=0xa0, michael@0: GR96_END=0xff michael@0: }; michael@0: michael@0: /* michael@0: * ISO 2022 control codes must not be converted from Unicode michael@0: * because they would mess up the byte stream. michael@0: * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b michael@0: * corresponding to SO, SI, and ESC. michael@0: */ michael@0: #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) michael@0: michael@0: /* for ISO-2022-JP and -CN implementations */ michael@0: typedef enum { michael@0: /* shared values */ michael@0: INVALID_STATE=-1, michael@0: ASCII = 0, michael@0: michael@0: SS2_STATE=0x10, michael@0: SS3_STATE, michael@0: michael@0: /* JP */ michael@0: ISO8859_1 = 1 , michael@0: ISO8859_7 = 2 , michael@0: JISX201 = 3, michael@0: JISX208 = 4, michael@0: JISX212 = 5, michael@0: GB2312 =6, michael@0: KSC5601 =7, michael@0: HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ michael@0: michael@0: /* CN */ michael@0: /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ michael@0: GB2312_1=1, michael@0: ISO_IR_165=2, michael@0: CNS_11643=3, michael@0: michael@0: /* michael@0: * these are used in StateEnum and ISO2022State variables, michael@0: * but CNS_11643 must be used to index into myConverterArray[] michael@0: */ michael@0: CNS_11643_0=0x20, michael@0: CNS_11643_1, michael@0: CNS_11643_2, michael@0: CNS_11643_3, michael@0: CNS_11643_4, michael@0: CNS_11643_5, michael@0: CNS_11643_6, michael@0: CNS_11643_7 michael@0: } StateEnum; michael@0: michael@0: /* is the StateEnum charset value for a DBCS charset? */ michael@0: #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) michael@0: michael@0: #define CSM(cs) ((uint16_t)1<<(cs)) michael@0: michael@0: /* michael@0: * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence michael@0: * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x michael@0: * michael@0: * Note: The converter uses some leniency: michael@0: * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in michael@0: * all versions, not just JIS7 and JIS8. michael@0: * - ICU does not distinguish between different versions of JIS X 0208. michael@0: */ michael@0: enum { MAX_JA_VERSION=4 }; michael@0: static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ michael@0: CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), michael@0: CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), michael@0: CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), michael@0: CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), michael@0: CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) michael@0: }; michael@0: michael@0: typedef enum { michael@0: ASCII1=0, michael@0: LATIN1, michael@0: SBCS, michael@0: DBCS, michael@0: MBCS, michael@0: HWKANA michael@0: }Cnv2022Type; michael@0: michael@0: typedef struct ISO2022State { michael@0: int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ michael@0: int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ michael@0: int8_t prevG; /* g before single shift (SS2 or SS3) */ michael@0: } ISO2022State; michael@0: michael@0: #define UCNV_OPTIONS_VERSION_MASK 0xf michael@0: #define UCNV_2022_MAX_CONVERTERS 10 michael@0: michael@0: typedef struct{ michael@0: UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; michael@0: UConverter *currentConverter; michael@0: Cnv2022Type currentType; michael@0: ISO2022State toU2022State, fromU2022State; michael@0: uint32_t key; michael@0: uint32_t version; michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: UBool isFirstBuffer; michael@0: #endif michael@0: UBool isEmptySegment; michael@0: char name[30]; michael@0: char locale[3]; michael@0: }UConverterDataISO2022; michael@0: michael@0: /* Protos */ michael@0: /* ISO-2022 ----------------------------------------------------------------- */ michael@0: michael@0: /*Forward declaration */ michael@0: U_CFUNC void michael@0: ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, michael@0: UErrorCode * err); michael@0: U_CFUNC void michael@0: ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, michael@0: UErrorCode * err); michael@0: michael@0: #define ESC_2022 0x1B /*ESC*/ michael@0: michael@0: typedef enum michael@0: { michael@0: INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ michael@0: VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ michael@0: VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ michael@0: VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ michael@0: } UCNV_TableStates_2022; michael@0: michael@0: /* michael@0: * The way these state transition arrays work is: michael@0: * ex : ESC$B is the sequence for JISX208 michael@0: * a) First Iteration: char is ESC michael@0: * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index michael@0: * int x = normalize_esq_chars_2022[27] which is equal to 1 michael@0: * ii) Search for this value in escSeqStateTable_Key_2022[] michael@0: * value of x is stored at escSeqStateTable_Key_2022[0] michael@0: * iii) Save this index as offset michael@0: * iv) Get state of this sequence from escSeqStateTable_Value_2022[] michael@0: * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 michael@0: * b) Switch on this state and continue to next char michael@0: * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index michael@0: * which is normalize_esq_chars_2022[36] == 4 michael@0: * ii) x is currently 1(from above) michael@0: * x<<=5 -- x is now 32 michael@0: * x+=normalize_esq_chars_2022[36] michael@0: * now x is 36 michael@0: * iii) Search for this value in escSeqStateTable_Key_2022[] michael@0: * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 michael@0: * iv) Get state of this sequence from escSeqStateTable_Value_2022[] michael@0: * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 michael@0: * c) Switch on this state and continue to next char michael@0: * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index michael@0: * ii) x is currently 36 (from above) michael@0: * x<<=5 -- x is now 1152 michael@0: * x+=normalize_esq_chars_2022[66] michael@0: * now x is 1161 michael@0: * iii) Search for this value in escSeqStateTable_Key_2022[] michael@0: * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 michael@0: * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] michael@0: * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 michael@0: * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 michael@0: */ michael@0: michael@0: michael@0: /*Below are the 3 arrays depicting a state transition table*/ michael@0: static const int8_t normalize_esq_chars_2022[256] = { michael@0: /* 0 1 2 3 4 5 6 7 8 9 */ michael@0: michael@0: 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 michael@0: ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 michael@0: ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 michael@0: ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 michael@0: ,0 ,0 ,0 ,0 ,0 ,0 michael@0: }; michael@0: michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: /* michael@0: * When the generic ISO-2022 converter is completely removed, not just disabled michael@0: * per #ifdef, then the following state table and the associated tables that are michael@0: * dimensioned with MAX_STATES_2022 should be trimmed. michael@0: * michael@0: * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of michael@0: * the associated escape sequences starting with ESC ( B should be removed. michael@0: * This includes the ones with key values 1097 and all of the ones above 1000000. michael@0: * michael@0: * For the latter, the tables can simply be truncated. michael@0: * For the former, since the tables must be kept parallel, it is probably best michael@0: * to simply duplicate an adjacent table cell, parallel in all tables. michael@0: * michael@0: * It may make sense to restructure the tables, especially by using small search michael@0: * tables for the variants instead of indexing them parallel to the table here. michael@0: */ michael@0: #endif michael@0: michael@0: #define MAX_STATES_2022 74 michael@0: static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { michael@0: /* 0 1 2 3 4 5 6 7 8 9 */ michael@0: michael@0: 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 michael@0: ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 michael@0: ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 michael@0: ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 michael@0: ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 michael@0: ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 michael@0: ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 michael@0: ,35947631 ,35947635 ,35947636 ,35947638 michael@0: }; michael@0: michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: michael@0: static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { michael@0: /* 0 1 2 3 4 5 6 7 8 9 */ michael@0: michael@0: NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" michael@0: ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" michael@0: ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" michael@0: ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" michael@0: ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" michael@0: ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" michael@0: ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" michael@0: ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { michael@0: /* 0 1 2 3 4 5 6 7 8 9 */ michael@0: VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 michael@0: ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 michael@0: ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 michael@0: ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 michael@0: ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 michael@0: ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 michael@0: ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 michael@0: ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 michael@0: }; michael@0: michael@0: michael@0: /* Type def for refactoring changeState_2022 code*/ michael@0: typedef enum{ michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: ISO_2022=0, michael@0: #endif michael@0: ISO_2022_JP=1, michael@0: ISO_2022_KR=2, michael@0: ISO_2022_CN=3 michael@0: } Variant2022; michael@0: michael@0: /*********** ISO 2022 Converter Protos ***********/ michael@0: static void michael@0: _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); michael@0: michael@0: static void michael@0: _ISO2022Close(UConverter *converter); michael@0: michael@0: static void michael@0: _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); michael@0: michael@0: static const char* michael@0: _ISO2022getName(const UConverter* cnv); michael@0: michael@0: static void michael@0: _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); michael@0: michael@0: static UConverter * michael@0: _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); michael@0: michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: static void michael@0: T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); michael@0: #endif michael@0: michael@0: namespace { michael@0: michael@0: /*const UConverterSharedData _ISO2022Data;*/ michael@0: extern const UConverterSharedData _ISO2022JPData; michael@0: extern const UConverterSharedData _ISO2022KRData; michael@0: extern const UConverterSharedData _ISO2022CNData; michael@0: michael@0: } // namespace michael@0: michael@0: /*************** Converter implementations ******************/ michael@0: michael@0: /* The purpose of this function is to get around gcc compiler warnings. */ michael@0: static inline void michael@0: fromUWriteUInt8(UConverter *cnv, michael@0: const char *bytes, int32_t length, michael@0: uint8_t **target, const char *targetLimit, michael@0: int32_t **offsets, michael@0: int32_t sourceIndex, michael@0: UErrorCode *pErrorCode) michael@0: { michael@0: char *targetChars = (char *)*target; michael@0: ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, michael@0: offsets, sourceIndex, pErrorCode); michael@0: *target = (uint8_t*)targetChars; michael@0: michael@0: } michael@0: michael@0: static inline void michael@0: setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ michael@0: if(myConverterData->version == 1) { michael@0: UConverter *cnv = myConverterData->currentConverter; michael@0: michael@0: cnv->toUnicodeStatus=0; /* offset */ michael@0: cnv->mode=0; /* state */ michael@0: cnv->toULength=0; /* byteIndex */ michael@0: } michael@0: } michael@0: michael@0: static inline void michael@0: setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ michael@0: /* in ISO-2022-KR the designator sequence appears only once michael@0: * in a file so we append it only once michael@0: */ michael@0: if( converter->charErrorBufferLength==0){ michael@0: michael@0: converter->charErrorBufferLength = 4; michael@0: converter->charErrorBuffer[0] = 0x1b; michael@0: converter->charErrorBuffer[1] = 0x24; michael@0: converter->charErrorBuffer[2] = 0x29; michael@0: converter->charErrorBuffer[3] = 0x43; michael@0: } michael@0: if(myConverterData->version == 1) { michael@0: UConverter *cnv = myConverterData->currentConverter; michael@0: michael@0: cnv->fromUChar32=0; michael@0: cnv->fromUnicodeStatus=1; /* prevLength */ michael@0: } michael@0: } michael@0: michael@0: static void michael@0: _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ michael@0: michael@0: char myLocale[6]={' ',' ',' ',' ',' ',' '}; michael@0: michael@0: cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); michael@0: if(cnv->extraInfo != NULL) { michael@0: UConverterNamePieces stackPieces; michael@0: UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; michael@0: UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; michael@0: uint32_t version; michael@0: michael@0: stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; michael@0: michael@0: uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); michael@0: myConverterData->currentType = ASCII1; michael@0: cnv->fromUnicodeStatus =FALSE; michael@0: if(pArgs->locale){ michael@0: uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); michael@0: } michael@0: version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; michael@0: myConverterData->version = version; michael@0: if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && michael@0: (myLocale[2]=='_' || myLocale[2]=='\0')) michael@0: { michael@0: size_t len=0; michael@0: /* open the required converters and cache them */ michael@0: if(version>MAX_JA_VERSION) { michael@0: /* prevent indexing beyond jpCharsetMasks[] */ michael@0: myConverterData->version = version = 0; michael@0: } michael@0: if(jpCharsetMasks[version]&CSM(ISO8859_7)) { michael@0: myConverterData->myConverterArray[ISO8859_7] = michael@0: ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); michael@0: } michael@0: myConverterData->myConverterArray[JISX208] = michael@0: ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); michael@0: if(jpCharsetMasks[version]&CSM(JISX212)) { michael@0: myConverterData->myConverterArray[JISX212] = michael@0: ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); michael@0: } michael@0: if(jpCharsetMasks[version]&CSM(GB2312)) { michael@0: myConverterData->myConverterArray[GB2312] = michael@0: ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ michael@0: } michael@0: if(jpCharsetMasks[version]&CSM(KSC5601)) { michael@0: myConverterData->myConverterArray[KSC5601] = michael@0: ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); michael@0: } michael@0: michael@0: /* set the function pointers to appropriate funtions */ michael@0: cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); michael@0: uprv_strcpy(myConverterData->locale,"ja"); michael@0: michael@0: (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); michael@0: len = uprv_strlen(myConverterData->name); michael@0: myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); michael@0: myConverterData->name[len+1]='\0'; michael@0: } michael@0: else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && michael@0: (myLocale[2]=='_' || myLocale[2]=='\0')) michael@0: { michael@0: const char *cnvName; michael@0: if(version==1) { michael@0: cnvName="icu-internal-25546"; michael@0: } else { michael@0: cnvName="ibm-949"; michael@0: myConverterData->version=version=0; michael@0: } michael@0: if(pArgs->onlyTestIsLoadable) { michael@0: ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ michael@0: uprv_free(cnv->extraInfo); michael@0: cnv->extraInfo=NULL; michael@0: return; michael@0: } else { michael@0: myConverterData->currentConverter=ucnv_open(cnvName, errorCode); michael@0: if (U_FAILURE(*errorCode)) { michael@0: _ISO2022Close(cnv); michael@0: return; michael@0: } michael@0: michael@0: if(version==1) { michael@0: (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); michael@0: uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); michael@0: cnv->subCharLen = myConverterData->currentConverter->subCharLen; michael@0: }else{ michael@0: (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); michael@0: } michael@0: michael@0: /* initialize the state variables */ michael@0: setInitialStateToUnicodeKR(cnv, myConverterData); michael@0: setInitialStateFromUnicodeKR(cnv, myConverterData); michael@0: michael@0: /* set the function pointers to appropriate funtions */ michael@0: cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; michael@0: uprv_strcpy(myConverterData->locale,"ko"); michael@0: } michael@0: } michael@0: else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& michael@0: (myLocale[2]=='_' || myLocale[2]=='\0')) michael@0: { michael@0: michael@0: /* open the required converters and cache them */ michael@0: myConverterData->myConverterArray[GB2312_1] = michael@0: ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); michael@0: if(version==1) { michael@0: myConverterData->myConverterArray[ISO_IR_165] = michael@0: ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); michael@0: } michael@0: myConverterData->myConverterArray[CNS_11643] = michael@0: ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); michael@0: michael@0: michael@0: /* set the function pointers to appropriate funtions */ michael@0: cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; michael@0: uprv_strcpy(myConverterData->locale,"cn"); michael@0: michael@0: if (version==0){ michael@0: myConverterData->version = 0; michael@0: (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); michael@0: }else if (version==1){ michael@0: myConverterData->version = 1; michael@0: (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); michael@0: }else { michael@0: myConverterData->version = 2; michael@0: (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); michael@0: } michael@0: } michael@0: else{ michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: myConverterData->isFirstBuffer = TRUE; michael@0: michael@0: /* append the UTF-8 escape sequence */ michael@0: cnv->charErrorBufferLength = 3; michael@0: cnv->charErrorBuffer[0] = 0x1b; michael@0: cnv->charErrorBuffer[1] = 0x25; michael@0: cnv->charErrorBuffer[2] = 0x42; michael@0: michael@0: cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; michael@0: /* initialize the state variables */ michael@0: uprv_strcpy(myConverterData->name,"ISO_2022"); michael@0: #else michael@0: *errorCode = U_UNSUPPORTED_ERROR; michael@0: return; michael@0: #endif michael@0: } michael@0: michael@0: cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; michael@0: michael@0: if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { michael@0: _ISO2022Close(cnv); michael@0: } michael@0: } else { michael@0: *errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: michael@0: static void michael@0: _ISO2022Close(UConverter *converter) { michael@0: UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); michael@0: UConverterSharedData **array = myData->myConverterArray; michael@0: int32_t i; michael@0: michael@0: if (converter->extraInfo != NULL) { michael@0: /*close the array of converter pointers and free the memory*/ michael@0: for (i=0; icurrentConverter); michael@0: michael@0: if(!converter->isExtraLocal){ michael@0: uprv_free (converter->extraInfo); michael@0: converter->extraInfo = NULL; michael@0: } michael@0: } michael@0: } michael@0: michael@0: static void michael@0: _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { michael@0: UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); michael@0: if(choice<=UCNV_RESET_TO_UNICODE) { michael@0: uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); michael@0: myConverterData->key = 0; michael@0: myConverterData->isEmptySegment = FALSE; michael@0: } michael@0: if(choice!=UCNV_RESET_TO_UNICODE) { michael@0: uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); michael@0: } michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: if(myConverterData->locale[0] == 0){ michael@0: if(choice<=UCNV_RESET_TO_UNICODE) { michael@0: myConverterData->isFirstBuffer = TRUE; michael@0: myConverterData->key = 0; michael@0: if (converter->mode == UCNV_SO){ michael@0: ucnv_close (myConverterData->currentConverter); michael@0: myConverterData->currentConverter=NULL; michael@0: } michael@0: converter->mode = UCNV_SI; michael@0: } michael@0: if(choice!=UCNV_RESET_TO_UNICODE) { michael@0: /* re-append UTF-8 escape sequence */ michael@0: converter->charErrorBufferLength = 3; michael@0: converter->charErrorBuffer[0] = 0x1b; michael@0: converter->charErrorBuffer[1] = 0x28; michael@0: converter->charErrorBuffer[2] = 0x42; michael@0: } michael@0: } michael@0: else michael@0: #endif michael@0: { michael@0: /* reset the state variables */ michael@0: if(myConverterData->locale[0] == 'k'){ michael@0: if(choice<=UCNV_RESET_TO_UNICODE) { michael@0: setInitialStateToUnicodeKR(converter, myConverterData); michael@0: } michael@0: if(choice!=UCNV_RESET_TO_UNICODE) { michael@0: setInitialStateFromUnicodeKR(converter, myConverterData); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: static const char* michael@0: _ISO2022getName(const UConverter* cnv){ michael@0: if(cnv->extraInfo){ michael@0: UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; michael@0: return myData->name; michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: michael@0: /*************** to unicode *******************/ michael@0: /**************************************************************************** michael@0: * Recognized escape sequences are michael@0: * (B ASCII michael@0: * .A ISO-8859-1 michael@0: * .F ISO-8859-7 michael@0: * (J JISX-201 michael@0: * (I JISX-201 michael@0: * $B JISX-208 michael@0: * $@ JISX-208 michael@0: * $(D JISX-212 michael@0: * $A GB2312 michael@0: * $(C KSC5601 michael@0: */ michael@0: static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { michael@0: /* 0 1 2 3 4 5 6 7 8 9 */ michael@0: INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: }; michael@0: michael@0: /*************** to unicode *******************/ michael@0: static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { michael@0: /* 0 1 2 3 4 5 6 7 8 9 */ michael@0: INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 michael@0: ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE michael@0: }; michael@0: michael@0: michael@0: static UCNV_TableStates_2022 michael@0: getKey_2022(char c,int32_t* key,int32_t* offset){ michael@0: int32_t togo; michael@0: int32_t low = 0; michael@0: int32_t hi = MAX_STATES_2022; michael@0: int32_t oldmid=0; michael@0: michael@0: togo = normalize_esq_chars_2022[(uint8_t)c]; michael@0: if(togo == 0) { michael@0: /* not a valid character anywhere in an escape sequence */ michael@0: *key = 0; michael@0: *offset = 0; michael@0: return INVALID_2022; michael@0: } michael@0: togo = (*key << 5) + togo; michael@0: michael@0: while (hi != low) /*binary search*/{ michael@0: michael@0: register int32_t mid = (hi+low) >> 1; /*Finds median*/ michael@0: michael@0: if (mid == oldmid) michael@0: break; michael@0: michael@0: if (escSeqStateTable_Key_2022[mid] > togo){ michael@0: hi = mid; michael@0: } michael@0: else if (escSeqStateTable_Key_2022[mid] < togo){ michael@0: low = mid; michael@0: } michael@0: else /*we found it*/{ michael@0: *key = togo; michael@0: *offset = mid; michael@0: return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; michael@0: } michael@0: oldmid = mid; michael@0: michael@0: } michael@0: michael@0: *key = 0; michael@0: *offset = 0; michael@0: return INVALID_2022; michael@0: } michael@0: michael@0: /*runs through a state machine to determine the escape sequence - codepage correspondance michael@0: */ michael@0: static void michael@0: changeState_2022(UConverter* _this, michael@0: const char** source, michael@0: const char* sourceLimit, michael@0: Variant2022 var, michael@0: UErrorCode* err){ michael@0: UCNV_TableStates_2022 value; michael@0: UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); michael@0: uint32_t key = myData2022->key; michael@0: int32_t offset = 0; michael@0: int8_t initialToULength = _this->toULength; michael@0: char c; michael@0: michael@0: value = VALID_NON_TERMINAL_2022; michael@0: while (*source < sourceLimit) { michael@0: c = *(*source)++; michael@0: _this->toUBytes[_this->toULength++]=(uint8_t)c; michael@0: value = getKey_2022(c,(int32_t *) &key, &offset); michael@0: michael@0: switch (value){ michael@0: michael@0: case VALID_NON_TERMINAL_2022 : michael@0: /* continue with the loop */ michael@0: break; michael@0: michael@0: case VALID_TERMINAL_2022: michael@0: key = 0; michael@0: goto DONE; michael@0: michael@0: case INVALID_2022: michael@0: goto DONE; michael@0: michael@0: case VALID_MAYBE_TERMINAL_2022: michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: /* ESC ( B is ambiguous only for ISO_2022 itself */ michael@0: if(var == ISO_2022) { michael@0: /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ michael@0: _this->toULength = 0; michael@0: michael@0: /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ michael@0: michael@0: /* continue with the loop */ michael@0: value = VALID_NON_TERMINAL_2022; michael@0: break; michael@0: } else michael@0: #endif michael@0: { michael@0: /* not ISO_2022 itself, finish here */ michael@0: value = VALID_TERMINAL_2022; michael@0: key = 0; michael@0: goto DONE; michael@0: } michael@0: } michael@0: } michael@0: michael@0: DONE: michael@0: myData2022->key = key; michael@0: michael@0: if (value == VALID_NON_TERMINAL_2022) { michael@0: /* indicate that the escape sequence is incomplete: key!=0 */ michael@0: return; michael@0: } else if (value == INVALID_2022 ) { michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: } else /* value == VALID_TERMINAL_2022 */ { michael@0: switch(var){ michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: case ISO_2022: michael@0: { michael@0: const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; michael@0: if(chosenConverterName == NULL) { michael@0: /* SS2 or SS3 */ michael@0: *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; michael@0: _this->toUCallbackReason = UCNV_UNASSIGNED; michael@0: return; michael@0: } michael@0: michael@0: _this->mode = UCNV_SI; michael@0: ucnv_close(myData2022->currentConverter); michael@0: myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); michael@0: if(U_SUCCESS(*err)) { michael@0: myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; michael@0: _this->mode = UCNV_SO; michael@0: } michael@0: break; michael@0: } michael@0: #endif michael@0: case ISO_2022_JP: michael@0: { michael@0: StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; michael@0: switch(tempState) { michael@0: case INVALID_STATE: michael@0: *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; michael@0: break; michael@0: case SS2_STATE: michael@0: if(myData2022->toU2022State.cs[2]!=0) { michael@0: if(myData2022->toU2022State.g<2) { michael@0: myData2022->toU2022State.prevG=myData2022->toU2022State.g; michael@0: } michael@0: myData2022->toU2022State.g=2; michael@0: } else { michael@0: /* illegal to have SS2 before a matching designator */ michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: } michael@0: break; michael@0: /* case SS3_STATE: not used in ISO-2022-JP-x */ michael@0: case ISO8859_1: michael@0: case ISO8859_7: michael@0: if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { michael@0: *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; michael@0: } else { michael@0: /* G2 charset for SS2 */ michael@0: myData2022->toU2022State.cs[2]=(int8_t)tempState; michael@0: } michael@0: break; michael@0: default: michael@0: if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { michael@0: *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; michael@0: } else { michael@0: /* G0 charset */ michael@0: myData2022->toU2022State.cs[0]=(int8_t)tempState; michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: break; michael@0: case ISO_2022_CN: michael@0: { michael@0: StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; michael@0: switch(tempState) { michael@0: case INVALID_STATE: michael@0: *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; michael@0: break; michael@0: case SS2_STATE: michael@0: if(myData2022->toU2022State.cs[2]!=0) { michael@0: if(myData2022->toU2022State.g<2) { michael@0: myData2022->toU2022State.prevG=myData2022->toU2022State.g; michael@0: } michael@0: myData2022->toU2022State.g=2; michael@0: } else { michael@0: /* illegal to have SS2 before a matching designator */ michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: } michael@0: break; michael@0: case SS3_STATE: michael@0: if(myData2022->toU2022State.cs[3]!=0) { michael@0: if(myData2022->toU2022State.g<2) { michael@0: myData2022->toU2022State.prevG=myData2022->toU2022State.g; michael@0: } michael@0: myData2022->toU2022State.g=3; michael@0: } else { michael@0: /* illegal to have SS3 before a matching designator */ michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: } michael@0: break; michael@0: case ISO_IR_165: michael@0: if(myData2022->version==0) { michael@0: *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; michael@0: break; michael@0: } michael@0: /*fall through*/ michael@0: case GB2312_1: michael@0: /*fall through*/ michael@0: case CNS_11643_1: michael@0: myData2022->toU2022State.cs[1]=(int8_t)tempState; michael@0: break; michael@0: case CNS_11643_2: michael@0: myData2022->toU2022State.cs[2]=(int8_t)tempState; michael@0: break; michael@0: default: michael@0: /* other CNS 11643 planes */ michael@0: if(myData2022->version==0) { michael@0: *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; michael@0: } else { michael@0: myData2022->toU2022State.cs[3]=(int8_t)tempState; michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: break; michael@0: case ISO_2022_KR: michael@0: if(offset==0x30){ michael@0: /* nothing to be done, just accept this one escape sequence */ michael@0: } else { michael@0: *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; michael@0: } michael@0: break; michael@0: michael@0: default: michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: break; michael@0: } michael@0: } michael@0: if(U_SUCCESS(*err)) { michael@0: _this->toULength = 0; michael@0: } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { michael@0: if(_this->toULength>1) { michael@0: /* michael@0: * Ticket 5691: consistent illegal sequences: michael@0: * - We include at least the first byte (ESC) in the illegal sequence. michael@0: * - If any of the non-initial bytes could be the start of a character, michael@0: * we stop the illegal sequence before the first one of those. michael@0: * In escape sequences, all following bytes are "printable", that is, michael@0: * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), michael@0: * they are valid single/lead bytes. michael@0: * For simplicity, we always only report the initial ESC byte as the michael@0: * illegal sequence and back out all other bytes we looked at. michael@0: */ michael@0: /* Back out some bytes. */ michael@0: int8_t backOutDistance=_this->toULength-1; michael@0: int8_t bytesFromThisBuffer=_this->toULength-initialToULength; michael@0: if(backOutDistance<=bytesFromThisBuffer) { michael@0: /* same as initialToULength<=1 */ michael@0: *source-=backOutDistance; michael@0: } else { michael@0: /* Back out bytes from the previous buffer: Need to replay them. */ michael@0: _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); michael@0: /* same as -(initialToULength-1) */ michael@0: /* preToULength is negative! */ michael@0: uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); michael@0: *source-=bytesFromThisBuffer; michael@0: } michael@0: _this->toULength=1; michael@0: } michael@0: } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { michael@0: _this->toUCallbackReason = UCNV_UNASSIGNED; michael@0: } michael@0: } michael@0: michael@0: /*Checks the characters of the buffer against valid 2022 escape sequences michael@0: *if the match we return a pointer to the initial start of the sequence otherwise michael@0: *we return sourceLimit michael@0: */ michael@0: /*for 2022 looks ahead in the stream michael@0: *to determine the longest possible convertible michael@0: *data stream michael@0: */ michael@0: static inline const char* michael@0: getEndOfBuffer_2022(const char** source, michael@0: const char* sourceLimit, michael@0: UBool /*flush*/){ michael@0: michael@0: const char* mySource = *source; michael@0: michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: if (*source >= sourceLimit) michael@0: return sourceLimit; michael@0: michael@0: do{ michael@0: michael@0: if (*mySource == ESC_2022){ michael@0: int8_t i; michael@0: int32_t key = 0; michael@0: int32_t offset; michael@0: UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; michael@0: michael@0: /* Kludge: I could not michael@0: * figure out the reason for validating an escape sequence michael@0: * twice - once here and once in changeState_2022(). michael@0: * is it possible to have an ESC character in a ISO2022 michael@0: * byte stream which is valid in a code page? Is it legal? michael@0: */ michael@0: for (i=0; michael@0: (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); michael@0: i++) { michael@0: value = getKey_2022(*(mySource+i), &key, &offset); michael@0: } michael@0: if (value > 0 || *mySource==ESC_2022) michael@0: return mySource; michael@0: michael@0: if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) michael@0: return sourceLimit; michael@0: } michael@0: }while (++mySource < sourceLimit); michael@0: michael@0: return sourceLimit; michael@0: #else michael@0: while(mySource < sourceLimit && *mySource != ESC_2022) { michael@0: ++mySource; michael@0: } michael@0: return mySource; michael@0: #endif michael@0: } michael@0: michael@0: michael@0: /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c michael@0: * any future change in _MBCSFromUChar32() function should be reflected here. michael@0: * @return number of bytes in *value; negative number if fallback; 0 if no mapping michael@0: */ michael@0: static inline int32_t michael@0: MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, michael@0: UChar32 c, michael@0: uint32_t* value, michael@0: UBool useFallback, michael@0: int outputType) michael@0: { michael@0: const int32_t *cx; michael@0: const uint16_t *table; michael@0: uint32_t stage2Entry; michael@0: uint32_t myValue; michael@0: int32_t length; michael@0: const uint8_t *p; michael@0: /* michael@0: * TODO(markus): Use and require new, faster MBCS conversion table structures. michael@0: * Use internal version of ucnv_open() that verifies that the new structures are available, michael@0: * else U_INTERNAL_PROGRAM_ERROR. michael@0: */ michael@0: /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ michael@0: if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { michael@0: table=sharedData->mbcs.fromUnicodeTable; michael@0: stage2Entry=MBCS_STAGE_2_FROM_U(table, c); michael@0: /* get the bytes and the length for the output */ michael@0: if(outputType==MBCS_OUTPUT_2){ michael@0: myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); michael@0: if(myValue<=0xff) { michael@0: length=1; michael@0: } else { michael@0: length=2; michael@0: } michael@0: } else /* outputType==MBCS_OUTPUT_3 */ { michael@0: p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); michael@0: myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; michael@0: if(myValue<=0xff) { michael@0: length=1; michael@0: } else if(myValue<=0xffff) { michael@0: length=2; michael@0: } else { michael@0: length=3; michael@0: } michael@0: } michael@0: /* is this code point assigned, or do we use fallbacks? */ michael@0: if((stage2Entry&(1<<(16+(c&0xf))))!=0) { michael@0: /* assigned */ michael@0: *value=myValue; michael@0: return length; michael@0: } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { michael@0: /* michael@0: * We allow a 0 byte output if the "assigned" bit is set for this entry. michael@0: * There is no way with this data structure for fallback output michael@0: * to be a zero byte. michael@0: */ michael@0: *value=myValue; michael@0: return -length; michael@0: } michael@0: } michael@0: michael@0: cx=sharedData->mbcs.extIndexes; michael@0: if(cx!=NULL) { michael@0: return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); michael@0: } michael@0: michael@0: /* unassigned */ michael@0: return 0; michael@0: } michael@0: michael@0: /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c michael@0: * any future change in _MBCSSingleFromUChar32() function should be reflected here. michael@0: * @param retval pointer to output byte michael@0: * @return 1 roundtrip byte 0 no mapping -1 fallback byte michael@0: */ michael@0: static inline int32_t michael@0: MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, michael@0: UChar32 c, michael@0: uint32_t* retval, michael@0: UBool useFallback) michael@0: { michael@0: const uint16_t *table; michael@0: int32_t value; michael@0: /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ michael@0: if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { michael@0: return 0; michael@0: } michael@0: /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ michael@0: table=sharedData->mbcs.fromUnicodeTable; michael@0: /* get the byte for the output */ michael@0: value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); michael@0: /* is this code point assigned, or do we use fallbacks? */ michael@0: *retval=(uint32_t)(value&0xff); michael@0: if(value>=0xf00) { michael@0: return 1; /* roundtrip */ michael@0: } else if(useFallback ? value>=0x800 : value>=0xc00) { michael@0: return -1; /* fallback taken */ michael@0: } else { michael@0: return 0; /* no mapping */ michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Check that the result is a 2-byte value with each byte in the range A1..FE michael@0: * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte michael@0: * to move it to the ISO 2022 range 21..7E. michael@0: * Return 0 if out of range. michael@0: */ michael@0: static inline uint32_t michael@0: _2022FromGR94DBCS(uint32_t value) { michael@0: if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && michael@0: (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) michael@0: ) { michael@0: return value - 0x8080; /* shift down to 21..7e byte range */ michael@0: } else { michael@0: return 0; /* not valid for ISO 2022 */ michael@0: } michael@0: } michael@0: michael@0: #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ michael@0: /* michael@0: * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the michael@0: * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point michael@0: * unchanged. michael@0: */ michael@0: static inline uint32_t michael@0: _2022ToGR94DBCS(uint32_t value) { michael@0: uint32_t returnValue = value + 0x8080; michael@0: if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && michael@0: (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { michael@0: return returnValue; michael@0: } else { michael@0: return value; michael@0: } michael@0: } michael@0: #endif michael@0: michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: michael@0: /********************************************************************************** michael@0: * ISO-2022 Converter michael@0: * michael@0: * michael@0: */ michael@0: michael@0: static void michael@0: T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, michael@0: UErrorCode* err){ michael@0: const char* mySourceLimit, *realSourceLimit; michael@0: const char* sourceStart; michael@0: const UChar* myTargetStart; michael@0: UConverter* saveThis; michael@0: UConverterDataISO2022* myData; michael@0: int8_t length; michael@0: michael@0: saveThis = args->converter; michael@0: myData=((UConverterDataISO2022*)(saveThis->extraInfo)); michael@0: michael@0: realSourceLimit = args->sourceLimit; michael@0: while (args->source < realSourceLimit) { michael@0: if(myData->key == 0) { /* are we in the middle of an escape sequence? */ michael@0: /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ michael@0: mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); michael@0: michael@0: if(args->source < mySourceLimit) { michael@0: if(myData->currentConverter==NULL) { michael@0: myData->currentConverter = ucnv_open("ASCII",err); michael@0: if(U_FAILURE(*err)){ michael@0: return; michael@0: } michael@0: michael@0: myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; michael@0: saveThis->mode = UCNV_SO; michael@0: } michael@0: michael@0: /* convert to before the ESC or until the end of the buffer */ michael@0: myData->isFirstBuffer=FALSE; michael@0: sourceStart = args->source; michael@0: myTargetStart = args->target; michael@0: args->converter = myData->currentConverter; michael@0: ucnv_toUnicode(args->converter, michael@0: &args->target, michael@0: args->targetLimit, michael@0: &args->source, michael@0: mySourceLimit, michael@0: args->offsets, michael@0: (UBool)(args->flush && mySourceLimit == realSourceLimit), michael@0: err); michael@0: args->converter = saveThis; michael@0: michael@0: if (*err == U_BUFFER_OVERFLOW_ERROR) { michael@0: /* move the overflow buffer */ michael@0: length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; michael@0: myData->currentConverter->UCharErrorBufferLength = 0; michael@0: if(length > 0) { michael@0: uprv_memcpy(saveThis->UCharErrorBuffer, michael@0: myData->currentConverter->UCharErrorBuffer, michael@0: length*U_SIZEOF_UCHAR); michael@0: } michael@0: return; michael@0: } michael@0: michael@0: /* michael@0: * At least one of: michael@0: * -Error while converting michael@0: * -Done with entire buffer michael@0: * -Need to write offsets or update the current offset michael@0: * (leave that up to the code in ucnv.c) michael@0: * michael@0: * or else we just stopped at an ESC byte and continue with changeState_2022() michael@0: */ michael@0: if (U_FAILURE(*err) || michael@0: (args->source == realSourceLimit) || michael@0: (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || michael@0: (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) michael@0: ) { michael@0: /* copy partial or error input for truncated detection and error handling */ michael@0: if(U_FAILURE(*err)) { michael@0: length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; michael@0: if(length > 0) { michael@0: uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); michael@0: } michael@0: } else { michael@0: length = saveThis->toULength = myData->currentConverter->toULength; michael@0: if(length > 0) { michael@0: uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); michael@0: if(args->source < mySourceLimit) { michael@0: *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ michael@0: } michael@0: } michael@0: } michael@0: return; michael@0: } michael@0: } michael@0: } michael@0: michael@0: sourceStart = args->source; michael@0: changeState_2022(args->converter, michael@0: &(args->source), michael@0: realSourceLimit, michael@0: ISO_2022, michael@0: err); michael@0: if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { michael@0: /* let the ucnv.c code update its current offset */ michael@0: return; michael@0: } michael@0: } michael@0: } michael@0: michael@0: #endif michael@0: michael@0: /* michael@0: * To Unicode Callback helper function michael@0: */ michael@0: static void michael@0: toUnicodeCallback(UConverter *cnv, michael@0: const uint32_t sourceChar, const uint32_t targetUniChar, michael@0: UErrorCode* err){ michael@0: if(sourceChar>0xff){ michael@0: cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); michael@0: cnv->toUBytes[1] = (uint8_t)sourceChar; michael@0: cnv->toULength = 2; michael@0: } michael@0: else{ michael@0: cnv->toUBytes[0] =(char) sourceChar; michael@0: cnv->toULength = 1; michael@0: } michael@0: michael@0: if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ michael@0: *err = U_INVALID_CHAR_FOUND; michael@0: } michael@0: else{ michael@0: *err = U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } michael@0: michael@0: /**************************************ISO-2022-JP*************************************************/ michael@0: michael@0: /************************************** IMPORTANT ************************************************** michael@0: * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and michael@0: * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). michael@0: * The converter iterates over each Unicode codepoint michael@0: * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is michael@0: * processed one char at a time it would make sense to reduce the extra processing a canned converter michael@0: * would do as far as possible. michael@0: * michael@0: * If the implementation of these macros or structure of sharedData struct change in the future, make michael@0: * sure that ISO-2022 is also changed. michael@0: *************************************************************************************************** michael@0: */ michael@0: michael@0: /*************************************************************************************************** michael@0: * Rules for ISO-2022-jp encoding michael@0: * (i) Escape sequences must be fully contained within a line they should not michael@0: * span new lines or CRs michael@0: * (ii) If the last character on a line is represented by two bytes then an ASCII or michael@0: * JIS-Roman character escape sequence should follow before the line terminates michael@0: * (iii) If the first character on the line is represented by two bytes then a two michael@0: * byte character escape sequence should precede it michael@0: * (iv) If no escape sequence is encountered then the characters are ASCII michael@0: * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, michael@0: * and invoked with SS2 (ESC N). michael@0: * (vi) If there is any G0 designation in text, there must be a switch to michael@0: * ASCII or to JIS X 0201-Roman before a space character (but not michael@0: * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control michael@0: * characters such as tab or CRLF. michael@0: * (vi) Supported encodings: michael@0: * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 michael@0: * michael@0: * source : RFC-1554 michael@0: * michael@0: * JISX201, JISX208,JISX212 : new .cnv data files created michael@0: * KSC5601 : alias to ibm-949 mapping table michael@0: * GB2312 : alias to ibm-1386 mapping table michael@0: * ISO-8859-1 : Algorithmic implemented as LATIN1 case michael@0: * ISO-8859-7 : alisas to ibm-9409 mapping table michael@0: */ michael@0: michael@0: /* preference order of JP charsets */ michael@0: static const StateEnum jpCharsetPref[]={ michael@0: ASCII, michael@0: JISX201, michael@0: ISO8859_1, michael@0: ISO8859_7, michael@0: JISX208, michael@0: JISX212, michael@0: GB2312, michael@0: KSC5601, michael@0: HWKANA_7BIT michael@0: }; michael@0: michael@0: /* michael@0: * The escape sequences must be in order of the enum constants like JISX201 = 3, michael@0: * not in order of jpCharsetPref[]! michael@0: */ michael@0: static const char escSeqChars[][6] ={ michael@0: "\x1B\x28\x42", /* (B ASCII */ michael@0: "\x1B\x2E\x41", /* .A ISO-8859-1 */ michael@0: "\x1B\x2E\x46", /* .F ISO-8859-7 */ michael@0: "\x1B\x28\x4A", /* (J JISX-201 */ michael@0: "\x1B\x24\x42", /* $B JISX-208 */ michael@0: "\x1B\x24\x28\x44", /* $(D JISX-212 */ michael@0: "\x1B\x24\x41", /* $A GB2312 */ michael@0: "\x1B\x24\x28\x43", /* $(C KSC5601 */ michael@0: "\x1B\x28\x49" /* (I HWKANA_7BIT */ michael@0: michael@0: }; michael@0: static const int8_t escSeqCharsLen[] ={ michael@0: 3, /* length of (B ASCII */ michael@0: 3, /* length of .A ISO-8859-1 */ michael@0: 3, /* length of .F ISO-8859-7 */ michael@0: 3, /* length of (J JISX-201 */ michael@0: 3, /* length of $B JISX-208 */ michael@0: 4, /* length of $(D JISX-212 */ michael@0: 3, /* length of $A GB2312 */ michael@0: 4, /* length of $(C KSC5601 */ michael@0: 3 /* length of (I HWKANA_7BIT */ michael@0: }; michael@0: michael@0: /* michael@0: * The iteration over various code pages works this way: michael@0: * i) Get the currentState from myConverterData->currentState michael@0: * ii) Check if the character is mapped to a valid character in the currentState michael@0: * Yes -> a) set the initIterState to currentState michael@0: * b) remain in this state until an invalid character is found michael@0: * No -> a) go to the next code page and find the character michael@0: * iii) Before changing the state increment the current state check if the current state michael@0: * is equal to the intitIteration state michael@0: * Yes -> A character that cannot be represented in any of the supported encodings michael@0: * break and return a U_INVALID_CHARACTER error michael@0: * No -> Continue and find the character in next code page michael@0: * michael@0: * michael@0: * TODO: Implement a priority technique where the users are allowed to set the priority of code pages michael@0: */ michael@0: michael@0: /* Map 00..7F to Unicode according to JIS X 0201. */ michael@0: static inline uint32_t michael@0: jisx201ToU(uint32_t value) { michael@0: if(value < 0x5c) { michael@0: return value; michael@0: } else if(value == 0x5c) { michael@0: return 0xa5; michael@0: } else if(value == 0x7e) { michael@0: return 0x203e; michael@0: } else /* value <= 0x7f */ { michael@0: return value; michael@0: } michael@0: } michael@0: michael@0: /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ michael@0: static inline uint32_t michael@0: jisx201FromU(uint32_t value) { michael@0: if(value<=0x7f) { michael@0: if(value!=0x5c && value!=0x7e) { michael@0: return value; michael@0: } michael@0: } else if(value==0xa5) { michael@0: return 0x5c; michael@0: } else if(value==0x203e) { michael@0: return 0x7e; michael@0: } michael@0: return 0xfffe; michael@0: } michael@0: michael@0: /* michael@0: * Take a valid Shift-JIS byte pair, check that it is in the range corresponding michael@0: * to JIS X 0208, and convert it to a pair of 21..7E bytes. michael@0: * Return 0 if the byte pair is out of range. michael@0: */ michael@0: static inline uint32_t michael@0: _2022FromSJIS(uint32_t value) { michael@0: uint8_t trail; michael@0: michael@0: if(value > 0xEFFC) { michael@0: return 0; /* beyond JIS X 0208 */ michael@0: } michael@0: michael@0: trail = (uint8_t)value; michael@0: michael@0: value &= 0xff00; /* lead byte */ michael@0: if(value <= 0x9f00) { michael@0: value -= 0x7000; michael@0: } else /* 0xe000 <= value <= 0xef00 */ { michael@0: value -= 0xb000; michael@0: } michael@0: value <<= 1; michael@0: michael@0: if(trail <= 0x9e) { michael@0: value -= 0x100; michael@0: if(trail <= 0x7e) { michael@0: value |= trail - 0x1f; michael@0: } else { michael@0: value |= trail - 0x20; michael@0: } michael@0: } else /* trail <= 0xfc */ { michael@0: value |= trail - 0x7e; michael@0: } michael@0: return value; michael@0: } michael@0: michael@0: /* michael@0: * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. michael@0: * If either byte is outside 21..7E make sure that the result is not valid michael@0: * for Shift-JIS so that the converter catches it. michael@0: * Some invalid byte values already turn into equally invalid Shift-JIS michael@0: * byte values and need not be tested explicitly. michael@0: */ michael@0: static inline void michael@0: _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { michael@0: if(c1&1) { michael@0: ++c1; michael@0: if(c2 <= 0x5f) { michael@0: c2 += 0x1f; michael@0: } else if(c2 <= 0x7e) { michael@0: c2 += 0x20; michael@0: } else { michael@0: c2 = 0; /* invalid */ michael@0: } michael@0: } else { michael@0: if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { michael@0: c2 += 0x7e; michael@0: } else { michael@0: c2 = 0; /* invalid */ michael@0: } michael@0: } michael@0: c1 >>= 1; michael@0: if(c1 <= 0x2f) { michael@0: c1 += 0x70; michael@0: } else if(c1 <= 0x3f) { michael@0: c1 += 0xb0; michael@0: } else { michael@0: c1 = 0; /* invalid */ michael@0: } michael@0: bytes[0] = (char)c1; michael@0: bytes[1] = (char)c2; michael@0: } michael@0: michael@0: /* michael@0: * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) michael@0: * Katakana. michael@0: * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks michael@0: * because Shift-JIS roundtrips half-width Katakana to single bytes. michael@0: * These were the only fallbacks in ICU's jisx-208.ucm file. michael@0: */ michael@0: static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { michael@0: 0x2123, /* U+FF61 */ michael@0: 0x2156, michael@0: 0x2157, michael@0: 0x2122, michael@0: 0x2126, michael@0: 0x2572, michael@0: 0x2521, michael@0: 0x2523, michael@0: 0x2525, michael@0: 0x2527, michael@0: 0x2529, michael@0: 0x2563, michael@0: 0x2565, michael@0: 0x2567, michael@0: 0x2543, michael@0: 0x213C, /* U+FF70 */ michael@0: 0x2522, michael@0: 0x2524, michael@0: 0x2526, michael@0: 0x2528, michael@0: 0x252A, michael@0: 0x252B, michael@0: 0x252D, michael@0: 0x252F, michael@0: 0x2531, michael@0: 0x2533, michael@0: 0x2535, michael@0: 0x2537, michael@0: 0x2539, michael@0: 0x253B, michael@0: 0x253D, michael@0: 0x253F, /* U+FF80 */ michael@0: 0x2541, michael@0: 0x2544, michael@0: 0x2546, michael@0: 0x2548, michael@0: 0x254A, michael@0: 0x254B, michael@0: 0x254C, michael@0: 0x254D, michael@0: 0x254E, michael@0: 0x254F, michael@0: 0x2552, michael@0: 0x2555, michael@0: 0x2558, michael@0: 0x255B, michael@0: 0x255E, michael@0: 0x255F, /* U+FF90 */ michael@0: 0x2560, michael@0: 0x2561, michael@0: 0x2562, michael@0: 0x2564, michael@0: 0x2566, michael@0: 0x2568, michael@0: 0x2569, michael@0: 0x256A, michael@0: 0x256B, michael@0: 0x256C, michael@0: 0x256D, michael@0: 0x256F, michael@0: 0x2573, michael@0: 0x212B, michael@0: 0x212C /* U+FF9F */ michael@0: }; michael@0: michael@0: static void michael@0: UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { michael@0: UConverter *cnv = args->converter; michael@0: UConverterDataISO2022 *converterData; michael@0: ISO2022State *pFromU2022State; michael@0: uint8_t *target = (uint8_t *) args->target; michael@0: const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; michael@0: const UChar* source = args->source; michael@0: const UChar* sourceLimit = args->sourceLimit; michael@0: int32_t* offsets = args->offsets; michael@0: UChar32 sourceChar; michael@0: char buffer[8]; michael@0: int32_t len, outLen; michael@0: int8_t choices[10]; michael@0: int32_t choiceCount; michael@0: uint32_t targetValue = 0; michael@0: UBool useFallback; michael@0: michael@0: int32_t i; michael@0: int8_t cs, g; michael@0: michael@0: /* set up the state */ michael@0: converterData = (UConverterDataISO2022*)cnv->extraInfo; michael@0: pFromU2022State = &converterData->fromU2022State; michael@0: michael@0: choiceCount = 0; michael@0: michael@0: /* check if the last codepoint of previous buffer was a lead surrogate*/ michael@0: if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { michael@0: goto getTrail; michael@0: } michael@0: michael@0: while(source < sourceLimit) { michael@0: if(target < targetLimit) { michael@0: michael@0: sourceChar = *(source++); michael@0: /*check if the char is a First surrogate*/ michael@0: if(U16_IS_SURROGATE(sourceChar)) { michael@0: if(U16_IS_SURROGATE_LEAD(sourceChar)) { michael@0: getTrail: michael@0: /*look ahead to find the trail surrogate*/ michael@0: if(source < sourceLimit) { michael@0: /* test the following code unit */ michael@0: UChar trail=(UChar) *source; michael@0: if(U16_IS_TRAIL(trail)) { michael@0: source++; michael@0: sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); michael@0: cnv->fromUChar32=0x00; michael@0: /* convert this supplementary code point */ michael@0: /* exit this condition tree */ michael@0: } else { michael@0: /* this is an unmatched lead code unit (1st surrogate) */ michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: } else { michael@0: /* no more input */ michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: } else { michael@0: /* this is an unmatched trail code unit (2nd surrogate) */ michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: /* do not convert SO/SI/ESC */ michael@0: if(IS_2022_CONTROL(sourceChar)) { michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: michael@0: /* do the conversion */ michael@0: michael@0: if(choiceCount == 0) { michael@0: uint16_t csm; michael@0: michael@0: /* michael@0: * The csm variable keeps track of which charsets are allowed michael@0: * and not used yet while building the choices[]. michael@0: */ michael@0: csm = jpCharsetMasks[converterData->version]; michael@0: choiceCount = 0; michael@0: michael@0: /* JIS7/8: try single-byte half-width Katakana before JISX208 */ michael@0: if(converterData->version == 3 || converterData->version == 4) { michael@0: choices[choiceCount++] = (int8_t)HWKANA_7BIT; michael@0: } michael@0: /* Do not try single-byte half-width Katakana for other versions. */ michael@0: csm &= ~CSM(HWKANA_7BIT); michael@0: michael@0: /* try the current G0 charset */ michael@0: choices[choiceCount++] = cs = pFromU2022State->cs[0]; michael@0: csm &= ~CSM(cs); michael@0: michael@0: /* try the current G2 charset */ michael@0: if((cs = pFromU2022State->cs[2]) != 0) { michael@0: choices[choiceCount++] = cs; michael@0: csm &= ~CSM(cs); michael@0: } michael@0: michael@0: /* try all the other possible charsets */ michael@0: for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { michael@0: cs = (int8_t)jpCharsetPref[i]; michael@0: if(CSM(cs) & csm) { michael@0: choices[choiceCount++] = cs; michael@0: csm &= ~CSM(cs); michael@0: } michael@0: } michael@0: } michael@0: michael@0: cs = g = 0; michael@0: /* michael@0: * len==0: no mapping found yet michael@0: * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks michael@0: * len>0: found a roundtrip result, done michael@0: */ michael@0: len = 0; michael@0: /* michael@0: * We will turn off useFallback after finding a fallback, michael@0: * but we still get fallbacks from PUA code points as usual. michael@0: * Therefore, we will also need to check that we don't overwrite michael@0: * an early fallback with a later one. michael@0: */ michael@0: useFallback = cnv->useFallback; michael@0: michael@0: for(i = 0; i < choiceCount && len <= 0; ++i) { michael@0: uint32_t value; michael@0: int32_t len2; michael@0: int8_t cs0 = choices[i]; michael@0: switch(cs0) { michael@0: case ASCII: michael@0: if(sourceChar <= 0x7f) { michael@0: targetValue = (uint32_t)sourceChar; michael@0: len = 1; michael@0: cs = cs0; michael@0: g = 0; michael@0: } michael@0: break; michael@0: case ISO8859_1: michael@0: if(GR96_START <= sourceChar && sourceChar <= GR96_END) { michael@0: targetValue = (uint32_t)sourceChar - 0x80; michael@0: len = 1; michael@0: cs = cs0; michael@0: g = 2; michael@0: } michael@0: break; michael@0: case HWKANA_7BIT: michael@0: if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { michael@0: if(converterData->version==3) { michael@0: /* JIS7: use G1 (SO) */ michael@0: /* Shift U+FF61..U+FF9F to bytes 21..5F. */ michael@0: targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); michael@0: len = 1; michael@0: pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ michael@0: g = 1; michael@0: } else if(converterData->version==4) { michael@0: /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ michael@0: /* Shift U+FF61..U+FF9F to bytes A1..DF. */ michael@0: targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); michael@0: len = 1; michael@0: michael@0: cs = pFromU2022State->cs[0]; michael@0: if(IS_JP_DBCS(cs)) { michael@0: /* switch from a DBCS charset to JISX201 */ michael@0: cs = (int8_t)JISX201; michael@0: } michael@0: /* else stay in the current G0 charset */ michael@0: g = 0; michael@0: } michael@0: /* else do not use HWKANA_7BIT with other versions */ michael@0: } michael@0: break; michael@0: case JISX201: michael@0: /* G0 SBCS */ michael@0: value = jisx201FromU(sourceChar); michael@0: if(value <= 0x7f) { michael@0: targetValue = value; michael@0: len = 1; michael@0: cs = cs0; michael@0: g = 0; michael@0: useFallback = FALSE; michael@0: } michael@0: break; michael@0: case JISX208: michael@0: /* G0 DBCS from Shift-JIS table */ michael@0: len2 = MBCS_FROM_UCHAR32_ISO2022( michael@0: converterData->myConverterArray[cs0], michael@0: sourceChar, &value, michael@0: useFallback, MBCS_OUTPUT_2); michael@0: if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ michael@0: value = _2022FromSJIS(value); michael@0: if(value != 0) { michael@0: targetValue = value; michael@0: len = len2; michael@0: cs = cs0; michael@0: g = 0; michael@0: useFallback = FALSE; michael@0: } michael@0: } else if(len == 0 && useFallback && michael@0: (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { michael@0: targetValue = hwkana_fb[sourceChar - HWKANA_START]; michael@0: len = -2; michael@0: cs = cs0; michael@0: g = 0; michael@0: useFallback = FALSE; michael@0: } michael@0: break; michael@0: case ISO8859_7: michael@0: /* G0 SBCS forced to 7-bit output */ michael@0: len2 = MBCS_SINGLE_FROM_UCHAR32( michael@0: converterData->myConverterArray[cs0], michael@0: sourceChar, &value, michael@0: useFallback); michael@0: if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { michael@0: targetValue = value - 0x80; michael@0: len = len2; michael@0: cs = cs0; michael@0: g = 2; michael@0: useFallback = FALSE; michael@0: } michael@0: break; michael@0: default: michael@0: /* G0 DBCS */ michael@0: len2 = MBCS_FROM_UCHAR32_ISO2022( michael@0: converterData->myConverterArray[cs0], michael@0: sourceChar, &value, michael@0: useFallback, MBCS_OUTPUT_2); michael@0: if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ michael@0: if(cs0 == KSC5601) { michael@0: /* michael@0: * Check for valid bytes for the encoding scheme. michael@0: * This is necessary because the sub-converter (windows-949) michael@0: * has a broader encoding scheme than is valid for 2022. michael@0: */ michael@0: value = _2022FromGR94DBCS(value); michael@0: if(value == 0) { michael@0: break; michael@0: } michael@0: } michael@0: targetValue = value; michael@0: len = len2; michael@0: cs = cs0; michael@0: g = 0; michael@0: useFallback = FALSE; michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if(len != 0) { michael@0: if(len < 0) { michael@0: len = -len; /* fallback */ michael@0: } michael@0: outLen = 0; /* count output bytes */ michael@0: michael@0: /* write SI if necessary (only for JIS7) */ michael@0: if(pFromU2022State->g == 1 && g == 0) { michael@0: buffer[outLen++] = UCNV_SI; michael@0: pFromU2022State->g = 0; michael@0: } michael@0: michael@0: /* write the designation sequence if necessary */ michael@0: if(cs != pFromU2022State->cs[g]) { michael@0: int32_t escLen = escSeqCharsLen[cs]; michael@0: uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); michael@0: outLen += escLen; michael@0: pFromU2022State->cs[g] = cs; michael@0: michael@0: /* invalidate the choices[] */ michael@0: choiceCount = 0; michael@0: } michael@0: michael@0: /* write the shift sequence if necessary */ michael@0: if(g != pFromU2022State->g) { michael@0: switch(g) { michael@0: /* case 0 handled before writing escapes */ michael@0: case 1: michael@0: buffer[outLen++] = UCNV_SO; michael@0: pFromU2022State->g = 1; michael@0: break; michael@0: default: /* case 2 */ michael@0: buffer[outLen++] = 0x1b; michael@0: buffer[outLen++] = 0x4e; michael@0: break; michael@0: /* no case 3: no SS3 in ISO-2022-JP-x */ michael@0: } michael@0: } michael@0: michael@0: /* write the output bytes */ michael@0: if(len == 1) { michael@0: buffer[outLen++] = (char)targetValue; michael@0: } else /* len == 2 */ { michael@0: buffer[outLen++] = (char)(targetValue >> 8); michael@0: buffer[outLen++] = (char)targetValue; michael@0: } michael@0: } else { michael@0: /* michael@0: * if we cannot find the character after checking all codepages michael@0: * then this is an error michael@0: */ michael@0: *err = U_INVALID_CHAR_FOUND; michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: michael@0: if(sourceChar == CR || sourceChar == LF) { michael@0: /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ michael@0: pFromU2022State->cs[2] = 0; michael@0: choiceCount = 0; michael@0: } michael@0: michael@0: /* output outLen>0 bytes in buffer[] */ michael@0: if(outLen == 1) { michael@0: *target++ = buffer[0]; michael@0: if(offsets) { michael@0: *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ michael@0: } michael@0: } else if(outLen == 2 && (target + 2) <= targetLimit) { michael@0: *target++ = buffer[0]; michael@0: *target++ = buffer[1]; michael@0: if(offsets) { michael@0: int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); michael@0: *offsets++ = sourceIndex; michael@0: *offsets++ = sourceIndex; michael@0: } michael@0: } else { michael@0: fromUWriteUInt8( michael@0: cnv, michael@0: buffer, outLen, michael@0: &target, (const char *)targetLimit, michael@0: &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), michael@0: err); michael@0: if(U_FAILURE(*err)) { michael@0: break; michael@0: } michael@0: } michael@0: } /* end if(myTargetIndexg!=0 || pFromU2022State->cs[0]!=ASCII) && michael@0: args->flush && source>=sourceLimit && cnv->fromUChar32==0 michael@0: ) { michael@0: int32_t sourceIndex; michael@0: michael@0: outLen = 0; michael@0: michael@0: if(pFromU2022State->g != 0) { michael@0: buffer[outLen++] = UCNV_SI; michael@0: pFromU2022State->g = 0; michael@0: } michael@0: michael@0: if(pFromU2022State->cs[0] != ASCII) { michael@0: int32_t escLen = escSeqCharsLen[ASCII]; michael@0: uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); michael@0: outLen += escLen; michael@0: pFromU2022State->cs[0] = (int8_t)ASCII; michael@0: } michael@0: michael@0: /* get the source index of the last input character */ michael@0: /* michael@0: * TODO this would be simpler and more reliable if we used a pair michael@0: * of sourceIndex/prevSourceIndex like in ucnvmbcs.c michael@0: * so that we could simply use the prevSourceIndex here; michael@0: * this code gives an incorrect result for the rare case of an unmatched michael@0: * trail surrogate that is alone in the last buffer of the text stream michael@0: */ michael@0: sourceIndex=(int32_t)(source-args->source); michael@0: if(sourceIndex>0) { michael@0: --sourceIndex; michael@0: if( U16_IS_TRAIL(args->source[sourceIndex]) && michael@0: (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) michael@0: ) { michael@0: --sourceIndex; michael@0: } michael@0: } else { michael@0: sourceIndex=-1; michael@0: } michael@0: michael@0: fromUWriteUInt8( michael@0: cnv, michael@0: buffer, outLen, michael@0: &target, (const char *)targetLimit, michael@0: &offsets, sourceIndex, michael@0: err); michael@0: } michael@0: michael@0: /*save the state and return */ michael@0: args->source = source; michael@0: args->target = (char*)target; michael@0: } michael@0: michael@0: /*************** to unicode *******************/ michael@0: michael@0: static void michael@0: UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, michael@0: UErrorCode* err){ michael@0: char tempBuf[2]; michael@0: const char *mySource = (char *) args->source; michael@0: UChar *myTarget = args->target; michael@0: const char *mySourceLimit = args->sourceLimit; michael@0: uint32_t targetUniChar = 0x0000; michael@0: uint32_t mySourceChar = 0x0000; michael@0: uint32_t tmpSourceChar = 0x0000; michael@0: UConverterDataISO2022* myData; michael@0: ISO2022State *pToU2022State; michael@0: StateEnum cs; michael@0: michael@0: myData=(UConverterDataISO2022*)(args->converter->extraInfo); michael@0: pToU2022State = &myData->toU2022State; michael@0: michael@0: if(myData->key != 0) { michael@0: /* continue with a partial escape sequence */ michael@0: goto escape; michael@0: } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { michael@0: /* continue with a partial double-byte character */ michael@0: mySourceChar = args->converter->toUBytes[0]; michael@0: args->converter->toULength = 0; michael@0: cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; michael@0: targetUniChar = missingCharMarker; michael@0: goto getTrailByte; michael@0: } michael@0: michael@0: while(mySource < mySourceLimit){ michael@0: michael@0: targetUniChar =missingCharMarker; michael@0: michael@0: if(myTarget < args->targetLimit){ michael@0: michael@0: mySourceChar= (unsigned char) *mySource++; michael@0: michael@0: switch(mySourceChar) { michael@0: case UCNV_SI: michael@0: if(myData->version==3) { michael@0: pToU2022State->g=0; michael@0: continue; michael@0: } else { michael@0: /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ michael@0: myData->isEmptySegment = FALSE; /* reset this, we have a different error */ michael@0: break; michael@0: } michael@0: michael@0: case UCNV_SO: michael@0: if(myData->version==3) { michael@0: /* JIS7: switch to G1 half-width Katakana */ michael@0: pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; michael@0: pToU2022State->g=1; michael@0: continue; michael@0: } else { michael@0: /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ michael@0: myData->isEmptySegment = FALSE; /* reset this, we have a different error */ michael@0: break; michael@0: } michael@0: michael@0: case ESC_2022: michael@0: mySource--; michael@0: escape: michael@0: { michael@0: const char * mySourceBefore = mySource; michael@0: int8_t toULengthBefore = args->converter->toULength; michael@0: michael@0: changeState_2022(args->converter,&(mySource), michael@0: mySourceLimit, ISO_2022_JP,err); michael@0: michael@0: /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ michael@0: if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: args->converter->toUCallbackReason = UCNV_IRREGULAR; michael@0: args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); michael@0: } michael@0: } michael@0: michael@0: /* invalid or illegal escape sequence */ michael@0: if(U_FAILURE(*err)){ michael@0: args->target = myTarget; michael@0: args->source = mySource; michael@0: myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ michael@0: return; michael@0: } michael@0: /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ michael@0: if(myData->key==0) { michael@0: myData->isEmptySegment = TRUE; michael@0: } michael@0: continue; michael@0: michael@0: /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ michael@0: michael@0: case CR: michael@0: /*falls through*/ michael@0: case LF: michael@0: /* automatically reset to single-byte mode */ michael@0: if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { michael@0: pToU2022State->cs[0] = (int8_t)ASCII; michael@0: } michael@0: pToU2022State->cs[2] = 0; michael@0: pToU2022State->g = 0; michael@0: /* falls through */ michael@0: default: michael@0: /* convert one or two bytes */ michael@0: myData->isEmptySegment = FALSE; michael@0: cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; michael@0: if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && michael@0: !IS_JP_DBCS(cs) michael@0: ) { michael@0: /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ michael@0: targetUniChar = mySourceChar + (HWKANA_START - 0xa1); michael@0: michael@0: /* return from a single-shift state to the previous one */ michael@0: if(pToU2022State->g >= 2) { michael@0: pToU2022State->g=pToU2022State->prevG; michael@0: } michael@0: } else switch(cs) { michael@0: case ASCII: michael@0: if(mySourceChar <= 0x7f) { michael@0: targetUniChar = mySourceChar; michael@0: } michael@0: break; michael@0: case ISO8859_1: michael@0: if(mySourceChar <= 0x7f) { michael@0: targetUniChar = mySourceChar + 0x80; michael@0: } michael@0: /* return from a single-shift state to the previous one */ michael@0: pToU2022State->g=pToU2022State->prevG; michael@0: break; michael@0: case ISO8859_7: michael@0: if(mySourceChar <= 0x7f) { michael@0: /* convert mySourceChar+0x80 to use a normal 8-bit table */ michael@0: targetUniChar = michael@0: _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( michael@0: myData->myConverterArray[cs], michael@0: mySourceChar + 0x80); michael@0: } michael@0: /* return from a single-shift state to the previous one */ michael@0: pToU2022State->g=pToU2022State->prevG; michael@0: break; michael@0: case JISX201: michael@0: if(mySourceChar <= 0x7f) { michael@0: targetUniChar = jisx201ToU(mySourceChar); michael@0: } michael@0: break; michael@0: case HWKANA_7BIT: michael@0: if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { michael@0: /* 7-bit halfwidth Katakana */ michael@0: targetUniChar = mySourceChar + (HWKANA_START - 0x21); michael@0: } michael@0: break; michael@0: default: michael@0: /* G0 DBCS */ michael@0: if(mySource < mySourceLimit) { michael@0: int leadIsOk, trailIsOk; michael@0: uint8_t trailByte; michael@0: getTrailByte: michael@0: trailByte = (uint8_t)*mySource; michael@0: /* michael@0: * Ticket 5691: consistent illegal sequences: michael@0: * - We include at least the first byte in the illegal sequence. michael@0: * - If any of the non-initial bytes could be the start of a character, michael@0: * we stop the illegal sequence before the first one of those. michael@0: * michael@0: * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is michael@0: * an ESC/SO/SI, we report only the first byte as the illegal sequence. michael@0: * Otherwise we convert or report the pair of bytes. michael@0: */ michael@0: leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); michael@0: trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); michael@0: if (leadIsOk && trailIsOk) { michael@0: ++mySource; michael@0: tmpSourceChar = (mySourceChar << 8) | trailByte; michael@0: if(cs == JISX208) { michael@0: _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); michael@0: mySourceChar = tmpSourceChar; michael@0: } else { michael@0: /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ michael@0: mySourceChar = tmpSourceChar; michael@0: if (cs == KSC5601) { michael@0: tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ michael@0: } michael@0: tempBuf[0] = (char)(tmpSourceChar >> 8); michael@0: tempBuf[1] = (char)(tmpSourceChar); michael@0: } michael@0: targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); michael@0: } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { michael@0: /* report a pair of illegal bytes if the second byte is not a DBCS starter */ michael@0: ++mySource; michael@0: /* add another bit so that the code below writes 2 bytes in case of error */ michael@0: mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; michael@0: } michael@0: } else { michael@0: args->converter->toUBytes[0] = (uint8_t)mySourceChar; michael@0: args->converter->toULength = 1; michael@0: goto endloop; michael@0: } michael@0: } /* End of inner switch */ michael@0: break; michael@0: } /* End of outer switch */ michael@0: if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ michael@0: if(args->offsets){ michael@0: args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); michael@0: } michael@0: *(myTarget++)=(UChar)targetUniChar; michael@0: } michael@0: else if(targetUniChar > missingCharMarker){ michael@0: /* disassemble the surrogate pair and write to output*/ michael@0: targetUniChar-=0x0010000; michael@0: *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); michael@0: if(args->offsets){ michael@0: args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); michael@0: } michael@0: ++myTarget; michael@0: if(myTarget< args->targetLimit){ michael@0: *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); michael@0: if(args->offsets){ michael@0: args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); michael@0: } michael@0: ++myTarget; michael@0: }else{ michael@0: args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= michael@0: (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); michael@0: } michael@0: michael@0: } michael@0: else{ michael@0: /* Call the callback function*/ michael@0: toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); michael@0: break; michael@0: } michael@0: } michael@0: else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ michael@0: *err =U_BUFFER_OVERFLOW_ERROR; michael@0: break; michael@0: } michael@0: } michael@0: endloop: michael@0: args->target = myTarget; michael@0: args->source = mySource; michael@0: } michael@0: michael@0: michael@0: /*************************************************************** michael@0: * Rules for ISO-2022-KR encoding michael@0: * i) The KSC5601 designator sequence should appear only once in a file, michael@0: * at the begining of a line before any KSC5601 characters. This usually michael@0: * means that it appears by itself on the first line of the file michael@0: * ii) There are only 2 shifting sequences SO to shift into double byte mode michael@0: * and SI to shift into single byte mode michael@0: */ michael@0: static void michael@0: UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ michael@0: michael@0: UConverter* saveConv = args->converter; michael@0: UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; michael@0: args->converter=myConverterData->currentConverter; michael@0: michael@0: myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; michael@0: ucnv_MBCSFromUnicodeWithOffsets(args,err); michael@0: saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; michael@0: michael@0: if(*err == U_BUFFER_OVERFLOW_ERROR) { michael@0: if(myConverterData->currentConverter->charErrorBufferLength > 0) { michael@0: uprv_memcpy( michael@0: saveConv->charErrorBuffer, michael@0: myConverterData->currentConverter->charErrorBuffer, michael@0: myConverterData->currentConverter->charErrorBufferLength); michael@0: } michael@0: saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; michael@0: myConverterData->currentConverter->charErrorBufferLength = 0; michael@0: } michael@0: args->converter=saveConv; michael@0: } michael@0: michael@0: static void michael@0: UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ michael@0: michael@0: const UChar *source = args->source; michael@0: const UChar *sourceLimit = args->sourceLimit; michael@0: unsigned char *target = (unsigned char *) args->target; michael@0: unsigned char *targetLimit = (unsigned char *) args->targetLimit; michael@0: int32_t* offsets = args->offsets; michael@0: uint32_t targetByteUnit = 0x0000; michael@0: UChar32 sourceChar = 0x0000; michael@0: UBool isTargetByteDBCS; michael@0: UBool oldIsTargetByteDBCS; michael@0: UConverterDataISO2022 *converterData; michael@0: UConverterSharedData* sharedData; michael@0: UBool useFallback; michael@0: int32_t length =0; michael@0: michael@0: converterData=(UConverterDataISO2022*)args->converter->extraInfo; michael@0: /* if the version is 1 then the user is requesting michael@0: * conversion with ibm-25546 pass the arguments to michael@0: * MBCS converter and return michael@0: */ michael@0: if(converterData->version==1){ michael@0: UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); michael@0: return; michael@0: } michael@0: michael@0: /* initialize data */ michael@0: sharedData = converterData->currentConverter->sharedData; michael@0: useFallback = args->converter->useFallback; michael@0: isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; michael@0: oldIsTargetByteDBCS = isTargetByteDBCS; michael@0: michael@0: isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; michael@0: if((sourceChar = args->converter->fromUChar32)!=0 && target targetLimit){ michael@0: sourceChar = *source++; michael@0: michael@0: /* do not convert SO/SI/ESC */ michael@0: if(IS_2022_CONTROL(sourceChar)) { michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: args->converter->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: michael@0: length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); michael@0: if(length < 0) { michael@0: length = -length; /* fallback */ michael@0: } michael@0: /* only DBCS or SBCS characters are expected*/ michael@0: /* DB characters with high bit set to 1 are expected */ michael@0: if( length > 2 || length==0 || michael@0: (length == 1 && targetByteUnit > 0x7f) || michael@0: (length == 2 && michael@0: ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || michael@0: (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) michael@0: ) { michael@0: targetByteUnit=missingCharMarker; michael@0: } michael@0: if (targetByteUnit != missingCharMarker){ michael@0: michael@0: oldIsTargetByteDBCS = isTargetByteDBCS; michael@0: isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); michael@0: /* append the shift sequence */ michael@0: if (oldIsTargetByteDBCS != isTargetByteDBCS ){ michael@0: michael@0: if (isTargetByteDBCS) michael@0: *target++ = UCNV_SO; michael@0: else michael@0: *target++ = UCNV_SI; michael@0: if(offsets) michael@0: *(offsets++) = (int32_t)(source - args->source-1); michael@0: } michael@0: /* write the targetUniChar to target */ michael@0: if(targetByteUnit <= 0x00FF){ michael@0: if( target < targetLimit){ michael@0: *(target++) = (unsigned char) targetByteUnit; michael@0: if(offsets){ michael@0: *(offsets++) = (int32_t)(source - args->source-1); michael@0: } michael@0: michael@0: }else{ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); michael@0: *err = U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: }else{ michael@0: if(target < targetLimit){ michael@0: *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); michael@0: if(offsets){ michael@0: *(offsets++) = (int32_t)(source - args->source-1); michael@0: } michael@0: if(target < targetLimit){ michael@0: *(target++) =(unsigned char) (targetByteUnit -0x80); michael@0: if(offsets){ michael@0: *(offsets++) = (int32_t)(source - args->source-1); michael@0: } michael@0: }else{ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); michael@0: *err = U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: }else{ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); michael@0: *err = U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: } michael@0: michael@0: } michael@0: else{ michael@0: /* oops.. the code point is unassingned michael@0: * set the error and reason michael@0: */ michael@0: michael@0: /*check if the char is a First surrogate*/ michael@0: if(U16_IS_SURROGATE(sourceChar)) { michael@0: if(U16_IS_SURROGATE_LEAD(sourceChar)) { michael@0: getTrail: michael@0: /*look ahead to find the trail surrogate*/ michael@0: if(source < sourceLimit) { michael@0: /* test the following code unit */ michael@0: UChar trail=(UChar) *source; michael@0: if(U16_IS_TRAIL(trail)) { michael@0: source++; michael@0: sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); michael@0: *err = U_INVALID_CHAR_FOUND; michael@0: /* convert this surrogate code point */ michael@0: /* exit this condition tree */ michael@0: } else { michael@0: /* this is an unmatched lead code unit (1st surrogate) */ michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } else { michael@0: /* no more input */ michael@0: *err = U_ZERO_ERROR; michael@0: } michael@0: } else { michael@0: /* this is an unmatched trail code unit (2nd surrogate) */ michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } else { michael@0: /* callback(unassigned) for a BMP code point */ michael@0: *err = U_INVALID_CHAR_FOUND; michael@0: } michael@0: michael@0: args->converter->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: } /* end if(myTargetIndexflush && source>=sourceLimit && args->converter->fromUChar32==0 michael@0: ) { michael@0: int32_t sourceIndex; michael@0: michael@0: /* we are switching to ASCII */ michael@0: isTargetByteDBCS=FALSE; michael@0: michael@0: /* get the source index of the last input character */ michael@0: /* michael@0: * TODO this would be simpler and more reliable if we used a pair michael@0: * of sourceIndex/prevSourceIndex like in ucnvmbcs.c michael@0: * so that we could simply use the prevSourceIndex here; michael@0: * this code gives an incorrect result for the rare case of an unmatched michael@0: * trail surrogate that is alone in the last buffer of the text stream michael@0: */ michael@0: sourceIndex=(int32_t)(source-args->source); michael@0: if(sourceIndex>0) { michael@0: --sourceIndex; michael@0: if( U16_IS_TRAIL(args->source[sourceIndex]) && michael@0: (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) michael@0: ) { michael@0: --sourceIndex; michael@0: } michael@0: } else { michael@0: sourceIndex=-1; michael@0: } michael@0: michael@0: fromUWriteUInt8( michael@0: args->converter, michael@0: SHIFT_IN_STR, 1, michael@0: &target, (const char *)targetLimit, michael@0: &offsets, sourceIndex, michael@0: err); michael@0: } michael@0: michael@0: /*save the state and return */ michael@0: args->source = source; michael@0: args->target = (char*)target; michael@0: args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; michael@0: } michael@0: michael@0: /************************ To Unicode ***************************************/ michael@0: michael@0: static void michael@0: UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, michael@0: UErrorCode* err){ michael@0: char const* sourceStart; michael@0: UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); michael@0: michael@0: UConverterToUnicodeArgs subArgs; michael@0: int32_t minArgsSize; michael@0: michael@0: /* set up the subconverter arguments */ michael@0: if(args->sizesize; michael@0: } else { michael@0: minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); michael@0: } michael@0: michael@0: uprv_memcpy(&subArgs, args, minArgsSize); michael@0: subArgs.size = (uint16_t)minArgsSize; michael@0: subArgs.converter = myData->currentConverter; michael@0: michael@0: /* remember the original start of the input for offsets */ michael@0: sourceStart = args->source; michael@0: michael@0: if(myData->key != 0) { michael@0: /* continue with a partial escape sequence */ michael@0: goto escape; michael@0: } michael@0: michael@0: while(U_SUCCESS(*err) && args->source < args->sourceLimit) { michael@0: /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ michael@0: subArgs.source = args->source; michael@0: subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); michael@0: if(subArgs.source != subArgs.sourceLimit) { michael@0: /* michael@0: * get the current partial byte sequence michael@0: * michael@0: * it needs to be moved between the public and the subconverter michael@0: * so that the conversion framework, which only sees the public michael@0: * converter, can handle truncated and illegal input etc. michael@0: */ michael@0: if(args->converter->toULength > 0) { michael@0: uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); michael@0: } michael@0: subArgs.converter->toULength = args->converter->toULength; michael@0: michael@0: /* michael@0: * Convert up to the end of the input, or to before the next escape character. michael@0: * Does not handle conversion extensions because the preToU[] state etc. michael@0: * is not copied. michael@0: */ michael@0: ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); michael@0: michael@0: if(args->offsets != NULL && sourceStart != args->source) { michael@0: /* update offsets to base them on the actual start of the input */ michael@0: int32_t *offsets = args->offsets; michael@0: UChar *target = args->target; michael@0: int32_t delta = (int32_t)(args->source - sourceStart); michael@0: while(target < subArgs.target) { michael@0: if(*offsets >= 0) { michael@0: *offsets += delta; michael@0: } michael@0: ++offsets; michael@0: ++target; michael@0: } michael@0: } michael@0: args->source = subArgs.source; michael@0: args->target = subArgs.target; michael@0: args->offsets = subArgs.offsets; michael@0: michael@0: /* copy input/error/overflow buffers */ michael@0: if(subArgs.converter->toULength > 0) { michael@0: uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); michael@0: } michael@0: args->converter->toULength = subArgs.converter->toULength; michael@0: michael@0: if(*err == U_BUFFER_OVERFLOW_ERROR) { michael@0: if(subArgs.converter->UCharErrorBufferLength > 0) { michael@0: uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, michael@0: subArgs.converter->UCharErrorBufferLength); michael@0: } michael@0: args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; michael@0: subArgs.converter->UCharErrorBufferLength = 0; michael@0: } michael@0: } michael@0: michael@0: if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { michael@0: return; michael@0: } michael@0: michael@0: escape: michael@0: changeState_2022(args->converter, michael@0: &(args->source), michael@0: args->sourceLimit, michael@0: ISO_2022_KR, michael@0: err); michael@0: } michael@0: } michael@0: michael@0: static void michael@0: UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, michael@0: UErrorCode* err){ michael@0: char tempBuf[2]; michael@0: const char *mySource = ( char *) args->source; michael@0: UChar *myTarget = args->target; michael@0: const char *mySourceLimit = args->sourceLimit; michael@0: UChar32 targetUniChar = 0x0000; michael@0: UChar mySourceChar = 0x0000; michael@0: UConverterDataISO2022* myData; michael@0: UConverterSharedData* sharedData ; michael@0: UBool useFallback; michael@0: michael@0: myData=(UConverterDataISO2022*)(args->converter->extraInfo); michael@0: if(myData->version==1){ michael@0: UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); michael@0: return; michael@0: } michael@0: michael@0: /* initialize state */ michael@0: sharedData = myData->currentConverter->sharedData; michael@0: useFallback = args->converter->useFallback; michael@0: michael@0: if(myData->key != 0) { michael@0: /* continue with a partial escape sequence */ michael@0: goto escape; michael@0: } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { michael@0: /* continue with a partial double-byte character */ michael@0: mySourceChar = args->converter->toUBytes[0]; michael@0: args->converter->toULength = 0; michael@0: goto getTrailByte; michael@0: } michael@0: michael@0: while(mySource< mySourceLimit){ michael@0: michael@0: if(myTarget < args->targetLimit){ michael@0: michael@0: mySourceChar= (unsigned char) *mySource++; michael@0: michael@0: if(mySourceChar==UCNV_SI){ michael@0: myData->toU2022State.g = 0; michael@0: if (myData->isEmptySegment) { michael@0: myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: args->converter->toUCallbackReason = UCNV_IRREGULAR; michael@0: args->converter->toUBytes[0] = (uint8_t)mySourceChar; michael@0: args->converter->toULength = 1; michael@0: args->target = myTarget; michael@0: args->source = mySource; michael@0: return; michael@0: } michael@0: /*consume the source */ michael@0: continue; michael@0: }else if(mySourceChar==UCNV_SO){ michael@0: myData->toU2022State.g = 1; michael@0: myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ michael@0: /*consume the source */ michael@0: continue; michael@0: }else if(mySourceChar==ESC_2022){ michael@0: mySource--; michael@0: escape: michael@0: myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ michael@0: changeState_2022(args->converter,&(mySource), michael@0: mySourceLimit, ISO_2022_KR, err); michael@0: if(U_FAILURE(*err)){ michael@0: args->target = myTarget; michael@0: args->source = mySource; michael@0: return; michael@0: } michael@0: continue; michael@0: } michael@0: michael@0: myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ michael@0: if(myData->toU2022State.g == 1) { michael@0: if(mySource < mySourceLimit) { michael@0: int leadIsOk, trailIsOk; michael@0: uint8_t trailByte; michael@0: getTrailByte: michael@0: targetUniChar = missingCharMarker; michael@0: trailByte = (uint8_t)*mySource; michael@0: /* michael@0: * Ticket 5691: consistent illegal sequences: michael@0: * - We include at least the first byte in the illegal sequence. michael@0: * - If any of the non-initial bytes could be the start of a character, michael@0: * we stop the illegal sequence before the first one of those. michael@0: * michael@0: * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is michael@0: * an ESC/SO/SI, we report only the first byte as the illegal sequence. michael@0: * Otherwise we convert or report the pair of bytes. michael@0: */ michael@0: leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); michael@0: trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); michael@0: if (leadIsOk && trailIsOk) { michael@0: ++mySource; michael@0: tempBuf[0] = (char)(mySourceChar + 0x80); michael@0: tempBuf[1] = (char)(trailByte + 0x80); michael@0: targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); michael@0: mySourceChar = (mySourceChar << 8) | trailByte; michael@0: } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { michael@0: /* report a pair of illegal bytes if the second byte is not a DBCS starter */ michael@0: ++mySource; michael@0: /* add another bit so that the code below writes 2 bytes in case of error */ michael@0: mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; michael@0: } michael@0: } else { michael@0: args->converter->toUBytes[0] = (uint8_t)mySourceChar; michael@0: args->converter->toULength = 1; michael@0: break; michael@0: } michael@0: } michael@0: else if(mySourceChar <= 0x7f) { michael@0: targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); michael@0: } else { michael@0: targetUniChar = 0xffff; michael@0: } michael@0: if(targetUniChar < 0xfffe){ michael@0: if(args->offsets) { michael@0: args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); michael@0: } michael@0: *(myTarget++)=(UChar)targetUniChar; michael@0: } michael@0: else { michael@0: /* Call the callback function*/ michael@0: toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); michael@0: break; michael@0: } michael@0: } michael@0: else{ michael@0: *err =U_BUFFER_OVERFLOW_ERROR; michael@0: break; michael@0: } michael@0: } michael@0: args->target = myTarget; michael@0: args->source = mySource; michael@0: } michael@0: michael@0: /*************************** END ISO2022-KR *********************************/ michael@0: michael@0: /*************************** ISO-2022-CN ********************************* michael@0: * michael@0: * Rules for ISO-2022-CN Encoding: michael@0: * i) The designator sequence must appear once on a line before any instance michael@0: * of character set it designates. michael@0: * ii) If two lines contain characters from the same character set, both lines michael@0: * must include the designator sequence. michael@0: * iii) Once the designator sequence is known, a shifting sequence has to be found michael@0: * to invoke the shifting michael@0: * iv) All lines start in ASCII and end in ASCII. michael@0: * v) Four shifting sequences are employed for this purpose: michael@0: * michael@0: * Sequcence ASCII Eq Charsets michael@0: * ---------- ------- --------- michael@0: * SI US-ASCII michael@0: * SO CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 michael@0: * SS2 N CNS-11643-1992 Plane 2 michael@0: * SS3 O CNS-11643-1992 Planes 3-7 michael@0: * michael@0: * vi) michael@0: * SOdesignator : ESC "$" ")" finalchar_for_SO michael@0: * SS2designator : ESC "$" "*" finalchar_for_SS2 michael@0: * SS3designator : ESC "$" "+" finalchar_for_SS3 michael@0: * michael@0: * ESC $ ) A Indicates the bytes following SO are Chinese michael@0: * characters as defined in GB 2312-80, until michael@0: * another SOdesignation appears michael@0: * michael@0: * michael@0: * ESC $ ) E Indicates the bytes following SO are as defined michael@0: * in ISO-IR-165 (for details, see section 2.1), michael@0: * until another SOdesignation appears michael@0: * michael@0: * ESC $ ) G Indicates the bytes following SO are as defined michael@0: * in CNS 11643-plane-1, until another michael@0: * SOdesignation appears michael@0: * michael@0: * ESC $ * H Indicates the two bytes immediately following michael@0: * SS2 is a Chinese character as defined in CNS michael@0: * 11643-plane-2, until another SS2designation michael@0: * appears michael@0: * (Meaning N must preceed every 2 byte michael@0: * sequence.) michael@0: * michael@0: * ESC $ + I Indicates the immediate two bytes following SS3 michael@0: * is a Chinese character as defined in CNS michael@0: * 11643-plane-3, until another SS3designation michael@0: * appears michael@0: * (Meaning O must preceed every 2 byte michael@0: * sequence.) michael@0: * michael@0: * ESC $ + J Indicates the immediate two bytes following SS3 michael@0: * is a Chinese character as defined in CNS michael@0: * 11643-plane-4, until another SS3designation michael@0: * appears michael@0: * (In English: O must preceed every 2 byte michael@0: * sequence.) michael@0: * michael@0: * ESC $ + K Indicates the immediate two bytes following SS3 michael@0: * is a Chinese character as defined in CNS michael@0: * 11643-plane-5, until another SS3designation michael@0: * appears michael@0: * michael@0: * ESC $ + L Indicates the immediate two bytes following SS3 michael@0: * is a Chinese character as defined in CNS michael@0: * 11643-plane-6, until another SS3designation michael@0: * appears michael@0: * michael@0: * ESC $ + M Indicates the immediate two bytes following SS3 michael@0: * is a Chinese character as defined in CNS michael@0: * 11643-plane-7, until another SS3designation michael@0: * appears michael@0: * michael@0: * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and michael@0: * has its own designation information before any Chinese characters michael@0: * appear michael@0: * michael@0: */ michael@0: michael@0: /* The following are defined this way to make the strings truly readonly */ michael@0: static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; michael@0: static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; michael@0: static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; michael@0: static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; michael@0: static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; michael@0: static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; michael@0: static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; michael@0: static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; michael@0: static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; michael@0: michael@0: /********************** ISO2022-CN Data **************************/ michael@0: static const char* const escSeqCharsCN[10] ={ michael@0: SHIFT_IN_STR, /* 0 ASCII */ michael@0: GB_2312_80_STR, /* 1 GB2312_1 */ michael@0: ISO_IR_165_STR, /* 2 ISO_IR_165 */ michael@0: CNS_11643_1992_Plane_1_STR, michael@0: CNS_11643_1992_Plane_2_STR, michael@0: CNS_11643_1992_Plane_3_STR, michael@0: CNS_11643_1992_Plane_4_STR, michael@0: CNS_11643_1992_Plane_5_STR, michael@0: CNS_11643_1992_Plane_6_STR, michael@0: CNS_11643_1992_Plane_7_STR michael@0: }; michael@0: michael@0: static void michael@0: UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ michael@0: UConverter *cnv = args->converter; michael@0: UConverterDataISO2022 *converterData; michael@0: ISO2022State *pFromU2022State; michael@0: uint8_t *target = (uint8_t *) args->target; michael@0: const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; michael@0: const UChar* source = args->source; michael@0: const UChar* sourceLimit = args->sourceLimit; michael@0: int32_t* offsets = args->offsets; michael@0: UChar32 sourceChar; michael@0: char buffer[8]; michael@0: int32_t len; michael@0: int8_t choices[3]; michael@0: int32_t choiceCount; michael@0: uint32_t targetValue = 0; michael@0: UBool useFallback; michael@0: michael@0: /* set up the state */ michael@0: converterData = (UConverterDataISO2022*)cnv->extraInfo; michael@0: pFromU2022State = &converterData->fromU2022State; michael@0: michael@0: choiceCount = 0; michael@0: michael@0: /* check if the last codepoint of previous buffer was a lead surrogate*/ michael@0: if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { michael@0: goto getTrail; michael@0: } michael@0: michael@0: while( source < sourceLimit){ michael@0: if(target < targetLimit){ michael@0: michael@0: sourceChar = *(source++); michael@0: /*check if the char is a First surrogate*/ michael@0: if(U16_IS_SURROGATE(sourceChar)) { michael@0: if(U16_IS_SURROGATE_LEAD(sourceChar)) { michael@0: getTrail: michael@0: /*look ahead to find the trail surrogate*/ michael@0: if(source < sourceLimit) { michael@0: /* test the following code unit */ michael@0: UChar trail=(UChar) *source; michael@0: if(U16_IS_TRAIL(trail)) { michael@0: source++; michael@0: sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); michael@0: cnv->fromUChar32=0x00; michael@0: /* convert this supplementary code point */ michael@0: /* exit this condition tree */ michael@0: } else { michael@0: /* this is an unmatched lead code unit (1st surrogate) */ michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: } else { michael@0: /* no more input */ michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: } else { michael@0: /* this is an unmatched trail code unit (2nd surrogate) */ michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: /* do the conversion */ michael@0: if(sourceChar <= 0x007f ){ michael@0: /* do not convert SO/SI/ESC */ michael@0: if(IS_2022_CONTROL(sourceChar)) { michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: michael@0: /* US-ASCII */ michael@0: if(pFromU2022State->g == 0) { michael@0: buffer[0] = (char)sourceChar; michael@0: len = 1; michael@0: } else { michael@0: buffer[0] = UCNV_SI; michael@0: buffer[1] = (char)sourceChar; michael@0: len = 2; michael@0: pFromU2022State->g = 0; michael@0: choiceCount = 0; michael@0: } michael@0: if(sourceChar == CR || sourceChar == LF) { michael@0: /* reset the state at the end of a line */ michael@0: uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); michael@0: choiceCount = 0; michael@0: } michael@0: } michael@0: else{ michael@0: /* convert U+0080..U+10ffff */ michael@0: int32_t i; michael@0: int8_t cs, g; michael@0: michael@0: if(choiceCount == 0) { michael@0: /* try the current SO/G1 converter first */ michael@0: choices[0] = pFromU2022State->cs[1]; michael@0: michael@0: /* default to GB2312_1 if none is designated yet */ michael@0: if(choices[0] == 0) { michael@0: choices[0] = GB2312_1; michael@0: } michael@0: michael@0: if(converterData->version == 0) { michael@0: /* ISO-2022-CN */ michael@0: michael@0: /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ michael@0: if(choices[0] == GB2312_1) { michael@0: choices[1] = (int8_t)CNS_11643_1; michael@0: } else { michael@0: choices[1] = (int8_t)GB2312_1; michael@0: } michael@0: michael@0: choiceCount = 2; michael@0: } else if (converterData->version == 1) { michael@0: /* ISO-2022-CN-EXT */ michael@0: michael@0: /* try one of the other converters */ michael@0: switch(choices[0]) { michael@0: case GB2312_1: michael@0: choices[1] = (int8_t)CNS_11643_1; michael@0: choices[2] = (int8_t)ISO_IR_165; michael@0: break; michael@0: case ISO_IR_165: michael@0: choices[1] = (int8_t)GB2312_1; michael@0: choices[2] = (int8_t)CNS_11643_1; michael@0: break; michael@0: default: /* CNS_11643_x */ michael@0: choices[1] = (int8_t)GB2312_1; michael@0: choices[2] = (int8_t)ISO_IR_165; michael@0: break; michael@0: } michael@0: michael@0: choiceCount = 3; michael@0: } else { michael@0: choices[0] = (int8_t)CNS_11643_1; michael@0: choices[1] = (int8_t)GB2312_1; michael@0: } michael@0: } michael@0: michael@0: cs = g = 0; michael@0: /* michael@0: * len==0: no mapping found yet michael@0: * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks michael@0: * len>0: found a roundtrip result, done michael@0: */ michael@0: len = 0; michael@0: /* michael@0: * We will turn off useFallback after finding a fallback, michael@0: * but we still get fallbacks from PUA code points as usual. michael@0: * Therefore, we will also need to check that we don't overwrite michael@0: * an early fallback with a later one. michael@0: */ michael@0: useFallback = cnv->useFallback; michael@0: michael@0: for(i = 0; i < choiceCount && len <= 0; ++i) { michael@0: int8_t cs0 = choices[i]; michael@0: if(cs0 > 0) { michael@0: uint32_t value; michael@0: int32_t len2; michael@0: if(cs0 >= CNS_11643_0) { michael@0: len2 = MBCS_FROM_UCHAR32_ISO2022( michael@0: converterData->myConverterArray[CNS_11643], michael@0: sourceChar, michael@0: &value, michael@0: useFallback, michael@0: MBCS_OUTPUT_3); michael@0: if(len2 == 3 || (len2 == -3 && len == 0)) { michael@0: targetValue = value; michael@0: cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); michael@0: if(len2 >= 0) { michael@0: len = 2; michael@0: } else { michael@0: len = -2; michael@0: useFallback = FALSE; michael@0: } michael@0: if(cs == CNS_11643_1) { michael@0: g = 1; michael@0: } else if(cs == CNS_11643_2) { michael@0: g = 2; michael@0: } else /* plane 3..7 */ if(converterData->version == 1) { michael@0: g = 3; michael@0: } else { michael@0: /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ michael@0: len = 0; michael@0: } michael@0: } michael@0: } else { michael@0: /* GB2312_1 or ISO-IR-165 */ michael@0: U_ASSERT(cs0myConverterArray[cs0], michael@0: sourceChar, michael@0: &value, michael@0: useFallback, michael@0: MBCS_OUTPUT_2); michael@0: if(len2 == 2 || (len2 == -2 && len == 0)) { michael@0: targetValue = value; michael@0: len = len2; michael@0: cs = cs0; michael@0: g = 1; michael@0: useFallback = FALSE; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(len != 0) { michael@0: len = 0; /* count output bytes; it must have been abs(len) == 2 */ michael@0: michael@0: /* write the designation sequence if necessary */ michael@0: if(cs != pFromU2022State->cs[g]) { michael@0: if(cs < CNS_11643) { michael@0: uprv_memcpy(buffer, escSeqCharsCN[cs], 4); michael@0: } else { michael@0: U_ASSERT(cs >= CNS_11643_1); michael@0: uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); michael@0: } michael@0: len = 4; michael@0: pFromU2022State->cs[g] = cs; michael@0: if(g == 1) { michael@0: /* changing the SO/G1 charset invalidates the choices[] */ michael@0: choiceCount = 0; michael@0: } michael@0: } michael@0: michael@0: /* write the shift sequence if necessary */ michael@0: if(g != pFromU2022State->g) { michael@0: switch(g) { michael@0: case 1: michael@0: buffer[len++] = UCNV_SO; michael@0: michael@0: /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ michael@0: pFromU2022State->g = 1; michael@0: break; michael@0: case 2: michael@0: buffer[len++] = 0x1b; michael@0: buffer[len++] = 0x4e; michael@0: break; michael@0: default: /* case 3 */ michael@0: buffer[len++] = 0x1b; michael@0: buffer[len++] = 0x4f; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: /* write the two output bytes */ michael@0: buffer[len++] = (char)(targetValue >> 8); michael@0: buffer[len++] = (char)targetValue; michael@0: } else { michael@0: /* if we cannot find the character after checking all codepages michael@0: * then this is an error michael@0: */ michael@0: *err = U_INVALID_CHAR_FOUND; michael@0: cnv->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: /* output len>0 bytes in buffer[] */ michael@0: if(len == 1) { michael@0: *target++ = buffer[0]; michael@0: if(offsets) { michael@0: *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ michael@0: } michael@0: } else if(len == 2 && (target + 2) <= targetLimit) { michael@0: *target++ = buffer[0]; michael@0: *target++ = buffer[1]; michael@0: if(offsets) { michael@0: int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); michael@0: *offsets++ = sourceIndex; michael@0: *offsets++ = sourceIndex; michael@0: } michael@0: } else { michael@0: fromUWriteUInt8( michael@0: cnv, michael@0: buffer, len, michael@0: &target, (const char *)targetLimit, michael@0: &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), michael@0: err); michael@0: if(U_FAILURE(*err)) { michael@0: break; michael@0: } michael@0: } michael@0: } /* end if(myTargetIndexg!=0 && michael@0: args->flush && source>=sourceLimit && cnv->fromUChar32==0 michael@0: ) { michael@0: int32_t sourceIndex; michael@0: michael@0: /* we are switching to ASCII */ michael@0: pFromU2022State->g=0; michael@0: michael@0: /* get the source index of the last input character */ michael@0: /* michael@0: * TODO this would be simpler and more reliable if we used a pair michael@0: * of sourceIndex/prevSourceIndex like in ucnvmbcs.c michael@0: * so that we could simply use the prevSourceIndex here; michael@0: * this code gives an incorrect result for the rare case of an unmatched michael@0: * trail surrogate that is alone in the last buffer of the text stream michael@0: */ michael@0: sourceIndex=(int32_t)(source-args->source); michael@0: if(sourceIndex>0) { michael@0: --sourceIndex; michael@0: if( U16_IS_TRAIL(args->source[sourceIndex]) && michael@0: (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) michael@0: ) { michael@0: --sourceIndex; michael@0: } michael@0: } else { michael@0: sourceIndex=-1; michael@0: } michael@0: michael@0: fromUWriteUInt8( michael@0: cnv, michael@0: SHIFT_IN_STR, 1, michael@0: &target, (const char *)targetLimit, michael@0: &offsets, sourceIndex, michael@0: err); michael@0: } michael@0: michael@0: /*save the state and return */ michael@0: args->source = source; michael@0: args->target = (char*)target; michael@0: } michael@0: michael@0: michael@0: static void michael@0: UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, michael@0: UErrorCode* err){ michael@0: char tempBuf[3]; michael@0: const char *mySource = (char *) args->source; michael@0: UChar *myTarget = args->target; michael@0: const char *mySourceLimit = args->sourceLimit; michael@0: uint32_t targetUniChar = 0x0000; michael@0: uint32_t mySourceChar = 0x0000; michael@0: UConverterDataISO2022* myData; michael@0: ISO2022State *pToU2022State; michael@0: michael@0: myData=(UConverterDataISO2022*)(args->converter->extraInfo); michael@0: pToU2022State = &myData->toU2022State; michael@0: michael@0: if(myData->key != 0) { michael@0: /* continue with a partial escape sequence */ michael@0: goto escape; michael@0: } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { michael@0: /* continue with a partial double-byte character */ michael@0: mySourceChar = args->converter->toUBytes[0]; michael@0: args->converter->toULength = 0; michael@0: targetUniChar = missingCharMarker; michael@0: goto getTrailByte; michael@0: } michael@0: michael@0: while(mySource < mySourceLimit){ michael@0: michael@0: targetUniChar =missingCharMarker; michael@0: michael@0: if(myTarget < args->targetLimit){ michael@0: michael@0: mySourceChar= (unsigned char) *mySource++; michael@0: michael@0: switch(mySourceChar){ michael@0: case UCNV_SI: michael@0: pToU2022State->g=0; michael@0: if (myData->isEmptySegment) { michael@0: myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: args->converter->toUCallbackReason = UCNV_IRREGULAR; michael@0: args->converter->toUBytes[0] = mySourceChar; michael@0: args->converter->toULength = 1; michael@0: args->target = myTarget; michael@0: args->source = mySource; michael@0: return; michael@0: } michael@0: continue; michael@0: michael@0: case UCNV_SO: michael@0: if(pToU2022State->cs[1] != 0) { michael@0: pToU2022State->g=1; michael@0: myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ michael@0: continue; michael@0: } else { michael@0: /* illegal to have SO before a matching designator */ michael@0: myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ michael@0: break; michael@0: } michael@0: michael@0: case ESC_2022: michael@0: mySource--; michael@0: escape: michael@0: { michael@0: const char * mySourceBefore = mySource; michael@0: int8_t toULengthBefore = args->converter->toULength; michael@0: michael@0: changeState_2022(args->converter,&(mySource), michael@0: mySourceLimit, ISO_2022_CN,err); michael@0: michael@0: /* After SO there must be at least one character before a designator (designator error handled separately) */ michael@0: if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { michael@0: *err = U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: args->converter->toUCallbackReason = UCNV_IRREGULAR; michael@0: args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); michael@0: } michael@0: } michael@0: michael@0: /* invalid or illegal escape sequence */ michael@0: if(U_FAILURE(*err)){ michael@0: args->target = myTarget; michael@0: args->source = mySource; michael@0: myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ michael@0: return; michael@0: } michael@0: continue; michael@0: michael@0: /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ michael@0: michael@0: case CR: michael@0: /*falls through*/ michael@0: case LF: michael@0: uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); michael@0: /* falls through */ michael@0: default: michael@0: /* convert one or two bytes */ michael@0: myData->isEmptySegment = FALSE; michael@0: if(pToU2022State->g != 0) { michael@0: if(mySource < mySourceLimit) { michael@0: UConverterSharedData *cnv; michael@0: StateEnum tempState; michael@0: int32_t tempBufLen; michael@0: int leadIsOk, trailIsOk; michael@0: uint8_t trailByte; michael@0: getTrailByte: michael@0: trailByte = (uint8_t)*mySource; michael@0: /* michael@0: * Ticket 5691: consistent illegal sequences: michael@0: * - We include at least the first byte in the illegal sequence. michael@0: * - If any of the non-initial bytes could be the start of a character, michael@0: * we stop the illegal sequence before the first one of those. michael@0: * michael@0: * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is michael@0: * an ESC/SO/SI, we report only the first byte as the illegal sequence. michael@0: * Otherwise we convert or report the pair of bytes. michael@0: */ michael@0: leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); michael@0: trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); michael@0: if (leadIsOk && trailIsOk) { michael@0: ++mySource; michael@0: tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; michael@0: if(tempState >= CNS_11643_0) { michael@0: cnv = myData->myConverterArray[CNS_11643]; michael@0: tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); michael@0: tempBuf[1] = (char) (mySourceChar); michael@0: tempBuf[2] = (char) trailByte; michael@0: tempBufLen = 3; michael@0: michael@0: }else{ michael@0: U_ASSERT(tempStatemyConverterArray[tempState]; michael@0: tempBuf[0] = (char) (mySourceChar); michael@0: tempBuf[1] = (char) trailByte; michael@0: tempBufLen = 2; michael@0: } michael@0: targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); michael@0: mySourceChar = (mySourceChar << 8) | trailByte; michael@0: } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { michael@0: /* report a pair of illegal bytes if the second byte is not a DBCS starter */ michael@0: ++mySource; michael@0: /* add another bit so that the code below writes 2 bytes in case of error */ michael@0: mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; michael@0: } michael@0: if(pToU2022State->g>=2) { michael@0: /* return from a single-shift state to the previous one */ michael@0: pToU2022State->g=pToU2022State->prevG; michael@0: } michael@0: } else { michael@0: args->converter->toUBytes[0] = (uint8_t)mySourceChar; michael@0: args->converter->toULength = 1; michael@0: goto endloop; michael@0: } michael@0: } michael@0: else{ michael@0: if(mySourceChar <= 0x7f) { michael@0: targetUniChar = (UChar) mySourceChar; michael@0: } michael@0: } michael@0: break; michael@0: } michael@0: if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ michael@0: if(args->offsets){ michael@0: args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); michael@0: } michael@0: *(myTarget++)=(UChar)targetUniChar; michael@0: } michael@0: else if(targetUniChar > missingCharMarker){ michael@0: /* disassemble the surrogate pair and write to output*/ michael@0: targetUniChar-=0x0010000; michael@0: *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); michael@0: if(args->offsets){ michael@0: args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); michael@0: } michael@0: ++myTarget; michael@0: if(myTarget< args->targetLimit){ michael@0: *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); michael@0: if(args->offsets){ michael@0: args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); michael@0: } michael@0: ++myTarget; michael@0: }else{ michael@0: args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= michael@0: (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); michael@0: } michael@0: michael@0: } michael@0: else{ michael@0: /* Call the callback function*/ michael@0: toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); michael@0: break; michael@0: } michael@0: } michael@0: else{ michael@0: *err =U_BUFFER_OVERFLOW_ERROR; michael@0: break; michael@0: } michael@0: } michael@0: endloop: michael@0: args->target = myTarget; michael@0: args->source = mySource; michael@0: } michael@0: michael@0: static void michael@0: _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { michael@0: UConverter *cnv = args->converter; michael@0: UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; michael@0: ISO2022State *pFromU2022State=&myConverterData->fromU2022State; michael@0: char *p, *subchar; michael@0: char buffer[8]; michael@0: int32_t length; michael@0: michael@0: subchar=(char *)cnv->subChars; michael@0: length=cnv->subCharLen; /* assume length==1 for most variants */ michael@0: michael@0: p = buffer; michael@0: switch(myConverterData->locale[0]){ michael@0: case 'j': michael@0: { michael@0: int8_t cs; michael@0: michael@0: if(pFromU2022State->g == 1) { michael@0: /* JIS7: switch from G1 to G0 */ michael@0: pFromU2022State->g = 0; michael@0: *p++ = UCNV_SI; michael@0: } michael@0: michael@0: cs = pFromU2022State->cs[0]; michael@0: if(cs != ASCII && cs != JISX201) { michael@0: /* not in ASCII or JIS X 0201: switch to ASCII */ michael@0: pFromU2022State->cs[0] = (int8_t)ASCII; michael@0: *p++ = '\x1b'; michael@0: *p++ = '\x28'; michael@0: *p++ = '\x42'; michael@0: } michael@0: michael@0: *p++ = subchar[0]; michael@0: break; michael@0: } michael@0: case 'c': michael@0: if(pFromU2022State->g != 0) { michael@0: /* not in ASCII mode: switch to ASCII */ michael@0: pFromU2022State->g = 0; michael@0: *p++ = UCNV_SI; michael@0: } michael@0: *p++ = subchar[0]; michael@0: break; michael@0: case 'k': michael@0: if(myConverterData->version == 0) { michael@0: if(length == 1) { michael@0: if((UBool)args->converter->fromUnicodeStatus) { michael@0: /* in DBCS mode: switch to SBCS */ michael@0: args->converter->fromUnicodeStatus = 0; michael@0: *p++ = UCNV_SI; michael@0: } michael@0: *p++ = subchar[0]; michael@0: } else /* length == 2*/ { michael@0: if(!(UBool)args->converter->fromUnicodeStatus) { michael@0: /* in SBCS mode: switch to DBCS */ michael@0: args->converter->fromUnicodeStatus = 1; michael@0: *p++ = UCNV_SO; michael@0: } michael@0: *p++ = subchar[0]; michael@0: *p++ = subchar[1]; michael@0: } michael@0: break; michael@0: } else { michael@0: /* save the subconverter's substitution string */ michael@0: uint8_t *currentSubChars = myConverterData->currentConverter->subChars; michael@0: int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; michael@0: michael@0: /* set our substitution string into the subconverter */ michael@0: myConverterData->currentConverter->subChars = (uint8_t *)subchar; michael@0: myConverterData->currentConverter->subCharLen = (int8_t)length; michael@0: michael@0: /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ michael@0: args->converter = myConverterData->currentConverter; michael@0: myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; michael@0: ucnv_cbFromUWriteSub(args, 0, err); michael@0: cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; michael@0: args->converter = cnv; michael@0: michael@0: /* restore the subconverter's substitution string */ michael@0: myConverterData->currentConverter->subChars = currentSubChars; michael@0: myConverterData->currentConverter->subCharLen = currentSubCharLen; michael@0: michael@0: if(*err == U_BUFFER_OVERFLOW_ERROR) { michael@0: if(myConverterData->currentConverter->charErrorBufferLength > 0) { michael@0: uprv_memcpy( michael@0: cnv->charErrorBuffer, michael@0: myConverterData->currentConverter->charErrorBuffer, michael@0: myConverterData->currentConverter->charErrorBufferLength); michael@0: } michael@0: cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; michael@0: myConverterData->currentConverter->charErrorBufferLength = 0; michael@0: } michael@0: return; michael@0: } michael@0: default: michael@0: /* not expected */ michael@0: break; michael@0: } michael@0: ucnv_cbFromUWriteBytes(args, michael@0: buffer, (int32_t)(p - buffer), michael@0: offsetIndex, err); michael@0: } michael@0: michael@0: /* michael@0: * Structure for cloning an ISO 2022 converter into a single memory block. michael@0: * ucnv_safeClone() of the converter will align the entire cloneStruct, michael@0: * and then ucnv_safeClone() of the sub-converter may additionally align michael@0: * currentConverter inside the cloneStruct, for which we need the deadSpace michael@0: * after currentConverter. michael@0: * This is because UAlignedMemory may be larger than the actually michael@0: * necessary alignment size for the platform. michael@0: * The other cloneStruct fields will not be moved around, michael@0: * and are aligned properly with cloneStruct's alignment. michael@0: */ michael@0: struct cloneStruct michael@0: { michael@0: UConverter cnv; michael@0: UConverter currentConverter; michael@0: UAlignedMemory deadSpace; michael@0: UConverterDataISO2022 mydata; michael@0: }; michael@0: michael@0: michael@0: static UConverter * michael@0: _ISO_2022_SafeClone( michael@0: const UConverter *cnv, michael@0: void *stackBuffer, michael@0: int32_t *pBufferSize, michael@0: UErrorCode *status) michael@0: { michael@0: struct cloneStruct * localClone; michael@0: UConverterDataISO2022 *cnvData; michael@0: int32_t i, size; michael@0: michael@0: if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ michael@0: *pBufferSize = (int32_t)sizeof(struct cloneStruct); michael@0: return NULL; michael@0: } michael@0: michael@0: cnvData = (UConverterDataISO2022 *)cnv->extraInfo; michael@0: localClone = (struct cloneStruct *)stackBuffer; michael@0: michael@0: /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ michael@0: michael@0: uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); michael@0: localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ michael@0: localClone->cnv.isExtraLocal = TRUE; michael@0: michael@0: /* share the subconverters */ michael@0: michael@0: if(cnvData->currentConverter != NULL) { michael@0: size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ michael@0: localClone->mydata.currentConverter = michael@0: ucnv_safeClone(cnvData->currentConverter, michael@0: &localClone->currentConverter, michael@0: &size, status); michael@0: if(U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: for(i=0; imyConverterArray[i] != NULL) { michael@0: ucnv_incrementRefCount(cnvData->myConverterArray[i]); michael@0: } michael@0: } michael@0: michael@0: return &localClone->cnv; michael@0: } michael@0: michael@0: static void michael@0: _ISO_2022_GetUnicodeSet(const UConverter *cnv, michael@0: const USetAdder *sa, michael@0: UConverterUnicodeSet which, michael@0: UErrorCode *pErrorCode) michael@0: { michael@0: int32_t i; michael@0: UConverterDataISO2022* cnvData; michael@0: michael@0: if (U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: if (cnv->sharedData == &_ISO2022Data) { michael@0: /* We use UTF-8 in this case */ michael@0: sa->addRange(sa->set, 0, 0xd7FF); michael@0: sa->addRange(sa->set, 0xE000, 0x10FFFF); michael@0: return; michael@0: } michael@0: #endif michael@0: michael@0: cnvData = (UConverterDataISO2022*)cnv->extraInfo; michael@0: michael@0: /* open a set and initialize it with code points that are algorithmically round-tripped */ michael@0: switch(cnvData->locale[0]){ michael@0: case 'j': michael@0: /* include JIS X 0201 which is hardcoded */ michael@0: sa->add(sa->set, 0xa5); michael@0: sa->add(sa->set, 0x203e); michael@0: if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { michael@0: /* include Latin-1 for some variants of JP */ michael@0: sa->addRange(sa->set, 0, 0xff); michael@0: } else { michael@0: /* include ASCII for JP */ michael@0: sa->addRange(sa->set, 0, 0x7f); michael@0: } michael@0: if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { michael@0: /* michael@0: * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 michael@0: * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) michael@0: * use half-width Katakana. michael@0: * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) michael@0: * half-width Katakana via the ESC ( I sequence. michael@0: * However, we only emit (fromUnicode) half-width Katakana according to the michael@0: * definition of each variant. michael@0: * michael@0: * When including fallbacks, michael@0: * we need to include half-width Katakana Unicode code points for all JP variants because michael@0: * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). michael@0: */ michael@0: /* include half-width Katakana for JP */ michael@0: sa->addRange(sa->set, HWKANA_START, HWKANA_END); michael@0: } michael@0: break; michael@0: case 'c': michael@0: case 'z': michael@0: /* include ASCII for CN */ michael@0: sa->addRange(sa->set, 0, 0x7f); michael@0: break; michael@0: case 'k': michael@0: /* there is only one converter for KR, and it is not in the myConverterArray[] */ michael@0: cnvData->currentConverter->sharedData->impl->getUnicodeSet( michael@0: cnvData->currentConverter, sa, which, pErrorCode); michael@0: /* the loop over myConverterArray[] will simply not find another converter */ michael@0: break; michael@0: default: michael@0: break; michael@0: } michael@0: michael@0: #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ michael@0: if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && michael@0: cnvData->version==0 && i==CNS_11643 michael@0: ) { michael@0: /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ michael@0: ucnv_MBCSGetUnicodeSetForBytes( michael@0: cnvData->myConverterArray[i], michael@0: sa, UCNV_ROUNDTRIP_SET, michael@0: 0, 0x81, 0x82, michael@0: pErrorCode); michael@0: } michael@0: #endif michael@0: michael@0: for (i=0; imyConverterArray[i]!=NULL) { michael@0: if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && michael@0: cnvData->version==0 && i==CNS_11643 michael@0: ) { michael@0: /* michael@0: * Version-specific for CN: michael@0: * CN version 0 does not map CNS planes 3..7 although michael@0: * they are all available in the CNS conversion table; michael@0: * CN version 1 (-EXT) does map them all. michael@0: * The two versions create different Unicode sets. michael@0: */ michael@0: filter=UCNV_SET_FILTER_2022_CN; michael@0: } else if(cnvData->locale[0]=='j' && i==JISX208) { michael@0: /* michael@0: * Only add code points that map to Shift-JIS codes michael@0: * corresponding to JIS X 0208. michael@0: */ michael@0: filter=UCNV_SET_FILTER_SJIS; michael@0: } else if(i==KSC5601) { michael@0: /* michael@0: * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) michael@0: * are broader than GR94. michael@0: */ michael@0: filter=UCNV_SET_FILTER_GR94DBCS; michael@0: } else { michael@0: filter=UCNV_SET_FILTER_NONE; michael@0: } michael@0: ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * ISO 2022 converters must not convert SO/SI/ESC despite what michael@0: * sub-converters do by themselves. michael@0: * Remove these characters from the set. michael@0: */ michael@0: sa->remove(sa->set, 0x0e); michael@0: sa->remove(sa->set, 0x0f); michael@0: sa->remove(sa->set, 0x1b); michael@0: michael@0: /* ISO 2022 converters do not convert C1 controls either */ michael@0: sa->removeRange(sa->set, 0x80, 0x9f); michael@0: } michael@0: michael@0: static const UConverterImpl _ISO2022Impl={ michael@0: UCNV_ISO_2022, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _ISO2022Open, michael@0: _ISO2022Close, michael@0: _ISO2022Reset, michael@0: michael@0: #ifdef U_ENABLE_GENERIC_ISO_2022 michael@0: T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, michael@0: T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, michael@0: ucnv_fromUnicode_UTF8, michael@0: ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, michael@0: #else michael@0: NULL, michael@0: NULL, michael@0: NULL, michael@0: NULL, michael@0: #endif michael@0: NULL, michael@0: michael@0: NULL, michael@0: _ISO2022getName, michael@0: _ISO_2022_WriteSub, michael@0: _ISO_2022_SafeClone, michael@0: _ISO_2022_GetUnicodeSet, michael@0: michael@0: NULL, michael@0: NULL michael@0: }; michael@0: static const UConverterStaticData _ISO2022StaticData={ michael@0: sizeof(UConverterStaticData), michael@0: "ISO_2022", michael@0: 2022, michael@0: UCNV_IBM, michael@0: UCNV_ISO_2022, michael@0: 1, michael@0: 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ michael@0: { 0x1a, 0, 0, 0 }, michael@0: 1, michael@0: FALSE, michael@0: FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: const UConverterSharedData _ISO2022Data={ michael@0: sizeof(UConverterSharedData), michael@0: ~((uint32_t) 0), michael@0: NULL, michael@0: NULL, michael@0: &_ISO2022StaticData, michael@0: FALSE, michael@0: &_ISO2022Impl, michael@0: 0, UCNV_MBCS_TABLE_INITIALIZER michael@0: }; michael@0: michael@0: /*************JP****************/ michael@0: static const UConverterImpl _ISO2022JPImpl={ michael@0: UCNV_ISO_2022, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _ISO2022Open, michael@0: _ISO2022Close, michael@0: _ISO2022Reset, michael@0: michael@0: UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, michael@0: UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, michael@0: UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, michael@0: UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, michael@0: NULL, michael@0: michael@0: NULL, michael@0: _ISO2022getName, michael@0: _ISO_2022_WriteSub, michael@0: _ISO_2022_SafeClone, michael@0: _ISO_2022_GetUnicodeSet, michael@0: michael@0: NULL, michael@0: NULL michael@0: }; michael@0: static const UConverterStaticData _ISO2022JPStaticData={ michael@0: sizeof(UConverterStaticData), michael@0: "ISO_2022_JP", michael@0: 0, michael@0: UCNV_IBM, michael@0: UCNV_ISO_2022, michael@0: 1, michael@0: 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ michael@0: { 0x1a, 0, 0, 0 }, michael@0: 1, michael@0: FALSE, michael@0: FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: michael@0: namespace { michael@0: michael@0: const UConverterSharedData _ISO2022JPData={ michael@0: sizeof(UConverterSharedData), michael@0: ~((uint32_t) 0), michael@0: NULL, michael@0: NULL, michael@0: &_ISO2022JPStaticData, michael@0: FALSE, michael@0: &_ISO2022JPImpl, michael@0: 0, UCNV_MBCS_TABLE_INITIALIZER michael@0: }; michael@0: michael@0: } // namespace michael@0: michael@0: /************* KR ***************/ michael@0: static const UConverterImpl _ISO2022KRImpl={ michael@0: UCNV_ISO_2022, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _ISO2022Open, michael@0: _ISO2022Close, michael@0: _ISO2022Reset, michael@0: michael@0: UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, michael@0: UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, michael@0: UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, michael@0: UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, michael@0: NULL, michael@0: michael@0: NULL, michael@0: _ISO2022getName, michael@0: _ISO_2022_WriteSub, michael@0: _ISO_2022_SafeClone, michael@0: _ISO_2022_GetUnicodeSet, michael@0: michael@0: NULL, michael@0: NULL michael@0: }; michael@0: static const UConverterStaticData _ISO2022KRStaticData={ michael@0: sizeof(UConverterStaticData), michael@0: "ISO_2022_KR", michael@0: 0, michael@0: UCNV_IBM, michael@0: UCNV_ISO_2022, michael@0: 1, michael@0: 3, /* max 3 bytes per UChar: SO+DBCS */ michael@0: { 0x1a, 0, 0, 0 }, michael@0: 1, michael@0: FALSE, michael@0: FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: michael@0: namespace { michael@0: michael@0: const UConverterSharedData _ISO2022KRData={ michael@0: sizeof(UConverterSharedData), michael@0: ~((uint32_t) 0), michael@0: NULL, michael@0: NULL, michael@0: &_ISO2022KRStaticData, michael@0: FALSE, michael@0: &_ISO2022KRImpl, michael@0: 0, UCNV_MBCS_TABLE_INITIALIZER michael@0: }; michael@0: michael@0: } // namespace michael@0: michael@0: /*************** CN ***************/ michael@0: static const UConverterImpl _ISO2022CNImpl={ michael@0: michael@0: UCNV_ISO_2022, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _ISO2022Open, michael@0: _ISO2022Close, michael@0: _ISO2022Reset, michael@0: michael@0: UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, michael@0: UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, michael@0: UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, michael@0: UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, michael@0: NULL, michael@0: michael@0: NULL, michael@0: _ISO2022getName, michael@0: _ISO_2022_WriteSub, michael@0: _ISO_2022_SafeClone, michael@0: _ISO_2022_GetUnicodeSet, michael@0: michael@0: NULL, michael@0: NULL michael@0: }; michael@0: static const UConverterStaticData _ISO2022CNStaticData={ michael@0: sizeof(UConverterStaticData), michael@0: "ISO_2022_CN", michael@0: 0, michael@0: UCNV_IBM, michael@0: UCNV_ISO_2022, michael@0: 1, michael@0: 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ michael@0: { 0x1a, 0, 0, 0 }, michael@0: 1, michael@0: FALSE, michael@0: FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: michael@0: namespace { michael@0: michael@0: const UConverterSharedData _ISO2022CNData={ michael@0: sizeof(UConverterSharedData), michael@0: ~((uint32_t) 0), michael@0: NULL, michael@0: NULL, michael@0: &_ISO2022CNStaticData, michael@0: FALSE, michael@0: &_ISO2022CNImpl, michael@0: 0, UCNV_MBCS_TABLE_INITIALIZER michael@0: }; michael@0: michael@0: } // namespace michael@0: michael@0: #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */