|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2000-2012, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * file name: ucnv2022.cpp |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2000feb03 |
|
12 * created by: Markus W. Scherer |
|
13 * |
|
14 * Change history: |
|
15 * |
|
16 * 06/29/2000 helena Major rewrite of the callback APIs. |
|
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 |
|
18 * Changed implementation of toUnicode |
|
19 * function |
|
20 * 08/21/2000 Ram Added support for ISO-2022-KR |
|
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to |
|
22 * ucnvebdc.c |
|
23 * 09/20/2000 Ram Added support for ISO-2022-CN |
|
24 * Added implementations for getNextUChar() |
|
25 * for specific 2022 country variants. |
|
26 * 10/31/2000 Ram Implemented offsets logic functions |
|
27 */ |
|
28 |
|
29 #include "unicode/utypes.h" |
|
30 |
|
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
|
32 |
|
33 #include "unicode/ucnv.h" |
|
34 #include "unicode/uset.h" |
|
35 #include "unicode/ucnv_err.h" |
|
36 #include "unicode/ucnv_cb.h" |
|
37 #include "unicode/utf16.h" |
|
38 #include "ucnv_imp.h" |
|
39 #include "ucnv_bld.h" |
|
40 #include "ucnv_cnv.h" |
|
41 #include "ucnvmbcs.h" |
|
42 #include "cstring.h" |
|
43 #include "cmemory.h" |
|
44 #include "uassert.h" |
|
45 |
|
46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
47 |
|
48 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
49 /* |
|
50 * I am disabling the generic ISO-2022 converter after proposing to do so on |
|
51 * the icu mailing list two days ago. |
|
52 * |
|
53 * Reasons: |
|
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of |
|
55 * its designation sequences, single shifts with return to the previous state, |
|
56 * switch-with-no-return to UTF-16BE or similar, etc. |
|
57 * This is unlike the language-specific variants like ISO-2022-JP which |
|
58 * require a much smaller repertoire of ISO-2022 features. |
|
59 * These variants continue to be supported. |
|
60 * 2. I believe that no one is really using the generic ISO-2022 converter |
|
61 * but rather always one of the language-specific variants. |
|
62 * Note that ICU's generic ISO-2022 converter has always output one escape |
|
63 * sequence followed by UTF-8 for the whole stream. |
|
64 * 3. Switching between subcharsets is extremely slow, because each time |
|
65 * the previous converter is closed and a new one opened, |
|
66 * without any kind of caching, least-recently-used list, etc. |
|
67 * 4. The code is currently buggy, and given the above it does not seem |
|
68 * reasonable to spend the time on maintenance. |
|
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. |
|
70 * This means, for example, that when ISO-8859-7 is designated, the following |
|
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. |
|
72 * The ICU ISO-2022 converter does not handle this - and has no information |
|
73 * about which subconverter would have to be shifted vs. which is designed |
|
74 * for 7-bit ISO-2022. |
|
75 * |
|
76 * Markus Scherer 2003-dec-03 |
|
77 */ |
|
78 #endif |
|
79 |
|
80 static const char SHIFT_IN_STR[] = "\x0F"; |
|
81 // static const char SHIFT_OUT_STR[] = "\x0E"; |
|
82 |
|
83 #define CR 0x0D |
|
84 #define LF 0x0A |
|
85 #define H_TAB 0x09 |
|
86 #define V_TAB 0x0B |
|
87 #define SPACE 0x20 |
|
88 |
|
89 enum { |
|
90 HWKANA_START=0xff61, |
|
91 HWKANA_END=0xff9f |
|
92 }; |
|
93 |
|
94 /* |
|
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 |
|
96 * as bytes 21..7E. (Subtract 0x80.) |
|
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 |
|
98 * as bytes 20..7F. (Subtract 0x80.) |
|
99 * Do not encode C1 control codes with native bytes 80..9F |
|
100 * as bytes 00..1F (C0 control codes). |
|
101 */ |
|
102 enum { |
|
103 GR94_START=0xa1, |
|
104 GR94_END=0xfe, |
|
105 GR96_START=0xa0, |
|
106 GR96_END=0xff |
|
107 }; |
|
108 |
|
109 /* |
|
110 * ISO 2022 control codes must not be converted from Unicode |
|
111 * because they would mess up the byte stream. |
|
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b |
|
113 * corresponding to SO, SI, and ESC. |
|
114 */ |
|
115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) |
|
116 |
|
117 /* for ISO-2022-JP and -CN implementations */ |
|
118 typedef enum { |
|
119 /* shared values */ |
|
120 INVALID_STATE=-1, |
|
121 ASCII = 0, |
|
122 |
|
123 SS2_STATE=0x10, |
|
124 SS3_STATE, |
|
125 |
|
126 /* JP */ |
|
127 ISO8859_1 = 1 , |
|
128 ISO8859_7 = 2 , |
|
129 JISX201 = 3, |
|
130 JISX208 = 4, |
|
131 JISX212 = 5, |
|
132 GB2312 =6, |
|
133 KSC5601 =7, |
|
134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ |
|
135 |
|
136 /* CN */ |
|
137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ |
|
138 GB2312_1=1, |
|
139 ISO_IR_165=2, |
|
140 CNS_11643=3, |
|
141 |
|
142 /* |
|
143 * these are used in StateEnum and ISO2022State variables, |
|
144 * but CNS_11643 must be used to index into myConverterArray[] |
|
145 */ |
|
146 CNS_11643_0=0x20, |
|
147 CNS_11643_1, |
|
148 CNS_11643_2, |
|
149 CNS_11643_3, |
|
150 CNS_11643_4, |
|
151 CNS_11643_5, |
|
152 CNS_11643_6, |
|
153 CNS_11643_7 |
|
154 } StateEnum; |
|
155 |
|
156 /* is the StateEnum charset value for a DBCS charset? */ |
|
157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) |
|
158 |
|
159 #define CSM(cs) ((uint16_t)1<<(cs)) |
|
160 |
|
161 /* |
|
162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence |
|
163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x |
|
164 * |
|
165 * Note: The converter uses some leniency: |
|
166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in |
|
167 * all versions, not just JIS7 and JIS8. |
|
168 * - ICU does not distinguish between different versions of JIS X 0208. |
|
169 */ |
|
170 enum { MAX_JA_VERSION=4 }; |
|
171 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ |
|
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), |
|
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), |
|
174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
|
175 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
|
176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) |
|
177 }; |
|
178 |
|
179 typedef enum { |
|
180 ASCII1=0, |
|
181 LATIN1, |
|
182 SBCS, |
|
183 DBCS, |
|
184 MBCS, |
|
185 HWKANA |
|
186 }Cnv2022Type; |
|
187 |
|
188 typedef struct ISO2022State { |
|
189 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ |
|
190 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ |
|
191 int8_t prevG; /* g before single shift (SS2 or SS3) */ |
|
192 } ISO2022State; |
|
193 |
|
194 #define UCNV_OPTIONS_VERSION_MASK 0xf |
|
195 #define UCNV_2022_MAX_CONVERTERS 10 |
|
196 |
|
197 typedef struct{ |
|
198 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; |
|
199 UConverter *currentConverter; |
|
200 Cnv2022Type currentType; |
|
201 ISO2022State toU2022State, fromU2022State; |
|
202 uint32_t key; |
|
203 uint32_t version; |
|
204 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
205 UBool isFirstBuffer; |
|
206 #endif |
|
207 UBool isEmptySegment; |
|
208 char name[30]; |
|
209 char locale[3]; |
|
210 }UConverterDataISO2022; |
|
211 |
|
212 /* Protos */ |
|
213 /* ISO-2022 ----------------------------------------------------------------- */ |
|
214 |
|
215 /*Forward declaration */ |
|
216 U_CFUNC void |
|
217 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, |
|
218 UErrorCode * err); |
|
219 U_CFUNC void |
|
220 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, |
|
221 UErrorCode * err); |
|
222 |
|
223 #define ESC_2022 0x1B /*ESC*/ |
|
224 |
|
225 typedef enum |
|
226 { |
|
227 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ |
|
228 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ |
|
229 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ |
|
230 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ |
|
231 } UCNV_TableStates_2022; |
|
232 |
|
233 /* |
|
234 * The way these state transition arrays work is: |
|
235 * ex : ESC$B is the sequence for JISX208 |
|
236 * a) First Iteration: char is ESC |
|
237 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index |
|
238 * int x = normalize_esq_chars_2022[27] which is equal to 1 |
|
239 * ii) Search for this value in escSeqStateTable_Key_2022[] |
|
240 * value of x is stored at escSeqStateTable_Key_2022[0] |
|
241 * iii) Save this index as offset |
|
242 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
|
243 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
|
244 * b) Switch on this state and continue to next char |
|
245 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index |
|
246 * which is normalize_esq_chars_2022[36] == 4 |
|
247 * ii) x is currently 1(from above) |
|
248 * x<<=5 -- x is now 32 |
|
249 * x+=normalize_esq_chars_2022[36] |
|
250 * now x is 36 |
|
251 * iii) Search for this value in escSeqStateTable_Key_2022[] |
|
252 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 |
|
253 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
|
254 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
|
255 * c) Switch on this state and continue to next char |
|
256 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index |
|
257 * ii) x is currently 36 (from above) |
|
258 * x<<=5 -- x is now 1152 |
|
259 * x+=normalize_esq_chars_2022[66] |
|
260 * now x is 1161 |
|
261 * iii) Search for this value in escSeqStateTable_Key_2022[] |
|
262 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 |
|
263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] |
|
264 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 |
|
265 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 |
|
266 */ |
|
267 |
|
268 |
|
269 /*Below are the 3 arrays depicting a state transition table*/ |
|
270 static const int8_t normalize_esq_chars_2022[256] = { |
|
271 /* 0 1 2 3 4 5 6 7 8 9 */ |
|
272 |
|
273 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 |
|
276 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 |
|
277 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 |
|
278 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
279 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 |
|
280 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 |
|
281 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
282 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
|
298 ,0 ,0 ,0 ,0 ,0 ,0 |
|
299 }; |
|
300 |
|
301 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
302 /* |
|
303 * When the generic ISO-2022 converter is completely removed, not just disabled |
|
304 * per #ifdef, then the following state table and the associated tables that are |
|
305 * dimensioned with MAX_STATES_2022 should be trimmed. |
|
306 * |
|
307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of |
|
308 * the associated escape sequences starting with ESC ( B should be removed. |
|
309 * This includes the ones with key values 1097 and all of the ones above 1000000. |
|
310 * |
|
311 * For the latter, the tables can simply be truncated. |
|
312 * For the former, since the tables must be kept parallel, it is probably best |
|
313 * to simply duplicate an adjacent table cell, parallel in all tables. |
|
314 * |
|
315 * It may make sense to restructure the tables, especially by using small search |
|
316 * tables for the variants instead of indexing them parallel to the table here. |
|
317 */ |
|
318 #endif |
|
319 |
|
320 #define MAX_STATES_2022 74 |
|
321 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { |
|
322 /* 0 1 2 3 4 5 6 7 8 9 */ |
|
323 |
|
324 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 |
|
325 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 |
|
326 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 |
|
327 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 |
|
328 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 |
|
329 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 |
|
330 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 |
|
331 ,35947631 ,35947635 ,35947636 ,35947638 |
|
332 }; |
|
333 |
|
334 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
335 |
|
336 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { |
|
337 /* 0 1 2 3 4 5 6 7 8 9 */ |
|
338 |
|
339 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" |
|
340 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" |
|
341 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" |
|
342 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" |
|
343 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" |
|
344 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" |
|
345 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" |
|
346 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" |
|
347 }; |
|
348 |
|
349 #endif |
|
350 |
|
351 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { |
|
352 /* 0 1 2 3 4 5 6 7 8 9 */ |
|
353 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
|
354 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
|
355 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 |
|
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
|
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
|
358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
|
359 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
|
360 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
|
361 }; |
|
362 |
|
363 |
|
364 /* Type def for refactoring changeState_2022 code*/ |
|
365 typedef enum{ |
|
366 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
367 ISO_2022=0, |
|
368 #endif |
|
369 ISO_2022_JP=1, |
|
370 ISO_2022_KR=2, |
|
371 ISO_2022_CN=3 |
|
372 } Variant2022; |
|
373 |
|
374 /*********** ISO 2022 Converter Protos ***********/ |
|
375 static void |
|
376 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); |
|
377 |
|
378 static void |
|
379 _ISO2022Close(UConverter *converter); |
|
380 |
|
381 static void |
|
382 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); |
|
383 |
|
384 static const char* |
|
385 _ISO2022getName(const UConverter* cnv); |
|
386 |
|
387 static void |
|
388 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); |
|
389 |
|
390 static UConverter * |
|
391 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); |
|
392 |
|
393 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
394 static void |
|
395 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); |
|
396 #endif |
|
397 |
|
398 namespace { |
|
399 |
|
400 /*const UConverterSharedData _ISO2022Data;*/ |
|
401 extern const UConverterSharedData _ISO2022JPData; |
|
402 extern const UConverterSharedData _ISO2022KRData; |
|
403 extern const UConverterSharedData _ISO2022CNData; |
|
404 |
|
405 } // namespace |
|
406 |
|
407 /*************** Converter implementations ******************/ |
|
408 |
|
409 /* The purpose of this function is to get around gcc compiler warnings. */ |
|
410 static inline void |
|
411 fromUWriteUInt8(UConverter *cnv, |
|
412 const char *bytes, int32_t length, |
|
413 uint8_t **target, const char *targetLimit, |
|
414 int32_t **offsets, |
|
415 int32_t sourceIndex, |
|
416 UErrorCode *pErrorCode) |
|
417 { |
|
418 char *targetChars = (char *)*target; |
|
419 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, |
|
420 offsets, sourceIndex, pErrorCode); |
|
421 *target = (uint8_t*)targetChars; |
|
422 |
|
423 } |
|
424 |
|
425 static inline void |
|
426 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ |
|
427 if(myConverterData->version == 1) { |
|
428 UConverter *cnv = myConverterData->currentConverter; |
|
429 |
|
430 cnv->toUnicodeStatus=0; /* offset */ |
|
431 cnv->mode=0; /* state */ |
|
432 cnv->toULength=0; /* byteIndex */ |
|
433 } |
|
434 } |
|
435 |
|
436 static inline void |
|
437 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ |
|
438 /* in ISO-2022-KR the designator sequence appears only once |
|
439 * in a file so we append it only once |
|
440 */ |
|
441 if( converter->charErrorBufferLength==0){ |
|
442 |
|
443 converter->charErrorBufferLength = 4; |
|
444 converter->charErrorBuffer[0] = 0x1b; |
|
445 converter->charErrorBuffer[1] = 0x24; |
|
446 converter->charErrorBuffer[2] = 0x29; |
|
447 converter->charErrorBuffer[3] = 0x43; |
|
448 } |
|
449 if(myConverterData->version == 1) { |
|
450 UConverter *cnv = myConverterData->currentConverter; |
|
451 |
|
452 cnv->fromUChar32=0; |
|
453 cnv->fromUnicodeStatus=1; /* prevLength */ |
|
454 } |
|
455 } |
|
456 |
|
457 static void |
|
458 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
|
459 |
|
460 char myLocale[6]={' ',' ',' ',' ',' ',' '}; |
|
461 |
|
462 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); |
|
463 if(cnv->extraInfo != NULL) { |
|
464 UConverterNamePieces stackPieces; |
|
465 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; |
|
466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
|
467 uint32_t version; |
|
468 |
|
469 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; |
|
470 |
|
471 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); |
|
472 myConverterData->currentType = ASCII1; |
|
473 cnv->fromUnicodeStatus =FALSE; |
|
474 if(pArgs->locale){ |
|
475 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); |
|
476 } |
|
477 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; |
|
478 myConverterData->version = version; |
|
479 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && |
|
480 (myLocale[2]=='_' || myLocale[2]=='\0')) |
|
481 { |
|
482 size_t len=0; |
|
483 /* open the required converters and cache them */ |
|
484 if(version>MAX_JA_VERSION) { |
|
485 /* prevent indexing beyond jpCharsetMasks[] */ |
|
486 myConverterData->version = version = 0; |
|
487 } |
|
488 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { |
|
489 myConverterData->myConverterArray[ISO8859_7] = |
|
490 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); |
|
491 } |
|
492 myConverterData->myConverterArray[JISX208] = |
|
493 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); |
|
494 if(jpCharsetMasks[version]&CSM(JISX212)) { |
|
495 myConverterData->myConverterArray[JISX212] = |
|
496 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); |
|
497 } |
|
498 if(jpCharsetMasks[version]&CSM(GB2312)) { |
|
499 myConverterData->myConverterArray[GB2312] = |
|
500 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ |
|
501 } |
|
502 if(jpCharsetMasks[version]&CSM(KSC5601)) { |
|
503 myConverterData->myConverterArray[KSC5601] = |
|
504 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); |
|
505 } |
|
506 |
|
507 /* set the function pointers to appropriate funtions */ |
|
508 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); |
|
509 uprv_strcpy(myConverterData->locale,"ja"); |
|
510 |
|
511 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); |
|
512 len = uprv_strlen(myConverterData->name); |
|
513 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); |
|
514 myConverterData->name[len+1]='\0'; |
|
515 } |
|
516 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && |
|
517 (myLocale[2]=='_' || myLocale[2]=='\0')) |
|
518 { |
|
519 const char *cnvName; |
|
520 if(version==1) { |
|
521 cnvName="icu-internal-25546"; |
|
522 } else { |
|
523 cnvName="ibm-949"; |
|
524 myConverterData->version=version=0; |
|
525 } |
|
526 if(pArgs->onlyTestIsLoadable) { |
|
527 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ |
|
528 uprv_free(cnv->extraInfo); |
|
529 cnv->extraInfo=NULL; |
|
530 return; |
|
531 } else { |
|
532 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); |
|
533 if (U_FAILURE(*errorCode)) { |
|
534 _ISO2022Close(cnv); |
|
535 return; |
|
536 } |
|
537 |
|
538 if(version==1) { |
|
539 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); |
|
540 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); |
|
541 cnv->subCharLen = myConverterData->currentConverter->subCharLen; |
|
542 }else{ |
|
543 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); |
|
544 } |
|
545 |
|
546 /* initialize the state variables */ |
|
547 setInitialStateToUnicodeKR(cnv, myConverterData); |
|
548 setInitialStateFromUnicodeKR(cnv, myConverterData); |
|
549 |
|
550 /* set the function pointers to appropriate funtions */ |
|
551 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; |
|
552 uprv_strcpy(myConverterData->locale,"ko"); |
|
553 } |
|
554 } |
|
555 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& |
|
556 (myLocale[2]=='_' || myLocale[2]=='\0')) |
|
557 { |
|
558 |
|
559 /* open the required converters and cache them */ |
|
560 myConverterData->myConverterArray[GB2312_1] = |
|
561 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); |
|
562 if(version==1) { |
|
563 myConverterData->myConverterArray[ISO_IR_165] = |
|
564 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); |
|
565 } |
|
566 myConverterData->myConverterArray[CNS_11643] = |
|
567 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); |
|
568 |
|
569 |
|
570 /* set the function pointers to appropriate funtions */ |
|
571 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; |
|
572 uprv_strcpy(myConverterData->locale,"cn"); |
|
573 |
|
574 if (version==0){ |
|
575 myConverterData->version = 0; |
|
576 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); |
|
577 }else if (version==1){ |
|
578 myConverterData->version = 1; |
|
579 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); |
|
580 }else { |
|
581 myConverterData->version = 2; |
|
582 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); |
|
583 } |
|
584 } |
|
585 else{ |
|
586 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
587 myConverterData->isFirstBuffer = TRUE; |
|
588 |
|
589 /* append the UTF-8 escape sequence */ |
|
590 cnv->charErrorBufferLength = 3; |
|
591 cnv->charErrorBuffer[0] = 0x1b; |
|
592 cnv->charErrorBuffer[1] = 0x25; |
|
593 cnv->charErrorBuffer[2] = 0x42; |
|
594 |
|
595 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; |
|
596 /* initialize the state variables */ |
|
597 uprv_strcpy(myConverterData->name,"ISO_2022"); |
|
598 #else |
|
599 *errorCode = U_UNSUPPORTED_ERROR; |
|
600 return; |
|
601 #endif |
|
602 } |
|
603 |
|
604 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; |
|
605 |
|
606 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { |
|
607 _ISO2022Close(cnv); |
|
608 } |
|
609 } else { |
|
610 *errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
611 } |
|
612 } |
|
613 |
|
614 |
|
615 static void |
|
616 _ISO2022Close(UConverter *converter) { |
|
617 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); |
|
618 UConverterSharedData **array = myData->myConverterArray; |
|
619 int32_t i; |
|
620 |
|
621 if (converter->extraInfo != NULL) { |
|
622 /*close the array of converter pointers and free the memory*/ |
|
623 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
|
624 if(array[i]!=NULL) { |
|
625 ucnv_unloadSharedDataIfReady(array[i]); |
|
626 } |
|
627 } |
|
628 |
|
629 ucnv_close(myData->currentConverter); |
|
630 |
|
631 if(!converter->isExtraLocal){ |
|
632 uprv_free (converter->extraInfo); |
|
633 converter->extraInfo = NULL; |
|
634 } |
|
635 } |
|
636 } |
|
637 |
|
638 static void |
|
639 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { |
|
640 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); |
|
641 if(choice<=UCNV_RESET_TO_UNICODE) { |
|
642 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); |
|
643 myConverterData->key = 0; |
|
644 myConverterData->isEmptySegment = FALSE; |
|
645 } |
|
646 if(choice!=UCNV_RESET_TO_UNICODE) { |
|
647 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); |
|
648 } |
|
649 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
650 if(myConverterData->locale[0] == 0){ |
|
651 if(choice<=UCNV_RESET_TO_UNICODE) { |
|
652 myConverterData->isFirstBuffer = TRUE; |
|
653 myConverterData->key = 0; |
|
654 if (converter->mode == UCNV_SO){ |
|
655 ucnv_close (myConverterData->currentConverter); |
|
656 myConverterData->currentConverter=NULL; |
|
657 } |
|
658 converter->mode = UCNV_SI; |
|
659 } |
|
660 if(choice!=UCNV_RESET_TO_UNICODE) { |
|
661 /* re-append UTF-8 escape sequence */ |
|
662 converter->charErrorBufferLength = 3; |
|
663 converter->charErrorBuffer[0] = 0x1b; |
|
664 converter->charErrorBuffer[1] = 0x28; |
|
665 converter->charErrorBuffer[2] = 0x42; |
|
666 } |
|
667 } |
|
668 else |
|
669 #endif |
|
670 { |
|
671 /* reset the state variables */ |
|
672 if(myConverterData->locale[0] == 'k'){ |
|
673 if(choice<=UCNV_RESET_TO_UNICODE) { |
|
674 setInitialStateToUnicodeKR(converter, myConverterData); |
|
675 } |
|
676 if(choice!=UCNV_RESET_TO_UNICODE) { |
|
677 setInitialStateFromUnicodeKR(converter, myConverterData); |
|
678 } |
|
679 } |
|
680 } |
|
681 } |
|
682 |
|
683 static const char* |
|
684 _ISO2022getName(const UConverter* cnv){ |
|
685 if(cnv->extraInfo){ |
|
686 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; |
|
687 return myData->name; |
|
688 } |
|
689 return NULL; |
|
690 } |
|
691 |
|
692 |
|
693 /*************** to unicode *******************/ |
|
694 /**************************************************************************** |
|
695 * Recognized escape sequences are |
|
696 * <ESC>(B ASCII |
|
697 * <ESC>.A ISO-8859-1 |
|
698 * <ESC>.F ISO-8859-7 |
|
699 * <ESC>(J JISX-201 |
|
700 * <ESC>(I JISX-201 |
|
701 * <ESC>$B JISX-208 |
|
702 * <ESC>$@ JISX-208 |
|
703 * <ESC>$(D JISX-212 |
|
704 * <ESC>$A GB2312 |
|
705 * <ESC>$(C KSC5601 |
|
706 */ |
|
707 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { |
|
708 /* 0 1 2 3 4 5 6 7 8 9 */ |
|
709 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
710 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE |
|
711 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
712 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE |
|
713 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
714 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
715 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
716 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
717 }; |
|
718 |
|
719 /*************** to unicode *******************/ |
|
720 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { |
|
721 /* 0 1 2 3 4 5 6 7 8 9 */ |
|
722 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
723 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
724 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
725 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
726 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 |
|
727 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
728 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
729 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
|
730 }; |
|
731 |
|
732 |
|
733 static UCNV_TableStates_2022 |
|
734 getKey_2022(char c,int32_t* key,int32_t* offset){ |
|
735 int32_t togo; |
|
736 int32_t low = 0; |
|
737 int32_t hi = MAX_STATES_2022; |
|
738 int32_t oldmid=0; |
|
739 |
|
740 togo = normalize_esq_chars_2022[(uint8_t)c]; |
|
741 if(togo == 0) { |
|
742 /* not a valid character anywhere in an escape sequence */ |
|
743 *key = 0; |
|
744 *offset = 0; |
|
745 return INVALID_2022; |
|
746 } |
|
747 togo = (*key << 5) + togo; |
|
748 |
|
749 while (hi != low) /*binary search*/{ |
|
750 |
|
751 register int32_t mid = (hi+low) >> 1; /*Finds median*/ |
|
752 |
|
753 if (mid == oldmid) |
|
754 break; |
|
755 |
|
756 if (escSeqStateTable_Key_2022[mid] > togo){ |
|
757 hi = mid; |
|
758 } |
|
759 else if (escSeqStateTable_Key_2022[mid] < togo){ |
|
760 low = mid; |
|
761 } |
|
762 else /*we found it*/{ |
|
763 *key = togo; |
|
764 *offset = mid; |
|
765 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; |
|
766 } |
|
767 oldmid = mid; |
|
768 |
|
769 } |
|
770 |
|
771 *key = 0; |
|
772 *offset = 0; |
|
773 return INVALID_2022; |
|
774 } |
|
775 |
|
776 /*runs through a state machine to determine the escape sequence - codepage correspondance |
|
777 */ |
|
778 static void |
|
779 changeState_2022(UConverter* _this, |
|
780 const char** source, |
|
781 const char* sourceLimit, |
|
782 Variant2022 var, |
|
783 UErrorCode* err){ |
|
784 UCNV_TableStates_2022 value; |
|
785 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); |
|
786 uint32_t key = myData2022->key; |
|
787 int32_t offset = 0; |
|
788 int8_t initialToULength = _this->toULength; |
|
789 char c; |
|
790 |
|
791 value = VALID_NON_TERMINAL_2022; |
|
792 while (*source < sourceLimit) { |
|
793 c = *(*source)++; |
|
794 _this->toUBytes[_this->toULength++]=(uint8_t)c; |
|
795 value = getKey_2022(c,(int32_t *) &key, &offset); |
|
796 |
|
797 switch (value){ |
|
798 |
|
799 case VALID_NON_TERMINAL_2022 : |
|
800 /* continue with the loop */ |
|
801 break; |
|
802 |
|
803 case VALID_TERMINAL_2022: |
|
804 key = 0; |
|
805 goto DONE; |
|
806 |
|
807 case INVALID_2022: |
|
808 goto DONE; |
|
809 |
|
810 case VALID_MAYBE_TERMINAL_2022: |
|
811 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
812 /* ESC ( B is ambiguous only for ISO_2022 itself */ |
|
813 if(var == ISO_2022) { |
|
814 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ |
|
815 _this->toULength = 0; |
|
816 |
|
817 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ |
|
818 |
|
819 /* continue with the loop */ |
|
820 value = VALID_NON_TERMINAL_2022; |
|
821 break; |
|
822 } else |
|
823 #endif |
|
824 { |
|
825 /* not ISO_2022 itself, finish here */ |
|
826 value = VALID_TERMINAL_2022; |
|
827 key = 0; |
|
828 goto DONE; |
|
829 } |
|
830 } |
|
831 } |
|
832 |
|
833 DONE: |
|
834 myData2022->key = key; |
|
835 |
|
836 if (value == VALID_NON_TERMINAL_2022) { |
|
837 /* indicate that the escape sequence is incomplete: key!=0 */ |
|
838 return; |
|
839 } else if (value == INVALID_2022 ) { |
|
840 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
841 } else /* value == VALID_TERMINAL_2022 */ { |
|
842 switch(var){ |
|
843 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
844 case ISO_2022: |
|
845 { |
|
846 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; |
|
847 if(chosenConverterName == NULL) { |
|
848 /* SS2 or SS3 */ |
|
849 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
|
850 _this->toUCallbackReason = UCNV_UNASSIGNED; |
|
851 return; |
|
852 } |
|
853 |
|
854 _this->mode = UCNV_SI; |
|
855 ucnv_close(myData2022->currentConverter); |
|
856 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); |
|
857 if(U_SUCCESS(*err)) { |
|
858 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
|
859 _this->mode = UCNV_SO; |
|
860 } |
|
861 break; |
|
862 } |
|
863 #endif |
|
864 case ISO_2022_JP: |
|
865 { |
|
866 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; |
|
867 switch(tempState) { |
|
868 case INVALID_STATE: |
|
869 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
|
870 break; |
|
871 case SS2_STATE: |
|
872 if(myData2022->toU2022State.cs[2]!=0) { |
|
873 if(myData2022->toU2022State.g<2) { |
|
874 myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
|
875 } |
|
876 myData2022->toU2022State.g=2; |
|
877 } else { |
|
878 /* illegal to have SS2 before a matching designator */ |
|
879 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
880 } |
|
881 break; |
|
882 /* case SS3_STATE: not used in ISO-2022-JP-x */ |
|
883 case ISO8859_1: |
|
884 case ISO8859_7: |
|
885 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
|
886 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
|
887 } else { |
|
888 /* G2 charset for SS2 */ |
|
889 myData2022->toU2022State.cs[2]=(int8_t)tempState; |
|
890 } |
|
891 break; |
|
892 default: |
|
893 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
|
894 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
|
895 } else { |
|
896 /* G0 charset */ |
|
897 myData2022->toU2022State.cs[0]=(int8_t)tempState; |
|
898 } |
|
899 break; |
|
900 } |
|
901 } |
|
902 break; |
|
903 case ISO_2022_CN: |
|
904 { |
|
905 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; |
|
906 switch(tempState) { |
|
907 case INVALID_STATE: |
|
908 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
|
909 break; |
|
910 case SS2_STATE: |
|
911 if(myData2022->toU2022State.cs[2]!=0) { |
|
912 if(myData2022->toU2022State.g<2) { |
|
913 myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
|
914 } |
|
915 myData2022->toU2022State.g=2; |
|
916 } else { |
|
917 /* illegal to have SS2 before a matching designator */ |
|
918 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
919 } |
|
920 break; |
|
921 case SS3_STATE: |
|
922 if(myData2022->toU2022State.cs[3]!=0) { |
|
923 if(myData2022->toU2022State.g<2) { |
|
924 myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
|
925 } |
|
926 myData2022->toU2022State.g=3; |
|
927 } else { |
|
928 /* illegal to have SS3 before a matching designator */ |
|
929 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
930 } |
|
931 break; |
|
932 case ISO_IR_165: |
|
933 if(myData2022->version==0) { |
|
934 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
|
935 break; |
|
936 } |
|
937 /*fall through*/ |
|
938 case GB2312_1: |
|
939 /*fall through*/ |
|
940 case CNS_11643_1: |
|
941 myData2022->toU2022State.cs[1]=(int8_t)tempState; |
|
942 break; |
|
943 case CNS_11643_2: |
|
944 myData2022->toU2022State.cs[2]=(int8_t)tempState; |
|
945 break; |
|
946 default: |
|
947 /* other CNS 11643 planes */ |
|
948 if(myData2022->version==0) { |
|
949 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
|
950 } else { |
|
951 myData2022->toU2022State.cs[3]=(int8_t)tempState; |
|
952 } |
|
953 break; |
|
954 } |
|
955 } |
|
956 break; |
|
957 case ISO_2022_KR: |
|
958 if(offset==0x30){ |
|
959 /* nothing to be done, just accept this one escape sequence */ |
|
960 } else { |
|
961 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
|
962 } |
|
963 break; |
|
964 |
|
965 default: |
|
966 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
967 break; |
|
968 } |
|
969 } |
|
970 if(U_SUCCESS(*err)) { |
|
971 _this->toULength = 0; |
|
972 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
|
973 if(_this->toULength>1) { |
|
974 /* |
|
975 * Ticket 5691: consistent illegal sequences: |
|
976 * - We include at least the first byte (ESC) in the illegal sequence. |
|
977 * - If any of the non-initial bytes could be the start of a character, |
|
978 * we stop the illegal sequence before the first one of those. |
|
979 * In escape sequences, all following bytes are "printable", that is, |
|
980 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), |
|
981 * they are valid single/lead bytes. |
|
982 * For simplicity, we always only report the initial ESC byte as the |
|
983 * illegal sequence and back out all other bytes we looked at. |
|
984 */ |
|
985 /* Back out some bytes. */ |
|
986 int8_t backOutDistance=_this->toULength-1; |
|
987 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; |
|
988 if(backOutDistance<=bytesFromThisBuffer) { |
|
989 /* same as initialToULength<=1 */ |
|
990 *source-=backOutDistance; |
|
991 } else { |
|
992 /* Back out bytes from the previous buffer: Need to replay them. */ |
|
993 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); |
|
994 /* same as -(initialToULength-1) */ |
|
995 /* preToULength is negative! */ |
|
996 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); |
|
997 *source-=bytesFromThisBuffer; |
|
998 } |
|
999 _this->toULength=1; |
|
1000 } |
|
1001 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { |
|
1002 _this->toUCallbackReason = UCNV_UNASSIGNED; |
|
1003 } |
|
1004 } |
|
1005 |
|
1006 /*Checks the characters of the buffer against valid 2022 escape sequences |
|
1007 *if the match we return a pointer to the initial start of the sequence otherwise |
|
1008 *we return sourceLimit |
|
1009 */ |
|
1010 /*for 2022 looks ahead in the stream |
|
1011 *to determine the longest possible convertible |
|
1012 *data stream |
|
1013 */ |
|
1014 static inline const char* |
|
1015 getEndOfBuffer_2022(const char** source, |
|
1016 const char* sourceLimit, |
|
1017 UBool /*flush*/){ |
|
1018 |
|
1019 const char* mySource = *source; |
|
1020 |
|
1021 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
1022 if (*source >= sourceLimit) |
|
1023 return sourceLimit; |
|
1024 |
|
1025 do{ |
|
1026 |
|
1027 if (*mySource == ESC_2022){ |
|
1028 int8_t i; |
|
1029 int32_t key = 0; |
|
1030 int32_t offset; |
|
1031 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; |
|
1032 |
|
1033 /* Kludge: I could not |
|
1034 * figure out the reason for validating an escape sequence |
|
1035 * twice - once here and once in changeState_2022(). |
|
1036 * is it possible to have an ESC character in a ISO2022 |
|
1037 * byte stream which is valid in a code page? Is it legal? |
|
1038 */ |
|
1039 for (i=0; |
|
1040 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); |
|
1041 i++) { |
|
1042 value = getKey_2022(*(mySource+i), &key, &offset); |
|
1043 } |
|
1044 if (value > 0 || *mySource==ESC_2022) |
|
1045 return mySource; |
|
1046 |
|
1047 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) |
|
1048 return sourceLimit; |
|
1049 } |
|
1050 }while (++mySource < sourceLimit); |
|
1051 |
|
1052 return sourceLimit; |
|
1053 #else |
|
1054 while(mySource < sourceLimit && *mySource != ESC_2022) { |
|
1055 ++mySource; |
|
1056 } |
|
1057 return mySource; |
|
1058 #endif |
|
1059 } |
|
1060 |
|
1061 |
|
1062 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c |
|
1063 * any future change in _MBCSFromUChar32() function should be reflected here. |
|
1064 * @return number of bytes in *value; negative number if fallback; 0 if no mapping |
|
1065 */ |
|
1066 static inline int32_t |
|
1067 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, |
|
1068 UChar32 c, |
|
1069 uint32_t* value, |
|
1070 UBool useFallback, |
|
1071 int outputType) |
|
1072 { |
|
1073 const int32_t *cx; |
|
1074 const uint16_t *table; |
|
1075 uint32_t stage2Entry; |
|
1076 uint32_t myValue; |
|
1077 int32_t length; |
|
1078 const uint8_t *p; |
|
1079 /* |
|
1080 * TODO(markus): Use and require new, faster MBCS conversion table structures. |
|
1081 * Use internal version of ucnv_open() that verifies that the new structures are available, |
|
1082 * else U_INTERNAL_PROGRAM_ERROR. |
|
1083 */ |
|
1084 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
|
1085 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
|
1086 table=sharedData->mbcs.fromUnicodeTable; |
|
1087 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); |
|
1088 /* get the bytes and the length for the output */ |
|
1089 if(outputType==MBCS_OUTPUT_2){ |
|
1090 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
|
1091 if(myValue<=0xff) { |
|
1092 length=1; |
|
1093 } else { |
|
1094 length=2; |
|
1095 } |
|
1096 } else /* outputType==MBCS_OUTPUT_3 */ { |
|
1097 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
|
1098 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; |
|
1099 if(myValue<=0xff) { |
|
1100 length=1; |
|
1101 } else if(myValue<=0xffff) { |
|
1102 length=2; |
|
1103 } else { |
|
1104 length=3; |
|
1105 } |
|
1106 } |
|
1107 /* is this code point assigned, or do we use fallbacks? */ |
|
1108 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { |
|
1109 /* assigned */ |
|
1110 *value=myValue; |
|
1111 return length; |
|
1112 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { |
|
1113 /* |
|
1114 * We allow a 0 byte output if the "assigned" bit is set for this entry. |
|
1115 * There is no way with this data structure for fallback output |
|
1116 * to be a zero byte. |
|
1117 */ |
|
1118 *value=myValue; |
|
1119 return -length; |
|
1120 } |
|
1121 } |
|
1122 |
|
1123 cx=sharedData->mbcs.extIndexes; |
|
1124 if(cx!=NULL) { |
|
1125 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); |
|
1126 } |
|
1127 |
|
1128 /* unassigned */ |
|
1129 return 0; |
|
1130 } |
|
1131 |
|
1132 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c |
|
1133 * any future change in _MBCSSingleFromUChar32() function should be reflected here. |
|
1134 * @param retval pointer to output byte |
|
1135 * @return 1 roundtrip byte 0 no mapping -1 fallback byte |
|
1136 */ |
|
1137 static inline int32_t |
|
1138 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, |
|
1139 UChar32 c, |
|
1140 uint32_t* retval, |
|
1141 UBool useFallback) |
|
1142 { |
|
1143 const uint16_t *table; |
|
1144 int32_t value; |
|
1145 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
|
1146 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
|
1147 return 0; |
|
1148 } |
|
1149 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ |
|
1150 table=sharedData->mbcs.fromUnicodeTable; |
|
1151 /* get the byte for the output */ |
|
1152 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); |
|
1153 /* is this code point assigned, or do we use fallbacks? */ |
|
1154 *retval=(uint32_t)(value&0xff); |
|
1155 if(value>=0xf00) { |
|
1156 return 1; /* roundtrip */ |
|
1157 } else if(useFallback ? value>=0x800 : value>=0xc00) { |
|
1158 return -1; /* fallback taken */ |
|
1159 } else { |
|
1160 return 0; /* no mapping */ |
|
1161 } |
|
1162 } |
|
1163 |
|
1164 /* |
|
1165 * Check that the result is a 2-byte value with each byte in the range A1..FE |
|
1166 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte |
|
1167 * to move it to the ISO 2022 range 21..7E. |
|
1168 * Return 0 if out of range. |
|
1169 */ |
|
1170 static inline uint32_t |
|
1171 _2022FromGR94DBCS(uint32_t value) { |
|
1172 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && |
|
1173 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) |
|
1174 ) { |
|
1175 return value - 0x8080; /* shift down to 21..7e byte range */ |
|
1176 } else { |
|
1177 return 0; /* not valid for ISO 2022 */ |
|
1178 } |
|
1179 } |
|
1180 |
|
1181 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ |
|
1182 /* |
|
1183 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the |
|
1184 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point |
|
1185 * unchanged. |
|
1186 */ |
|
1187 static inline uint32_t |
|
1188 _2022ToGR94DBCS(uint32_t value) { |
|
1189 uint32_t returnValue = value + 0x8080; |
|
1190 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && |
|
1191 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { |
|
1192 return returnValue; |
|
1193 } else { |
|
1194 return value; |
|
1195 } |
|
1196 } |
|
1197 #endif |
|
1198 |
|
1199 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
1200 |
|
1201 /********************************************************************************** |
|
1202 * ISO-2022 Converter |
|
1203 * |
|
1204 * |
|
1205 */ |
|
1206 |
|
1207 static void |
|
1208 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, |
|
1209 UErrorCode* err){ |
|
1210 const char* mySourceLimit, *realSourceLimit; |
|
1211 const char* sourceStart; |
|
1212 const UChar* myTargetStart; |
|
1213 UConverter* saveThis; |
|
1214 UConverterDataISO2022* myData; |
|
1215 int8_t length; |
|
1216 |
|
1217 saveThis = args->converter; |
|
1218 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); |
|
1219 |
|
1220 realSourceLimit = args->sourceLimit; |
|
1221 while (args->source < realSourceLimit) { |
|
1222 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ |
|
1223 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
|
1224 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); |
|
1225 |
|
1226 if(args->source < mySourceLimit) { |
|
1227 if(myData->currentConverter==NULL) { |
|
1228 myData->currentConverter = ucnv_open("ASCII",err); |
|
1229 if(U_FAILURE(*err)){ |
|
1230 return; |
|
1231 } |
|
1232 |
|
1233 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
|
1234 saveThis->mode = UCNV_SO; |
|
1235 } |
|
1236 |
|
1237 /* convert to before the ESC or until the end of the buffer */ |
|
1238 myData->isFirstBuffer=FALSE; |
|
1239 sourceStart = args->source; |
|
1240 myTargetStart = args->target; |
|
1241 args->converter = myData->currentConverter; |
|
1242 ucnv_toUnicode(args->converter, |
|
1243 &args->target, |
|
1244 args->targetLimit, |
|
1245 &args->source, |
|
1246 mySourceLimit, |
|
1247 args->offsets, |
|
1248 (UBool)(args->flush && mySourceLimit == realSourceLimit), |
|
1249 err); |
|
1250 args->converter = saveThis; |
|
1251 |
|
1252 if (*err == U_BUFFER_OVERFLOW_ERROR) { |
|
1253 /* move the overflow buffer */ |
|
1254 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; |
|
1255 myData->currentConverter->UCharErrorBufferLength = 0; |
|
1256 if(length > 0) { |
|
1257 uprv_memcpy(saveThis->UCharErrorBuffer, |
|
1258 myData->currentConverter->UCharErrorBuffer, |
|
1259 length*U_SIZEOF_UCHAR); |
|
1260 } |
|
1261 return; |
|
1262 } |
|
1263 |
|
1264 /* |
|
1265 * At least one of: |
|
1266 * -Error while converting |
|
1267 * -Done with entire buffer |
|
1268 * -Need to write offsets or update the current offset |
|
1269 * (leave that up to the code in ucnv.c) |
|
1270 * |
|
1271 * or else we just stopped at an ESC byte and continue with changeState_2022() |
|
1272 */ |
|
1273 if (U_FAILURE(*err) || |
|
1274 (args->source == realSourceLimit) || |
|
1275 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || |
|
1276 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) |
|
1277 ) { |
|
1278 /* copy partial or error input for truncated detection and error handling */ |
|
1279 if(U_FAILURE(*err)) { |
|
1280 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; |
|
1281 if(length > 0) { |
|
1282 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); |
|
1283 } |
|
1284 } else { |
|
1285 length = saveThis->toULength = myData->currentConverter->toULength; |
|
1286 if(length > 0) { |
|
1287 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); |
|
1288 if(args->source < mySourceLimit) { |
|
1289 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ |
|
1290 } |
|
1291 } |
|
1292 } |
|
1293 return; |
|
1294 } |
|
1295 } |
|
1296 } |
|
1297 |
|
1298 sourceStart = args->source; |
|
1299 changeState_2022(args->converter, |
|
1300 &(args->source), |
|
1301 realSourceLimit, |
|
1302 ISO_2022, |
|
1303 err); |
|
1304 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { |
|
1305 /* let the ucnv.c code update its current offset */ |
|
1306 return; |
|
1307 } |
|
1308 } |
|
1309 } |
|
1310 |
|
1311 #endif |
|
1312 |
|
1313 /* |
|
1314 * To Unicode Callback helper function |
|
1315 */ |
|
1316 static void |
|
1317 toUnicodeCallback(UConverter *cnv, |
|
1318 const uint32_t sourceChar, const uint32_t targetUniChar, |
|
1319 UErrorCode* err){ |
|
1320 if(sourceChar>0xff){ |
|
1321 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); |
|
1322 cnv->toUBytes[1] = (uint8_t)sourceChar; |
|
1323 cnv->toULength = 2; |
|
1324 } |
|
1325 else{ |
|
1326 cnv->toUBytes[0] =(char) sourceChar; |
|
1327 cnv->toULength = 1; |
|
1328 } |
|
1329 |
|
1330 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ |
|
1331 *err = U_INVALID_CHAR_FOUND; |
|
1332 } |
|
1333 else{ |
|
1334 *err = U_ILLEGAL_CHAR_FOUND; |
|
1335 } |
|
1336 } |
|
1337 |
|
1338 /**************************************ISO-2022-JP*************************************************/ |
|
1339 |
|
1340 /************************************** IMPORTANT ************************************************** |
|
1341 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and |
|
1342 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). |
|
1343 * The converter iterates over each Unicode codepoint |
|
1344 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is |
|
1345 * processed one char at a time it would make sense to reduce the extra processing a canned converter |
|
1346 * would do as far as possible. |
|
1347 * |
|
1348 * If the implementation of these macros or structure of sharedData struct change in the future, make |
|
1349 * sure that ISO-2022 is also changed. |
|
1350 *************************************************************************************************** |
|
1351 */ |
|
1352 |
|
1353 /*************************************************************************************************** |
|
1354 * Rules for ISO-2022-jp encoding |
|
1355 * (i) Escape sequences must be fully contained within a line they should not |
|
1356 * span new lines or CRs |
|
1357 * (ii) If the last character on a line is represented by two bytes then an ASCII or |
|
1358 * JIS-Roman character escape sequence should follow before the line terminates |
|
1359 * (iii) If the first character on the line is represented by two bytes then a two |
|
1360 * byte character escape sequence should precede it |
|
1361 * (iv) If no escape sequence is encountered then the characters are ASCII |
|
1362 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, |
|
1363 * and invoked with SS2 (ESC N). |
|
1364 * (vi) If there is any G0 designation in text, there must be a switch to |
|
1365 * ASCII or to JIS X 0201-Roman before a space character (but not |
|
1366 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control |
|
1367 * characters such as tab or CRLF. |
|
1368 * (vi) Supported encodings: |
|
1369 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 |
|
1370 * |
|
1371 * source : RFC-1554 |
|
1372 * |
|
1373 * JISX201, JISX208,JISX212 : new .cnv data files created |
|
1374 * KSC5601 : alias to ibm-949 mapping table |
|
1375 * GB2312 : alias to ibm-1386 mapping table |
|
1376 * ISO-8859-1 : Algorithmic implemented as LATIN1 case |
|
1377 * ISO-8859-7 : alisas to ibm-9409 mapping table |
|
1378 */ |
|
1379 |
|
1380 /* preference order of JP charsets */ |
|
1381 static const StateEnum jpCharsetPref[]={ |
|
1382 ASCII, |
|
1383 JISX201, |
|
1384 ISO8859_1, |
|
1385 ISO8859_7, |
|
1386 JISX208, |
|
1387 JISX212, |
|
1388 GB2312, |
|
1389 KSC5601, |
|
1390 HWKANA_7BIT |
|
1391 }; |
|
1392 |
|
1393 /* |
|
1394 * The escape sequences must be in order of the enum constants like JISX201 = 3, |
|
1395 * not in order of jpCharsetPref[]! |
|
1396 */ |
|
1397 static const char escSeqChars[][6] ={ |
|
1398 "\x1B\x28\x42", /* <ESC>(B ASCII */ |
|
1399 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ |
|
1400 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ |
|
1401 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ |
|
1402 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ |
|
1403 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ |
|
1404 "\x1B\x24\x41", /* <ESC>$A GB2312 */ |
|
1405 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ |
|
1406 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ |
|
1407 |
|
1408 }; |
|
1409 static const int8_t escSeqCharsLen[] ={ |
|
1410 3, /* length of <ESC>(B ASCII */ |
|
1411 3, /* length of <ESC>.A ISO-8859-1 */ |
|
1412 3, /* length of <ESC>.F ISO-8859-7 */ |
|
1413 3, /* length of <ESC>(J JISX-201 */ |
|
1414 3, /* length of <ESC>$B JISX-208 */ |
|
1415 4, /* length of <ESC>$(D JISX-212 */ |
|
1416 3, /* length of <ESC>$A GB2312 */ |
|
1417 4, /* length of <ESC>$(C KSC5601 */ |
|
1418 3 /* length of <ESC>(I HWKANA_7BIT */ |
|
1419 }; |
|
1420 |
|
1421 /* |
|
1422 * The iteration over various code pages works this way: |
|
1423 * i) Get the currentState from myConverterData->currentState |
|
1424 * ii) Check if the character is mapped to a valid character in the currentState |
|
1425 * Yes -> a) set the initIterState to currentState |
|
1426 * b) remain in this state until an invalid character is found |
|
1427 * No -> a) go to the next code page and find the character |
|
1428 * iii) Before changing the state increment the current state check if the current state |
|
1429 * is equal to the intitIteration state |
|
1430 * Yes -> A character that cannot be represented in any of the supported encodings |
|
1431 * break and return a U_INVALID_CHARACTER error |
|
1432 * No -> Continue and find the character in next code page |
|
1433 * |
|
1434 * |
|
1435 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages |
|
1436 */ |
|
1437 |
|
1438 /* Map 00..7F to Unicode according to JIS X 0201. */ |
|
1439 static inline uint32_t |
|
1440 jisx201ToU(uint32_t value) { |
|
1441 if(value < 0x5c) { |
|
1442 return value; |
|
1443 } else if(value == 0x5c) { |
|
1444 return 0xa5; |
|
1445 } else if(value == 0x7e) { |
|
1446 return 0x203e; |
|
1447 } else /* value <= 0x7f */ { |
|
1448 return value; |
|
1449 } |
|
1450 } |
|
1451 |
|
1452 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ |
|
1453 static inline uint32_t |
|
1454 jisx201FromU(uint32_t value) { |
|
1455 if(value<=0x7f) { |
|
1456 if(value!=0x5c && value!=0x7e) { |
|
1457 return value; |
|
1458 } |
|
1459 } else if(value==0xa5) { |
|
1460 return 0x5c; |
|
1461 } else if(value==0x203e) { |
|
1462 return 0x7e; |
|
1463 } |
|
1464 return 0xfffe; |
|
1465 } |
|
1466 |
|
1467 /* |
|
1468 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding |
|
1469 * to JIS X 0208, and convert it to a pair of 21..7E bytes. |
|
1470 * Return 0 if the byte pair is out of range. |
|
1471 */ |
|
1472 static inline uint32_t |
|
1473 _2022FromSJIS(uint32_t value) { |
|
1474 uint8_t trail; |
|
1475 |
|
1476 if(value > 0xEFFC) { |
|
1477 return 0; /* beyond JIS X 0208 */ |
|
1478 } |
|
1479 |
|
1480 trail = (uint8_t)value; |
|
1481 |
|
1482 value &= 0xff00; /* lead byte */ |
|
1483 if(value <= 0x9f00) { |
|
1484 value -= 0x7000; |
|
1485 } else /* 0xe000 <= value <= 0xef00 */ { |
|
1486 value -= 0xb000; |
|
1487 } |
|
1488 value <<= 1; |
|
1489 |
|
1490 if(trail <= 0x9e) { |
|
1491 value -= 0x100; |
|
1492 if(trail <= 0x7e) { |
|
1493 value |= trail - 0x1f; |
|
1494 } else { |
|
1495 value |= trail - 0x20; |
|
1496 } |
|
1497 } else /* trail <= 0xfc */ { |
|
1498 value |= trail - 0x7e; |
|
1499 } |
|
1500 return value; |
|
1501 } |
|
1502 |
|
1503 /* |
|
1504 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. |
|
1505 * If either byte is outside 21..7E make sure that the result is not valid |
|
1506 * for Shift-JIS so that the converter catches it. |
|
1507 * Some invalid byte values already turn into equally invalid Shift-JIS |
|
1508 * byte values and need not be tested explicitly. |
|
1509 */ |
|
1510 static inline void |
|
1511 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { |
|
1512 if(c1&1) { |
|
1513 ++c1; |
|
1514 if(c2 <= 0x5f) { |
|
1515 c2 += 0x1f; |
|
1516 } else if(c2 <= 0x7e) { |
|
1517 c2 += 0x20; |
|
1518 } else { |
|
1519 c2 = 0; /* invalid */ |
|
1520 } |
|
1521 } else { |
|
1522 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { |
|
1523 c2 += 0x7e; |
|
1524 } else { |
|
1525 c2 = 0; /* invalid */ |
|
1526 } |
|
1527 } |
|
1528 c1 >>= 1; |
|
1529 if(c1 <= 0x2f) { |
|
1530 c1 += 0x70; |
|
1531 } else if(c1 <= 0x3f) { |
|
1532 c1 += 0xb0; |
|
1533 } else { |
|
1534 c1 = 0; /* invalid */ |
|
1535 } |
|
1536 bytes[0] = (char)c1; |
|
1537 bytes[1] = (char)c2; |
|
1538 } |
|
1539 |
|
1540 /* |
|
1541 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) |
|
1542 * Katakana. |
|
1543 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks |
|
1544 * because Shift-JIS roundtrips half-width Katakana to single bytes. |
|
1545 * These were the only fallbacks in ICU's jisx-208.ucm file. |
|
1546 */ |
|
1547 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { |
|
1548 0x2123, /* U+FF61 */ |
|
1549 0x2156, |
|
1550 0x2157, |
|
1551 0x2122, |
|
1552 0x2126, |
|
1553 0x2572, |
|
1554 0x2521, |
|
1555 0x2523, |
|
1556 0x2525, |
|
1557 0x2527, |
|
1558 0x2529, |
|
1559 0x2563, |
|
1560 0x2565, |
|
1561 0x2567, |
|
1562 0x2543, |
|
1563 0x213C, /* U+FF70 */ |
|
1564 0x2522, |
|
1565 0x2524, |
|
1566 0x2526, |
|
1567 0x2528, |
|
1568 0x252A, |
|
1569 0x252B, |
|
1570 0x252D, |
|
1571 0x252F, |
|
1572 0x2531, |
|
1573 0x2533, |
|
1574 0x2535, |
|
1575 0x2537, |
|
1576 0x2539, |
|
1577 0x253B, |
|
1578 0x253D, |
|
1579 0x253F, /* U+FF80 */ |
|
1580 0x2541, |
|
1581 0x2544, |
|
1582 0x2546, |
|
1583 0x2548, |
|
1584 0x254A, |
|
1585 0x254B, |
|
1586 0x254C, |
|
1587 0x254D, |
|
1588 0x254E, |
|
1589 0x254F, |
|
1590 0x2552, |
|
1591 0x2555, |
|
1592 0x2558, |
|
1593 0x255B, |
|
1594 0x255E, |
|
1595 0x255F, /* U+FF90 */ |
|
1596 0x2560, |
|
1597 0x2561, |
|
1598 0x2562, |
|
1599 0x2564, |
|
1600 0x2566, |
|
1601 0x2568, |
|
1602 0x2569, |
|
1603 0x256A, |
|
1604 0x256B, |
|
1605 0x256C, |
|
1606 0x256D, |
|
1607 0x256F, |
|
1608 0x2573, |
|
1609 0x212B, |
|
1610 0x212C /* U+FF9F */ |
|
1611 }; |
|
1612 |
|
1613 static void |
|
1614 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { |
|
1615 UConverter *cnv = args->converter; |
|
1616 UConverterDataISO2022 *converterData; |
|
1617 ISO2022State *pFromU2022State; |
|
1618 uint8_t *target = (uint8_t *) args->target; |
|
1619 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
|
1620 const UChar* source = args->source; |
|
1621 const UChar* sourceLimit = args->sourceLimit; |
|
1622 int32_t* offsets = args->offsets; |
|
1623 UChar32 sourceChar; |
|
1624 char buffer[8]; |
|
1625 int32_t len, outLen; |
|
1626 int8_t choices[10]; |
|
1627 int32_t choiceCount; |
|
1628 uint32_t targetValue = 0; |
|
1629 UBool useFallback; |
|
1630 |
|
1631 int32_t i; |
|
1632 int8_t cs, g; |
|
1633 |
|
1634 /* set up the state */ |
|
1635 converterData = (UConverterDataISO2022*)cnv->extraInfo; |
|
1636 pFromU2022State = &converterData->fromU2022State; |
|
1637 |
|
1638 choiceCount = 0; |
|
1639 |
|
1640 /* check if the last codepoint of previous buffer was a lead surrogate*/ |
|
1641 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
|
1642 goto getTrail; |
|
1643 } |
|
1644 |
|
1645 while(source < sourceLimit) { |
|
1646 if(target < targetLimit) { |
|
1647 |
|
1648 sourceChar = *(source++); |
|
1649 /*check if the char is a First surrogate*/ |
|
1650 if(U16_IS_SURROGATE(sourceChar)) { |
|
1651 if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
|
1652 getTrail: |
|
1653 /*look ahead to find the trail surrogate*/ |
|
1654 if(source < sourceLimit) { |
|
1655 /* test the following code unit */ |
|
1656 UChar trail=(UChar) *source; |
|
1657 if(U16_IS_TRAIL(trail)) { |
|
1658 source++; |
|
1659 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
|
1660 cnv->fromUChar32=0x00; |
|
1661 /* convert this supplementary code point */ |
|
1662 /* exit this condition tree */ |
|
1663 } else { |
|
1664 /* this is an unmatched lead code unit (1st surrogate) */ |
|
1665 /* callback(illegal) */ |
|
1666 *err=U_ILLEGAL_CHAR_FOUND; |
|
1667 cnv->fromUChar32=sourceChar; |
|
1668 break; |
|
1669 } |
|
1670 } else { |
|
1671 /* no more input */ |
|
1672 cnv->fromUChar32=sourceChar; |
|
1673 break; |
|
1674 } |
|
1675 } else { |
|
1676 /* this is an unmatched trail code unit (2nd surrogate) */ |
|
1677 /* callback(illegal) */ |
|
1678 *err=U_ILLEGAL_CHAR_FOUND; |
|
1679 cnv->fromUChar32=sourceChar; |
|
1680 break; |
|
1681 } |
|
1682 } |
|
1683 |
|
1684 /* do not convert SO/SI/ESC */ |
|
1685 if(IS_2022_CONTROL(sourceChar)) { |
|
1686 /* callback(illegal) */ |
|
1687 *err=U_ILLEGAL_CHAR_FOUND; |
|
1688 cnv->fromUChar32=sourceChar; |
|
1689 break; |
|
1690 } |
|
1691 |
|
1692 /* do the conversion */ |
|
1693 |
|
1694 if(choiceCount == 0) { |
|
1695 uint16_t csm; |
|
1696 |
|
1697 /* |
|
1698 * The csm variable keeps track of which charsets are allowed |
|
1699 * and not used yet while building the choices[]. |
|
1700 */ |
|
1701 csm = jpCharsetMasks[converterData->version]; |
|
1702 choiceCount = 0; |
|
1703 |
|
1704 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ |
|
1705 if(converterData->version == 3 || converterData->version == 4) { |
|
1706 choices[choiceCount++] = (int8_t)HWKANA_7BIT; |
|
1707 } |
|
1708 /* Do not try single-byte half-width Katakana for other versions. */ |
|
1709 csm &= ~CSM(HWKANA_7BIT); |
|
1710 |
|
1711 /* try the current G0 charset */ |
|
1712 choices[choiceCount++] = cs = pFromU2022State->cs[0]; |
|
1713 csm &= ~CSM(cs); |
|
1714 |
|
1715 /* try the current G2 charset */ |
|
1716 if((cs = pFromU2022State->cs[2]) != 0) { |
|
1717 choices[choiceCount++] = cs; |
|
1718 csm &= ~CSM(cs); |
|
1719 } |
|
1720 |
|
1721 /* try all the other possible charsets */ |
|
1722 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { |
|
1723 cs = (int8_t)jpCharsetPref[i]; |
|
1724 if(CSM(cs) & csm) { |
|
1725 choices[choiceCount++] = cs; |
|
1726 csm &= ~CSM(cs); |
|
1727 } |
|
1728 } |
|
1729 } |
|
1730 |
|
1731 cs = g = 0; |
|
1732 /* |
|
1733 * len==0: no mapping found yet |
|
1734 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
|
1735 * len>0: found a roundtrip result, done |
|
1736 */ |
|
1737 len = 0; |
|
1738 /* |
|
1739 * We will turn off useFallback after finding a fallback, |
|
1740 * but we still get fallbacks from PUA code points as usual. |
|
1741 * Therefore, we will also need to check that we don't overwrite |
|
1742 * an early fallback with a later one. |
|
1743 */ |
|
1744 useFallback = cnv->useFallback; |
|
1745 |
|
1746 for(i = 0; i < choiceCount && len <= 0; ++i) { |
|
1747 uint32_t value; |
|
1748 int32_t len2; |
|
1749 int8_t cs0 = choices[i]; |
|
1750 switch(cs0) { |
|
1751 case ASCII: |
|
1752 if(sourceChar <= 0x7f) { |
|
1753 targetValue = (uint32_t)sourceChar; |
|
1754 len = 1; |
|
1755 cs = cs0; |
|
1756 g = 0; |
|
1757 } |
|
1758 break; |
|
1759 case ISO8859_1: |
|
1760 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { |
|
1761 targetValue = (uint32_t)sourceChar - 0x80; |
|
1762 len = 1; |
|
1763 cs = cs0; |
|
1764 g = 2; |
|
1765 } |
|
1766 break; |
|
1767 case HWKANA_7BIT: |
|
1768 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
|
1769 if(converterData->version==3) { |
|
1770 /* JIS7: use G1 (SO) */ |
|
1771 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ |
|
1772 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); |
|
1773 len = 1; |
|
1774 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ |
|
1775 g = 1; |
|
1776 } else if(converterData->version==4) { |
|
1777 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ |
|
1778 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ |
|
1779 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); |
|
1780 len = 1; |
|
1781 |
|
1782 cs = pFromU2022State->cs[0]; |
|
1783 if(IS_JP_DBCS(cs)) { |
|
1784 /* switch from a DBCS charset to JISX201 */ |
|
1785 cs = (int8_t)JISX201; |
|
1786 } |
|
1787 /* else stay in the current G0 charset */ |
|
1788 g = 0; |
|
1789 } |
|
1790 /* else do not use HWKANA_7BIT with other versions */ |
|
1791 } |
|
1792 break; |
|
1793 case JISX201: |
|
1794 /* G0 SBCS */ |
|
1795 value = jisx201FromU(sourceChar); |
|
1796 if(value <= 0x7f) { |
|
1797 targetValue = value; |
|
1798 len = 1; |
|
1799 cs = cs0; |
|
1800 g = 0; |
|
1801 useFallback = FALSE; |
|
1802 } |
|
1803 break; |
|
1804 case JISX208: |
|
1805 /* G0 DBCS from Shift-JIS table */ |
|
1806 len2 = MBCS_FROM_UCHAR32_ISO2022( |
|
1807 converterData->myConverterArray[cs0], |
|
1808 sourceChar, &value, |
|
1809 useFallback, MBCS_OUTPUT_2); |
|
1810 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
|
1811 value = _2022FromSJIS(value); |
|
1812 if(value != 0) { |
|
1813 targetValue = value; |
|
1814 len = len2; |
|
1815 cs = cs0; |
|
1816 g = 0; |
|
1817 useFallback = FALSE; |
|
1818 } |
|
1819 } else if(len == 0 && useFallback && |
|
1820 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
|
1821 targetValue = hwkana_fb[sourceChar - HWKANA_START]; |
|
1822 len = -2; |
|
1823 cs = cs0; |
|
1824 g = 0; |
|
1825 useFallback = FALSE; |
|
1826 } |
|
1827 break; |
|
1828 case ISO8859_7: |
|
1829 /* G0 SBCS forced to 7-bit output */ |
|
1830 len2 = MBCS_SINGLE_FROM_UCHAR32( |
|
1831 converterData->myConverterArray[cs0], |
|
1832 sourceChar, &value, |
|
1833 useFallback); |
|
1834 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { |
|
1835 targetValue = value - 0x80; |
|
1836 len = len2; |
|
1837 cs = cs0; |
|
1838 g = 2; |
|
1839 useFallback = FALSE; |
|
1840 } |
|
1841 break; |
|
1842 default: |
|
1843 /* G0 DBCS */ |
|
1844 len2 = MBCS_FROM_UCHAR32_ISO2022( |
|
1845 converterData->myConverterArray[cs0], |
|
1846 sourceChar, &value, |
|
1847 useFallback, MBCS_OUTPUT_2); |
|
1848 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
|
1849 if(cs0 == KSC5601) { |
|
1850 /* |
|
1851 * Check for valid bytes for the encoding scheme. |
|
1852 * This is necessary because the sub-converter (windows-949) |
|
1853 * has a broader encoding scheme than is valid for 2022. |
|
1854 */ |
|
1855 value = _2022FromGR94DBCS(value); |
|
1856 if(value == 0) { |
|
1857 break; |
|
1858 } |
|
1859 } |
|
1860 targetValue = value; |
|
1861 len = len2; |
|
1862 cs = cs0; |
|
1863 g = 0; |
|
1864 useFallback = FALSE; |
|
1865 } |
|
1866 break; |
|
1867 } |
|
1868 } |
|
1869 |
|
1870 if(len != 0) { |
|
1871 if(len < 0) { |
|
1872 len = -len; /* fallback */ |
|
1873 } |
|
1874 outLen = 0; /* count output bytes */ |
|
1875 |
|
1876 /* write SI if necessary (only for JIS7) */ |
|
1877 if(pFromU2022State->g == 1 && g == 0) { |
|
1878 buffer[outLen++] = UCNV_SI; |
|
1879 pFromU2022State->g = 0; |
|
1880 } |
|
1881 |
|
1882 /* write the designation sequence if necessary */ |
|
1883 if(cs != pFromU2022State->cs[g]) { |
|
1884 int32_t escLen = escSeqCharsLen[cs]; |
|
1885 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); |
|
1886 outLen += escLen; |
|
1887 pFromU2022State->cs[g] = cs; |
|
1888 |
|
1889 /* invalidate the choices[] */ |
|
1890 choiceCount = 0; |
|
1891 } |
|
1892 |
|
1893 /* write the shift sequence if necessary */ |
|
1894 if(g != pFromU2022State->g) { |
|
1895 switch(g) { |
|
1896 /* case 0 handled before writing escapes */ |
|
1897 case 1: |
|
1898 buffer[outLen++] = UCNV_SO; |
|
1899 pFromU2022State->g = 1; |
|
1900 break; |
|
1901 default: /* case 2 */ |
|
1902 buffer[outLen++] = 0x1b; |
|
1903 buffer[outLen++] = 0x4e; |
|
1904 break; |
|
1905 /* no case 3: no SS3 in ISO-2022-JP-x */ |
|
1906 } |
|
1907 } |
|
1908 |
|
1909 /* write the output bytes */ |
|
1910 if(len == 1) { |
|
1911 buffer[outLen++] = (char)targetValue; |
|
1912 } else /* len == 2 */ { |
|
1913 buffer[outLen++] = (char)(targetValue >> 8); |
|
1914 buffer[outLen++] = (char)targetValue; |
|
1915 } |
|
1916 } else { |
|
1917 /* |
|
1918 * if we cannot find the character after checking all codepages |
|
1919 * then this is an error |
|
1920 */ |
|
1921 *err = U_INVALID_CHAR_FOUND; |
|
1922 cnv->fromUChar32=sourceChar; |
|
1923 break; |
|
1924 } |
|
1925 |
|
1926 if(sourceChar == CR || sourceChar == LF) { |
|
1927 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ |
|
1928 pFromU2022State->cs[2] = 0; |
|
1929 choiceCount = 0; |
|
1930 } |
|
1931 |
|
1932 /* output outLen>0 bytes in buffer[] */ |
|
1933 if(outLen == 1) { |
|
1934 *target++ = buffer[0]; |
|
1935 if(offsets) { |
|
1936 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
|
1937 } |
|
1938 } else if(outLen == 2 && (target + 2) <= targetLimit) { |
|
1939 *target++ = buffer[0]; |
|
1940 *target++ = buffer[1]; |
|
1941 if(offsets) { |
|
1942 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
|
1943 *offsets++ = sourceIndex; |
|
1944 *offsets++ = sourceIndex; |
|
1945 } |
|
1946 } else { |
|
1947 fromUWriteUInt8( |
|
1948 cnv, |
|
1949 buffer, outLen, |
|
1950 &target, (const char *)targetLimit, |
|
1951 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
|
1952 err); |
|
1953 if(U_FAILURE(*err)) { |
|
1954 break; |
|
1955 } |
|
1956 } |
|
1957 } /* end if(myTargetIndex<myTargetLength) */ |
|
1958 else{ |
|
1959 *err =U_BUFFER_OVERFLOW_ERROR; |
|
1960 break; |
|
1961 } |
|
1962 |
|
1963 }/* end while(mySourceIndex<mySourceLength) */ |
|
1964 |
|
1965 /* |
|
1966 * the end of the input stream and detection of truncated input |
|
1967 * are handled by the framework, but for ISO-2022-JP conversion |
|
1968 * we need to be in ASCII mode at the very end |
|
1969 * |
|
1970 * conditions: |
|
1971 * successful |
|
1972 * in SO mode or not in ASCII mode |
|
1973 * end of input and no truncated input |
|
1974 */ |
|
1975 if( U_SUCCESS(*err) && |
|
1976 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && |
|
1977 args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
|
1978 ) { |
|
1979 int32_t sourceIndex; |
|
1980 |
|
1981 outLen = 0; |
|
1982 |
|
1983 if(pFromU2022State->g != 0) { |
|
1984 buffer[outLen++] = UCNV_SI; |
|
1985 pFromU2022State->g = 0; |
|
1986 } |
|
1987 |
|
1988 if(pFromU2022State->cs[0] != ASCII) { |
|
1989 int32_t escLen = escSeqCharsLen[ASCII]; |
|
1990 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); |
|
1991 outLen += escLen; |
|
1992 pFromU2022State->cs[0] = (int8_t)ASCII; |
|
1993 } |
|
1994 |
|
1995 /* get the source index of the last input character */ |
|
1996 /* |
|
1997 * TODO this would be simpler and more reliable if we used a pair |
|
1998 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
|
1999 * so that we could simply use the prevSourceIndex here; |
|
2000 * this code gives an incorrect result for the rare case of an unmatched |
|
2001 * trail surrogate that is alone in the last buffer of the text stream |
|
2002 */ |
|
2003 sourceIndex=(int32_t)(source-args->source); |
|
2004 if(sourceIndex>0) { |
|
2005 --sourceIndex; |
|
2006 if( U16_IS_TRAIL(args->source[sourceIndex]) && |
|
2007 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
|
2008 ) { |
|
2009 --sourceIndex; |
|
2010 } |
|
2011 } else { |
|
2012 sourceIndex=-1; |
|
2013 } |
|
2014 |
|
2015 fromUWriteUInt8( |
|
2016 cnv, |
|
2017 buffer, outLen, |
|
2018 &target, (const char *)targetLimit, |
|
2019 &offsets, sourceIndex, |
|
2020 err); |
|
2021 } |
|
2022 |
|
2023 /*save the state and return */ |
|
2024 args->source = source; |
|
2025 args->target = (char*)target; |
|
2026 } |
|
2027 |
|
2028 /*************** to unicode *******************/ |
|
2029 |
|
2030 static void |
|
2031 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
|
2032 UErrorCode* err){ |
|
2033 char tempBuf[2]; |
|
2034 const char *mySource = (char *) args->source; |
|
2035 UChar *myTarget = args->target; |
|
2036 const char *mySourceLimit = args->sourceLimit; |
|
2037 uint32_t targetUniChar = 0x0000; |
|
2038 uint32_t mySourceChar = 0x0000; |
|
2039 uint32_t tmpSourceChar = 0x0000; |
|
2040 UConverterDataISO2022* myData; |
|
2041 ISO2022State *pToU2022State; |
|
2042 StateEnum cs; |
|
2043 |
|
2044 myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
|
2045 pToU2022State = &myData->toU2022State; |
|
2046 |
|
2047 if(myData->key != 0) { |
|
2048 /* continue with a partial escape sequence */ |
|
2049 goto escape; |
|
2050 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
|
2051 /* continue with a partial double-byte character */ |
|
2052 mySourceChar = args->converter->toUBytes[0]; |
|
2053 args->converter->toULength = 0; |
|
2054 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
|
2055 targetUniChar = missingCharMarker; |
|
2056 goto getTrailByte; |
|
2057 } |
|
2058 |
|
2059 while(mySource < mySourceLimit){ |
|
2060 |
|
2061 targetUniChar =missingCharMarker; |
|
2062 |
|
2063 if(myTarget < args->targetLimit){ |
|
2064 |
|
2065 mySourceChar= (unsigned char) *mySource++; |
|
2066 |
|
2067 switch(mySourceChar) { |
|
2068 case UCNV_SI: |
|
2069 if(myData->version==3) { |
|
2070 pToU2022State->g=0; |
|
2071 continue; |
|
2072 } else { |
|
2073 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
|
2074 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
|
2075 break; |
|
2076 } |
|
2077 |
|
2078 case UCNV_SO: |
|
2079 if(myData->version==3) { |
|
2080 /* JIS7: switch to G1 half-width Katakana */ |
|
2081 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; |
|
2082 pToU2022State->g=1; |
|
2083 continue; |
|
2084 } else { |
|
2085 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
|
2086 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
|
2087 break; |
|
2088 } |
|
2089 |
|
2090 case ESC_2022: |
|
2091 mySource--; |
|
2092 escape: |
|
2093 { |
|
2094 const char * mySourceBefore = mySource; |
|
2095 int8_t toULengthBefore = args->converter->toULength; |
|
2096 |
|
2097 changeState_2022(args->converter,&(mySource), |
|
2098 mySourceLimit, ISO_2022_JP,err); |
|
2099 |
|
2100 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ |
|
2101 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
|
2102 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
2103 args->converter->toUCallbackReason = UCNV_IRREGULAR; |
|
2104 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
|
2105 } |
|
2106 } |
|
2107 |
|
2108 /* invalid or illegal escape sequence */ |
|
2109 if(U_FAILURE(*err)){ |
|
2110 args->target = myTarget; |
|
2111 args->source = mySource; |
|
2112 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
|
2113 return; |
|
2114 } |
|
2115 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ |
|
2116 if(myData->key==0) { |
|
2117 myData->isEmptySegment = TRUE; |
|
2118 } |
|
2119 continue; |
|
2120 |
|
2121 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
|
2122 |
|
2123 case CR: |
|
2124 /*falls through*/ |
|
2125 case LF: |
|
2126 /* automatically reset to single-byte mode */ |
|
2127 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { |
|
2128 pToU2022State->cs[0] = (int8_t)ASCII; |
|
2129 } |
|
2130 pToU2022State->cs[2] = 0; |
|
2131 pToU2022State->g = 0; |
|
2132 /* falls through */ |
|
2133 default: |
|
2134 /* convert one or two bytes */ |
|
2135 myData->isEmptySegment = FALSE; |
|
2136 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
|
2137 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && |
|
2138 !IS_JP_DBCS(cs) |
|
2139 ) { |
|
2140 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ |
|
2141 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); |
|
2142 |
|
2143 /* return from a single-shift state to the previous one */ |
|
2144 if(pToU2022State->g >= 2) { |
|
2145 pToU2022State->g=pToU2022State->prevG; |
|
2146 } |
|
2147 } else switch(cs) { |
|
2148 case ASCII: |
|
2149 if(mySourceChar <= 0x7f) { |
|
2150 targetUniChar = mySourceChar; |
|
2151 } |
|
2152 break; |
|
2153 case ISO8859_1: |
|
2154 if(mySourceChar <= 0x7f) { |
|
2155 targetUniChar = mySourceChar + 0x80; |
|
2156 } |
|
2157 /* return from a single-shift state to the previous one */ |
|
2158 pToU2022State->g=pToU2022State->prevG; |
|
2159 break; |
|
2160 case ISO8859_7: |
|
2161 if(mySourceChar <= 0x7f) { |
|
2162 /* convert mySourceChar+0x80 to use a normal 8-bit table */ |
|
2163 targetUniChar = |
|
2164 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( |
|
2165 myData->myConverterArray[cs], |
|
2166 mySourceChar + 0x80); |
|
2167 } |
|
2168 /* return from a single-shift state to the previous one */ |
|
2169 pToU2022State->g=pToU2022State->prevG; |
|
2170 break; |
|
2171 case JISX201: |
|
2172 if(mySourceChar <= 0x7f) { |
|
2173 targetUniChar = jisx201ToU(mySourceChar); |
|
2174 } |
|
2175 break; |
|
2176 case HWKANA_7BIT: |
|
2177 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { |
|
2178 /* 7-bit halfwidth Katakana */ |
|
2179 targetUniChar = mySourceChar + (HWKANA_START - 0x21); |
|
2180 } |
|
2181 break; |
|
2182 default: |
|
2183 /* G0 DBCS */ |
|
2184 if(mySource < mySourceLimit) { |
|
2185 int leadIsOk, trailIsOk; |
|
2186 uint8_t trailByte; |
|
2187 getTrailByte: |
|
2188 trailByte = (uint8_t)*mySource; |
|
2189 /* |
|
2190 * Ticket 5691: consistent illegal sequences: |
|
2191 * - We include at least the first byte in the illegal sequence. |
|
2192 * - If any of the non-initial bytes could be the start of a character, |
|
2193 * we stop the illegal sequence before the first one of those. |
|
2194 * |
|
2195 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
|
2196 * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
|
2197 * Otherwise we convert or report the pair of bytes. |
|
2198 */ |
|
2199 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
|
2200 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
|
2201 if (leadIsOk && trailIsOk) { |
|
2202 ++mySource; |
|
2203 tmpSourceChar = (mySourceChar << 8) | trailByte; |
|
2204 if(cs == JISX208) { |
|
2205 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); |
|
2206 mySourceChar = tmpSourceChar; |
|
2207 } else { |
|
2208 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ |
|
2209 mySourceChar = tmpSourceChar; |
|
2210 if (cs == KSC5601) { |
|
2211 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ |
|
2212 } |
|
2213 tempBuf[0] = (char)(tmpSourceChar >> 8); |
|
2214 tempBuf[1] = (char)(tmpSourceChar); |
|
2215 } |
|
2216 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); |
|
2217 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
|
2218 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
|
2219 ++mySource; |
|
2220 /* add another bit so that the code below writes 2 bytes in case of error */ |
|
2221 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
|
2222 } |
|
2223 } else { |
|
2224 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
|
2225 args->converter->toULength = 1; |
|
2226 goto endloop; |
|
2227 } |
|
2228 } /* End of inner switch */ |
|
2229 break; |
|
2230 } /* End of outer switch */ |
|
2231 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
|
2232 if(args->offsets){ |
|
2233 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
|
2234 } |
|
2235 *(myTarget++)=(UChar)targetUniChar; |
|
2236 } |
|
2237 else if(targetUniChar > missingCharMarker){ |
|
2238 /* disassemble the surrogate pair and write to output*/ |
|
2239 targetUniChar-=0x0010000; |
|
2240 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
|
2241 if(args->offsets){ |
|
2242 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
|
2243 } |
|
2244 ++myTarget; |
|
2245 if(myTarget< args->targetLimit){ |
|
2246 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
|
2247 if(args->offsets){ |
|
2248 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
|
2249 } |
|
2250 ++myTarget; |
|
2251 }else{ |
|
2252 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
|
2253 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
|
2254 } |
|
2255 |
|
2256 } |
|
2257 else{ |
|
2258 /* Call the callback function*/ |
|
2259 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
|
2260 break; |
|
2261 } |
|
2262 } |
|
2263 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ |
|
2264 *err =U_BUFFER_OVERFLOW_ERROR; |
|
2265 break; |
|
2266 } |
|
2267 } |
|
2268 endloop: |
|
2269 args->target = myTarget; |
|
2270 args->source = mySource; |
|
2271 } |
|
2272 |
|
2273 |
|
2274 /*************************************************************** |
|
2275 * Rules for ISO-2022-KR encoding |
|
2276 * i) The KSC5601 designator sequence should appear only once in a file, |
|
2277 * at the begining of a line before any KSC5601 characters. This usually |
|
2278 * means that it appears by itself on the first line of the file |
|
2279 * ii) There are only 2 shifting sequences SO to shift into double byte mode |
|
2280 * and SI to shift into single byte mode |
|
2281 */ |
|
2282 static void |
|
2283 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
|
2284 |
|
2285 UConverter* saveConv = args->converter; |
|
2286 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; |
|
2287 args->converter=myConverterData->currentConverter; |
|
2288 |
|
2289 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; |
|
2290 ucnv_MBCSFromUnicodeWithOffsets(args,err); |
|
2291 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
|
2292 |
|
2293 if(*err == U_BUFFER_OVERFLOW_ERROR) { |
|
2294 if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
|
2295 uprv_memcpy( |
|
2296 saveConv->charErrorBuffer, |
|
2297 myConverterData->currentConverter->charErrorBuffer, |
|
2298 myConverterData->currentConverter->charErrorBufferLength); |
|
2299 } |
|
2300 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
|
2301 myConverterData->currentConverter->charErrorBufferLength = 0; |
|
2302 } |
|
2303 args->converter=saveConv; |
|
2304 } |
|
2305 |
|
2306 static void |
|
2307 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
|
2308 |
|
2309 const UChar *source = args->source; |
|
2310 const UChar *sourceLimit = args->sourceLimit; |
|
2311 unsigned char *target = (unsigned char *) args->target; |
|
2312 unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
|
2313 int32_t* offsets = args->offsets; |
|
2314 uint32_t targetByteUnit = 0x0000; |
|
2315 UChar32 sourceChar = 0x0000; |
|
2316 UBool isTargetByteDBCS; |
|
2317 UBool oldIsTargetByteDBCS; |
|
2318 UConverterDataISO2022 *converterData; |
|
2319 UConverterSharedData* sharedData; |
|
2320 UBool useFallback; |
|
2321 int32_t length =0; |
|
2322 |
|
2323 converterData=(UConverterDataISO2022*)args->converter->extraInfo; |
|
2324 /* if the version is 1 then the user is requesting |
|
2325 * conversion with ibm-25546 pass the arguments to |
|
2326 * MBCS converter and return |
|
2327 */ |
|
2328 if(converterData->version==1){ |
|
2329 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
|
2330 return; |
|
2331 } |
|
2332 |
|
2333 /* initialize data */ |
|
2334 sharedData = converterData->currentConverter->sharedData; |
|
2335 useFallback = args->converter->useFallback; |
|
2336 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; |
|
2337 oldIsTargetByteDBCS = isTargetByteDBCS; |
|
2338 |
|
2339 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; |
|
2340 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { |
|
2341 goto getTrail; |
|
2342 } |
|
2343 while(source < sourceLimit){ |
|
2344 |
|
2345 targetByteUnit = missingCharMarker; |
|
2346 |
|
2347 if(target < (unsigned char*) args->targetLimit){ |
|
2348 sourceChar = *source++; |
|
2349 |
|
2350 /* do not convert SO/SI/ESC */ |
|
2351 if(IS_2022_CONTROL(sourceChar)) { |
|
2352 /* callback(illegal) */ |
|
2353 *err=U_ILLEGAL_CHAR_FOUND; |
|
2354 args->converter->fromUChar32=sourceChar; |
|
2355 break; |
|
2356 } |
|
2357 |
|
2358 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); |
|
2359 if(length < 0) { |
|
2360 length = -length; /* fallback */ |
|
2361 } |
|
2362 /* only DBCS or SBCS characters are expected*/ |
|
2363 /* DB characters with high bit set to 1 are expected */ |
|
2364 if( length > 2 || length==0 || |
|
2365 (length == 1 && targetByteUnit > 0x7f) || |
|
2366 (length == 2 && |
|
2367 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || |
|
2368 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) |
|
2369 ) { |
|
2370 targetByteUnit=missingCharMarker; |
|
2371 } |
|
2372 if (targetByteUnit != missingCharMarker){ |
|
2373 |
|
2374 oldIsTargetByteDBCS = isTargetByteDBCS; |
|
2375 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); |
|
2376 /* append the shift sequence */ |
|
2377 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ |
|
2378 |
|
2379 if (isTargetByteDBCS) |
|
2380 *target++ = UCNV_SO; |
|
2381 else |
|
2382 *target++ = UCNV_SI; |
|
2383 if(offsets) |
|
2384 *(offsets++) = (int32_t)(source - args->source-1); |
|
2385 } |
|
2386 /* write the targetUniChar to target */ |
|
2387 if(targetByteUnit <= 0x00FF){ |
|
2388 if( target < targetLimit){ |
|
2389 *(target++) = (unsigned char) targetByteUnit; |
|
2390 if(offsets){ |
|
2391 *(offsets++) = (int32_t)(source - args->source-1); |
|
2392 } |
|
2393 |
|
2394 }else{ |
|
2395 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); |
|
2396 *err = U_BUFFER_OVERFLOW_ERROR; |
|
2397 } |
|
2398 }else{ |
|
2399 if(target < targetLimit){ |
|
2400 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); |
|
2401 if(offsets){ |
|
2402 *(offsets++) = (int32_t)(source - args->source-1); |
|
2403 } |
|
2404 if(target < targetLimit){ |
|
2405 *(target++) =(unsigned char) (targetByteUnit -0x80); |
|
2406 if(offsets){ |
|
2407 *(offsets++) = (int32_t)(source - args->source-1); |
|
2408 } |
|
2409 }else{ |
|
2410 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); |
|
2411 *err = U_BUFFER_OVERFLOW_ERROR; |
|
2412 } |
|
2413 }else{ |
|
2414 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); |
|
2415 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); |
|
2416 *err = U_BUFFER_OVERFLOW_ERROR; |
|
2417 } |
|
2418 } |
|
2419 |
|
2420 } |
|
2421 else{ |
|
2422 /* oops.. the code point is unassingned |
|
2423 * set the error and reason |
|
2424 */ |
|
2425 |
|
2426 /*check if the char is a First surrogate*/ |
|
2427 if(U16_IS_SURROGATE(sourceChar)) { |
|
2428 if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
|
2429 getTrail: |
|
2430 /*look ahead to find the trail surrogate*/ |
|
2431 if(source < sourceLimit) { |
|
2432 /* test the following code unit */ |
|
2433 UChar trail=(UChar) *source; |
|
2434 if(U16_IS_TRAIL(trail)) { |
|
2435 source++; |
|
2436 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
|
2437 *err = U_INVALID_CHAR_FOUND; |
|
2438 /* convert this surrogate code point */ |
|
2439 /* exit this condition tree */ |
|
2440 } else { |
|
2441 /* this is an unmatched lead code unit (1st surrogate) */ |
|
2442 /* callback(illegal) */ |
|
2443 *err=U_ILLEGAL_CHAR_FOUND; |
|
2444 } |
|
2445 } else { |
|
2446 /* no more input */ |
|
2447 *err = U_ZERO_ERROR; |
|
2448 } |
|
2449 } else { |
|
2450 /* this is an unmatched trail code unit (2nd surrogate) */ |
|
2451 /* callback(illegal) */ |
|
2452 *err=U_ILLEGAL_CHAR_FOUND; |
|
2453 } |
|
2454 } else { |
|
2455 /* callback(unassigned) for a BMP code point */ |
|
2456 *err = U_INVALID_CHAR_FOUND; |
|
2457 } |
|
2458 |
|
2459 args->converter->fromUChar32=sourceChar; |
|
2460 break; |
|
2461 } |
|
2462 } /* end if(myTargetIndex<myTargetLength) */ |
|
2463 else{ |
|
2464 *err =U_BUFFER_OVERFLOW_ERROR; |
|
2465 break; |
|
2466 } |
|
2467 |
|
2468 }/* end while(mySourceIndex<mySourceLength) */ |
|
2469 |
|
2470 /* |
|
2471 * the end of the input stream and detection of truncated input |
|
2472 * are handled by the framework, but for ISO-2022-KR conversion |
|
2473 * we need to be in ASCII mode at the very end |
|
2474 * |
|
2475 * conditions: |
|
2476 * successful |
|
2477 * not in ASCII mode |
|
2478 * end of input and no truncated input |
|
2479 */ |
|
2480 if( U_SUCCESS(*err) && |
|
2481 isTargetByteDBCS && |
|
2482 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 |
|
2483 ) { |
|
2484 int32_t sourceIndex; |
|
2485 |
|
2486 /* we are switching to ASCII */ |
|
2487 isTargetByteDBCS=FALSE; |
|
2488 |
|
2489 /* get the source index of the last input character */ |
|
2490 /* |
|
2491 * TODO this would be simpler and more reliable if we used a pair |
|
2492 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
|
2493 * so that we could simply use the prevSourceIndex here; |
|
2494 * this code gives an incorrect result for the rare case of an unmatched |
|
2495 * trail surrogate that is alone in the last buffer of the text stream |
|
2496 */ |
|
2497 sourceIndex=(int32_t)(source-args->source); |
|
2498 if(sourceIndex>0) { |
|
2499 --sourceIndex; |
|
2500 if( U16_IS_TRAIL(args->source[sourceIndex]) && |
|
2501 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
|
2502 ) { |
|
2503 --sourceIndex; |
|
2504 } |
|
2505 } else { |
|
2506 sourceIndex=-1; |
|
2507 } |
|
2508 |
|
2509 fromUWriteUInt8( |
|
2510 args->converter, |
|
2511 SHIFT_IN_STR, 1, |
|
2512 &target, (const char *)targetLimit, |
|
2513 &offsets, sourceIndex, |
|
2514 err); |
|
2515 } |
|
2516 |
|
2517 /*save the state and return */ |
|
2518 args->source = source; |
|
2519 args->target = (char*)target; |
|
2520 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; |
|
2521 } |
|
2522 |
|
2523 /************************ To Unicode ***************************************/ |
|
2524 |
|
2525 static void |
|
2526 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, |
|
2527 UErrorCode* err){ |
|
2528 char const* sourceStart; |
|
2529 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
|
2530 |
|
2531 UConverterToUnicodeArgs subArgs; |
|
2532 int32_t minArgsSize; |
|
2533 |
|
2534 /* set up the subconverter arguments */ |
|
2535 if(args->size<sizeof(UConverterToUnicodeArgs)) { |
|
2536 minArgsSize = args->size; |
|
2537 } else { |
|
2538 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); |
|
2539 } |
|
2540 |
|
2541 uprv_memcpy(&subArgs, args, minArgsSize); |
|
2542 subArgs.size = (uint16_t)minArgsSize; |
|
2543 subArgs.converter = myData->currentConverter; |
|
2544 |
|
2545 /* remember the original start of the input for offsets */ |
|
2546 sourceStart = args->source; |
|
2547 |
|
2548 if(myData->key != 0) { |
|
2549 /* continue with a partial escape sequence */ |
|
2550 goto escape; |
|
2551 } |
|
2552 |
|
2553 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { |
|
2554 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
|
2555 subArgs.source = args->source; |
|
2556 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); |
|
2557 if(subArgs.source != subArgs.sourceLimit) { |
|
2558 /* |
|
2559 * get the current partial byte sequence |
|
2560 * |
|
2561 * it needs to be moved between the public and the subconverter |
|
2562 * so that the conversion framework, which only sees the public |
|
2563 * converter, can handle truncated and illegal input etc. |
|
2564 */ |
|
2565 if(args->converter->toULength > 0) { |
|
2566 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); |
|
2567 } |
|
2568 subArgs.converter->toULength = args->converter->toULength; |
|
2569 |
|
2570 /* |
|
2571 * Convert up to the end of the input, or to before the next escape character. |
|
2572 * Does not handle conversion extensions because the preToU[] state etc. |
|
2573 * is not copied. |
|
2574 */ |
|
2575 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); |
|
2576 |
|
2577 if(args->offsets != NULL && sourceStart != args->source) { |
|
2578 /* update offsets to base them on the actual start of the input */ |
|
2579 int32_t *offsets = args->offsets; |
|
2580 UChar *target = args->target; |
|
2581 int32_t delta = (int32_t)(args->source - sourceStart); |
|
2582 while(target < subArgs.target) { |
|
2583 if(*offsets >= 0) { |
|
2584 *offsets += delta; |
|
2585 } |
|
2586 ++offsets; |
|
2587 ++target; |
|
2588 } |
|
2589 } |
|
2590 args->source = subArgs.source; |
|
2591 args->target = subArgs.target; |
|
2592 args->offsets = subArgs.offsets; |
|
2593 |
|
2594 /* copy input/error/overflow buffers */ |
|
2595 if(subArgs.converter->toULength > 0) { |
|
2596 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); |
|
2597 } |
|
2598 args->converter->toULength = subArgs.converter->toULength; |
|
2599 |
|
2600 if(*err == U_BUFFER_OVERFLOW_ERROR) { |
|
2601 if(subArgs.converter->UCharErrorBufferLength > 0) { |
|
2602 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, |
|
2603 subArgs.converter->UCharErrorBufferLength); |
|
2604 } |
|
2605 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; |
|
2606 subArgs.converter->UCharErrorBufferLength = 0; |
|
2607 } |
|
2608 } |
|
2609 |
|
2610 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { |
|
2611 return; |
|
2612 } |
|
2613 |
|
2614 escape: |
|
2615 changeState_2022(args->converter, |
|
2616 &(args->source), |
|
2617 args->sourceLimit, |
|
2618 ISO_2022_KR, |
|
2619 err); |
|
2620 } |
|
2621 } |
|
2622 |
|
2623 static void |
|
2624 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
|
2625 UErrorCode* err){ |
|
2626 char tempBuf[2]; |
|
2627 const char *mySource = ( char *) args->source; |
|
2628 UChar *myTarget = args->target; |
|
2629 const char *mySourceLimit = args->sourceLimit; |
|
2630 UChar32 targetUniChar = 0x0000; |
|
2631 UChar mySourceChar = 0x0000; |
|
2632 UConverterDataISO2022* myData; |
|
2633 UConverterSharedData* sharedData ; |
|
2634 UBool useFallback; |
|
2635 |
|
2636 myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
|
2637 if(myData->version==1){ |
|
2638 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
|
2639 return; |
|
2640 } |
|
2641 |
|
2642 /* initialize state */ |
|
2643 sharedData = myData->currentConverter->sharedData; |
|
2644 useFallback = args->converter->useFallback; |
|
2645 |
|
2646 if(myData->key != 0) { |
|
2647 /* continue with a partial escape sequence */ |
|
2648 goto escape; |
|
2649 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
|
2650 /* continue with a partial double-byte character */ |
|
2651 mySourceChar = args->converter->toUBytes[0]; |
|
2652 args->converter->toULength = 0; |
|
2653 goto getTrailByte; |
|
2654 } |
|
2655 |
|
2656 while(mySource< mySourceLimit){ |
|
2657 |
|
2658 if(myTarget < args->targetLimit){ |
|
2659 |
|
2660 mySourceChar= (unsigned char) *mySource++; |
|
2661 |
|
2662 if(mySourceChar==UCNV_SI){ |
|
2663 myData->toU2022State.g = 0; |
|
2664 if (myData->isEmptySegment) { |
|
2665 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ |
|
2666 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
2667 args->converter->toUCallbackReason = UCNV_IRREGULAR; |
|
2668 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
|
2669 args->converter->toULength = 1; |
|
2670 args->target = myTarget; |
|
2671 args->source = mySource; |
|
2672 return; |
|
2673 } |
|
2674 /*consume the source */ |
|
2675 continue; |
|
2676 }else if(mySourceChar==UCNV_SO){ |
|
2677 myData->toU2022State.g = 1; |
|
2678 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
|
2679 /*consume the source */ |
|
2680 continue; |
|
2681 }else if(mySourceChar==ESC_2022){ |
|
2682 mySource--; |
|
2683 escape: |
|
2684 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ |
|
2685 changeState_2022(args->converter,&(mySource), |
|
2686 mySourceLimit, ISO_2022_KR, err); |
|
2687 if(U_FAILURE(*err)){ |
|
2688 args->target = myTarget; |
|
2689 args->source = mySource; |
|
2690 return; |
|
2691 } |
|
2692 continue; |
|
2693 } |
|
2694 |
|
2695 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ |
|
2696 if(myData->toU2022State.g == 1) { |
|
2697 if(mySource < mySourceLimit) { |
|
2698 int leadIsOk, trailIsOk; |
|
2699 uint8_t trailByte; |
|
2700 getTrailByte: |
|
2701 targetUniChar = missingCharMarker; |
|
2702 trailByte = (uint8_t)*mySource; |
|
2703 /* |
|
2704 * Ticket 5691: consistent illegal sequences: |
|
2705 * - We include at least the first byte in the illegal sequence. |
|
2706 * - If any of the non-initial bytes could be the start of a character, |
|
2707 * we stop the illegal sequence before the first one of those. |
|
2708 * |
|
2709 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
|
2710 * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
|
2711 * Otherwise we convert or report the pair of bytes. |
|
2712 */ |
|
2713 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
|
2714 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
|
2715 if (leadIsOk && trailIsOk) { |
|
2716 ++mySource; |
|
2717 tempBuf[0] = (char)(mySourceChar + 0x80); |
|
2718 tempBuf[1] = (char)(trailByte + 0x80); |
|
2719 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); |
|
2720 mySourceChar = (mySourceChar << 8) | trailByte; |
|
2721 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
|
2722 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
|
2723 ++mySource; |
|
2724 /* add another bit so that the code below writes 2 bytes in case of error */ |
|
2725 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
|
2726 } |
|
2727 } else { |
|
2728 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
|
2729 args->converter->toULength = 1; |
|
2730 break; |
|
2731 } |
|
2732 } |
|
2733 else if(mySourceChar <= 0x7f) { |
|
2734 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); |
|
2735 } else { |
|
2736 targetUniChar = 0xffff; |
|
2737 } |
|
2738 if(targetUniChar < 0xfffe){ |
|
2739 if(args->offsets) { |
|
2740 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
|
2741 } |
|
2742 *(myTarget++)=(UChar)targetUniChar; |
|
2743 } |
|
2744 else { |
|
2745 /* Call the callback function*/ |
|
2746 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
|
2747 break; |
|
2748 } |
|
2749 } |
|
2750 else{ |
|
2751 *err =U_BUFFER_OVERFLOW_ERROR; |
|
2752 break; |
|
2753 } |
|
2754 } |
|
2755 args->target = myTarget; |
|
2756 args->source = mySource; |
|
2757 } |
|
2758 |
|
2759 /*************************** END ISO2022-KR *********************************/ |
|
2760 |
|
2761 /*************************** ISO-2022-CN ********************************* |
|
2762 * |
|
2763 * Rules for ISO-2022-CN Encoding: |
|
2764 * i) The designator sequence must appear once on a line before any instance |
|
2765 * of character set it designates. |
|
2766 * ii) If two lines contain characters from the same character set, both lines |
|
2767 * must include the designator sequence. |
|
2768 * iii) Once the designator sequence is known, a shifting sequence has to be found |
|
2769 * to invoke the shifting |
|
2770 * iv) All lines start in ASCII and end in ASCII. |
|
2771 * v) Four shifting sequences are employed for this purpose: |
|
2772 * |
|
2773 * Sequcence ASCII Eq Charsets |
|
2774 * ---------- ------- --------- |
|
2775 * SI <SI> US-ASCII |
|
2776 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 |
|
2777 * SS2 <ESC>N CNS-11643-1992 Plane 2 |
|
2778 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 |
|
2779 * |
|
2780 * vi) |
|
2781 * SOdesignator : ESC "$" ")" finalchar_for_SO |
|
2782 * SS2designator : ESC "$" "*" finalchar_for_SS2 |
|
2783 * SS3designator : ESC "$" "+" finalchar_for_SS3 |
|
2784 * |
|
2785 * ESC $ ) A Indicates the bytes following SO are Chinese |
|
2786 * characters as defined in GB 2312-80, until |
|
2787 * another SOdesignation appears |
|
2788 * |
|
2789 * |
|
2790 * ESC $ ) E Indicates the bytes following SO are as defined |
|
2791 * in ISO-IR-165 (for details, see section 2.1), |
|
2792 * until another SOdesignation appears |
|
2793 * |
|
2794 * ESC $ ) G Indicates the bytes following SO are as defined |
|
2795 * in CNS 11643-plane-1, until another |
|
2796 * SOdesignation appears |
|
2797 * |
|
2798 * ESC $ * H Indicates the two bytes immediately following |
|
2799 * SS2 is a Chinese character as defined in CNS |
|
2800 * 11643-plane-2, until another SS2designation |
|
2801 * appears |
|
2802 * (Meaning <ESC>N must preceed every 2 byte |
|
2803 * sequence.) |
|
2804 * |
|
2805 * ESC $ + I Indicates the immediate two bytes following SS3 |
|
2806 * is a Chinese character as defined in CNS |
|
2807 * 11643-plane-3, until another SS3designation |
|
2808 * appears |
|
2809 * (Meaning <ESC>O must preceed every 2 byte |
|
2810 * sequence.) |
|
2811 * |
|
2812 * ESC $ + J Indicates the immediate two bytes following SS3 |
|
2813 * is a Chinese character as defined in CNS |
|
2814 * 11643-plane-4, until another SS3designation |
|
2815 * appears |
|
2816 * (In English: <ESC>O must preceed every 2 byte |
|
2817 * sequence.) |
|
2818 * |
|
2819 * ESC $ + K Indicates the immediate two bytes following SS3 |
|
2820 * is a Chinese character as defined in CNS |
|
2821 * 11643-plane-5, until another SS3designation |
|
2822 * appears |
|
2823 * |
|
2824 * ESC $ + L Indicates the immediate two bytes following SS3 |
|
2825 * is a Chinese character as defined in CNS |
|
2826 * 11643-plane-6, until another SS3designation |
|
2827 * appears |
|
2828 * |
|
2829 * ESC $ + M Indicates the immediate two bytes following SS3 |
|
2830 * is a Chinese character as defined in CNS |
|
2831 * 11643-plane-7, until another SS3designation |
|
2832 * appears |
|
2833 * |
|
2834 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and |
|
2835 * has its own designation information before any Chinese characters |
|
2836 * appear |
|
2837 * |
|
2838 */ |
|
2839 |
|
2840 /* The following are defined this way to make the strings truly readonly */ |
|
2841 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; |
|
2842 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; |
|
2843 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; |
|
2844 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; |
|
2845 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; |
|
2846 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; |
|
2847 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; |
|
2848 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; |
|
2849 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; |
|
2850 |
|
2851 /********************** ISO2022-CN Data **************************/ |
|
2852 static const char* const escSeqCharsCN[10] ={ |
|
2853 SHIFT_IN_STR, /* 0 ASCII */ |
|
2854 GB_2312_80_STR, /* 1 GB2312_1 */ |
|
2855 ISO_IR_165_STR, /* 2 ISO_IR_165 */ |
|
2856 CNS_11643_1992_Plane_1_STR, |
|
2857 CNS_11643_1992_Plane_2_STR, |
|
2858 CNS_11643_1992_Plane_3_STR, |
|
2859 CNS_11643_1992_Plane_4_STR, |
|
2860 CNS_11643_1992_Plane_5_STR, |
|
2861 CNS_11643_1992_Plane_6_STR, |
|
2862 CNS_11643_1992_Plane_7_STR |
|
2863 }; |
|
2864 |
|
2865 static void |
|
2866 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
|
2867 UConverter *cnv = args->converter; |
|
2868 UConverterDataISO2022 *converterData; |
|
2869 ISO2022State *pFromU2022State; |
|
2870 uint8_t *target = (uint8_t *) args->target; |
|
2871 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
|
2872 const UChar* source = args->source; |
|
2873 const UChar* sourceLimit = args->sourceLimit; |
|
2874 int32_t* offsets = args->offsets; |
|
2875 UChar32 sourceChar; |
|
2876 char buffer[8]; |
|
2877 int32_t len; |
|
2878 int8_t choices[3]; |
|
2879 int32_t choiceCount; |
|
2880 uint32_t targetValue = 0; |
|
2881 UBool useFallback; |
|
2882 |
|
2883 /* set up the state */ |
|
2884 converterData = (UConverterDataISO2022*)cnv->extraInfo; |
|
2885 pFromU2022State = &converterData->fromU2022State; |
|
2886 |
|
2887 choiceCount = 0; |
|
2888 |
|
2889 /* check if the last codepoint of previous buffer was a lead surrogate*/ |
|
2890 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
|
2891 goto getTrail; |
|
2892 } |
|
2893 |
|
2894 while( source < sourceLimit){ |
|
2895 if(target < targetLimit){ |
|
2896 |
|
2897 sourceChar = *(source++); |
|
2898 /*check if the char is a First surrogate*/ |
|
2899 if(U16_IS_SURROGATE(sourceChar)) { |
|
2900 if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
|
2901 getTrail: |
|
2902 /*look ahead to find the trail surrogate*/ |
|
2903 if(source < sourceLimit) { |
|
2904 /* test the following code unit */ |
|
2905 UChar trail=(UChar) *source; |
|
2906 if(U16_IS_TRAIL(trail)) { |
|
2907 source++; |
|
2908 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
|
2909 cnv->fromUChar32=0x00; |
|
2910 /* convert this supplementary code point */ |
|
2911 /* exit this condition tree */ |
|
2912 } else { |
|
2913 /* this is an unmatched lead code unit (1st surrogate) */ |
|
2914 /* callback(illegal) */ |
|
2915 *err=U_ILLEGAL_CHAR_FOUND; |
|
2916 cnv->fromUChar32=sourceChar; |
|
2917 break; |
|
2918 } |
|
2919 } else { |
|
2920 /* no more input */ |
|
2921 cnv->fromUChar32=sourceChar; |
|
2922 break; |
|
2923 } |
|
2924 } else { |
|
2925 /* this is an unmatched trail code unit (2nd surrogate) */ |
|
2926 /* callback(illegal) */ |
|
2927 *err=U_ILLEGAL_CHAR_FOUND; |
|
2928 cnv->fromUChar32=sourceChar; |
|
2929 break; |
|
2930 } |
|
2931 } |
|
2932 |
|
2933 /* do the conversion */ |
|
2934 if(sourceChar <= 0x007f ){ |
|
2935 /* do not convert SO/SI/ESC */ |
|
2936 if(IS_2022_CONTROL(sourceChar)) { |
|
2937 /* callback(illegal) */ |
|
2938 *err=U_ILLEGAL_CHAR_FOUND; |
|
2939 cnv->fromUChar32=sourceChar; |
|
2940 break; |
|
2941 } |
|
2942 |
|
2943 /* US-ASCII */ |
|
2944 if(pFromU2022State->g == 0) { |
|
2945 buffer[0] = (char)sourceChar; |
|
2946 len = 1; |
|
2947 } else { |
|
2948 buffer[0] = UCNV_SI; |
|
2949 buffer[1] = (char)sourceChar; |
|
2950 len = 2; |
|
2951 pFromU2022State->g = 0; |
|
2952 choiceCount = 0; |
|
2953 } |
|
2954 if(sourceChar == CR || sourceChar == LF) { |
|
2955 /* reset the state at the end of a line */ |
|
2956 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); |
|
2957 choiceCount = 0; |
|
2958 } |
|
2959 } |
|
2960 else{ |
|
2961 /* convert U+0080..U+10ffff */ |
|
2962 int32_t i; |
|
2963 int8_t cs, g; |
|
2964 |
|
2965 if(choiceCount == 0) { |
|
2966 /* try the current SO/G1 converter first */ |
|
2967 choices[0] = pFromU2022State->cs[1]; |
|
2968 |
|
2969 /* default to GB2312_1 if none is designated yet */ |
|
2970 if(choices[0] == 0) { |
|
2971 choices[0] = GB2312_1; |
|
2972 } |
|
2973 |
|
2974 if(converterData->version == 0) { |
|
2975 /* ISO-2022-CN */ |
|
2976 |
|
2977 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ |
|
2978 if(choices[0] == GB2312_1) { |
|
2979 choices[1] = (int8_t)CNS_11643_1; |
|
2980 } else { |
|
2981 choices[1] = (int8_t)GB2312_1; |
|
2982 } |
|
2983 |
|
2984 choiceCount = 2; |
|
2985 } else if (converterData->version == 1) { |
|
2986 /* ISO-2022-CN-EXT */ |
|
2987 |
|
2988 /* try one of the other converters */ |
|
2989 switch(choices[0]) { |
|
2990 case GB2312_1: |
|
2991 choices[1] = (int8_t)CNS_11643_1; |
|
2992 choices[2] = (int8_t)ISO_IR_165; |
|
2993 break; |
|
2994 case ISO_IR_165: |
|
2995 choices[1] = (int8_t)GB2312_1; |
|
2996 choices[2] = (int8_t)CNS_11643_1; |
|
2997 break; |
|
2998 default: /* CNS_11643_x */ |
|
2999 choices[1] = (int8_t)GB2312_1; |
|
3000 choices[2] = (int8_t)ISO_IR_165; |
|
3001 break; |
|
3002 } |
|
3003 |
|
3004 choiceCount = 3; |
|
3005 } else { |
|
3006 choices[0] = (int8_t)CNS_11643_1; |
|
3007 choices[1] = (int8_t)GB2312_1; |
|
3008 } |
|
3009 } |
|
3010 |
|
3011 cs = g = 0; |
|
3012 /* |
|
3013 * len==0: no mapping found yet |
|
3014 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
|
3015 * len>0: found a roundtrip result, done |
|
3016 */ |
|
3017 len = 0; |
|
3018 /* |
|
3019 * We will turn off useFallback after finding a fallback, |
|
3020 * but we still get fallbacks from PUA code points as usual. |
|
3021 * Therefore, we will also need to check that we don't overwrite |
|
3022 * an early fallback with a later one. |
|
3023 */ |
|
3024 useFallback = cnv->useFallback; |
|
3025 |
|
3026 for(i = 0; i < choiceCount && len <= 0; ++i) { |
|
3027 int8_t cs0 = choices[i]; |
|
3028 if(cs0 > 0) { |
|
3029 uint32_t value; |
|
3030 int32_t len2; |
|
3031 if(cs0 >= CNS_11643_0) { |
|
3032 len2 = MBCS_FROM_UCHAR32_ISO2022( |
|
3033 converterData->myConverterArray[CNS_11643], |
|
3034 sourceChar, |
|
3035 &value, |
|
3036 useFallback, |
|
3037 MBCS_OUTPUT_3); |
|
3038 if(len2 == 3 || (len2 == -3 && len == 0)) { |
|
3039 targetValue = value; |
|
3040 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); |
|
3041 if(len2 >= 0) { |
|
3042 len = 2; |
|
3043 } else { |
|
3044 len = -2; |
|
3045 useFallback = FALSE; |
|
3046 } |
|
3047 if(cs == CNS_11643_1) { |
|
3048 g = 1; |
|
3049 } else if(cs == CNS_11643_2) { |
|
3050 g = 2; |
|
3051 } else /* plane 3..7 */ if(converterData->version == 1) { |
|
3052 g = 3; |
|
3053 } else { |
|
3054 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ |
|
3055 len = 0; |
|
3056 } |
|
3057 } |
|
3058 } else { |
|
3059 /* GB2312_1 or ISO-IR-165 */ |
|
3060 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); |
|
3061 len2 = MBCS_FROM_UCHAR32_ISO2022( |
|
3062 converterData->myConverterArray[cs0], |
|
3063 sourceChar, |
|
3064 &value, |
|
3065 useFallback, |
|
3066 MBCS_OUTPUT_2); |
|
3067 if(len2 == 2 || (len2 == -2 && len == 0)) { |
|
3068 targetValue = value; |
|
3069 len = len2; |
|
3070 cs = cs0; |
|
3071 g = 1; |
|
3072 useFallback = FALSE; |
|
3073 } |
|
3074 } |
|
3075 } |
|
3076 } |
|
3077 |
|
3078 if(len != 0) { |
|
3079 len = 0; /* count output bytes; it must have been abs(len) == 2 */ |
|
3080 |
|
3081 /* write the designation sequence if necessary */ |
|
3082 if(cs != pFromU2022State->cs[g]) { |
|
3083 if(cs < CNS_11643) { |
|
3084 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); |
|
3085 } else { |
|
3086 U_ASSERT(cs >= CNS_11643_1); |
|
3087 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); |
|
3088 } |
|
3089 len = 4; |
|
3090 pFromU2022State->cs[g] = cs; |
|
3091 if(g == 1) { |
|
3092 /* changing the SO/G1 charset invalidates the choices[] */ |
|
3093 choiceCount = 0; |
|
3094 } |
|
3095 } |
|
3096 |
|
3097 /* write the shift sequence if necessary */ |
|
3098 if(g != pFromU2022State->g) { |
|
3099 switch(g) { |
|
3100 case 1: |
|
3101 buffer[len++] = UCNV_SO; |
|
3102 |
|
3103 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ |
|
3104 pFromU2022State->g = 1; |
|
3105 break; |
|
3106 case 2: |
|
3107 buffer[len++] = 0x1b; |
|
3108 buffer[len++] = 0x4e; |
|
3109 break; |
|
3110 default: /* case 3 */ |
|
3111 buffer[len++] = 0x1b; |
|
3112 buffer[len++] = 0x4f; |
|
3113 break; |
|
3114 } |
|
3115 } |
|
3116 |
|
3117 /* write the two output bytes */ |
|
3118 buffer[len++] = (char)(targetValue >> 8); |
|
3119 buffer[len++] = (char)targetValue; |
|
3120 } else { |
|
3121 /* if we cannot find the character after checking all codepages |
|
3122 * then this is an error |
|
3123 */ |
|
3124 *err = U_INVALID_CHAR_FOUND; |
|
3125 cnv->fromUChar32=sourceChar; |
|
3126 break; |
|
3127 } |
|
3128 } |
|
3129 |
|
3130 /* output len>0 bytes in buffer[] */ |
|
3131 if(len == 1) { |
|
3132 *target++ = buffer[0]; |
|
3133 if(offsets) { |
|
3134 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
|
3135 } |
|
3136 } else if(len == 2 && (target + 2) <= targetLimit) { |
|
3137 *target++ = buffer[0]; |
|
3138 *target++ = buffer[1]; |
|
3139 if(offsets) { |
|
3140 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
|
3141 *offsets++ = sourceIndex; |
|
3142 *offsets++ = sourceIndex; |
|
3143 } |
|
3144 } else { |
|
3145 fromUWriteUInt8( |
|
3146 cnv, |
|
3147 buffer, len, |
|
3148 &target, (const char *)targetLimit, |
|
3149 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
|
3150 err); |
|
3151 if(U_FAILURE(*err)) { |
|
3152 break; |
|
3153 } |
|
3154 } |
|
3155 } /* end if(myTargetIndex<myTargetLength) */ |
|
3156 else{ |
|
3157 *err =U_BUFFER_OVERFLOW_ERROR; |
|
3158 break; |
|
3159 } |
|
3160 |
|
3161 }/* end while(mySourceIndex<mySourceLength) */ |
|
3162 |
|
3163 /* |
|
3164 * the end of the input stream and detection of truncated input |
|
3165 * are handled by the framework, but for ISO-2022-CN conversion |
|
3166 * we need to be in ASCII mode at the very end |
|
3167 * |
|
3168 * conditions: |
|
3169 * successful |
|
3170 * not in ASCII mode |
|
3171 * end of input and no truncated input |
|
3172 */ |
|
3173 if( U_SUCCESS(*err) && |
|
3174 pFromU2022State->g!=0 && |
|
3175 args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
|
3176 ) { |
|
3177 int32_t sourceIndex; |
|
3178 |
|
3179 /* we are switching to ASCII */ |
|
3180 pFromU2022State->g=0; |
|
3181 |
|
3182 /* get the source index of the last input character */ |
|
3183 /* |
|
3184 * TODO this would be simpler and more reliable if we used a pair |
|
3185 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
|
3186 * so that we could simply use the prevSourceIndex here; |
|
3187 * this code gives an incorrect result for the rare case of an unmatched |
|
3188 * trail surrogate that is alone in the last buffer of the text stream |
|
3189 */ |
|
3190 sourceIndex=(int32_t)(source-args->source); |
|
3191 if(sourceIndex>0) { |
|
3192 --sourceIndex; |
|
3193 if( U16_IS_TRAIL(args->source[sourceIndex]) && |
|
3194 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
|
3195 ) { |
|
3196 --sourceIndex; |
|
3197 } |
|
3198 } else { |
|
3199 sourceIndex=-1; |
|
3200 } |
|
3201 |
|
3202 fromUWriteUInt8( |
|
3203 cnv, |
|
3204 SHIFT_IN_STR, 1, |
|
3205 &target, (const char *)targetLimit, |
|
3206 &offsets, sourceIndex, |
|
3207 err); |
|
3208 } |
|
3209 |
|
3210 /*save the state and return */ |
|
3211 args->source = source; |
|
3212 args->target = (char*)target; |
|
3213 } |
|
3214 |
|
3215 |
|
3216 static void |
|
3217 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
|
3218 UErrorCode* err){ |
|
3219 char tempBuf[3]; |
|
3220 const char *mySource = (char *) args->source; |
|
3221 UChar *myTarget = args->target; |
|
3222 const char *mySourceLimit = args->sourceLimit; |
|
3223 uint32_t targetUniChar = 0x0000; |
|
3224 uint32_t mySourceChar = 0x0000; |
|
3225 UConverterDataISO2022* myData; |
|
3226 ISO2022State *pToU2022State; |
|
3227 |
|
3228 myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
|
3229 pToU2022State = &myData->toU2022State; |
|
3230 |
|
3231 if(myData->key != 0) { |
|
3232 /* continue with a partial escape sequence */ |
|
3233 goto escape; |
|
3234 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
|
3235 /* continue with a partial double-byte character */ |
|
3236 mySourceChar = args->converter->toUBytes[0]; |
|
3237 args->converter->toULength = 0; |
|
3238 targetUniChar = missingCharMarker; |
|
3239 goto getTrailByte; |
|
3240 } |
|
3241 |
|
3242 while(mySource < mySourceLimit){ |
|
3243 |
|
3244 targetUniChar =missingCharMarker; |
|
3245 |
|
3246 if(myTarget < args->targetLimit){ |
|
3247 |
|
3248 mySourceChar= (unsigned char) *mySource++; |
|
3249 |
|
3250 switch(mySourceChar){ |
|
3251 case UCNV_SI: |
|
3252 pToU2022State->g=0; |
|
3253 if (myData->isEmptySegment) { |
|
3254 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ |
|
3255 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
3256 args->converter->toUCallbackReason = UCNV_IRREGULAR; |
|
3257 args->converter->toUBytes[0] = mySourceChar; |
|
3258 args->converter->toULength = 1; |
|
3259 args->target = myTarget; |
|
3260 args->source = mySource; |
|
3261 return; |
|
3262 } |
|
3263 continue; |
|
3264 |
|
3265 case UCNV_SO: |
|
3266 if(pToU2022State->cs[1] != 0) { |
|
3267 pToU2022State->g=1; |
|
3268 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
|
3269 continue; |
|
3270 } else { |
|
3271 /* illegal to have SO before a matching designator */ |
|
3272 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ |
|
3273 break; |
|
3274 } |
|
3275 |
|
3276 case ESC_2022: |
|
3277 mySource--; |
|
3278 escape: |
|
3279 { |
|
3280 const char * mySourceBefore = mySource; |
|
3281 int8_t toULengthBefore = args->converter->toULength; |
|
3282 |
|
3283 changeState_2022(args->converter,&(mySource), |
|
3284 mySourceLimit, ISO_2022_CN,err); |
|
3285 |
|
3286 /* After SO there must be at least one character before a designator (designator error handled separately) */ |
|
3287 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
|
3288 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
|
3289 args->converter->toUCallbackReason = UCNV_IRREGULAR; |
|
3290 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
|
3291 } |
|
3292 } |
|
3293 |
|
3294 /* invalid or illegal escape sequence */ |
|
3295 if(U_FAILURE(*err)){ |
|
3296 args->target = myTarget; |
|
3297 args->source = mySource; |
|
3298 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
|
3299 return; |
|
3300 } |
|
3301 continue; |
|
3302 |
|
3303 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ |
|
3304 |
|
3305 case CR: |
|
3306 /*falls through*/ |
|
3307 case LF: |
|
3308 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); |
|
3309 /* falls through */ |
|
3310 default: |
|
3311 /* convert one or two bytes */ |
|
3312 myData->isEmptySegment = FALSE; |
|
3313 if(pToU2022State->g != 0) { |
|
3314 if(mySource < mySourceLimit) { |
|
3315 UConverterSharedData *cnv; |
|
3316 StateEnum tempState; |
|
3317 int32_t tempBufLen; |
|
3318 int leadIsOk, trailIsOk; |
|
3319 uint8_t trailByte; |
|
3320 getTrailByte: |
|
3321 trailByte = (uint8_t)*mySource; |
|
3322 /* |
|
3323 * Ticket 5691: consistent illegal sequences: |
|
3324 * - We include at least the first byte in the illegal sequence. |
|
3325 * - If any of the non-initial bytes could be the start of a character, |
|
3326 * we stop the illegal sequence before the first one of those. |
|
3327 * |
|
3328 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
|
3329 * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
|
3330 * Otherwise we convert or report the pair of bytes. |
|
3331 */ |
|
3332 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
|
3333 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
|
3334 if (leadIsOk && trailIsOk) { |
|
3335 ++mySource; |
|
3336 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
|
3337 if(tempState >= CNS_11643_0) { |
|
3338 cnv = myData->myConverterArray[CNS_11643]; |
|
3339 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); |
|
3340 tempBuf[1] = (char) (mySourceChar); |
|
3341 tempBuf[2] = (char) trailByte; |
|
3342 tempBufLen = 3; |
|
3343 |
|
3344 }else{ |
|
3345 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); |
|
3346 cnv = myData->myConverterArray[tempState]; |
|
3347 tempBuf[0] = (char) (mySourceChar); |
|
3348 tempBuf[1] = (char) trailByte; |
|
3349 tempBufLen = 2; |
|
3350 } |
|
3351 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); |
|
3352 mySourceChar = (mySourceChar << 8) | trailByte; |
|
3353 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
|
3354 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
|
3355 ++mySource; |
|
3356 /* add another bit so that the code below writes 2 bytes in case of error */ |
|
3357 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
|
3358 } |
|
3359 if(pToU2022State->g>=2) { |
|
3360 /* return from a single-shift state to the previous one */ |
|
3361 pToU2022State->g=pToU2022State->prevG; |
|
3362 } |
|
3363 } else { |
|
3364 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
|
3365 args->converter->toULength = 1; |
|
3366 goto endloop; |
|
3367 } |
|
3368 } |
|
3369 else{ |
|
3370 if(mySourceChar <= 0x7f) { |
|
3371 targetUniChar = (UChar) mySourceChar; |
|
3372 } |
|
3373 } |
|
3374 break; |
|
3375 } |
|
3376 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
|
3377 if(args->offsets){ |
|
3378 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
|
3379 } |
|
3380 *(myTarget++)=(UChar)targetUniChar; |
|
3381 } |
|
3382 else if(targetUniChar > missingCharMarker){ |
|
3383 /* disassemble the surrogate pair and write to output*/ |
|
3384 targetUniChar-=0x0010000; |
|
3385 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
|
3386 if(args->offsets){ |
|
3387 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
|
3388 } |
|
3389 ++myTarget; |
|
3390 if(myTarget< args->targetLimit){ |
|
3391 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
|
3392 if(args->offsets){ |
|
3393 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
|
3394 } |
|
3395 ++myTarget; |
|
3396 }else{ |
|
3397 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
|
3398 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
|
3399 } |
|
3400 |
|
3401 } |
|
3402 else{ |
|
3403 /* Call the callback function*/ |
|
3404 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
|
3405 break; |
|
3406 } |
|
3407 } |
|
3408 else{ |
|
3409 *err =U_BUFFER_OVERFLOW_ERROR; |
|
3410 break; |
|
3411 } |
|
3412 } |
|
3413 endloop: |
|
3414 args->target = myTarget; |
|
3415 args->source = mySource; |
|
3416 } |
|
3417 |
|
3418 static void |
|
3419 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { |
|
3420 UConverter *cnv = args->converter; |
|
3421 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
|
3422 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; |
|
3423 char *p, *subchar; |
|
3424 char buffer[8]; |
|
3425 int32_t length; |
|
3426 |
|
3427 subchar=(char *)cnv->subChars; |
|
3428 length=cnv->subCharLen; /* assume length==1 for most variants */ |
|
3429 |
|
3430 p = buffer; |
|
3431 switch(myConverterData->locale[0]){ |
|
3432 case 'j': |
|
3433 { |
|
3434 int8_t cs; |
|
3435 |
|
3436 if(pFromU2022State->g == 1) { |
|
3437 /* JIS7: switch from G1 to G0 */ |
|
3438 pFromU2022State->g = 0; |
|
3439 *p++ = UCNV_SI; |
|
3440 } |
|
3441 |
|
3442 cs = pFromU2022State->cs[0]; |
|
3443 if(cs != ASCII && cs != JISX201) { |
|
3444 /* not in ASCII or JIS X 0201: switch to ASCII */ |
|
3445 pFromU2022State->cs[0] = (int8_t)ASCII; |
|
3446 *p++ = '\x1b'; |
|
3447 *p++ = '\x28'; |
|
3448 *p++ = '\x42'; |
|
3449 } |
|
3450 |
|
3451 *p++ = subchar[0]; |
|
3452 break; |
|
3453 } |
|
3454 case 'c': |
|
3455 if(pFromU2022State->g != 0) { |
|
3456 /* not in ASCII mode: switch to ASCII */ |
|
3457 pFromU2022State->g = 0; |
|
3458 *p++ = UCNV_SI; |
|
3459 } |
|
3460 *p++ = subchar[0]; |
|
3461 break; |
|
3462 case 'k': |
|
3463 if(myConverterData->version == 0) { |
|
3464 if(length == 1) { |
|
3465 if((UBool)args->converter->fromUnicodeStatus) { |
|
3466 /* in DBCS mode: switch to SBCS */ |
|
3467 args->converter->fromUnicodeStatus = 0; |
|
3468 *p++ = UCNV_SI; |
|
3469 } |
|
3470 *p++ = subchar[0]; |
|
3471 } else /* length == 2*/ { |
|
3472 if(!(UBool)args->converter->fromUnicodeStatus) { |
|
3473 /* in SBCS mode: switch to DBCS */ |
|
3474 args->converter->fromUnicodeStatus = 1; |
|
3475 *p++ = UCNV_SO; |
|
3476 } |
|
3477 *p++ = subchar[0]; |
|
3478 *p++ = subchar[1]; |
|
3479 } |
|
3480 break; |
|
3481 } else { |
|
3482 /* save the subconverter's substitution string */ |
|
3483 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; |
|
3484 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; |
|
3485 |
|
3486 /* set our substitution string into the subconverter */ |
|
3487 myConverterData->currentConverter->subChars = (uint8_t *)subchar; |
|
3488 myConverterData->currentConverter->subCharLen = (int8_t)length; |
|
3489 |
|
3490 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ |
|
3491 args->converter = myConverterData->currentConverter; |
|
3492 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; |
|
3493 ucnv_cbFromUWriteSub(args, 0, err); |
|
3494 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
|
3495 args->converter = cnv; |
|
3496 |
|
3497 /* restore the subconverter's substitution string */ |
|
3498 myConverterData->currentConverter->subChars = currentSubChars; |
|
3499 myConverterData->currentConverter->subCharLen = currentSubCharLen; |
|
3500 |
|
3501 if(*err == U_BUFFER_OVERFLOW_ERROR) { |
|
3502 if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
|
3503 uprv_memcpy( |
|
3504 cnv->charErrorBuffer, |
|
3505 myConverterData->currentConverter->charErrorBuffer, |
|
3506 myConverterData->currentConverter->charErrorBufferLength); |
|
3507 } |
|
3508 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
|
3509 myConverterData->currentConverter->charErrorBufferLength = 0; |
|
3510 } |
|
3511 return; |
|
3512 } |
|
3513 default: |
|
3514 /* not expected */ |
|
3515 break; |
|
3516 } |
|
3517 ucnv_cbFromUWriteBytes(args, |
|
3518 buffer, (int32_t)(p - buffer), |
|
3519 offsetIndex, err); |
|
3520 } |
|
3521 |
|
3522 /* |
|
3523 * Structure for cloning an ISO 2022 converter into a single memory block. |
|
3524 * ucnv_safeClone() of the converter will align the entire cloneStruct, |
|
3525 * and then ucnv_safeClone() of the sub-converter may additionally align |
|
3526 * currentConverter inside the cloneStruct, for which we need the deadSpace |
|
3527 * after currentConverter. |
|
3528 * This is because UAlignedMemory may be larger than the actually |
|
3529 * necessary alignment size for the platform. |
|
3530 * The other cloneStruct fields will not be moved around, |
|
3531 * and are aligned properly with cloneStruct's alignment. |
|
3532 */ |
|
3533 struct cloneStruct |
|
3534 { |
|
3535 UConverter cnv; |
|
3536 UConverter currentConverter; |
|
3537 UAlignedMemory deadSpace; |
|
3538 UConverterDataISO2022 mydata; |
|
3539 }; |
|
3540 |
|
3541 |
|
3542 static UConverter * |
|
3543 _ISO_2022_SafeClone( |
|
3544 const UConverter *cnv, |
|
3545 void *stackBuffer, |
|
3546 int32_t *pBufferSize, |
|
3547 UErrorCode *status) |
|
3548 { |
|
3549 struct cloneStruct * localClone; |
|
3550 UConverterDataISO2022 *cnvData; |
|
3551 int32_t i, size; |
|
3552 |
|
3553 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ |
|
3554 *pBufferSize = (int32_t)sizeof(struct cloneStruct); |
|
3555 return NULL; |
|
3556 } |
|
3557 |
|
3558 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; |
|
3559 localClone = (struct cloneStruct *)stackBuffer; |
|
3560 |
|
3561 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
|
3562 |
|
3563 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); |
|
3564 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ |
|
3565 localClone->cnv.isExtraLocal = TRUE; |
|
3566 |
|
3567 /* share the subconverters */ |
|
3568 |
|
3569 if(cnvData->currentConverter != NULL) { |
|
3570 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ |
|
3571 localClone->mydata.currentConverter = |
|
3572 ucnv_safeClone(cnvData->currentConverter, |
|
3573 &localClone->currentConverter, |
|
3574 &size, status); |
|
3575 if(U_FAILURE(*status)) { |
|
3576 return NULL; |
|
3577 } |
|
3578 } |
|
3579 |
|
3580 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { |
|
3581 if(cnvData->myConverterArray[i] != NULL) { |
|
3582 ucnv_incrementRefCount(cnvData->myConverterArray[i]); |
|
3583 } |
|
3584 } |
|
3585 |
|
3586 return &localClone->cnv; |
|
3587 } |
|
3588 |
|
3589 static void |
|
3590 _ISO_2022_GetUnicodeSet(const UConverter *cnv, |
|
3591 const USetAdder *sa, |
|
3592 UConverterUnicodeSet which, |
|
3593 UErrorCode *pErrorCode) |
|
3594 { |
|
3595 int32_t i; |
|
3596 UConverterDataISO2022* cnvData; |
|
3597 |
|
3598 if (U_FAILURE(*pErrorCode)) { |
|
3599 return; |
|
3600 } |
|
3601 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
3602 if (cnv->sharedData == &_ISO2022Data) { |
|
3603 /* We use UTF-8 in this case */ |
|
3604 sa->addRange(sa->set, 0, 0xd7FF); |
|
3605 sa->addRange(sa->set, 0xE000, 0x10FFFF); |
|
3606 return; |
|
3607 } |
|
3608 #endif |
|
3609 |
|
3610 cnvData = (UConverterDataISO2022*)cnv->extraInfo; |
|
3611 |
|
3612 /* open a set and initialize it with code points that are algorithmically round-tripped */ |
|
3613 switch(cnvData->locale[0]){ |
|
3614 case 'j': |
|
3615 /* include JIS X 0201 which is hardcoded */ |
|
3616 sa->add(sa->set, 0xa5); |
|
3617 sa->add(sa->set, 0x203e); |
|
3618 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { |
|
3619 /* include Latin-1 for some variants of JP */ |
|
3620 sa->addRange(sa->set, 0, 0xff); |
|
3621 } else { |
|
3622 /* include ASCII for JP */ |
|
3623 sa->addRange(sa->set, 0, 0x7f); |
|
3624 } |
|
3625 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { |
|
3626 /* |
|
3627 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 |
|
3628 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) |
|
3629 * use half-width Katakana. |
|
3630 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) |
|
3631 * half-width Katakana via the ESC ( I sequence. |
|
3632 * However, we only emit (fromUnicode) half-width Katakana according to the |
|
3633 * definition of each variant. |
|
3634 * |
|
3635 * When including fallbacks, |
|
3636 * we need to include half-width Katakana Unicode code points for all JP variants because |
|
3637 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). |
|
3638 */ |
|
3639 /* include half-width Katakana for JP */ |
|
3640 sa->addRange(sa->set, HWKANA_START, HWKANA_END); |
|
3641 } |
|
3642 break; |
|
3643 case 'c': |
|
3644 case 'z': |
|
3645 /* include ASCII for CN */ |
|
3646 sa->addRange(sa->set, 0, 0x7f); |
|
3647 break; |
|
3648 case 'k': |
|
3649 /* there is only one converter for KR, and it is not in the myConverterArray[] */ |
|
3650 cnvData->currentConverter->sharedData->impl->getUnicodeSet( |
|
3651 cnvData->currentConverter, sa, which, pErrorCode); |
|
3652 /* the loop over myConverterArray[] will simply not find another converter */ |
|
3653 break; |
|
3654 default: |
|
3655 break; |
|
3656 } |
|
3657 |
|
3658 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ |
|
3659 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
|
3660 cnvData->version==0 && i==CNS_11643 |
|
3661 ) { |
|
3662 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ |
|
3663 ucnv_MBCSGetUnicodeSetForBytes( |
|
3664 cnvData->myConverterArray[i], |
|
3665 sa, UCNV_ROUNDTRIP_SET, |
|
3666 0, 0x81, 0x82, |
|
3667 pErrorCode); |
|
3668 } |
|
3669 #endif |
|
3670 |
|
3671 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
|
3672 UConverterSetFilter filter; |
|
3673 if(cnvData->myConverterArray[i]!=NULL) { |
|
3674 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
|
3675 cnvData->version==0 && i==CNS_11643 |
|
3676 ) { |
|
3677 /* |
|
3678 * Version-specific for CN: |
|
3679 * CN version 0 does not map CNS planes 3..7 although |
|
3680 * they are all available in the CNS conversion table; |
|
3681 * CN version 1 (-EXT) does map them all. |
|
3682 * The two versions create different Unicode sets. |
|
3683 */ |
|
3684 filter=UCNV_SET_FILTER_2022_CN; |
|
3685 } else if(cnvData->locale[0]=='j' && i==JISX208) { |
|
3686 /* |
|
3687 * Only add code points that map to Shift-JIS codes |
|
3688 * corresponding to JIS X 0208. |
|
3689 */ |
|
3690 filter=UCNV_SET_FILTER_SJIS; |
|
3691 } else if(i==KSC5601) { |
|
3692 /* |
|
3693 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) |
|
3694 * are broader than GR94. |
|
3695 */ |
|
3696 filter=UCNV_SET_FILTER_GR94DBCS; |
|
3697 } else { |
|
3698 filter=UCNV_SET_FILTER_NONE; |
|
3699 } |
|
3700 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); |
|
3701 } |
|
3702 } |
|
3703 |
|
3704 /* |
|
3705 * ISO 2022 converters must not convert SO/SI/ESC despite what |
|
3706 * sub-converters do by themselves. |
|
3707 * Remove these characters from the set. |
|
3708 */ |
|
3709 sa->remove(sa->set, 0x0e); |
|
3710 sa->remove(sa->set, 0x0f); |
|
3711 sa->remove(sa->set, 0x1b); |
|
3712 |
|
3713 /* ISO 2022 converters do not convert C1 controls either */ |
|
3714 sa->removeRange(sa->set, 0x80, 0x9f); |
|
3715 } |
|
3716 |
|
3717 static const UConverterImpl _ISO2022Impl={ |
|
3718 UCNV_ISO_2022, |
|
3719 |
|
3720 NULL, |
|
3721 NULL, |
|
3722 |
|
3723 _ISO2022Open, |
|
3724 _ISO2022Close, |
|
3725 _ISO2022Reset, |
|
3726 |
|
3727 #ifdef U_ENABLE_GENERIC_ISO_2022 |
|
3728 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
|
3729 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
|
3730 ucnv_fromUnicode_UTF8, |
|
3731 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
|
3732 #else |
|
3733 NULL, |
|
3734 NULL, |
|
3735 NULL, |
|
3736 NULL, |
|
3737 #endif |
|
3738 NULL, |
|
3739 |
|
3740 NULL, |
|
3741 _ISO2022getName, |
|
3742 _ISO_2022_WriteSub, |
|
3743 _ISO_2022_SafeClone, |
|
3744 _ISO_2022_GetUnicodeSet, |
|
3745 |
|
3746 NULL, |
|
3747 NULL |
|
3748 }; |
|
3749 static const UConverterStaticData _ISO2022StaticData={ |
|
3750 sizeof(UConverterStaticData), |
|
3751 "ISO_2022", |
|
3752 2022, |
|
3753 UCNV_IBM, |
|
3754 UCNV_ISO_2022, |
|
3755 1, |
|
3756 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ |
|
3757 { 0x1a, 0, 0, 0 }, |
|
3758 1, |
|
3759 FALSE, |
|
3760 FALSE, |
|
3761 0, |
|
3762 0, |
|
3763 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
3764 }; |
|
3765 const UConverterSharedData _ISO2022Data={ |
|
3766 sizeof(UConverterSharedData), |
|
3767 ~((uint32_t) 0), |
|
3768 NULL, |
|
3769 NULL, |
|
3770 &_ISO2022StaticData, |
|
3771 FALSE, |
|
3772 &_ISO2022Impl, |
|
3773 0, UCNV_MBCS_TABLE_INITIALIZER |
|
3774 }; |
|
3775 |
|
3776 /*************JP****************/ |
|
3777 static const UConverterImpl _ISO2022JPImpl={ |
|
3778 UCNV_ISO_2022, |
|
3779 |
|
3780 NULL, |
|
3781 NULL, |
|
3782 |
|
3783 _ISO2022Open, |
|
3784 _ISO2022Close, |
|
3785 _ISO2022Reset, |
|
3786 |
|
3787 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
|
3788 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
|
3789 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
|
3790 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
|
3791 NULL, |
|
3792 |
|
3793 NULL, |
|
3794 _ISO2022getName, |
|
3795 _ISO_2022_WriteSub, |
|
3796 _ISO_2022_SafeClone, |
|
3797 _ISO_2022_GetUnicodeSet, |
|
3798 |
|
3799 NULL, |
|
3800 NULL |
|
3801 }; |
|
3802 static const UConverterStaticData _ISO2022JPStaticData={ |
|
3803 sizeof(UConverterStaticData), |
|
3804 "ISO_2022_JP", |
|
3805 0, |
|
3806 UCNV_IBM, |
|
3807 UCNV_ISO_2022, |
|
3808 1, |
|
3809 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ |
|
3810 { 0x1a, 0, 0, 0 }, |
|
3811 1, |
|
3812 FALSE, |
|
3813 FALSE, |
|
3814 0, |
|
3815 0, |
|
3816 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
3817 }; |
|
3818 |
|
3819 namespace { |
|
3820 |
|
3821 const UConverterSharedData _ISO2022JPData={ |
|
3822 sizeof(UConverterSharedData), |
|
3823 ~((uint32_t) 0), |
|
3824 NULL, |
|
3825 NULL, |
|
3826 &_ISO2022JPStaticData, |
|
3827 FALSE, |
|
3828 &_ISO2022JPImpl, |
|
3829 0, UCNV_MBCS_TABLE_INITIALIZER |
|
3830 }; |
|
3831 |
|
3832 } // namespace |
|
3833 |
|
3834 /************* KR ***************/ |
|
3835 static const UConverterImpl _ISO2022KRImpl={ |
|
3836 UCNV_ISO_2022, |
|
3837 |
|
3838 NULL, |
|
3839 NULL, |
|
3840 |
|
3841 _ISO2022Open, |
|
3842 _ISO2022Close, |
|
3843 _ISO2022Reset, |
|
3844 |
|
3845 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
|
3846 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
|
3847 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
|
3848 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
|
3849 NULL, |
|
3850 |
|
3851 NULL, |
|
3852 _ISO2022getName, |
|
3853 _ISO_2022_WriteSub, |
|
3854 _ISO_2022_SafeClone, |
|
3855 _ISO_2022_GetUnicodeSet, |
|
3856 |
|
3857 NULL, |
|
3858 NULL |
|
3859 }; |
|
3860 static const UConverterStaticData _ISO2022KRStaticData={ |
|
3861 sizeof(UConverterStaticData), |
|
3862 "ISO_2022_KR", |
|
3863 0, |
|
3864 UCNV_IBM, |
|
3865 UCNV_ISO_2022, |
|
3866 1, |
|
3867 3, /* max 3 bytes per UChar: SO+DBCS */ |
|
3868 { 0x1a, 0, 0, 0 }, |
|
3869 1, |
|
3870 FALSE, |
|
3871 FALSE, |
|
3872 0, |
|
3873 0, |
|
3874 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
3875 }; |
|
3876 |
|
3877 namespace { |
|
3878 |
|
3879 const UConverterSharedData _ISO2022KRData={ |
|
3880 sizeof(UConverterSharedData), |
|
3881 ~((uint32_t) 0), |
|
3882 NULL, |
|
3883 NULL, |
|
3884 &_ISO2022KRStaticData, |
|
3885 FALSE, |
|
3886 &_ISO2022KRImpl, |
|
3887 0, UCNV_MBCS_TABLE_INITIALIZER |
|
3888 }; |
|
3889 |
|
3890 } // namespace |
|
3891 |
|
3892 /*************** CN ***************/ |
|
3893 static const UConverterImpl _ISO2022CNImpl={ |
|
3894 |
|
3895 UCNV_ISO_2022, |
|
3896 |
|
3897 NULL, |
|
3898 NULL, |
|
3899 |
|
3900 _ISO2022Open, |
|
3901 _ISO2022Close, |
|
3902 _ISO2022Reset, |
|
3903 |
|
3904 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
|
3905 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
|
3906 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
|
3907 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
|
3908 NULL, |
|
3909 |
|
3910 NULL, |
|
3911 _ISO2022getName, |
|
3912 _ISO_2022_WriteSub, |
|
3913 _ISO_2022_SafeClone, |
|
3914 _ISO_2022_GetUnicodeSet, |
|
3915 |
|
3916 NULL, |
|
3917 NULL |
|
3918 }; |
|
3919 static const UConverterStaticData _ISO2022CNStaticData={ |
|
3920 sizeof(UConverterStaticData), |
|
3921 "ISO_2022_CN", |
|
3922 0, |
|
3923 UCNV_IBM, |
|
3924 UCNV_ISO_2022, |
|
3925 1, |
|
3926 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ |
|
3927 { 0x1a, 0, 0, 0 }, |
|
3928 1, |
|
3929 FALSE, |
|
3930 FALSE, |
|
3931 0, |
|
3932 0, |
|
3933 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
3934 }; |
|
3935 |
|
3936 namespace { |
|
3937 |
|
3938 const UConverterSharedData _ISO2022CNData={ |
|
3939 sizeof(UConverterSharedData), |
|
3940 ~((uint32_t) 0), |
|
3941 NULL, |
|
3942 NULL, |
|
3943 &_ISO2022CNStaticData, |
|
3944 FALSE, |
|
3945 &_ISO2022CNImpl, |
|
3946 0, UCNV_MBCS_TABLE_INITIALIZER |
|
3947 }; |
|
3948 |
|
3949 } // namespace |
|
3950 |
|
3951 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |