1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnvmbcs.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,5661 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2000-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: ucnvmbcs.c 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2000jul03 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* The current code in this file replaces the previous implementation 1.20 +* of conversion code from multi-byte codepages to Unicode and back. 1.21 +* This implementation supports the following: 1.22 +* - legacy variable-length codepages with up to 4 bytes per character 1.23 +* - all Unicode code points (up to 0x10ffff) 1.24 +* - efficient distinction of unassigned vs. illegal byte sequences 1.25 +* - it is possible in fromUnicode() to directly deal with simple 1.26 +* stateful encodings (used for EBCDIC_STATEFUL) 1.27 +* - it is possible to convert Unicode code points 1.28 +* to a single zero byte (but not as a fallback except for SBCS) 1.29 +* 1.30 +* Remaining limitations in fromUnicode: 1.31 +* - byte sequences must not have leading zero bytes 1.32 +* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 1.33 +* - limitation to up to 4 bytes per character 1.34 +* 1.35 +* ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 1.36 +* limitations and adds m:n character mappings and other features. 1.37 +* See ucnv_ext.h for details. 1.38 +* 1.39 +* Change history: 1.40 +* 1.41 +* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 1.42 +* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 1.43 +* macros to ucnvmbcs.h file 1.44 +*/ 1.45 + 1.46 +#include "unicode/utypes.h" 1.47 + 1.48 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 1.49 + 1.50 +#include "unicode/ucnv.h" 1.51 +#include "unicode/ucnv_cb.h" 1.52 +#include "unicode/udata.h" 1.53 +#include "unicode/uset.h" 1.54 +#include "unicode/utf8.h" 1.55 +#include "unicode/utf16.h" 1.56 +#include "ucnv_bld.h" 1.57 +#include "ucnvmbcs.h" 1.58 +#include "ucnv_ext.h" 1.59 +#include "ucnv_cnv.h" 1.60 +#include "cmemory.h" 1.61 +#include "cstring.h" 1.62 +#include "cmutex.h" 1.63 + 1.64 +/* control optimizations according to the platform */ 1.65 +#define MBCS_UNROLL_SINGLE_TO_BMP 1 1.66 +#define MBCS_UNROLL_SINGLE_FROM_BMP 0 1.67 + 1.68 +/* 1.69 + * _MBCSHeader versions 5.3 & 4.3 1.70 + * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 1.71 + * 1.72 + * This version is optional. Version 5 is used for incompatible data format changes. 1.73 + * makeconv will continue to generate version 4 files if possible. 1.74 + * 1.75 + * Changes from version 4: 1.76 + * 1.77 + * The main difference is an additional _MBCSHeader field with 1.78 + * - the length (number of uint32_t) of the _MBCSHeader 1.79 + * - flags for further incompatible data format changes 1.80 + * - flags for further, backward compatible data format changes 1.81 + * 1.82 + * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 1.83 + * the file and needs to be reconstituted at load time. 1.84 + * This requires a utf8Friendly format with an additional mbcsIndex table for fast 1.85 + * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 1.86 + * (For details about these structures see below, and see ucnvmbcs.h.) 1.87 + * 1.88 + * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 1.89 + * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 1.90 + * precision markers for all mappings.) 1.91 + * 1.92 + * All fallbacks have been moved to the extension table, leaving only roundtrips in the 1.93 + * omitted data that can be reconstituted from the toUnicode data. 1.94 + * 1.95 + * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 1.96 + * With only roundtrip mappings in the base fromUnicode data, this part is fully 1.97 + * redundant with the mbcsIndex and will be reconstituted from that (also using the 1.98 + * stage 1 table which contains the information about how stage 2 was compacted). 1.99 + * 1.100 + * The rest of the stage 2 table, the part for code points above maxFastUChar, 1.101 + * is stored in the file and will be appended to the reconstituted part. 1.102 + * 1.103 + * The entire fromUBytes array is omitted from the file and will be reconstitued. 1.104 + * This is done by enumerating all toUnicode roundtrip mappings, performing 1.105 + * each mapping (using the stage 1 and reconstituted stage 2 tables) and 1.106 + * writing instead of reading the byte values. 1.107 + * 1.108 + * _MBCSHeader version 4.3 1.109 + * 1.110 + * Change from version 4.2: 1.111 + * - Optional utf8Friendly data structures, with 64-entry stage 3 block 1.112 + * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 1.113 + * files which can be used instead of stages 1 & 2. 1.114 + * Faster lookups for roundtrips from most commonly used characters, 1.115 + * and lookups from UTF-8 byte sequences with a natural bit distribution. 1.116 + * See ucnvmbcs.h for more details. 1.117 + * 1.118 + * Change from version 4.1: 1.119 + * - Added an optional extension table structure at the end of the .cnv file. 1.120 + * It is present if the upper bits of the header flags field contains a non-zero 1.121 + * byte offset to it. 1.122 + * Files that contain only a conversion table and no base table 1.123 + * use the special outputType MBCS_OUTPUT_EXT_ONLY. 1.124 + * These contain the base table name between the MBCS header and the extension 1.125 + * data. 1.126 + * 1.127 + * Change from version 4.0: 1.128 + * - Replace header.reserved with header.fromUBytesLength so that all 1.129 + * fields in the data have length. 1.130 + * 1.131 + * Changes from version 3 (for performance improvements): 1.132 + * - new bit distribution for state table entries 1.133 + * - reordered action codes 1.134 + * - new data structure for single-byte fromUnicode 1.135 + * + stage 2 only contains indexes 1.136 + * + stage 3 stores 16 bits per character with classification bits 15..8 1.137 + * - no multiplier for stage 1 entries 1.138 + * - stage 2 for non-single-byte codepages contains the index and the flags in 1.139 + * one 32-bit value 1.140 + * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 1.141 + * 1.142 + * For more details about old versions of the MBCS data structure, see 1.143 + * the corresponding versions of this file. 1.144 + * 1.145 + * Converting stateless codepage data ---------------------------------------*** 1.146 + * (or codepage data with simple states) to Unicode. 1.147 + * 1.148 + * Data structure and algorithm for converting from complex legacy codepages 1.149 + * to Unicode. (Designed before 2000-may-22.) 1.150 + * 1.151 + * The basic idea is that the structure of legacy codepages can be described 1.152 + * with state tables. 1.153 + * When reading a byte stream, each input byte causes a state transition. 1.154 + * Some transitions result in the output of a code point, some result in 1.155 + * "unassigned" or "illegal" output. 1.156 + * This is used here for character conversion. 1.157 + * 1.158 + * The data structure begins with a state table consisting of a row 1.159 + * per state, with 256 entries (columns) per row for each possible input 1.160 + * byte value. 1.161 + * Each entry is 32 bits wide, with two formats distinguished by 1.162 + * the sign bit (bit 31): 1.163 + * 1.164 + * One format for transitional entries (bit 31 not set) for non-final bytes, and 1.165 + * one format for final entries (bit 31 set). 1.166 + * Both formats contain the number of the next state in the same bit 1.167 + * positions. 1.168 + * State 0 is the initial state. 1.169 + * 1.170 + * Most of the time, the offset values of subsequent states are added 1.171 + * up to a scalar value. This value will eventually be the index of 1.172 + * the Unicode code point in a table that follows the state table. 1.173 + * The effect is that the code points for final state table rows 1.174 + * are contiguous. The code points of final state rows follow each other 1.175 + * in the order of the references to those final states by previous 1.176 + * states, etc. 1.177 + * 1.178 + * For some terminal states, the offset is itself the output Unicode 1.179 + * code point (16 bits for a BMP code point or 20 bits for a supplementary 1.180 + * code point (stored as code point minus 0x10000 so that 20 bits are enough). 1.181 + * For others, the code point in the Unicode table is stored with either 1.182 + * one or two code units: one for BMP code points, two for a pair of 1.183 + * surrogates. 1.184 + * All code points for a final state entry take up the same number of code 1.185 + * units, regardless of whether they all actually _use_ the same number 1.186 + * of code units. This is necessary for simple array access. 1.187 + * 1.188 + * An additional feature comes in with what in ICU is called "fallback" 1.189 + * mappings: 1.190 + * 1.191 + * In addition to round-trippable, precise, 1:1 mappings, there are often 1.192 + * mappings defined between similar, though not the same, characters. 1.193 + * Typically, such mappings occur only in fromUnicode mapping tables because 1.194 + * Unicode has a superset repertoire of most other codepages. However, it 1.195 + * is possible to provide such mappings in the toUnicode tables, too. 1.196 + * In this case, the fallback mappings are partly integrated into the 1.197 + * general state tables because the structure of the encoding includes their 1.198 + * byte sequences. 1.199 + * For final entries in an initial state, fallback mappings are stored in 1.200 + * the entry itself like with roundtrip mappings. 1.201 + * For other final entries, they are stored in the code units table if 1.202 + * the entry is for a pair of code units. 1.203 + * For single-unit results in the code units table, there is no space to 1.204 + * alternatively hold a fallback mapping; in this case, the code unit 1.205 + * is stored as U+fffe (unassigned), and the fallback mapping needs to 1.206 + * be looked up by the scalar offset value in a separate table. 1.207 + * 1.208 + * "Unassigned" state entries really mean "structurally unassigned", 1.209 + * i.e., such a byte sequence will never have a mapping result. 1.210 + * 1.211 + * The interpretation of the bits in each entry is as follows: 1.212 + * 1.213 + * Bit 31 not set, not a terminal entry ("transitional"): 1.214 + * 30..24 next state 1.215 + * 23..0 offset delta, to be added up 1.216 + * 1.217 + * Bit 31 set, terminal ("final") entry: 1.218 + * 30..24 next state (regardless of action code) 1.219 + * 23..20 action code: 1.220 + * action codes 0 and 1 result in precise-mapping Unicode code points 1.221 + * 0 valid byte sequence 1.222 + * 19..16 not used, 0 1.223 + * 15..0 16-bit Unicode BMP code point 1.224 + * never U+fffe or U+ffff 1.225 + * 1 valid byte sequence 1.226 + * 19..0 20-bit Unicode supplementary code point 1.227 + * never U+fffe or U+ffff 1.228 + * 1.229 + * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 1.230 + * 2 valid byte sequence (fallback) 1.231 + * 19..16 not used, 0 1.232 + * 15..0 16-bit Unicode BMP code point as fallback result 1.233 + * 3 valid byte sequence (fallback) 1.234 + * 19..0 20-bit Unicode supplementary code point as fallback result 1.235 + * 1.236 + * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 1.237 + * depending on the code units they result in 1.238 + * 4 valid byte sequence 1.239 + * 19..9 not used, 0 1.240 + * 8..0 final offset delta 1.241 + * pointing to one 16-bit code unit which may be 1.242 + * fffe unassigned -- look for a fallback for this offset 1.243 + * ffff illegal 1.244 + * 5 valid byte sequence 1.245 + * 19..9 not used, 0 1.246 + * 8..0 final offset delta 1.247 + * pointing to two 16-bit code units 1.248 + * (typically UTF-16 surrogates) 1.249 + * the result depends on the first code unit as follows: 1.250 + * 0000..d7ff roundtrip BMP code point (1st alone) 1.251 + * d800..dbff roundtrip surrogate pair (1st, 2nd) 1.252 + * dc00..dfff fallback surrogate pair (1st-400, 2nd) 1.253 + * e000 roundtrip BMP code point (2nd alone) 1.254 + * e001 fallback BMP code point (2nd alone) 1.255 + * fffe unassigned 1.256 + * ffff illegal 1.257 + * (the final offset deltas are at most 255 * 2, 1.258 + * times 2 because of storing code unit pairs) 1.259 + * 1.260 + * 6 unassigned byte sequence 1.261 + * 19..16 not used, 0 1.262 + * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 1.263 + * this does not contain a final offset delta because the main 1.264 + * purpose of this action code is to save scalar offset values; 1.265 + * therefore, fallback values cannot be assigned to byte 1.266 + * sequences that result in this action code 1.267 + * 7 illegal byte sequence 1.268 + * 19..16 not used, 0 1.269 + * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 1.270 + * 8 state change only 1.271 + * 19..0 not used, 0 1.272 + * useful for state changes in simple stateful encodings, 1.273 + * at Shift-In/Shift-Out codes 1.274 + * 1.275 + * 1.276 + * 9..15 reserved for future use 1.277 + * current implementations will only perform a state change 1.278 + * and ignore bits 19..0 1.279 + * 1.280 + * An encoding with contiguous ranges of unassigned byte sequences, like 1.281 + * Shift-JIS and especially EUC-TW, can be stored efficiently by having 1.282 + * at least two states for the trail bytes: 1.283 + * One trail byte state that results in code points, and one that only 1.284 + * has "unassigned" and "illegal" terminal states. 1.285 + * 1.286 + * Note: partly by accident, this data structure supports simple stateful 1.287 + * encodings without any additional logic. 1.288 + * Currently, only simple Shift-In/Shift-Out schemes are handled with 1.289 + * appropriate state tables (especially EBCDIC_STATEFUL!). 1.290 + * 1.291 + * MBCS version 2 added: 1.292 + * unassigned and illegal action codes have U+fffe and U+ffff 1.293 + * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 1.294 + * 1.295 + * Converting from Unicode to codepage bytes --------------------------------*** 1.296 + * 1.297 + * The conversion data structure for fromUnicode is designed for the known 1.298 + * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 1.299 + * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 1.300 + * a roundtrip mapping. 1.301 + * 1.302 + * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 1.303 + * like in the character properties table. 1.304 + * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 1.305 + * with the resulting bytes is at offsetFromUBytes. 1.306 + * 1.307 + * Beginning with version 4, single-byte codepages have a significantly different 1.308 + * trie compared to other codepages. 1.309 + * In all cases, the entry in stage 1 is directly the index of the block of 1.310 + * 64 entries in stage 2. 1.311 + * 1.312 + * Single-byte lookup: 1.313 + * 1.314 + * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 1.315 + * Stage 3 contains one 16-bit word per result: 1.316 + * Bits 15..8 indicate the kind of result: 1.317 + * f roundtrip result 1.318 + * c fallback result from private-use code point 1.319 + * 8 fallback result from other code points 1.320 + * 0 unassigned 1.321 + * Bits 7..0 contain the codepage byte. A zero byte is always possible. 1.322 + * 1.323 + * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 1.324 + * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 1.325 + * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 1.326 + * ASCII code points can be looked up with a linear array access into stage 3. 1.327 + * See maxFastUChar and other details in ucnvmbcs.h. 1.328 + * 1.329 + * Multi-byte lookup: 1.330 + * 1.331 + * Stage 2 contains a 32-bit word for each 16-block in stage 3: 1.332 + * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 1.333 + * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 1.334 + * If this test is false, then a non-zero result will be interpreted as 1.335 + * a fallback mapping. 1.336 + * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 1.337 + * 1.338 + * Stage 3 contains 2, 3, or 4 bytes per result. 1.339 + * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 1.340 + * while 3 bytes are stored as bytes in big-endian order. 1.341 + * Leading zero bytes are ignored, and the number of bytes is counted. 1.342 + * A zero byte mapping result is possible as a roundtrip result. 1.343 + * For some output types, the actual result is processed from this; 1.344 + * see ucnv_MBCSFromUnicodeWithOffsets(). 1.345 + * 1.346 + * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 1.347 + * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 1.348 + * 1.349 + * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 1.350 + * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 1.351 + * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 1.352 + * ASCII code points can be looked up with a linear array access into stage 3. 1.353 + * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 1.354 + * 1.355 + * In version 3, stage 2 blocks may overlap by multiples of the multiplier 1.356 + * for compaction. 1.357 + * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 1.358 + * may overlap by any number of entries. 1.359 + * 1.360 + * MBCS version 2 added: 1.361 + * the converter checks for known output types, which allows 1.362 + * adding new ones without crashing an unaware converter 1.363 + */ 1.364 + 1.365 +static const UConverterImpl _SBCSUTF8Impl; 1.366 +static const UConverterImpl _DBCSUTF8Impl; 1.367 + 1.368 +/* GB 18030 data ------------------------------------------------------------ */ 1.369 + 1.370 +/* helper macros for linear values for GB 18030 four-byte sequences */ 1.371 +#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 1.372 + 1.373 +#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 1.374 + 1.375 +#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 1.376 + 1.377 +/* 1.378 + * Some ranges of GB 18030 where both the Unicode code points and the 1.379 + * GB four-byte sequences are contiguous and are handled algorithmically by 1.380 + * the special callback functions below. 1.381 + * The values are start & end of Unicode & GB codes. 1.382 + * 1.383 + * Note that single surrogates are not mapped by GB 18030 1.384 + * as of the re-released mapping tables from 2000-nov-30. 1.385 + */ 1.386 +static const uint32_t 1.387 +gb18030Ranges[14][4]={ 1.388 + {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 1.389 + {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 1.390 + {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)}, 1.391 + {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)}, 1.392 + {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 1.393 + {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 1.394 + {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 1.395 + {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 1.396 + {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 1.397 + {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 1.398 + {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 1.399 + {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 1.400 + {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 1.401 + {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 1.402 +}; 1.403 + 1.404 +/* bit flag for UConverter.options indicating GB 18030 special handling */ 1.405 +#define _MBCS_OPTION_GB18030 0x8000 1.406 + 1.407 +/* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 1.408 +#define _MBCS_OPTION_KEIS 0x01000 1.409 +#define _MBCS_OPTION_JEF 0x02000 1.410 +#define _MBCS_OPTION_JIPS 0x04000 1.411 + 1.412 +#define KEIS_SO_CHAR_1 0x0A 1.413 +#define KEIS_SO_CHAR_2 0x42 1.414 +#define KEIS_SI_CHAR_1 0x0A 1.415 +#define KEIS_SI_CHAR_2 0x41 1.416 + 1.417 +#define JEF_SO_CHAR 0x28 1.418 +#define JEF_SI_CHAR 0x29 1.419 + 1.420 +#define JIPS_SO_CHAR_1 0x1A 1.421 +#define JIPS_SO_CHAR_2 0x70 1.422 +#define JIPS_SI_CHAR_1 0x1A 1.423 +#define JIPS_SI_CHAR_2 0x71 1.424 + 1.425 +enum SISO_Option { 1.426 + SI, 1.427 + SO 1.428 +}; 1.429 +typedef enum SISO_Option SISO_Option; 1.430 + 1.431 +static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { 1.432 + int32_t SISOLength = 0; 1.433 + 1.434 + switch (option) { 1.435 + case SI: 1.436 + if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 1.437 + value[0] = KEIS_SI_CHAR_1; 1.438 + value[1] = KEIS_SI_CHAR_2; 1.439 + SISOLength = 2; 1.440 + } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 1.441 + value[0] = JEF_SI_CHAR; 1.442 + SISOLength = 1; 1.443 + } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 1.444 + value[0] = JIPS_SI_CHAR_1; 1.445 + value[1] = JIPS_SI_CHAR_2; 1.446 + SISOLength = 2; 1.447 + } else { 1.448 + value[0] = UCNV_SI; 1.449 + SISOLength = 1; 1.450 + } 1.451 + break; 1.452 + case SO: 1.453 + if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 1.454 + value[0] = KEIS_SO_CHAR_1; 1.455 + value[1] = KEIS_SO_CHAR_2; 1.456 + SISOLength = 2; 1.457 + } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 1.458 + value[0] = JEF_SO_CHAR; 1.459 + SISOLength = 1; 1.460 + } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 1.461 + value[0] = JIPS_SO_CHAR_1; 1.462 + value[1] = JIPS_SO_CHAR_2; 1.463 + SISOLength = 2; 1.464 + } else { 1.465 + value[0] = UCNV_SO; 1.466 + SISOLength = 1; 1.467 + } 1.468 + break; 1.469 + default: 1.470 + /* Should never happen. */ 1.471 + break; 1.472 + } 1.473 + 1.474 + return SISOLength; 1.475 +} 1.476 + 1.477 +/* Miscellaneous ------------------------------------------------------------ */ 1.478 + 1.479 +/** 1.480 + * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 1.481 + * consecutive sequences of bytes, starting from the one encoded in value, 1.482 + * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 1.483 + * Does not currently support m:n mappings or reverse fallbacks. 1.484 + * This function will not be called for sequences of bytes with leading zeros. 1.485 + * 1.486 + * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 1.487 + * @param value contains 1..4 bytes of the first byte sequence, right-aligned 1.488 + * @param codePoints resulting Unicode code points, or negative if a byte sequence does 1.489 + * not map to anything 1.490 + * @return TRUE to continue enumeration, FALSE to stop 1.491 + */ 1.492 +typedef UBool U_CALLCONV 1.493 +UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 1.494 + 1.495 +/* similar to ucnv_MBCSGetNextUChar() but recursive */ 1.496 +static UBool 1.497 +enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 1.498 + int32_t state, uint32_t offset, 1.499 + uint32_t value, 1.500 + UConverterEnumToUCallback *callback, const void *context, 1.501 + UErrorCode *pErrorCode) { 1.502 + UChar32 codePoints[32]; 1.503 + const int32_t *row; 1.504 + const uint16_t *unicodeCodeUnits; 1.505 + UChar32 anyCodePoints; 1.506 + int32_t b, limit; 1.507 + 1.508 + row=mbcsTable->stateTable[state]; 1.509 + unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 1.510 + 1.511 + value<<=8; 1.512 + anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 1.513 + 1.514 + b=(stateProps[state]&0x38)<<2; 1.515 + if(b==0 && stateProps[state]>=0x40) { 1.516 + /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 1.517 + codePoints[0]=U_SENTINEL; 1.518 + b=1; 1.519 + } 1.520 + limit=((stateProps[state]&7)+1)<<5; 1.521 + while(b<limit) { 1.522 + int32_t entry=row[b]; 1.523 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.524 + int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 1.525 + if(stateProps[nextState]>=0) { 1.526 + /* recurse to a state with non-ignorable actions */ 1.527 + if(!enumToU( 1.528 + mbcsTable, stateProps, nextState, 1.529 + offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 1.530 + value|(uint32_t)b, 1.531 + callback, context, 1.532 + pErrorCode)) { 1.533 + return FALSE; 1.534 + } 1.535 + } 1.536 + codePoints[b&0x1f]=U_SENTINEL; 1.537 + } else { 1.538 + UChar32 c; 1.539 + int32_t action; 1.540 + 1.541 + /* 1.542 + * An if-else-if chain provides more reliable performance for 1.543 + * the most common cases compared to a switch. 1.544 + */ 1.545 + action=MBCS_ENTRY_FINAL_ACTION(entry); 1.546 + if(action==MBCS_STATE_VALID_DIRECT_16) { 1.547 + /* output BMP code point */ 1.548 + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.549 + } else if(action==MBCS_STATE_VALID_16) { 1.550 + int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 1.551 + c=unicodeCodeUnits[finalOffset]; 1.552 + if(c<0xfffe) { 1.553 + /* output BMP code point */ 1.554 + } else { 1.555 + c=U_SENTINEL; 1.556 + } 1.557 + } else if(action==MBCS_STATE_VALID_16_PAIR) { 1.558 + int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 1.559 + c=unicodeCodeUnits[finalOffset++]; 1.560 + if(c<0xd800) { 1.561 + /* output BMP code point below 0xd800 */ 1.562 + } else if(c<=0xdbff) { 1.563 + /* output roundtrip or fallback supplementary code point */ 1.564 + c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 1.565 + } else if(c==0xe000) { 1.566 + /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 1.567 + c=unicodeCodeUnits[finalOffset]; 1.568 + } else { 1.569 + c=U_SENTINEL; 1.570 + } 1.571 + } else if(action==MBCS_STATE_VALID_DIRECT_20) { 1.572 + /* output supplementary code point */ 1.573 + c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 1.574 + } else { 1.575 + c=U_SENTINEL; 1.576 + } 1.577 + 1.578 + codePoints[b&0x1f]=c; 1.579 + anyCodePoints&=c; 1.580 + } 1.581 + if(((++b)&0x1f)==0) { 1.582 + if(anyCodePoints>=0) { 1.583 + if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { 1.584 + return FALSE; 1.585 + } 1.586 + anyCodePoints=-1; 1.587 + } 1.588 + } 1.589 + } 1.590 + return TRUE; 1.591 +} 1.592 + 1.593 +/* 1.594 + * Only called if stateProps[state]==-1. 1.595 + * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 1.596 + * MBCS_STATE_CHANGE_ONLY. 1.597 + */ 1.598 +static int8_t 1.599 +getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 1.600 + const int32_t *row; 1.601 + int32_t min, max, entry, nextState; 1.602 + 1.603 + row=stateTable[state]; 1.604 + stateProps[state]=0; 1.605 + 1.606 + /* find first non-ignorable state */ 1.607 + for(min=0;; ++min) { 1.608 + entry=row[min]; 1.609 + nextState=MBCS_ENTRY_STATE(entry); 1.610 + if(stateProps[nextState]==-1) { 1.611 + getStateProp(stateTable, stateProps, nextState); 1.612 + } 1.613 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.614 + if(stateProps[nextState]>=0) { 1.615 + break; 1.616 + } 1.617 + } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 1.618 + break; 1.619 + } 1.620 + if(min==0xff) { 1.621 + stateProps[state]=-0x40; /* (int8_t)0xc0 */ 1.622 + return stateProps[state]; 1.623 + } 1.624 + } 1.625 + stateProps[state]|=(int8_t)((min>>5)<<3); 1.626 + 1.627 + /* find last non-ignorable state */ 1.628 + for(max=0xff; min<max; --max) { 1.629 + entry=row[max]; 1.630 + nextState=MBCS_ENTRY_STATE(entry); 1.631 + if(stateProps[nextState]==-1) { 1.632 + getStateProp(stateTable, stateProps, nextState); 1.633 + } 1.634 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.635 + if(stateProps[nextState]>=0) { 1.636 + break; 1.637 + } 1.638 + } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 1.639 + break; 1.640 + } 1.641 + } 1.642 + stateProps[state]|=(int8_t)(max>>5); 1.643 + 1.644 + /* recurse further and collect direct-state information */ 1.645 + while(min<=max) { 1.646 + entry=row[min]; 1.647 + nextState=MBCS_ENTRY_STATE(entry); 1.648 + if(stateProps[nextState]==-1) { 1.649 + getStateProp(stateTable, stateProps, nextState); 1.650 + } 1.651 + if(MBCS_ENTRY_IS_FINAL(entry)) { 1.652 + stateProps[nextState]|=0x40; 1.653 + if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 1.654 + stateProps[state]|=0x40; 1.655 + } 1.656 + } 1.657 + ++min; 1.658 + } 1.659 + return stateProps[state]; 1.660 +} 1.661 + 1.662 +/* 1.663 + * Internal function enumerating the toUnicode data of an MBCS converter. 1.664 + * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 1.665 + * table, but could also be used for a future ucnv_getUnicodeSet() option 1.666 + * that includes reverse fallbacks (after updating this function's implementation). 1.667 + * Currently only handles roundtrip mappings. 1.668 + * Does not currently handle extensions. 1.669 + */ 1.670 +static void 1.671 +ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 1.672 + UConverterEnumToUCallback *callback, const void *context, 1.673 + UErrorCode *pErrorCode) { 1.674 + /* 1.675 + * Properties for each state, to speed up the enumeration. 1.676 + * Ignorable actions are unassigned/illegal/state-change-only: 1.677 + * They do not lead to mappings. 1.678 + * 1.679 + * Bits 7..6: 1.680 + * 1 direct/initial state (stateful converters have multiple) 1.681 + * 0 non-initial state with transitions or with non-ignorable result actions 1.682 + * -1 final state with only ignorable actions 1.683 + * 1.684 + * Bits 5..3: 1.685 + * The lowest byte value with non-ignorable actions is 1.686 + * value<<5 (rounded down). 1.687 + * 1.688 + * Bits 2..0: 1.689 + * The highest byte value with non-ignorable actions is 1.690 + * (value<<5)&0x1f (rounded up). 1.691 + */ 1.692 + int8_t stateProps[MBCS_MAX_STATE_COUNT]; 1.693 + int32_t state; 1.694 + 1.695 + uprv_memset(stateProps, -1, sizeof(stateProps)); 1.696 + 1.697 + /* recurse from state 0 and set all stateProps */ 1.698 + getStateProp(mbcsTable->stateTable, stateProps, 0); 1.699 + 1.700 + for(state=0; state<mbcsTable->countStates; ++state) { 1.701 + /*if(stateProps[state]==-1) { 1.702 + printf("unused/unreachable <icu:state> %d\n", state); 1.703 + }*/ 1.704 + if(stateProps[state]>=0x40) { 1.705 + /* start from each direct state */ 1.706 + enumToU( 1.707 + mbcsTable, stateProps, state, 0, 0, 1.708 + callback, context, 1.709 + pErrorCode); 1.710 + } 1.711 + } 1.712 +} 1.713 + 1.714 +U_CFUNC void 1.715 +ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 1.716 + const USetAdder *sa, 1.717 + UConverterUnicodeSet which, 1.718 + UConverterSetFilter filter, 1.719 + UErrorCode *pErrorCode) { 1.720 + const UConverterMBCSTable *mbcsTable; 1.721 + const uint16_t *table; 1.722 + 1.723 + uint32_t st3; 1.724 + uint16_t st1, maxStage1, st2; 1.725 + 1.726 + UChar32 c; 1.727 + 1.728 + /* enumerate the from-Unicode trie table */ 1.729 + mbcsTable=&sharedData->mbcs; 1.730 + table=mbcsTable->fromUnicodeTable; 1.731 + if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 1.732 + maxStage1=0x440; 1.733 + } else { 1.734 + maxStage1=0x40; 1.735 + } 1.736 + 1.737 + c=0; /* keep track of the current code point while enumerating */ 1.738 + 1.739 + if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1.740 + const uint16_t *stage2, *stage3, *results; 1.741 + uint16_t minValue; 1.742 + 1.743 + results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 1.744 + 1.745 + /* 1.746 + * Set a threshold variable for selecting which mappings to use. 1.747 + * See ucnv_MBCSSingleFromBMPWithOffsets() and 1.748 + * MBCS_SINGLE_RESULT_FROM_U() for details. 1.749 + */ 1.750 + if(which==UCNV_ROUNDTRIP_SET) { 1.751 + /* use only roundtrips */ 1.752 + minValue=0xf00; 1.753 + } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 1.754 + /* use all roundtrip and fallback results */ 1.755 + minValue=0x800; 1.756 + } 1.757 + 1.758 + for(st1=0; st1<maxStage1; ++st1) { 1.759 + st2=table[st1]; 1.760 + if(st2>maxStage1) { 1.761 + stage2=table+st2; 1.762 + for(st2=0; st2<64; ++st2) { 1.763 + if((st3=stage2[st2])!=0) { 1.764 + /* read the stage 3 block */ 1.765 + stage3=results+st3; 1.766 + 1.767 + do { 1.768 + if(*stage3++>=minValue) { 1.769 + sa->add(sa->set, c); 1.770 + } 1.771 + } while((++c&0xf)!=0); 1.772 + } else { 1.773 + c+=16; /* empty stage 3 block */ 1.774 + } 1.775 + } 1.776 + } else { 1.777 + c+=1024; /* empty stage 2 block */ 1.778 + } 1.779 + } 1.780 + } else { 1.781 + const uint32_t *stage2; 1.782 + const uint8_t *stage3, *bytes; 1.783 + uint32_t st3Multiplier; 1.784 + uint32_t value; 1.785 + UBool useFallback; 1.786 + 1.787 + bytes=mbcsTable->fromUnicodeBytes; 1.788 + 1.789 + useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 1.790 + 1.791 + switch(mbcsTable->outputType) { 1.792 + case MBCS_OUTPUT_3: 1.793 + case MBCS_OUTPUT_4_EUC: 1.794 + st3Multiplier=3; 1.795 + break; 1.796 + case MBCS_OUTPUT_4: 1.797 + st3Multiplier=4; 1.798 + break; 1.799 + default: 1.800 + st3Multiplier=2; 1.801 + break; 1.802 + } 1.803 + 1.804 + for(st1=0; st1<maxStage1; ++st1) { 1.805 + st2=table[st1]; 1.806 + if(st2>(maxStage1>>1)) { 1.807 + stage2=(const uint32_t *)table+st2; 1.808 + for(st2=0; st2<64; ++st2) { 1.809 + if((st3=stage2[st2])!=0) { 1.810 + /* read the stage 3 block */ 1.811 + stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 1.812 + 1.813 + /* get the roundtrip flags for the stage 3 block */ 1.814 + st3>>=16; 1.815 + 1.816 + /* 1.817 + * Add code points for which the roundtrip flag is set, 1.818 + * or which map to non-zero bytes if we use fallbacks. 1.819 + * See ucnv_MBCSFromUnicodeWithOffsets() for details. 1.820 + */ 1.821 + switch(filter) { 1.822 + case UCNV_SET_FILTER_NONE: 1.823 + do { 1.824 + if(st3&1) { 1.825 + sa->add(sa->set, c); 1.826 + stage3+=st3Multiplier; 1.827 + } else if(useFallback) { 1.828 + uint8_t b=0; 1.829 + switch(st3Multiplier) { 1.830 + case 4: 1.831 + b|=*stage3++; 1.832 + case 3: /*fall through*/ 1.833 + b|=*stage3++; 1.834 + case 2: /*fall through*/ 1.835 + b|=stage3[0]|stage3[1]; 1.836 + stage3+=2; 1.837 + default: 1.838 + break; 1.839 + } 1.840 + if(b!=0) { 1.841 + sa->add(sa->set, c); 1.842 + } 1.843 + } 1.844 + st3>>=1; 1.845 + } while((++c&0xf)!=0); 1.846 + break; 1.847 + case UCNV_SET_FILTER_DBCS_ONLY: 1.848 + /* Ignore single-byte results (<0x100). */ 1.849 + do { 1.850 + if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 1.851 + sa->add(sa->set, c); 1.852 + } 1.853 + st3>>=1; 1.854 + stage3+=2; /* +=st3Multiplier */ 1.855 + } while((++c&0xf)!=0); 1.856 + break; 1.857 + case UCNV_SET_FILTER_2022_CN: 1.858 + /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 1.859 + do { 1.860 + if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 1.861 + sa->add(sa->set, c); 1.862 + } 1.863 + st3>>=1; 1.864 + stage3+=3; /* +=st3Multiplier */ 1.865 + } while((++c&0xf)!=0); 1.866 + break; 1.867 + case UCNV_SET_FILTER_SJIS: 1.868 + /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 1.869 + do { 1.870 + if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 1.871 + sa->add(sa->set, c); 1.872 + } 1.873 + st3>>=1; 1.874 + stage3+=2; /* +=st3Multiplier */ 1.875 + } while((++c&0xf)!=0); 1.876 + break; 1.877 + case UCNV_SET_FILTER_GR94DBCS: 1.878 + /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 1.879 + do { 1.880 + if( ((st3&1)!=0 || useFallback) && 1.881 + (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 1.882 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1.883 + ) { 1.884 + sa->add(sa->set, c); 1.885 + } 1.886 + st3>>=1; 1.887 + stage3+=2; /* +=st3Multiplier */ 1.888 + } while((++c&0xf)!=0); 1.889 + break; 1.890 + case UCNV_SET_FILTER_HZ: 1.891 + /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 1.892 + do { 1.893 + if( ((st3&1)!=0 || useFallback) && 1.894 + (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 1.895 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1.896 + ) { 1.897 + sa->add(sa->set, c); 1.898 + } 1.899 + st3>>=1; 1.900 + stage3+=2; /* +=st3Multiplier */ 1.901 + } while((++c&0xf)!=0); 1.902 + break; 1.903 + default: 1.904 + *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1.905 + return; 1.906 + } 1.907 + } else { 1.908 + c+=16; /* empty stage 3 block */ 1.909 + } 1.910 + } 1.911 + } else { 1.912 + c+=1024; /* empty stage 2 block */ 1.913 + } 1.914 + } 1.915 + } 1.916 + 1.917 + ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 1.918 +} 1.919 + 1.920 +U_CFUNC void 1.921 +ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 1.922 + const USetAdder *sa, 1.923 + UConverterUnicodeSet which, 1.924 + UErrorCode *pErrorCode) { 1.925 + ucnv_MBCSGetFilteredUnicodeSetForUnicode( 1.926 + sharedData, sa, which, 1.927 + sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1.928 + UCNV_SET_FILTER_DBCS_ONLY : 1.929 + UCNV_SET_FILTER_NONE, 1.930 + pErrorCode); 1.931 +} 1.932 + 1.933 +static void 1.934 +ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 1.935 + const USetAdder *sa, 1.936 + UConverterUnicodeSet which, 1.937 + UErrorCode *pErrorCode) { 1.938 + if(cnv->options&_MBCS_OPTION_GB18030) { 1.939 + sa->addRange(sa->set, 0, 0xd7ff); 1.940 + sa->addRange(sa->set, 0xe000, 0x10ffff); 1.941 + } else { 1.942 + ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 1.943 + } 1.944 +} 1.945 + 1.946 +/* conversion extensions for input not in the main table -------------------- */ 1.947 + 1.948 +/* 1.949 + * Hardcoded extension handling for GB 18030. 1.950 + * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 1.951 + * 1.952 + * In the future, conversion extensions may handle m:n mappings and delta tables, 1.953 + * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 1.954 + * 1.955 + * If an input character cannot be mapped, then these functions set an error 1.956 + * code. The framework will then call the callback function. 1.957 + */ 1.958 + 1.959 +/* 1.960 + * @return if(U_FAILURE) return the code point for cnv->fromUChar32 1.961 + * else return 0 after output has been written to the target 1.962 + */ 1.963 +static UChar32 1.964 +_extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 1.965 + UChar32 cp, 1.966 + const UChar **source, const UChar *sourceLimit, 1.967 + uint8_t **target, const uint8_t *targetLimit, 1.968 + int32_t **offsets, int32_t sourceIndex, 1.969 + UBool flush, 1.970 + UErrorCode *pErrorCode) { 1.971 + const int32_t *cx; 1.972 + 1.973 + cnv->useSubChar1=FALSE; 1.974 + 1.975 + if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1.976 + ucnv_extInitialMatchFromU( 1.977 + cnv, cx, 1.978 + cp, source, sourceLimit, 1.979 + (char **)target, (char *)targetLimit, 1.980 + offsets, sourceIndex, 1.981 + flush, 1.982 + pErrorCode) 1.983 + ) { 1.984 + return 0; /* an extension mapping handled the input */ 1.985 + } 1.986 + 1.987 + /* GB 18030 */ 1.988 + if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 1.989 + const uint32_t *range; 1.990 + int32_t i; 1.991 + 1.992 + range=gb18030Ranges[0]; 1.993 + for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 1.994 + if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 1.995 + /* found the Unicode code point, output the four-byte sequence for it */ 1.996 + uint32_t linear; 1.997 + char bytes[4]; 1.998 + 1.999 + /* get the linear value of the first GB 18030 code in this range */ 1.1000 + linear=range[2]-LINEAR_18030_BASE; 1.1001 + 1.1002 + /* add the offset from the beginning of the range */ 1.1003 + linear+=((uint32_t)cp-range[0]); 1.1004 + 1.1005 + /* turn this into a four-byte sequence */ 1.1006 + bytes[3]=(char)(0x30+linear%10); linear/=10; 1.1007 + bytes[2]=(char)(0x81+linear%126); linear/=126; 1.1008 + bytes[1]=(char)(0x30+linear%10); linear/=10; 1.1009 + bytes[0]=(char)(0x81+linear); 1.1010 + 1.1011 + /* output this sequence */ 1.1012 + ucnv_fromUWriteBytes(cnv, 1.1013 + bytes, 4, (char **)target, (char *)targetLimit, 1.1014 + offsets, sourceIndex, pErrorCode); 1.1015 + return 0; 1.1016 + } 1.1017 + } 1.1018 + } 1.1019 + 1.1020 + /* no mapping */ 1.1021 + *pErrorCode=U_INVALID_CHAR_FOUND; 1.1022 + return cp; 1.1023 +} 1.1024 + 1.1025 +/* 1.1026 + * Input sequence: cnv->toUBytes[0..length[ 1.1027 + * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 1.1028 + * else return 0 after output has been written to the target 1.1029 + */ 1.1030 +static int8_t 1.1031 +_extToU(UConverter *cnv, const UConverterSharedData *sharedData, 1.1032 + int8_t length, 1.1033 + const uint8_t **source, const uint8_t *sourceLimit, 1.1034 + UChar **target, const UChar *targetLimit, 1.1035 + int32_t **offsets, int32_t sourceIndex, 1.1036 + UBool flush, 1.1037 + UErrorCode *pErrorCode) { 1.1038 + const int32_t *cx; 1.1039 + 1.1040 + if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1.1041 + ucnv_extInitialMatchToU( 1.1042 + cnv, cx, 1.1043 + length, (const char **)source, (const char *)sourceLimit, 1.1044 + target, targetLimit, 1.1045 + offsets, sourceIndex, 1.1046 + flush, 1.1047 + pErrorCode) 1.1048 + ) { 1.1049 + return 0; /* an extension mapping handled the input */ 1.1050 + } 1.1051 + 1.1052 + /* GB 18030 */ 1.1053 + if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 1.1054 + const uint32_t *range; 1.1055 + uint32_t linear; 1.1056 + int32_t i; 1.1057 + 1.1058 + linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 1.1059 + range=gb18030Ranges[0]; 1.1060 + for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 1.1061 + if(range[2]<=linear && linear<=range[3]) { 1.1062 + /* found the sequence, output the Unicode code point for it */ 1.1063 + *pErrorCode=U_ZERO_ERROR; 1.1064 + 1.1065 + /* add the linear difference between the input and start sequences to the start code point */ 1.1066 + linear=range[0]+(linear-range[2]); 1.1067 + 1.1068 + /* output this code point */ 1.1069 + ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 1.1070 + 1.1071 + return 0; 1.1072 + } 1.1073 + } 1.1074 + } 1.1075 + 1.1076 + /* no mapping */ 1.1077 + *pErrorCode=U_INVALID_CHAR_FOUND; 1.1078 + return length; 1.1079 +} 1.1080 + 1.1081 +/* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1.1082 + 1.1083 +/* 1.1084 + * This code modifies a standard EBCDIC<->Unicode mapping table for 1.1085 + * OS/390 (z/OS) Unix System Services (Open Edition). 1.1086 + * The difference is in the mapping of Line Feed and New Line control codes: 1.1087 + * Standard EBCDIC maps 1.1088 + * 1.1089 + * <U000A> \x25 |0 1.1090 + * <U0085> \x15 |0 1.1091 + * 1.1092 + * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1.1093 + * mapping 1.1094 + * 1.1095 + * <U000A> \x15 |0 1.1096 + * <U0085> \x25 |0 1.1097 + * 1.1098 + * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1.1099 + * by copying it into allocated memory and swapping the LF and NL values. 1.1100 + * It allows to support the same EBCDIC charset in both versions without 1.1101 + * duplicating the entire installed table. 1.1102 + */ 1.1103 + 1.1104 +/* standard EBCDIC codes */ 1.1105 +#define EBCDIC_LF 0x25 1.1106 +#define EBCDIC_NL 0x15 1.1107 + 1.1108 +/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1.1109 +#define EBCDIC_RT_LF 0xf25 1.1110 +#define EBCDIC_RT_NL 0xf15 1.1111 + 1.1112 +/* Unicode code points */ 1.1113 +#define U_LF 0x0a 1.1114 +#define U_NL 0x85 1.1115 + 1.1116 +static UBool 1.1117 +_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1.1118 + UConverterMBCSTable *mbcsTable; 1.1119 + 1.1120 + const uint16_t *table, *results; 1.1121 + const uint8_t *bytes; 1.1122 + 1.1123 + int32_t (*newStateTable)[256]; 1.1124 + uint16_t *newResults; 1.1125 + uint8_t *p; 1.1126 + char *name; 1.1127 + 1.1128 + uint32_t stage2Entry; 1.1129 + uint32_t size, sizeofFromUBytes; 1.1130 + 1.1131 + mbcsTable=&sharedData->mbcs; 1.1132 + 1.1133 + table=mbcsTable->fromUnicodeTable; 1.1134 + bytes=mbcsTable->fromUnicodeBytes; 1.1135 + results=(const uint16_t *)bytes; 1.1136 + 1.1137 + /* 1.1138 + * Check that this is an EBCDIC table with SBCS portion - 1.1139 + * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1.1140 + * 1.1141 + * If not, ignore the option. Options are always ignored if they do not apply. 1.1142 + */ 1.1143 + if(!( 1.1144 + (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1.1145 + mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1.1146 + mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1.1147 + )) { 1.1148 + return FALSE; 1.1149 + } 1.1150 + 1.1151 + if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1.1152 + if(!( 1.1153 + EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1.1154 + EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1.1155 + )) { 1.1156 + return FALSE; 1.1157 + } 1.1158 + } else /* MBCS_OUTPUT_2_SISO */ { 1.1159 + stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1.1160 + if(!( 1.1161 + MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1.1162 + EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1.1163 + )) { 1.1164 + return FALSE; 1.1165 + } 1.1166 + 1.1167 + stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1.1168 + if(!( 1.1169 + MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1.1170 + EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1.1171 + )) { 1.1172 + return FALSE; 1.1173 + } 1.1174 + } 1.1175 + 1.1176 + if(mbcsTable->fromUBytesLength>0) { 1.1177 + /* 1.1178 + * We _know_ the number of bytes in the fromUnicodeBytes array 1.1179 + * starting with header.version 4.1. 1.1180 + */ 1.1181 + sizeofFromUBytes=mbcsTable->fromUBytesLength; 1.1182 + } else { 1.1183 + /* 1.1184 + * Otherwise: 1.1185 + * There used to be code to enumerate the fromUnicode 1.1186 + * trie and find the highest entry, but it was removed in ICU 3.2 1.1187 + * because it was not tested and caused a low code coverage number. 1.1188 + * See Jitterbug 3674. 1.1189 + * This affects only some .cnv file formats with a header.version 1.1190 + * below 4.1, and only when swaplfnl is requested. 1.1191 + * 1.1192 + * ucnvmbcs.c revision 1.99 is the last one with the 1.1193 + * ucnv_MBCSSizeofFromUBytes() function. 1.1194 + */ 1.1195 + *pErrorCode=U_INVALID_FORMAT_ERROR; 1.1196 + return FALSE; 1.1197 + } 1.1198 + 1.1199 + /* 1.1200 + * The table has an appropriate format. 1.1201 + * Allocate and build 1.1202 + * - a modified to-Unicode state table 1.1203 + * - a modified from-Unicode output array 1.1204 + * - a converter name string with the swap option appended 1.1205 + */ 1.1206 + size= 1.1207 + mbcsTable->countStates*1024+ 1.1208 + sizeofFromUBytes+ 1.1209 + UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1.1210 + p=(uint8_t *)uprv_malloc(size); 1.1211 + if(p==NULL) { 1.1212 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.1213 + return FALSE; 1.1214 + } 1.1215 + 1.1216 + /* copy and modify the to-Unicode state table */ 1.1217 + newStateTable=(int32_t (*)[256])p; 1.1218 + uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1.1219 + 1.1220 + newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1.1221 + newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1.1222 + 1.1223 + /* copy and modify the from-Unicode result table */ 1.1224 + newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 1.1225 + uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1.1226 + 1.1227 + /* conveniently, the table access macros work on the left side of expressions */ 1.1228 + if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1.1229 + MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1.1230 + MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1.1231 + } else /* MBCS_OUTPUT_2_SISO */ { 1.1232 + stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1.1233 + MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1.1234 + 1.1235 + stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1.1236 + MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1.1237 + } 1.1238 + 1.1239 + /* set the canonical converter name */ 1.1240 + name=(char *)newResults+sizeofFromUBytes; 1.1241 + uprv_strcpy(name, sharedData->staticData->name); 1.1242 + uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1.1243 + 1.1244 + /* set the pointers */ 1.1245 + umtx_lock(NULL); 1.1246 + if(mbcsTable->swapLFNLStateTable==NULL) { 1.1247 + mbcsTable->swapLFNLStateTable=newStateTable; 1.1248 + mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 1.1249 + mbcsTable->swapLFNLName=name; 1.1250 + 1.1251 + newStateTable=NULL; 1.1252 + } 1.1253 + umtx_unlock(NULL); 1.1254 + 1.1255 + /* release the allocated memory if another thread beat us to it */ 1.1256 + if(newStateTable!=NULL) { 1.1257 + uprv_free(newStateTable); 1.1258 + } 1.1259 + return TRUE; 1.1260 +} 1.1261 + 1.1262 +/* reconstitute omitted fromUnicode data ------------------------------------ */ 1.1263 + 1.1264 +/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1.1265 +static UBool U_CALLCONV 1.1266 +writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1.1267 + UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1.1268 + const uint16_t *table; 1.1269 + uint32_t *stage2; 1.1270 + uint8_t *bytes, *p; 1.1271 + UChar32 c; 1.1272 + int32_t i, st3; 1.1273 + 1.1274 + table=mbcsTable->fromUnicodeTable; 1.1275 + bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; 1.1276 + 1.1277 + /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1.1278 + switch(mbcsTable->outputType) { 1.1279 + case MBCS_OUTPUT_3_EUC: 1.1280 + if(value<=0xffff) { 1.1281 + /* short sequences are stored directly */ 1.1282 + /* code set 0 or 1 */ 1.1283 + } else if(value<=0x8effff) { 1.1284 + /* code set 2 */ 1.1285 + value&=0x7fff; 1.1286 + } else /* first byte is 0x8f */ { 1.1287 + /* code set 3 */ 1.1288 + value&=0xff7f; 1.1289 + } 1.1290 + break; 1.1291 + case MBCS_OUTPUT_4_EUC: 1.1292 + if(value<=0xffffff) { 1.1293 + /* short sequences are stored directly */ 1.1294 + /* code set 0 or 1 */ 1.1295 + } else if(value<=0x8effffff) { 1.1296 + /* code set 2 */ 1.1297 + value&=0x7fffff; 1.1298 + } else /* first byte is 0x8f */ { 1.1299 + /* code set 3 */ 1.1300 + value&=0xff7fff; 1.1301 + } 1.1302 + break; 1.1303 + default: 1.1304 + break; 1.1305 + } 1.1306 + 1.1307 + for(i=0; i<=0x1f; ++value, ++i) { 1.1308 + c=codePoints[i]; 1.1309 + if(c<0) { 1.1310 + continue; 1.1311 + } 1.1312 + 1.1313 + /* locate the stage 2 & 3 data */ 1.1314 + stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1.1315 + p=bytes; 1.1316 + st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); 1.1317 + 1.1318 + /* write the codepage bytes into stage 3 */ 1.1319 + switch(mbcsTable->outputType) { 1.1320 + case MBCS_OUTPUT_3: 1.1321 + case MBCS_OUTPUT_4_EUC: 1.1322 + p+=st3*3; 1.1323 + p[0]=(uint8_t)(value>>16); 1.1324 + p[1]=(uint8_t)(value>>8); 1.1325 + p[2]=(uint8_t)value; 1.1326 + break; 1.1327 + case MBCS_OUTPUT_4: 1.1328 + ((uint32_t *)p)[st3]=value; 1.1329 + break; 1.1330 + default: 1.1331 + /* 2 bytes per character */ 1.1332 + ((uint16_t *)p)[st3]=(uint16_t)value; 1.1333 + break; 1.1334 + } 1.1335 + 1.1336 + /* set the roundtrip flag */ 1.1337 + *stage2|=(1UL<<(16+(c&0xf))); 1.1338 + } 1.1339 + return TRUE; 1.1340 + } 1.1341 + 1.1342 +static void 1.1343 +reconstituteData(UConverterMBCSTable *mbcsTable, 1.1344 + uint32_t stage1Length, uint32_t stage2Length, 1.1345 + uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1.1346 + UErrorCode *pErrorCode) { 1.1347 + uint16_t *stage1; 1.1348 + uint32_t *stage2; 1.1349 + uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1.1350 + mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); 1.1351 + if(mbcsTable->reconstitutedData==NULL) { 1.1352 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.1353 + return; 1.1354 + } 1.1355 + uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1.1356 + 1.1357 + /* copy existing data and reroute the pointers */ 1.1358 + stage1=(uint16_t *)mbcsTable->reconstitutedData; 1.1359 + uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1.1360 + 1.1361 + stage2=(uint32_t *)(stage1+stage1Length); 1.1362 + uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1.1363 + mbcsTable->fromUnicodeTable+stage1Length, 1.1364 + stage2Length*4); 1.1365 + 1.1366 + mbcsTable->fromUnicodeTable=stage1; 1.1367 + mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length); 1.1368 + 1.1369 + /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1.1370 + stage2=(uint32_t *)stage1; 1.1371 + 1.1372 + /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1.1373 + { 1.1374 + int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; 1.1375 + int32_t stageUTF8Index=0; 1.1376 + int32_t st1, st2, st3, i; 1.1377 + 1.1378 + for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1.1379 + st2=stage1[st1]; 1.1380 + if(st2!=stage1Length/2) { 1.1381 + /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1.1382 + for(i=0; i<16; ++i) { 1.1383 + st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1.1384 + if(st3!=0) { 1.1385 + /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1.1386 + st3>>=4; 1.1387 + /* 1.1388 + * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1.1389 + * allocated together as a single 64-block for access from the mbcsIndex 1.1390 + */ 1.1391 + stage2[st2++]=st3++; 1.1392 + stage2[st2++]=st3++; 1.1393 + stage2[st2++]=st3++; 1.1394 + stage2[st2++]=st3; 1.1395 + } else { 1.1396 + /* no stage 3 block, skip */ 1.1397 + st2+=4; 1.1398 + } 1.1399 + } 1.1400 + } else { 1.1401 + /* no stage 2 block, skip */ 1.1402 + stageUTF8Index+=16; 1.1403 + } 1.1404 + } 1.1405 + } 1.1406 + 1.1407 + /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1.1408 + ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1.1409 +} 1.1410 + 1.1411 +/* MBCS setup functions ----------------------------------------------------- */ 1.1412 + 1.1413 +static void 1.1414 +ucnv_MBCSLoad(UConverterSharedData *sharedData, 1.1415 + UConverterLoadArgs *pArgs, 1.1416 + const uint8_t *raw, 1.1417 + UErrorCode *pErrorCode) { 1.1418 + UDataInfo info; 1.1419 + UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1.1420 + _MBCSHeader *header=(_MBCSHeader *)raw; 1.1421 + uint32_t offset; 1.1422 + uint32_t headerLength; 1.1423 + UBool noFromU=FALSE; 1.1424 + 1.1425 + if(header->version[0]==4) { 1.1426 + headerLength=MBCS_HEADER_V4_LENGTH; 1.1427 + } else if(header->version[0]==5 && header->version[1]>=3 && 1.1428 + (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1.1429 + headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1.1430 + noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); 1.1431 + } else { 1.1432 + *pErrorCode=U_INVALID_TABLE_FORMAT; 1.1433 + return; 1.1434 + } 1.1435 + 1.1436 + mbcsTable->outputType=(uint8_t)header->flags; 1.1437 + if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1.1438 + *pErrorCode=U_INVALID_TABLE_FORMAT; 1.1439 + return; 1.1440 + } 1.1441 + 1.1442 + /* extension data, header version 4.2 and higher */ 1.1443 + offset=header->flags>>8; 1.1444 + if(offset!=0) { 1.1445 + mbcsTable->extIndexes=(const int32_t *)(raw+offset); 1.1446 + } 1.1447 + 1.1448 + if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1.1449 + UConverterLoadArgs args={ 0 }; 1.1450 + UConverterSharedData *baseSharedData; 1.1451 + const int32_t *extIndexes; 1.1452 + const char *baseName; 1.1453 + 1.1454 + /* extension-only file, load the base table and set values appropriately */ 1.1455 + if((extIndexes=mbcsTable->extIndexes)==NULL) { 1.1456 + /* extension-only file without extension */ 1.1457 + *pErrorCode=U_INVALID_TABLE_FORMAT; 1.1458 + return; 1.1459 + } 1.1460 + 1.1461 + if(pArgs->nestedLoads!=1) { 1.1462 + /* an extension table must not be loaded as a base table */ 1.1463 + *pErrorCode=U_INVALID_TABLE_FILE; 1.1464 + return; 1.1465 + } 1.1466 + 1.1467 + /* load the base table */ 1.1468 + baseName=(const char *)header+headerLength*4; 1.1469 + if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1.1470 + /* forbid loading this same extension-only file */ 1.1471 + *pErrorCode=U_INVALID_TABLE_FORMAT; 1.1472 + return; 1.1473 + } 1.1474 + 1.1475 + /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1.1476 + args.size=sizeof(UConverterLoadArgs); 1.1477 + args.nestedLoads=2; 1.1478 + args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1.1479 + args.reserved=pArgs->reserved; 1.1480 + args.options=pArgs->options; 1.1481 + args.pkg=pArgs->pkg; 1.1482 + args.name=baseName; 1.1483 + baseSharedData=ucnv_load(&args, pErrorCode); 1.1484 + if(U_FAILURE(*pErrorCode)) { 1.1485 + return; 1.1486 + } 1.1487 + if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1.1488 + baseSharedData->mbcs.baseSharedData!=NULL 1.1489 + ) { 1.1490 + ucnv_unload(baseSharedData); 1.1491 + *pErrorCode=U_INVALID_TABLE_FORMAT; 1.1492 + return; 1.1493 + } 1.1494 + if(pArgs->onlyTestIsLoadable) { 1.1495 + /* 1.1496 + * Exit as soon as we know that we can load the converter 1.1497 + * and the format is valid and supported. 1.1498 + * The worst that can happen in the following code is a memory 1.1499 + * allocation error. 1.1500 + */ 1.1501 + ucnv_unload(baseSharedData); 1.1502 + return; 1.1503 + } 1.1504 + 1.1505 + /* copy the base table data */ 1.1506 + uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1.1507 + 1.1508 + /* overwrite values with relevant ones for the extension converter */ 1.1509 + mbcsTable->baseSharedData=baseSharedData; 1.1510 + mbcsTable->extIndexes=extIndexes; 1.1511 + 1.1512 + /* 1.1513 + * It would be possible to share the swapLFNL data with a base converter, 1.1514 + * but the generated name would have to be different, and the memory 1.1515 + * would have to be free'd only once. 1.1516 + * It is easier to just create the data for the extension converter 1.1517 + * separately when it is requested. 1.1518 + */ 1.1519 + mbcsTable->swapLFNLStateTable=NULL; 1.1520 + mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1.1521 + mbcsTable->swapLFNLName=NULL; 1.1522 + 1.1523 + /* 1.1524 + * The reconstitutedData must be deleted only when the base converter 1.1525 + * is unloaded. 1.1526 + */ 1.1527 + mbcsTable->reconstitutedData=NULL; 1.1528 + 1.1529 + /* 1.1530 + * Set a special, runtime-only outputType if the extension converter 1.1531 + * is a DBCS version of a base converter that also maps single bytes. 1.1532 + */ 1.1533 + if( sharedData->staticData->conversionType==UCNV_DBCS || 1.1534 + (sharedData->staticData->conversionType==UCNV_MBCS && 1.1535 + sharedData->staticData->minBytesPerChar>=2) 1.1536 + ) { 1.1537 + if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1.1538 + /* the base converter is SI/SO-stateful */ 1.1539 + int32_t entry; 1.1540 + 1.1541 + /* get the dbcs state from the state table entry for SO=0x0e */ 1.1542 + entry=mbcsTable->stateTable[0][0xe]; 1.1543 + if( MBCS_ENTRY_IS_FINAL(entry) && 1.1544 + MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1.1545 + MBCS_ENTRY_FINAL_STATE(entry)!=0 1.1546 + ) { 1.1547 + mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1.1548 + 1.1549 + mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1.1550 + } 1.1551 + } else if( 1.1552 + baseSharedData->staticData->conversionType==UCNV_MBCS && 1.1553 + baseSharedData->staticData->minBytesPerChar==1 && 1.1554 + baseSharedData->staticData->maxBytesPerChar==2 && 1.1555 + mbcsTable->countStates<=127 1.1556 + ) { 1.1557 + /* non-stateful base converter, need to modify the state table */ 1.1558 + int32_t (*newStateTable)[256]; 1.1559 + int32_t *state; 1.1560 + int32_t i, count; 1.1561 + 1.1562 + /* allocate a new state table and copy the base state table contents */ 1.1563 + count=mbcsTable->countStates; 1.1564 + newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1.1565 + if(newStateTable==NULL) { 1.1566 + ucnv_unload(baseSharedData); 1.1567 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.1568 + return; 1.1569 + } 1.1570 + 1.1571 + uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1.1572 + 1.1573 + /* change all final single-byte entries to go to a new all-illegal state */ 1.1574 + state=newStateTable[0]; 1.1575 + for(i=0; i<256; ++i) { 1.1576 + if(MBCS_ENTRY_IS_FINAL(state[i])) { 1.1577 + state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1.1578 + } 1.1579 + } 1.1580 + 1.1581 + /* build the new all-illegal state */ 1.1582 + state=newStateTable[count]; 1.1583 + for(i=0; i<256; ++i) { 1.1584 + state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1.1585 + } 1.1586 + mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1.1587 + mbcsTable->countStates=(uint8_t)(count+1); 1.1588 + mbcsTable->stateTableOwned=TRUE; 1.1589 + 1.1590 + mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1.1591 + } 1.1592 + } 1.1593 + 1.1594 + /* 1.1595 + * unlike below for files with base tables, do not get the unicodeMask 1.1596 + * from the sharedData; instead, use the base table's unicodeMask, 1.1597 + * which we copied in the memcpy above; 1.1598 + * this is necessary because the static data unicodeMask, especially 1.1599 + * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1.1600 + */ 1.1601 + } else { 1.1602 + /* conversion file with a base table; an additional extension table is optional */ 1.1603 + /* make sure that the output type is known */ 1.1604 + switch(mbcsTable->outputType) { 1.1605 + case MBCS_OUTPUT_1: 1.1606 + case MBCS_OUTPUT_2: 1.1607 + case MBCS_OUTPUT_3: 1.1608 + case MBCS_OUTPUT_4: 1.1609 + case MBCS_OUTPUT_3_EUC: 1.1610 + case MBCS_OUTPUT_4_EUC: 1.1611 + case MBCS_OUTPUT_2_SISO: 1.1612 + /* OK */ 1.1613 + break; 1.1614 + default: 1.1615 + *pErrorCode=U_INVALID_TABLE_FORMAT; 1.1616 + return; 1.1617 + } 1.1618 + if(pArgs->onlyTestIsLoadable) { 1.1619 + /* 1.1620 + * Exit as soon as we know that we can load the converter 1.1621 + * and the format is valid and supported. 1.1622 + * The worst that can happen in the following code is a memory 1.1623 + * allocation error. 1.1624 + */ 1.1625 + return; 1.1626 + } 1.1627 + 1.1628 + mbcsTable->countStates=(uint8_t)header->countStates; 1.1629 + mbcsTable->countToUFallbacks=header->countToUFallbacks; 1.1630 + mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); 1.1631 + mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1.1632 + mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1.1633 + 1.1634 + mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1.1635 + mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1.1636 + mbcsTable->fromUBytesLength=header->fromUBytesLength; 1.1637 + 1.1638 + /* 1.1639 + * converter versions 6.1 and up contain a unicodeMask that is 1.1640 + * used here to select the most efficient function implementations 1.1641 + */ 1.1642 + info.size=sizeof(UDataInfo); 1.1643 + udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1.1644 + if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1.1645 + /* mask off possible future extensions to be safe */ 1.1646 + mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1.1647 + } else { 1.1648 + /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1.1649 + mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1.1650 + } 1.1651 + 1.1652 + /* 1.1653 + * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1.1654 + * Check for the header version, SBCS vs. MBCS, and for whether the 1.1655 + * data structures are optimized for code points as high as what the 1.1656 + * runtime code is designed for. 1.1657 + * The implementation does not handle mapping tables with entries for 1.1658 + * unpaired surrogates. 1.1659 + */ 1.1660 + if( header->version[1]>=3 && 1.1661 + (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1.1662 + (mbcsTable->countStates==1 ? 1.1663 + (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1.1664 + (header->version[2]>=(MBCS_FAST_MAX>>8)) 1.1665 + ) 1.1666 + ) { 1.1667 + mbcsTable->utf8Friendly=TRUE; 1.1668 + 1.1669 + if(mbcsTable->countStates==1) { 1.1670 + /* 1.1671 + * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1.1672 + * Build a table with indexes to each block, to be used instead of 1.1673 + * the regular stage 1/2 table. 1.1674 + */ 1.1675 + int32_t i; 1.1676 + for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1.1677 + mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1.1678 + } 1.1679 + /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1.1680 + mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1.1681 + } else { 1.1682 + /* 1.1683 + * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1.1684 + * The .cnv file is prebuilt with an additional stage table with indexes 1.1685 + * to each block. 1.1686 + */ 1.1687 + mbcsTable->mbcsIndex=(const uint16_t *) 1.1688 + (mbcsTable->fromUnicodeBytes+ 1.1689 + (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1.1690 + mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1.1691 + } 1.1692 + } 1.1693 + 1.1694 + /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1.1695 + { 1.1696 + uint32_t asciiRoundtrips=0xffffffff; 1.1697 + int32_t i; 1.1698 + 1.1699 + for(i=0; i<0x80; ++i) { 1.1700 + if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1.1701 + asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1.1702 + } 1.1703 + } 1.1704 + mbcsTable->asciiRoundtrips=asciiRoundtrips; 1.1705 + } 1.1706 + 1.1707 + if(noFromU) { 1.1708 + uint32_t stage1Length= 1.1709 + mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1.1710 + 0x440 : 0x40; 1.1711 + uint32_t stage2Length= 1.1712 + (header->offsetFromUBytes-header->offsetFromUTable)/4- 1.1713 + stage1Length/2; 1.1714 + reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1.1715 + } 1.1716 + } 1.1717 + 1.1718 + /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1.1719 + if(mbcsTable->utf8Friendly) { 1.1720 + if(mbcsTable->countStates==1) { 1.1721 + sharedData->impl=&_SBCSUTF8Impl; 1.1722 + } else { 1.1723 + if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1.1724 + sharedData->impl=&_DBCSUTF8Impl; 1.1725 + } 1.1726 + } 1.1727 + } 1.1728 + 1.1729 + if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1.1730 + /* 1.1731 + * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1.1732 + * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1.1733 + */ 1.1734 + mbcsTable->asciiRoundtrips=0; 1.1735 + } 1.1736 +} 1.1737 + 1.1738 +static void 1.1739 +ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1.1740 + UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1.1741 + 1.1742 + if(mbcsTable->swapLFNLStateTable!=NULL) { 1.1743 + uprv_free(mbcsTable->swapLFNLStateTable); 1.1744 + } 1.1745 + if(mbcsTable->stateTableOwned) { 1.1746 + uprv_free((void *)mbcsTable->stateTable); 1.1747 + } 1.1748 + if(mbcsTable->baseSharedData!=NULL) { 1.1749 + ucnv_unload(mbcsTable->baseSharedData); 1.1750 + } 1.1751 + if(mbcsTable->reconstitutedData!=NULL) { 1.1752 + uprv_free(mbcsTable->reconstitutedData); 1.1753 + } 1.1754 +} 1.1755 + 1.1756 +static void 1.1757 +ucnv_MBCSOpen(UConverter *cnv, 1.1758 + UConverterLoadArgs *pArgs, 1.1759 + UErrorCode *pErrorCode) { 1.1760 + UConverterMBCSTable *mbcsTable; 1.1761 + const int32_t *extIndexes; 1.1762 + uint8_t outputType; 1.1763 + int8_t maxBytesPerUChar; 1.1764 + 1.1765 + if(pArgs->onlyTestIsLoadable) { 1.1766 + return; 1.1767 + } 1.1768 + 1.1769 + mbcsTable=&cnv->sharedData->mbcs; 1.1770 + outputType=mbcsTable->outputType; 1.1771 + 1.1772 + if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1.1773 + /* the swaplfnl option does not apply, remove it */ 1.1774 + cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1.1775 + } 1.1776 + 1.1777 + if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.1778 + /* do this because double-checked locking is broken */ 1.1779 + UBool isCached; 1.1780 + 1.1781 + umtx_lock(NULL); 1.1782 + isCached=mbcsTable->swapLFNLStateTable!=NULL; 1.1783 + umtx_unlock(NULL); 1.1784 + 1.1785 + if(!isCached) { 1.1786 + if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1.1787 + if(U_FAILURE(*pErrorCode)) { 1.1788 + return; /* something went wrong */ 1.1789 + } 1.1790 + 1.1791 + /* the option does not apply, remove it */ 1.1792 + cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1.1793 + } 1.1794 + } 1.1795 + } 1.1796 + 1.1797 + if(uprv_strstr(pArgs->name, "18030")!=NULL) { 1.1798 + if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) { 1.1799 + /* set a flag for GB 18030 mode, which changes the callback behavior */ 1.1800 + cnv->options|=_MBCS_OPTION_GB18030; 1.1801 + } 1.1802 + } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) { 1.1803 + /* set a flag for KEIS converter, which changes the SI/SO character sequence */ 1.1804 + cnv->options|=_MBCS_OPTION_KEIS; 1.1805 + } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) { 1.1806 + /* set a flag for JEF converter, which changes the SI/SO character sequence */ 1.1807 + cnv->options|=_MBCS_OPTION_JEF; 1.1808 + } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) { 1.1809 + /* set a flag for JIPS converter, which changes the SI/SO character sequence */ 1.1810 + cnv->options|=_MBCS_OPTION_JIPS; 1.1811 + } 1.1812 + 1.1813 + /* fix maxBytesPerUChar depending on outputType and options etc. */ 1.1814 + if(outputType==MBCS_OUTPUT_2_SISO) { 1.1815 + cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1.1816 + } 1.1817 + 1.1818 + extIndexes=mbcsTable->extIndexes; 1.1819 + if(extIndexes!=NULL) { 1.1820 + maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1.1821 + if(outputType==MBCS_OUTPUT_2_SISO) { 1.1822 + ++maxBytesPerUChar; /* SO + multiple DBCS */ 1.1823 + } 1.1824 + 1.1825 + if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1.1826 + cnv->maxBytesPerUChar=maxBytesPerUChar; 1.1827 + } 1.1828 + } 1.1829 + 1.1830 +#if 0 1.1831 + /* 1.1832 + * documentation of UConverter fields used for status 1.1833 + * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1.1834 + */ 1.1835 + 1.1836 + /* toUnicode */ 1.1837 + cnv->toUnicodeStatus=0; /* offset */ 1.1838 + cnv->mode=0; /* state */ 1.1839 + cnv->toULength=0; /* byteIndex */ 1.1840 + 1.1841 + /* fromUnicode */ 1.1842 + cnv->fromUChar32=0; 1.1843 + cnv->fromUnicodeStatus=1; /* prevLength */ 1.1844 +#endif 1.1845 +} 1.1846 + 1.1847 +static const char * 1.1848 +ucnv_MBCSGetName(const UConverter *cnv) { 1.1849 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1.1850 + return cnv->sharedData->mbcs.swapLFNLName; 1.1851 + } else { 1.1852 + return cnv->sharedData->staticData->name; 1.1853 + } 1.1854 +} 1.1855 + 1.1856 +/* MBCS-to-Unicode conversion functions ------------------------------------- */ 1.1857 + 1.1858 +static UChar32 1.1859 +ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 1.1860 + const _MBCSToUFallback *toUFallbacks; 1.1861 + uint32_t i, start, limit; 1.1862 + 1.1863 + limit=mbcsTable->countToUFallbacks; 1.1864 + if(limit>0) { 1.1865 + /* do a binary search for the fallback mapping */ 1.1866 + toUFallbacks=mbcsTable->toUFallbacks; 1.1867 + start=0; 1.1868 + while(start<limit-1) { 1.1869 + i=(start+limit)/2; 1.1870 + if(offset<toUFallbacks[i].offset) { 1.1871 + limit=i; 1.1872 + } else { 1.1873 + start=i; 1.1874 + } 1.1875 + } 1.1876 + 1.1877 + /* did we really find it? */ 1.1878 + if(offset==toUFallbacks[start].offset) { 1.1879 + return toUFallbacks[start].codePoint; 1.1880 + } 1.1881 + } 1.1882 + 1.1883 + return 0xfffe; 1.1884 +} 1.1885 + 1.1886 +/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 1.1887 +static void 1.1888 +ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.1889 + UErrorCode *pErrorCode) { 1.1890 + UConverter *cnv; 1.1891 + const uint8_t *source, *sourceLimit; 1.1892 + UChar *target; 1.1893 + const UChar *targetLimit; 1.1894 + int32_t *offsets; 1.1895 + 1.1896 + const int32_t (*stateTable)[256]; 1.1897 + 1.1898 + int32_t sourceIndex; 1.1899 + 1.1900 + int32_t entry; 1.1901 + UChar c; 1.1902 + uint8_t action; 1.1903 + 1.1904 + /* set up the local pointers */ 1.1905 + cnv=pArgs->converter; 1.1906 + source=(const uint8_t *)pArgs->source; 1.1907 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.1908 + target=pArgs->target; 1.1909 + targetLimit=pArgs->targetLimit; 1.1910 + offsets=pArgs->offsets; 1.1911 + 1.1912 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.1913 + stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1.1914 + } else { 1.1915 + stateTable=cnv->sharedData->mbcs.stateTable; 1.1916 + } 1.1917 + 1.1918 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.1919 + sourceIndex=0; 1.1920 + 1.1921 + /* conversion loop */ 1.1922 + while(source<sourceLimit) { 1.1923 + /* 1.1924 + * This following test is to see if available input would overflow the output. 1.1925 + * It does not catch output of more than one code unit that 1.1926 + * overflows as a result of a surrogate pair or callback output 1.1927 + * from the last source byte. 1.1928 + * Therefore, those situations also test for overflows and will 1.1929 + * then break the loop, too. 1.1930 + */ 1.1931 + if(target>=targetLimit) { 1.1932 + /* target is full */ 1.1933 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1934 + break; 1.1935 + } 1.1936 + 1.1937 + entry=stateTable[0][*source++]; 1.1938 + /* MBCS_ENTRY_IS_FINAL(entry) */ 1.1939 + 1.1940 + /* test the most common case first */ 1.1941 + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1.1942 + /* output BMP code point */ 1.1943 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.1944 + if(offsets!=NULL) { 1.1945 + *offsets++=sourceIndex; 1.1946 + } 1.1947 + 1.1948 + /* normal end of action codes: prepare for a new character */ 1.1949 + ++sourceIndex; 1.1950 + continue; 1.1951 + } 1.1952 + 1.1953 + /* 1.1954 + * An if-else-if chain provides more reliable performance for 1.1955 + * the most common cases compared to a switch. 1.1956 + */ 1.1957 + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1.1958 + if(action==MBCS_STATE_VALID_DIRECT_20 || 1.1959 + (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1.1960 + ) { 1.1961 + entry=MBCS_ENTRY_FINAL_VALUE(entry); 1.1962 + /* output surrogate pair */ 1.1963 + *target++=(UChar)(0xd800|(UChar)(entry>>10)); 1.1964 + if(offsets!=NULL) { 1.1965 + *offsets++=sourceIndex; 1.1966 + } 1.1967 + c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 1.1968 + if(target<targetLimit) { 1.1969 + *target++=c; 1.1970 + if(offsets!=NULL) { 1.1971 + *offsets++=sourceIndex; 1.1972 + } 1.1973 + } else { 1.1974 + /* target overflow */ 1.1975 + cnv->UCharErrorBuffer[0]=c; 1.1976 + cnv->UCharErrorBufferLength=1; 1.1977 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1978 + break; 1.1979 + } 1.1980 + 1.1981 + ++sourceIndex; 1.1982 + continue; 1.1983 + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1.1984 + if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1.1985 + /* output BMP code point */ 1.1986 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.1987 + if(offsets!=NULL) { 1.1988 + *offsets++=sourceIndex; 1.1989 + } 1.1990 + 1.1991 + ++sourceIndex; 1.1992 + continue; 1.1993 + } 1.1994 + } else if(action==MBCS_STATE_UNASSIGNED) { 1.1995 + /* just fall through */ 1.1996 + } else if(action==MBCS_STATE_ILLEGAL) { 1.1997 + /* callback(illegal) */ 1.1998 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1999 + } else { 1.2000 + /* reserved, must never occur */ 1.2001 + ++sourceIndex; 1.2002 + continue; 1.2003 + } 1.2004 + 1.2005 + if(U_FAILURE(*pErrorCode)) { 1.2006 + /* callback(illegal) */ 1.2007 + break; 1.2008 + } else /* unassigned sequences indicated with byteIndex>0 */ { 1.2009 + /* try an extension mapping */ 1.2010 + pArgs->source=(const char *)source; 1.2011 + cnv->toUBytes[0]=*(source-1); 1.2012 + cnv->toULength=_extToU(cnv, cnv->sharedData, 1.2013 + 1, &source, sourceLimit, 1.2014 + &target, targetLimit, 1.2015 + &offsets, sourceIndex, 1.2016 + pArgs->flush, 1.2017 + pErrorCode); 1.2018 + sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 1.2019 + 1.2020 + if(U_FAILURE(*pErrorCode)) { 1.2021 + /* not mappable or buffer overflow */ 1.2022 + break; 1.2023 + } 1.2024 + } 1.2025 + } 1.2026 + 1.2027 + /* write back the updated pointers */ 1.2028 + pArgs->source=(const char *)source; 1.2029 + pArgs->target=target; 1.2030 + pArgs->offsets=offsets; 1.2031 +} 1.2032 + 1.2033 +/* 1.2034 + * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 1.2035 + * that only map to and from the BMP. 1.2036 + * In addition to single-byte optimizations, the offset calculations 1.2037 + * become much easier. 1.2038 + */ 1.2039 +static void 1.2040 +ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 1.2041 + UErrorCode *pErrorCode) { 1.2042 + UConverter *cnv; 1.2043 + const uint8_t *source, *sourceLimit, *lastSource; 1.2044 + UChar *target; 1.2045 + int32_t targetCapacity, length; 1.2046 + int32_t *offsets; 1.2047 + 1.2048 + const int32_t (*stateTable)[256]; 1.2049 + 1.2050 + int32_t sourceIndex; 1.2051 + 1.2052 + int32_t entry; 1.2053 + uint8_t action; 1.2054 + 1.2055 + /* set up the local pointers */ 1.2056 + cnv=pArgs->converter; 1.2057 + source=(const uint8_t *)pArgs->source; 1.2058 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.2059 + target=pArgs->target; 1.2060 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.2061 + offsets=pArgs->offsets; 1.2062 + 1.2063 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.2064 + stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1.2065 + } else { 1.2066 + stateTable=cnv->sharedData->mbcs.stateTable; 1.2067 + } 1.2068 + 1.2069 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.2070 + sourceIndex=0; 1.2071 + lastSource=source; 1.2072 + 1.2073 + /* 1.2074 + * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 1.2075 + * for the minimum of the sourceLength and targetCapacity 1.2076 + */ 1.2077 + length=(int32_t)(sourceLimit-source); 1.2078 + if(length<targetCapacity) { 1.2079 + targetCapacity=length; 1.2080 + } 1.2081 + 1.2082 +#if MBCS_UNROLL_SINGLE_TO_BMP 1.2083 + /* unrolling makes it faster on Pentium III/Windows 2000 */ 1.2084 + /* unroll the loop with the most common case */ 1.2085 +unrolled: 1.2086 + if(targetCapacity>=16) { 1.2087 + int32_t count, loops, oredEntries; 1.2088 + 1.2089 + loops=count=targetCapacity>>4; 1.2090 + do { 1.2091 + oredEntries=entry=stateTable[0][*source++]; 1.2092 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2093 + oredEntries|=entry=stateTable[0][*source++]; 1.2094 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2095 + oredEntries|=entry=stateTable[0][*source++]; 1.2096 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2097 + oredEntries|=entry=stateTable[0][*source++]; 1.2098 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2099 + oredEntries|=entry=stateTable[0][*source++]; 1.2100 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2101 + oredEntries|=entry=stateTable[0][*source++]; 1.2102 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2103 + oredEntries|=entry=stateTable[0][*source++]; 1.2104 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2105 + oredEntries|=entry=stateTable[0][*source++]; 1.2106 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2107 + oredEntries|=entry=stateTable[0][*source++]; 1.2108 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2109 + oredEntries|=entry=stateTable[0][*source++]; 1.2110 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2111 + oredEntries|=entry=stateTable[0][*source++]; 1.2112 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2113 + oredEntries|=entry=stateTable[0][*source++]; 1.2114 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2115 + oredEntries|=entry=stateTable[0][*source++]; 1.2116 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2117 + oredEntries|=entry=stateTable[0][*source++]; 1.2118 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2119 + oredEntries|=entry=stateTable[0][*source++]; 1.2120 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2121 + oredEntries|=entry=stateTable[0][*source++]; 1.2122 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2123 + 1.2124 + /* were all 16 entries really valid? */ 1.2125 + if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 1.2126 + /* no, return to the first of these 16 */ 1.2127 + source-=16; 1.2128 + target-=16; 1.2129 + break; 1.2130 + } 1.2131 + } while(--count>0); 1.2132 + count=loops-count; 1.2133 + targetCapacity-=16*count; 1.2134 + 1.2135 + if(offsets!=NULL) { 1.2136 + lastSource+=16*count; 1.2137 + while(count>0) { 1.2138 + *offsets++=sourceIndex++; 1.2139 + *offsets++=sourceIndex++; 1.2140 + *offsets++=sourceIndex++; 1.2141 + *offsets++=sourceIndex++; 1.2142 + *offsets++=sourceIndex++; 1.2143 + *offsets++=sourceIndex++; 1.2144 + *offsets++=sourceIndex++; 1.2145 + *offsets++=sourceIndex++; 1.2146 + *offsets++=sourceIndex++; 1.2147 + *offsets++=sourceIndex++; 1.2148 + *offsets++=sourceIndex++; 1.2149 + *offsets++=sourceIndex++; 1.2150 + *offsets++=sourceIndex++; 1.2151 + *offsets++=sourceIndex++; 1.2152 + *offsets++=sourceIndex++; 1.2153 + *offsets++=sourceIndex++; 1.2154 + --count; 1.2155 + } 1.2156 + } 1.2157 + } 1.2158 +#endif 1.2159 + 1.2160 + /* conversion loop */ 1.2161 + while(targetCapacity > 0 && source < sourceLimit) { 1.2162 + entry=stateTable[0][*source++]; 1.2163 + /* MBCS_ENTRY_IS_FINAL(entry) */ 1.2164 + 1.2165 + /* test the most common case first */ 1.2166 + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1.2167 + /* output BMP code point */ 1.2168 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2169 + --targetCapacity; 1.2170 + continue; 1.2171 + } 1.2172 + 1.2173 + /* 1.2174 + * An if-else-if chain provides more reliable performance for 1.2175 + * the most common cases compared to a switch. 1.2176 + */ 1.2177 + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1.2178 + if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1.2179 + if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1.2180 + /* output BMP code point */ 1.2181 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2182 + --targetCapacity; 1.2183 + continue; 1.2184 + } 1.2185 + } else if(action==MBCS_STATE_UNASSIGNED) { 1.2186 + /* just fall through */ 1.2187 + } else if(action==MBCS_STATE_ILLEGAL) { 1.2188 + /* callback(illegal) */ 1.2189 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2190 + } else { 1.2191 + /* reserved, must never occur */ 1.2192 + continue; 1.2193 + } 1.2194 + 1.2195 + /* set offsets since the start or the last extension */ 1.2196 + if(offsets!=NULL) { 1.2197 + int32_t count=(int32_t)(source-lastSource); 1.2198 + 1.2199 + /* predecrement: do not set the offset for the callback-causing character */ 1.2200 + while(--count>0) { 1.2201 + *offsets++=sourceIndex++; 1.2202 + } 1.2203 + /* offset and sourceIndex are now set for the current character */ 1.2204 + } 1.2205 + 1.2206 + if(U_FAILURE(*pErrorCode)) { 1.2207 + /* callback(illegal) */ 1.2208 + break; 1.2209 + } else /* unassigned sequences indicated with byteIndex>0 */ { 1.2210 + /* try an extension mapping */ 1.2211 + lastSource=source; 1.2212 + cnv->toUBytes[0]=*(source-1); 1.2213 + cnv->toULength=_extToU(cnv, cnv->sharedData, 1.2214 + 1, &source, sourceLimit, 1.2215 + &target, pArgs->targetLimit, 1.2216 + &offsets, sourceIndex, 1.2217 + pArgs->flush, 1.2218 + pErrorCode); 1.2219 + sourceIndex+=1+(int32_t)(source-lastSource); 1.2220 + 1.2221 + if(U_FAILURE(*pErrorCode)) { 1.2222 + /* not mappable or buffer overflow */ 1.2223 + break; 1.2224 + } 1.2225 + 1.2226 + /* recalculate the targetCapacity after an extension mapping */ 1.2227 + targetCapacity=(int32_t)(pArgs->targetLimit-target); 1.2228 + length=(int32_t)(sourceLimit-source); 1.2229 + if(length<targetCapacity) { 1.2230 + targetCapacity=length; 1.2231 + } 1.2232 + } 1.2233 + 1.2234 +#if MBCS_UNROLL_SINGLE_TO_BMP 1.2235 + /* unrolling makes it faster on Pentium III/Windows 2000 */ 1.2236 + goto unrolled; 1.2237 +#endif 1.2238 + } 1.2239 + 1.2240 + if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 1.2241 + /* target is full */ 1.2242 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.2243 + } 1.2244 + 1.2245 + /* set offsets since the start or the last callback */ 1.2246 + if(offsets!=NULL) { 1.2247 + size_t count=source-lastSource; 1.2248 + while(count>0) { 1.2249 + *offsets++=sourceIndex++; 1.2250 + --count; 1.2251 + } 1.2252 + } 1.2253 + 1.2254 + /* write back the updated pointers */ 1.2255 + pArgs->source=(const char *)source; 1.2256 + pArgs->target=target; 1.2257 + pArgs->offsets=offsets; 1.2258 +} 1.2259 + 1.2260 +static UBool 1.2261 +hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 1.2262 + const int32_t *row=stateTable[state]; 1.2263 + int32_t b, entry; 1.2264 + /* First test for final entries in this state for some commonly valid byte values. */ 1.2265 + entry=row[0xa1]; 1.2266 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && 1.2267 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 1.2268 + ) { 1.2269 + return TRUE; 1.2270 + } 1.2271 + entry=row[0x41]; 1.2272 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && 1.2273 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 1.2274 + ) { 1.2275 + return TRUE; 1.2276 + } 1.2277 + /* Then test for final entries in this state. */ 1.2278 + for(b=0; b<=0xff; ++b) { 1.2279 + entry=row[b]; 1.2280 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && 1.2281 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 1.2282 + ) { 1.2283 + return TRUE; 1.2284 + } 1.2285 + } 1.2286 + /* Then recurse for transition entries. */ 1.2287 + for(b=0; b<=0xff; ++b) { 1.2288 + entry=row[b]; 1.2289 + if( MBCS_ENTRY_IS_TRANSITION(entry) && 1.2290 + hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) 1.2291 + ) { 1.2292 + return TRUE; 1.2293 + } 1.2294 + } 1.2295 + return FALSE; 1.2296 +} 1.2297 + 1.2298 +/* 1.2299 + * Is byte b a single/lead byte in this state? 1.2300 + * Recurse for transition states, because here we don't want to say that 1.2301 + * b is a lead byte if all byte sequences that start with b are illegal. 1.2302 + */ 1.2303 +static UBool 1.2304 +isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 1.2305 + const int32_t *row=stateTable[state]; 1.2306 + int32_t entry=row[b]; 1.2307 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 1.2308 + return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); 1.2309 + } else { 1.2310 + uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1.2311 + if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 1.2312 + return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 1.2313 + } else { 1.2314 + return action!=MBCS_STATE_ILLEGAL; 1.2315 + } 1.2316 + } 1.2317 +} 1.2318 + 1.2319 +U_CFUNC void 1.2320 +ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.2321 + UErrorCode *pErrorCode) { 1.2322 + UConverter *cnv; 1.2323 + const uint8_t *source, *sourceLimit; 1.2324 + UChar *target; 1.2325 + const UChar *targetLimit; 1.2326 + int32_t *offsets; 1.2327 + 1.2328 + const int32_t (*stateTable)[256]; 1.2329 + const uint16_t *unicodeCodeUnits; 1.2330 + 1.2331 + uint32_t offset; 1.2332 + uint8_t state; 1.2333 + int8_t byteIndex; 1.2334 + uint8_t *bytes; 1.2335 + 1.2336 + int32_t sourceIndex, nextSourceIndex; 1.2337 + 1.2338 + int32_t entry; 1.2339 + UChar c; 1.2340 + uint8_t action; 1.2341 + 1.2342 + /* use optimized function if possible */ 1.2343 + cnv=pArgs->converter; 1.2344 + 1.2345 + if(cnv->preToULength>0) { 1.2346 + /* 1.2347 + * pass sourceIndex=-1 because we continue from an earlier buffer 1.2348 + * in the future, this may change with continuous offsets 1.2349 + */ 1.2350 + ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 1.2351 + 1.2352 + if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 1.2353 + return; 1.2354 + } 1.2355 + } 1.2356 + 1.2357 + if(cnv->sharedData->mbcs.countStates==1) { 1.2358 + if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1.2359 + ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 1.2360 + } else { 1.2361 + ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 1.2362 + } 1.2363 + return; 1.2364 + } 1.2365 + 1.2366 + /* set up the local pointers */ 1.2367 + source=(const uint8_t *)pArgs->source; 1.2368 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.2369 + target=pArgs->target; 1.2370 + targetLimit=pArgs->targetLimit; 1.2371 + offsets=pArgs->offsets; 1.2372 + 1.2373 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.2374 + stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1.2375 + } else { 1.2376 + stateTable=cnv->sharedData->mbcs.stateTable; 1.2377 + } 1.2378 + unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 1.2379 + 1.2380 + /* get the converter state from UConverter */ 1.2381 + offset=cnv->toUnicodeStatus; 1.2382 + byteIndex=cnv->toULength; 1.2383 + bytes=cnv->toUBytes; 1.2384 + 1.2385 + /* 1.2386 + * if we are in the SBCS state for a DBCS-only converter, 1.2387 + * then load the DBCS state from the MBCS data 1.2388 + * (dbcsOnlyState==0 if it is not a DBCS-only converter) 1.2389 + */ 1.2390 + if((state=(uint8_t)(cnv->mode))==0) { 1.2391 + state=cnv->sharedData->mbcs.dbcsOnlyState; 1.2392 + } 1.2393 + 1.2394 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.2395 + sourceIndex=byteIndex==0 ? 0 : -1; 1.2396 + nextSourceIndex=0; 1.2397 + 1.2398 + /* conversion loop */ 1.2399 + while(source<sourceLimit) { 1.2400 + /* 1.2401 + * This following test is to see if available input would overflow the output. 1.2402 + * It does not catch output of more than one code unit that 1.2403 + * overflows as a result of a surrogate pair or callback output 1.2404 + * from the last source byte. 1.2405 + * Therefore, those situations also test for overflows and will 1.2406 + * then break the loop, too. 1.2407 + */ 1.2408 + if(target>=targetLimit) { 1.2409 + /* target is full */ 1.2410 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.2411 + break; 1.2412 + } 1.2413 + 1.2414 + if(byteIndex==0) { 1.2415 + /* optimized loop for 1/2-byte input and BMP output */ 1.2416 + if(offsets==NULL) { 1.2417 + do { 1.2418 + entry=stateTable[state][*source]; 1.2419 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.2420 + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1.2421 + offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1.2422 + 1.2423 + ++source; 1.2424 + if( source<sourceLimit && 1.2425 + MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 1.2426 + MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 1.2427 + (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 1.2428 + ) { 1.2429 + ++source; 1.2430 + *target++=c; 1.2431 + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1.2432 + offset=0; 1.2433 + } else { 1.2434 + /* set the state and leave the optimized loop */ 1.2435 + bytes[0]=*(source-1); 1.2436 + byteIndex=1; 1.2437 + break; 1.2438 + } 1.2439 + } else { 1.2440 + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1.2441 + /* output BMP code point */ 1.2442 + ++source; 1.2443 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2444 + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1.2445 + } else { 1.2446 + /* leave the optimized loop */ 1.2447 + break; 1.2448 + } 1.2449 + } 1.2450 + } while(source<sourceLimit && target<targetLimit); 1.2451 + } else /* offsets!=NULL */ { 1.2452 + do { 1.2453 + entry=stateTable[state][*source]; 1.2454 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.2455 + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1.2456 + offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1.2457 + 1.2458 + ++source; 1.2459 + if( source<sourceLimit && 1.2460 + MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 1.2461 + MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 1.2462 + (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 1.2463 + ) { 1.2464 + ++source; 1.2465 + *target++=c; 1.2466 + if(offsets!=NULL) { 1.2467 + *offsets++=sourceIndex; 1.2468 + sourceIndex=(nextSourceIndex+=2); 1.2469 + } 1.2470 + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1.2471 + offset=0; 1.2472 + } else { 1.2473 + /* set the state and leave the optimized loop */ 1.2474 + ++nextSourceIndex; 1.2475 + bytes[0]=*(source-1); 1.2476 + byteIndex=1; 1.2477 + break; 1.2478 + } 1.2479 + } else { 1.2480 + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1.2481 + /* output BMP code point */ 1.2482 + ++source; 1.2483 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2484 + if(offsets!=NULL) { 1.2485 + *offsets++=sourceIndex; 1.2486 + sourceIndex=++nextSourceIndex; 1.2487 + } 1.2488 + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1.2489 + } else { 1.2490 + /* leave the optimized loop */ 1.2491 + break; 1.2492 + } 1.2493 + } 1.2494 + } while(source<sourceLimit && target<targetLimit); 1.2495 + } 1.2496 + 1.2497 + /* 1.2498 + * these tests and break statements could be put inside the loop 1.2499 + * if C had "break outerLoop" like Java 1.2500 + */ 1.2501 + if(source>=sourceLimit) { 1.2502 + break; 1.2503 + } 1.2504 + if(target>=targetLimit) { 1.2505 + /* target is full */ 1.2506 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.2507 + break; 1.2508 + } 1.2509 + 1.2510 + ++nextSourceIndex; 1.2511 + bytes[byteIndex++]=*source++; 1.2512 + } else /* byteIndex>0 */ { 1.2513 + ++nextSourceIndex; 1.2514 + entry=stateTable[state][bytes[byteIndex++]=*source++]; 1.2515 + } 1.2516 + 1.2517 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.2518 + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1.2519 + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1.2520 + continue; 1.2521 + } 1.2522 + 1.2523 + /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 1.2524 + cnv->mode=state; 1.2525 + 1.2526 + /* set the next state early so that we can reuse the entry variable */ 1.2527 + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1.2528 + 1.2529 + /* 1.2530 + * An if-else-if chain provides more reliable performance for 1.2531 + * the most common cases compared to a switch. 1.2532 + */ 1.2533 + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1.2534 + if(action==MBCS_STATE_VALID_16) { 1.2535 + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2536 + c=unicodeCodeUnits[offset]; 1.2537 + if(c<0xfffe) { 1.2538 + /* output BMP code point */ 1.2539 + *target++=c; 1.2540 + if(offsets!=NULL) { 1.2541 + *offsets++=sourceIndex; 1.2542 + } 1.2543 + byteIndex=0; 1.2544 + } else if(c==0xfffe) { 1.2545 + if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 1.2546 + /* output fallback BMP code point */ 1.2547 + *target++=(UChar)entry; 1.2548 + if(offsets!=NULL) { 1.2549 + *offsets++=sourceIndex; 1.2550 + } 1.2551 + byteIndex=0; 1.2552 + } 1.2553 + } else { 1.2554 + /* callback(illegal) */ 1.2555 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2556 + } 1.2557 + } else if(action==MBCS_STATE_VALID_DIRECT_16) { 1.2558 + /* output BMP code point */ 1.2559 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2560 + if(offsets!=NULL) { 1.2561 + *offsets++=sourceIndex; 1.2562 + } 1.2563 + byteIndex=0; 1.2564 + } else if(action==MBCS_STATE_VALID_16_PAIR) { 1.2565 + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2566 + c=unicodeCodeUnits[offset++]; 1.2567 + if(c<0xd800) { 1.2568 + /* output BMP code point below 0xd800 */ 1.2569 + *target++=c; 1.2570 + if(offsets!=NULL) { 1.2571 + *offsets++=sourceIndex; 1.2572 + } 1.2573 + byteIndex=0; 1.2574 + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 1.2575 + /* output roundtrip or fallback surrogate pair */ 1.2576 + *target++=(UChar)(c&0xdbff); 1.2577 + if(offsets!=NULL) { 1.2578 + *offsets++=sourceIndex; 1.2579 + } 1.2580 + byteIndex=0; 1.2581 + if(target<targetLimit) { 1.2582 + *target++=unicodeCodeUnits[offset]; 1.2583 + if(offsets!=NULL) { 1.2584 + *offsets++=sourceIndex; 1.2585 + } 1.2586 + } else { 1.2587 + /* target overflow */ 1.2588 + cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 1.2589 + cnv->UCharErrorBufferLength=1; 1.2590 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.2591 + 1.2592 + offset=0; 1.2593 + break; 1.2594 + } 1.2595 + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 1.2596 + /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 1.2597 + *target++=unicodeCodeUnits[offset]; 1.2598 + if(offsets!=NULL) { 1.2599 + *offsets++=sourceIndex; 1.2600 + } 1.2601 + byteIndex=0; 1.2602 + } else if(c==0xffff) { 1.2603 + /* callback(illegal) */ 1.2604 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2605 + } 1.2606 + } else if(action==MBCS_STATE_VALID_DIRECT_20 || 1.2607 + (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1.2608 + ) { 1.2609 + entry=MBCS_ENTRY_FINAL_VALUE(entry); 1.2610 + /* output surrogate pair */ 1.2611 + *target++=(UChar)(0xd800|(UChar)(entry>>10)); 1.2612 + if(offsets!=NULL) { 1.2613 + *offsets++=sourceIndex; 1.2614 + } 1.2615 + byteIndex=0; 1.2616 + c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 1.2617 + if(target<targetLimit) { 1.2618 + *target++=c; 1.2619 + if(offsets!=NULL) { 1.2620 + *offsets++=sourceIndex; 1.2621 + } 1.2622 + } else { 1.2623 + /* target overflow */ 1.2624 + cnv->UCharErrorBuffer[0]=c; 1.2625 + cnv->UCharErrorBufferLength=1; 1.2626 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.2627 + 1.2628 + offset=0; 1.2629 + break; 1.2630 + } 1.2631 + } else if(action==MBCS_STATE_CHANGE_ONLY) { 1.2632 + /* 1.2633 + * This serves as a state change without any output. 1.2634 + * It is useful for reading simple stateful encodings, 1.2635 + * for example using just Shift-In/Shift-Out codes. 1.2636 + * The 21 unused bits may later be used for more sophisticated 1.2637 + * state transitions. 1.2638 + */ 1.2639 + if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 1.2640 + byteIndex=0; 1.2641 + } else { 1.2642 + /* SI/SO are illegal for DBCS-only conversion */ 1.2643 + state=(uint8_t)(cnv->mode); /* restore the previous state */ 1.2644 + 1.2645 + /* callback(illegal) */ 1.2646 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2647 + } 1.2648 + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1.2649 + if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1.2650 + /* output BMP code point */ 1.2651 + *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2652 + if(offsets!=NULL) { 1.2653 + *offsets++=sourceIndex; 1.2654 + } 1.2655 + byteIndex=0; 1.2656 + } 1.2657 + } else if(action==MBCS_STATE_UNASSIGNED) { 1.2658 + /* just fall through */ 1.2659 + } else if(action==MBCS_STATE_ILLEGAL) { 1.2660 + /* callback(illegal) */ 1.2661 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2662 + } else { 1.2663 + /* reserved, must never occur */ 1.2664 + byteIndex=0; 1.2665 + } 1.2666 + 1.2667 + /* end of action codes: prepare for a new character */ 1.2668 + offset=0; 1.2669 + 1.2670 + if(byteIndex==0) { 1.2671 + sourceIndex=nextSourceIndex; 1.2672 + } else if(U_FAILURE(*pErrorCode)) { 1.2673 + /* callback(illegal) */ 1.2674 + if(byteIndex>1) { 1.2675 + /* 1.2676 + * Ticket 5691: consistent illegal sequences: 1.2677 + * - We include at least the first byte in the illegal sequence. 1.2678 + * - If any of the non-initial bytes could be the start of a character, 1.2679 + * we stop the illegal sequence before the first one of those. 1.2680 + */ 1.2681 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 1.2682 + int8_t i; 1.2683 + for(i=1; 1.2684 + i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 1.2685 + ++i) {} 1.2686 + if(i<byteIndex) { 1.2687 + /* Back out some bytes. */ 1.2688 + int8_t backOutDistance=byteIndex-i; 1.2689 + int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 1.2690 + byteIndex=i; /* length of reported illegal byte sequence */ 1.2691 + if(backOutDistance<=bytesFromThisBuffer) { 1.2692 + source-=backOutDistance; 1.2693 + } else { 1.2694 + /* Back out bytes from the previous buffer: Need to replay them. */ 1.2695 + cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1.2696 + /* preToULength is negative! */ 1.2697 + uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 1.2698 + source=(const uint8_t *)pArgs->source; 1.2699 + } 1.2700 + } 1.2701 + } 1.2702 + break; 1.2703 + } else /* unassigned sequences indicated with byteIndex>0 */ { 1.2704 + /* try an extension mapping */ 1.2705 + pArgs->source=(const char *)source; 1.2706 + byteIndex=_extToU(cnv, cnv->sharedData, 1.2707 + byteIndex, &source, sourceLimit, 1.2708 + &target, targetLimit, 1.2709 + &offsets, sourceIndex, 1.2710 + pArgs->flush, 1.2711 + pErrorCode); 1.2712 + sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 1.2713 + 1.2714 + if(U_FAILURE(*pErrorCode)) { 1.2715 + /* not mappable or buffer overflow */ 1.2716 + break; 1.2717 + } 1.2718 + } 1.2719 + } 1.2720 + 1.2721 + /* set the converter state back into UConverter */ 1.2722 + cnv->toUnicodeStatus=offset; 1.2723 + cnv->mode=state; 1.2724 + cnv->toULength=byteIndex; 1.2725 + 1.2726 + /* write back the updated pointers */ 1.2727 + pArgs->source=(const char *)source; 1.2728 + pArgs->target=target; 1.2729 + pArgs->offsets=offsets; 1.2730 +} 1.2731 + 1.2732 +/* 1.2733 + * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 1.2734 + * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 1.2735 + */ 1.2736 +static UChar32 1.2737 +ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 1.2738 + UErrorCode *pErrorCode) { 1.2739 + UConverter *cnv; 1.2740 + const int32_t (*stateTable)[256]; 1.2741 + const uint8_t *source, *sourceLimit; 1.2742 + 1.2743 + int32_t entry; 1.2744 + uint8_t action; 1.2745 + 1.2746 + /* set up the local pointers */ 1.2747 + cnv=pArgs->converter; 1.2748 + source=(const uint8_t *)pArgs->source; 1.2749 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.2750 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.2751 + stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1.2752 + } else { 1.2753 + stateTable=cnv->sharedData->mbcs.stateTable; 1.2754 + } 1.2755 + 1.2756 + /* conversion loop */ 1.2757 + while(source<sourceLimit) { 1.2758 + entry=stateTable[0][*source++]; 1.2759 + /* MBCS_ENTRY_IS_FINAL(entry) */ 1.2760 + 1.2761 + /* write back the updated pointer early so that we can return directly */ 1.2762 + pArgs->source=(const char *)source; 1.2763 + 1.2764 + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1.2765 + /* output BMP code point */ 1.2766 + return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2767 + } 1.2768 + 1.2769 + /* 1.2770 + * An if-else-if chain provides more reliable performance for 1.2771 + * the most common cases compared to a switch. 1.2772 + */ 1.2773 + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1.2774 + if( action==MBCS_STATE_VALID_DIRECT_20 || 1.2775 + (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1.2776 + ) { 1.2777 + /* output supplementary code point */ 1.2778 + return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 1.2779 + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1.2780 + if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1.2781 + /* output BMP code point */ 1.2782 + return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2783 + } 1.2784 + } else if(action==MBCS_STATE_UNASSIGNED) { 1.2785 + /* just fall through */ 1.2786 + } else if(action==MBCS_STATE_ILLEGAL) { 1.2787 + /* callback(illegal) */ 1.2788 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2789 + } else { 1.2790 + /* reserved, must never occur */ 1.2791 + continue; 1.2792 + } 1.2793 + 1.2794 + if(U_FAILURE(*pErrorCode)) { 1.2795 + /* callback(illegal) */ 1.2796 + break; 1.2797 + } else /* unassigned sequence */ { 1.2798 + /* defer to the generic implementation */ 1.2799 + pArgs->source=(const char *)source-1; 1.2800 + return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1.2801 + } 1.2802 + } 1.2803 + 1.2804 + /* no output because of empty input or only state changes */ 1.2805 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.2806 + return 0xffff; 1.2807 +} 1.2808 + 1.2809 +/* 1.2810 + * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 1.2811 + * conversion without offset handling. 1.2812 + * 1.2813 + * When a character does not have a mapping to Unicode, then we return to the 1.2814 + * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 1.2815 + * handling. 1.2816 + * We also defer to the generic code in other complicated cases and have them 1.2817 + * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 1.2818 + * 1.2819 + * All normal mappings and errors are handled here. 1.2820 + */ 1.2821 +static UChar32 1.2822 +ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 1.2823 + UErrorCode *pErrorCode) { 1.2824 + UConverter *cnv; 1.2825 + const uint8_t *source, *sourceLimit, *lastSource; 1.2826 + 1.2827 + const int32_t (*stateTable)[256]; 1.2828 + const uint16_t *unicodeCodeUnits; 1.2829 + 1.2830 + uint32_t offset; 1.2831 + uint8_t state; 1.2832 + 1.2833 + int32_t entry; 1.2834 + UChar32 c; 1.2835 + uint8_t action; 1.2836 + 1.2837 + /* use optimized function if possible */ 1.2838 + cnv=pArgs->converter; 1.2839 + 1.2840 + if(cnv->preToULength>0) { 1.2841 + /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 1.2842 + return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1.2843 + } 1.2844 + 1.2845 + if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 1.2846 + /* 1.2847 + * Using the generic ucnv_getNextUChar() code lets us deal correctly 1.2848 + * with the rare case of a codepage that maps single surrogates 1.2849 + * without adding the complexity to this already complicated function here. 1.2850 + */ 1.2851 + return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1.2852 + } else if(cnv->sharedData->mbcs.countStates==1) { 1.2853 + return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 1.2854 + } 1.2855 + 1.2856 + /* set up the local pointers */ 1.2857 + source=lastSource=(const uint8_t *)pArgs->source; 1.2858 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.2859 + 1.2860 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.2861 + stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1.2862 + } else { 1.2863 + stateTable=cnv->sharedData->mbcs.stateTable; 1.2864 + } 1.2865 + unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 1.2866 + 1.2867 + /* get the converter state from UConverter */ 1.2868 + offset=cnv->toUnicodeStatus; 1.2869 + 1.2870 + /* 1.2871 + * if we are in the SBCS state for a DBCS-only converter, 1.2872 + * then load the DBCS state from the MBCS data 1.2873 + * (dbcsOnlyState==0 if it is not a DBCS-only converter) 1.2874 + */ 1.2875 + if((state=(uint8_t)(cnv->mode))==0) { 1.2876 + state=cnv->sharedData->mbcs.dbcsOnlyState; 1.2877 + } 1.2878 + 1.2879 + /* conversion loop */ 1.2880 + c=U_SENTINEL; 1.2881 + while(source<sourceLimit) { 1.2882 + entry=stateTable[state][*source++]; 1.2883 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.2884 + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1.2885 + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1.2886 + 1.2887 + /* optimization for 1/2-byte input and BMP output */ 1.2888 + if( source<sourceLimit && 1.2889 + MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 1.2890 + MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 1.2891 + (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 1.2892 + ) { 1.2893 + ++source; 1.2894 + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1.2895 + /* output BMP code point */ 1.2896 + break; 1.2897 + } 1.2898 + } else { 1.2899 + /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 1.2900 + cnv->mode=state; 1.2901 + 1.2902 + /* set the next state early so that we can reuse the entry variable */ 1.2903 + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 1.2904 + 1.2905 + /* 1.2906 + * An if-else-if chain provides more reliable performance for 1.2907 + * the most common cases compared to a switch. 1.2908 + */ 1.2909 + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1.2910 + if(action==MBCS_STATE_VALID_DIRECT_16) { 1.2911 + /* output BMP code point */ 1.2912 + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2913 + break; 1.2914 + } else if(action==MBCS_STATE_VALID_16) { 1.2915 + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2916 + c=unicodeCodeUnits[offset]; 1.2917 + if(c<0xfffe) { 1.2918 + /* output BMP code point */ 1.2919 + break; 1.2920 + } else if(c==0xfffe) { 1.2921 + if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 1.2922 + break; 1.2923 + } 1.2924 + } else { 1.2925 + /* callback(illegal) */ 1.2926 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2927 + } 1.2928 + } else if(action==MBCS_STATE_VALID_16_PAIR) { 1.2929 + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2930 + c=unicodeCodeUnits[offset++]; 1.2931 + if(c<0xd800) { 1.2932 + /* output BMP code point below 0xd800 */ 1.2933 + break; 1.2934 + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 1.2935 + /* output roundtrip or fallback supplementary code point */ 1.2936 + c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 1.2937 + break; 1.2938 + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 1.2939 + /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 1.2940 + c=unicodeCodeUnits[offset]; 1.2941 + break; 1.2942 + } else if(c==0xffff) { 1.2943 + /* callback(illegal) */ 1.2944 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2945 + } 1.2946 + } else if(action==MBCS_STATE_VALID_DIRECT_20 || 1.2947 + (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1.2948 + ) { 1.2949 + /* output supplementary code point */ 1.2950 + c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 1.2951 + break; 1.2952 + } else if(action==MBCS_STATE_CHANGE_ONLY) { 1.2953 + /* 1.2954 + * This serves as a state change without any output. 1.2955 + * It is useful for reading simple stateful encodings, 1.2956 + * for example using just Shift-In/Shift-Out codes. 1.2957 + * The 21 unused bits may later be used for more sophisticated 1.2958 + * state transitions. 1.2959 + */ 1.2960 + if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 1.2961 + /* SI/SO are illegal for DBCS-only conversion */ 1.2962 + state=(uint8_t)(cnv->mode); /* restore the previous state */ 1.2963 + 1.2964 + /* callback(illegal) */ 1.2965 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2966 + } 1.2967 + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1.2968 + if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1.2969 + /* output BMP code point */ 1.2970 + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.2971 + break; 1.2972 + } 1.2973 + } else if(action==MBCS_STATE_UNASSIGNED) { 1.2974 + /* just fall through */ 1.2975 + } else if(action==MBCS_STATE_ILLEGAL) { 1.2976 + /* callback(illegal) */ 1.2977 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.2978 + } else { 1.2979 + /* reserved (must never occur), or only state change */ 1.2980 + offset=0; 1.2981 + lastSource=source; 1.2982 + continue; 1.2983 + } 1.2984 + 1.2985 + /* end of action codes: prepare for a new character */ 1.2986 + offset=0; 1.2987 + 1.2988 + if(U_FAILURE(*pErrorCode)) { 1.2989 + /* callback(illegal) */ 1.2990 + break; 1.2991 + } else /* unassigned sequence */ { 1.2992 + /* defer to the generic implementation */ 1.2993 + cnv->toUnicodeStatus=0; 1.2994 + cnv->mode=state; 1.2995 + pArgs->source=(const char *)lastSource; 1.2996 + return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1.2997 + } 1.2998 + } 1.2999 + } 1.3000 + 1.3001 + if(c<0) { 1.3002 + if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 1.3003 + /* incomplete character byte sequence */ 1.3004 + uint8_t *bytes=cnv->toUBytes; 1.3005 + cnv->toULength=(int8_t)(source-lastSource); 1.3006 + do { 1.3007 + *bytes++=*lastSource++; 1.3008 + } while(lastSource<source); 1.3009 + *pErrorCode=U_TRUNCATED_CHAR_FOUND; 1.3010 + } else if(U_FAILURE(*pErrorCode)) { 1.3011 + /* callback(illegal) */ 1.3012 + /* 1.3013 + * Ticket 5691: consistent illegal sequences: 1.3014 + * - We include at least the first byte in the illegal sequence. 1.3015 + * - If any of the non-initial bytes could be the start of a character, 1.3016 + * we stop the illegal sequence before the first one of those. 1.3017 + */ 1.3018 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 1.3019 + uint8_t *bytes=cnv->toUBytes; 1.3020 + *bytes++=*lastSource++; /* first byte */ 1.3021 + if(lastSource==source) { 1.3022 + cnv->toULength=1; 1.3023 + } else /* lastSource<source: multi-byte character */ { 1.3024 + int8_t i; 1.3025 + for(i=1; 1.3026 + lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 1.3027 + ++i 1.3028 + ) { 1.3029 + *bytes++=*lastSource++; 1.3030 + } 1.3031 + cnv->toULength=i; 1.3032 + source=lastSource; 1.3033 + } 1.3034 + } else { 1.3035 + /* no output because of empty input or only state changes */ 1.3036 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.3037 + } 1.3038 + c=0xffff; 1.3039 + } 1.3040 + 1.3041 + /* set the converter state back into UConverter, ready for a new character */ 1.3042 + cnv->toUnicodeStatus=0; 1.3043 + cnv->mode=state; 1.3044 + 1.3045 + /* write back the updated pointer */ 1.3046 + pArgs->source=(const char *)source; 1.3047 + return c; 1.3048 +} 1.3049 + 1.3050 +#if 0 1.3051 +/* 1.3052 + * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 1.3053 + * Removal improves code coverage. 1.3054 + */ 1.3055 +/** 1.3056 + * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 1.3057 + * It does not handle the EBCDIC swaplfnl option (set in UConverter). 1.3058 + * It does not handle conversion extensions (_extToU()). 1.3059 + */ 1.3060 +U_CFUNC UChar32 1.3061 +ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 1.3062 + uint8_t b, UBool useFallback) { 1.3063 + int32_t entry; 1.3064 + uint8_t action; 1.3065 + 1.3066 + entry=sharedData->mbcs.stateTable[0][b]; 1.3067 + /* MBCS_ENTRY_IS_FINAL(entry) */ 1.3068 + 1.3069 + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1.3070 + /* output BMP code point */ 1.3071 + return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.3072 + } 1.3073 + 1.3074 + /* 1.3075 + * An if-else-if chain provides more reliable performance for 1.3076 + * the most common cases compared to a switch. 1.3077 + */ 1.3078 + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1.3079 + if(action==MBCS_STATE_VALID_DIRECT_20) { 1.3080 + /* output supplementary code point */ 1.3081 + return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 1.3082 + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1.3083 + if(!TO_U_USE_FALLBACK(useFallback)) { 1.3084 + return 0xfffe; 1.3085 + } 1.3086 + /* output BMP code point */ 1.3087 + return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.3088 + } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 1.3089 + if(!TO_U_USE_FALLBACK(useFallback)) { 1.3090 + return 0xfffe; 1.3091 + } 1.3092 + /* output supplementary code point */ 1.3093 + return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 1.3094 + } else if(action==MBCS_STATE_UNASSIGNED) { 1.3095 + return 0xfffe; 1.3096 + } else if(action==MBCS_STATE_ILLEGAL) { 1.3097 + return 0xffff; 1.3098 + } else { 1.3099 + /* reserved, must never occur */ 1.3100 + return 0xffff; 1.3101 + } 1.3102 +} 1.3103 +#endif 1.3104 + 1.3105 +/* 1.3106 + * This is a simple version of _MBCSGetNextUChar() that is used 1.3107 + * by other converter implementations. 1.3108 + * It only returns an "assigned" result if it consumes the entire input. 1.3109 + * It does not use state from the converter, nor error codes. 1.3110 + * It does not handle the EBCDIC swaplfnl option (set in UConverter). 1.3111 + * It handles conversion extensions but not GB 18030. 1.3112 + * 1.3113 + * Return value: 1.3114 + * U+fffe unassigned 1.3115 + * U+ffff illegal 1.3116 + * otherwise the Unicode code point 1.3117 + */ 1.3118 +U_CFUNC UChar32 1.3119 +ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 1.3120 + const char *source, int32_t length, 1.3121 + UBool useFallback) { 1.3122 + const int32_t (*stateTable)[256]; 1.3123 + const uint16_t *unicodeCodeUnits; 1.3124 + 1.3125 + uint32_t offset; 1.3126 + uint8_t state, action; 1.3127 + 1.3128 + UChar32 c; 1.3129 + int32_t i, entry; 1.3130 + 1.3131 + if(length<=0) { 1.3132 + /* no input at all: "illegal" */ 1.3133 + return 0xffff; 1.3134 + } 1.3135 + 1.3136 +#if 0 1.3137 +/* 1.3138 + * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 1.3139 + * TODO In future releases, verify that this function is never called for SBCS 1.3140 + * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 1.3141 + * Removal improves code coverage. 1.3142 + */ 1.3143 + /* use optimized function if possible */ 1.3144 + if(sharedData->mbcs.countStates==1) { 1.3145 + if(length==1) { 1.3146 + return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 1.3147 + } else { 1.3148 + return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 1.3149 + } 1.3150 + } 1.3151 +#endif 1.3152 + 1.3153 + /* set up the local pointers */ 1.3154 + stateTable=sharedData->mbcs.stateTable; 1.3155 + unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 1.3156 + 1.3157 + /* converter state */ 1.3158 + offset=0; 1.3159 + state=sharedData->mbcs.dbcsOnlyState; 1.3160 + 1.3161 + /* conversion loop */ 1.3162 + for(i=0;;) { 1.3163 + entry=stateTable[state][(uint8_t)source[i++]]; 1.3164 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.3165 + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1.3166 + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1.3167 + 1.3168 + if(i==length) { 1.3169 + return 0xffff; /* truncated character */ 1.3170 + } 1.3171 + } else { 1.3172 + /* 1.3173 + * An if-else-if chain provides more reliable performance for 1.3174 + * the most common cases compared to a switch. 1.3175 + */ 1.3176 + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1.3177 + if(action==MBCS_STATE_VALID_16) { 1.3178 + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 1.3179 + c=unicodeCodeUnits[offset]; 1.3180 + if(c!=0xfffe) { 1.3181 + /* done */ 1.3182 + } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1.3183 + c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 1.3184 + /* else done with 0xfffe */ 1.3185 + } 1.3186 + break; 1.3187 + } else if(action==MBCS_STATE_VALID_DIRECT_16) { 1.3188 + /* output BMP code point */ 1.3189 + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.3190 + break; 1.3191 + } else if(action==MBCS_STATE_VALID_16_PAIR) { 1.3192 + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 1.3193 + c=unicodeCodeUnits[offset++]; 1.3194 + if(c<0xd800) { 1.3195 + /* output BMP code point below 0xd800 */ 1.3196 + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 1.3197 + /* output roundtrip or fallback supplementary code point */ 1.3198 + c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 1.3199 + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 1.3200 + /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 1.3201 + c=unicodeCodeUnits[offset]; 1.3202 + } else if(c==0xffff) { 1.3203 + return 0xffff; 1.3204 + } else { 1.3205 + c=0xfffe; 1.3206 + } 1.3207 + break; 1.3208 + } else if(action==MBCS_STATE_VALID_DIRECT_20) { 1.3209 + /* output supplementary code point */ 1.3210 + c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 1.3211 + break; 1.3212 + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1.3213 + if(!TO_U_USE_FALLBACK(useFallback)) { 1.3214 + c=0xfffe; 1.3215 + break; 1.3216 + } 1.3217 + /* output BMP code point */ 1.3218 + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1.3219 + break; 1.3220 + } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 1.3221 + if(!TO_U_USE_FALLBACK(useFallback)) { 1.3222 + c=0xfffe; 1.3223 + break; 1.3224 + } 1.3225 + /* output supplementary code point */ 1.3226 + c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 1.3227 + break; 1.3228 + } else if(action==MBCS_STATE_UNASSIGNED) { 1.3229 + c=0xfffe; 1.3230 + break; 1.3231 + } 1.3232 + 1.3233 + /* 1.3234 + * forbid MBCS_STATE_CHANGE_ONLY for this function, 1.3235 + * and MBCS_STATE_ILLEGAL and reserved action codes 1.3236 + */ 1.3237 + return 0xffff; 1.3238 + } 1.3239 + } 1.3240 + 1.3241 + if(i!=length) { 1.3242 + /* illegal for this function: not all input consumed */ 1.3243 + return 0xffff; 1.3244 + } 1.3245 + 1.3246 + if(c==0xfffe) { 1.3247 + /* try an extension mapping */ 1.3248 + const int32_t *cx=sharedData->mbcs.extIndexes; 1.3249 + if(cx!=NULL) { 1.3250 + return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 1.3251 + } 1.3252 + } 1.3253 + 1.3254 + return c; 1.3255 +} 1.3256 + 1.3257 +/* MBCS-from-Unicode conversion functions ----------------------------------- */ 1.3258 + 1.3259 +/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 1.3260 +static void 1.3261 +ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.3262 + UErrorCode *pErrorCode) { 1.3263 + UConverter *cnv; 1.3264 + const UChar *source, *sourceLimit; 1.3265 + uint8_t *target; 1.3266 + int32_t targetCapacity; 1.3267 + int32_t *offsets; 1.3268 + 1.3269 + const uint16_t *table; 1.3270 + const uint16_t *mbcsIndex; 1.3271 + const uint8_t *bytes; 1.3272 + 1.3273 + UChar32 c; 1.3274 + 1.3275 + int32_t sourceIndex, nextSourceIndex; 1.3276 + 1.3277 + uint32_t stage2Entry; 1.3278 + uint32_t asciiRoundtrips; 1.3279 + uint32_t value; 1.3280 + uint8_t unicodeMask; 1.3281 + 1.3282 + /* use optimized function if possible */ 1.3283 + cnv=pArgs->converter; 1.3284 + unicodeMask=cnv->sharedData->mbcs.unicodeMask; 1.3285 + 1.3286 + /* set up the local pointers */ 1.3287 + source=pArgs->source; 1.3288 + sourceLimit=pArgs->sourceLimit; 1.3289 + target=(uint8_t *)pArgs->target; 1.3290 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.3291 + offsets=pArgs->offsets; 1.3292 + 1.3293 + table=cnv->sharedData->mbcs.fromUnicodeTable; 1.3294 + mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 1.3295 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.3296 + bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 1.3297 + } else { 1.3298 + bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 1.3299 + } 1.3300 + asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 1.3301 + 1.3302 + /* get the converter state from UConverter */ 1.3303 + c=cnv->fromUChar32; 1.3304 + 1.3305 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.3306 + sourceIndex= c==0 ? 0 : -1; 1.3307 + nextSourceIndex=0; 1.3308 + 1.3309 + /* conversion loop */ 1.3310 + if(c!=0 && targetCapacity>0) { 1.3311 + goto getTrail; 1.3312 + } 1.3313 + 1.3314 + while(source<sourceLimit) { 1.3315 + /* 1.3316 + * This following test is to see if available input would overflow the output. 1.3317 + * It does not catch output of more than one byte that 1.3318 + * overflows as a result of a multi-byte character or callback output 1.3319 + * from the last source character. 1.3320 + * Therefore, those situations also test for overflows and will 1.3321 + * then break the loop, too. 1.3322 + */ 1.3323 + if(targetCapacity>0) { 1.3324 + /* 1.3325 + * Get a correct Unicode code point: 1.3326 + * a single UChar for a BMP code point or 1.3327 + * a matched surrogate pair for a "supplementary code point". 1.3328 + */ 1.3329 + c=*source++; 1.3330 + ++nextSourceIndex; 1.3331 + if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 1.3332 + *target++=(uint8_t)c; 1.3333 + if(offsets!=NULL) { 1.3334 + *offsets++=sourceIndex; 1.3335 + sourceIndex=nextSourceIndex; 1.3336 + } 1.3337 + --targetCapacity; 1.3338 + c=0; 1.3339 + continue; 1.3340 + } 1.3341 + /* 1.3342 + * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 1.3343 + * to avoid dealing with surrogates. 1.3344 + * MBCS_FAST_MAX must be >=0xd7ff. 1.3345 + */ 1.3346 + if(c<=0xd7ff) { 1.3347 + value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 1.3348 + /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 1.3349 + if(value==0) { 1.3350 + goto unassigned; 1.3351 + } 1.3352 + /* output the value */ 1.3353 + } else { 1.3354 + /* 1.3355 + * This also tests if the codepage maps single surrogates. 1.3356 + * If it does, then surrogates are not paired but mapped separately. 1.3357 + * Note that in this case unmatched surrogates are not detected. 1.3358 + */ 1.3359 + if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 1.3360 + if(U16_IS_SURROGATE_LEAD(c)) { 1.3361 +getTrail: 1.3362 + if(source<sourceLimit) { 1.3363 + /* test the following code unit */ 1.3364 + UChar trail=*source; 1.3365 + if(U16_IS_TRAIL(trail)) { 1.3366 + ++source; 1.3367 + ++nextSourceIndex; 1.3368 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.3369 + if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1.3370 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.3371 + /* callback(unassigned) */ 1.3372 + goto unassigned; 1.3373 + } 1.3374 + /* convert this supplementary code point */ 1.3375 + /* exit this condition tree */ 1.3376 + } else { 1.3377 + /* this is an unmatched lead code unit (1st surrogate) */ 1.3378 + /* callback(illegal) */ 1.3379 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.3380 + break; 1.3381 + } 1.3382 + } else { 1.3383 + /* no more input */ 1.3384 + break; 1.3385 + } 1.3386 + } else { 1.3387 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.3388 + /* callback(illegal) */ 1.3389 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.3390 + break; 1.3391 + } 1.3392 + } 1.3393 + 1.3394 + /* convert the Unicode code point in c into codepage bytes */ 1.3395 + stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1.3396 + 1.3397 + /* get the bytes and the length for the output */ 1.3398 + /* MBCS_OUTPUT_2 */ 1.3399 + value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 1.3400 + 1.3401 + /* is this code point assigned, or do we use fallbacks? */ 1.3402 + if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 1.3403 + (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 1.3404 + ) { 1.3405 + /* 1.3406 + * We allow a 0 byte output if the "assigned" bit is set for this entry. 1.3407 + * There is no way with this data structure for fallback output 1.3408 + * to be a zero byte. 1.3409 + */ 1.3410 + 1.3411 +unassigned: 1.3412 + /* try an extension mapping */ 1.3413 + pArgs->source=source; 1.3414 + c=_extFromU(cnv, cnv->sharedData, 1.3415 + c, &source, sourceLimit, 1.3416 + &target, target+targetCapacity, 1.3417 + &offsets, sourceIndex, 1.3418 + pArgs->flush, 1.3419 + pErrorCode); 1.3420 + nextSourceIndex+=(int32_t)(source-pArgs->source); 1.3421 + 1.3422 + if(U_FAILURE(*pErrorCode)) { 1.3423 + /* not mappable or buffer overflow */ 1.3424 + break; 1.3425 + } else { 1.3426 + /* a mapping was written to the target, continue */ 1.3427 + 1.3428 + /* recalculate the targetCapacity after an extension mapping */ 1.3429 + targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 1.3430 + 1.3431 + /* normal end of conversion: prepare for a new character */ 1.3432 + sourceIndex=nextSourceIndex; 1.3433 + continue; 1.3434 + } 1.3435 + } 1.3436 + } 1.3437 + 1.3438 + /* write the output character bytes from value and length */ 1.3439 + /* from the first if in the loop we know that targetCapacity>0 */ 1.3440 + if(value<=0xff) { 1.3441 + /* this is easy because we know that there is enough space */ 1.3442 + *target++=(uint8_t)value; 1.3443 + if(offsets!=NULL) { 1.3444 + *offsets++=sourceIndex; 1.3445 + } 1.3446 + --targetCapacity; 1.3447 + } else /* length==2 */ { 1.3448 + *target++=(uint8_t)(value>>8); 1.3449 + if(2<=targetCapacity) { 1.3450 + *target++=(uint8_t)value; 1.3451 + if(offsets!=NULL) { 1.3452 + *offsets++=sourceIndex; 1.3453 + *offsets++=sourceIndex; 1.3454 + } 1.3455 + targetCapacity-=2; 1.3456 + } else { 1.3457 + if(offsets!=NULL) { 1.3458 + *offsets++=sourceIndex; 1.3459 + } 1.3460 + cnv->charErrorBuffer[0]=(char)value; 1.3461 + cnv->charErrorBufferLength=1; 1.3462 + 1.3463 + /* target overflow */ 1.3464 + targetCapacity=0; 1.3465 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.3466 + c=0; 1.3467 + break; 1.3468 + } 1.3469 + } 1.3470 + 1.3471 + /* normal end of conversion: prepare for a new character */ 1.3472 + c=0; 1.3473 + sourceIndex=nextSourceIndex; 1.3474 + continue; 1.3475 + } else { 1.3476 + /* target is full */ 1.3477 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.3478 + break; 1.3479 + } 1.3480 + } 1.3481 + 1.3482 + /* set the converter state back into UConverter */ 1.3483 + cnv->fromUChar32=c; 1.3484 + 1.3485 + /* write back the updated pointers */ 1.3486 + pArgs->source=source; 1.3487 + pArgs->target=(char *)target; 1.3488 + pArgs->offsets=offsets; 1.3489 +} 1.3490 + 1.3491 +/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 1.3492 +static void 1.3493 +ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.3494 + UErrorCode *pErrorCode) { 1.3495 + UConverter *cnv; 1.3496 + const UChar *source, *sourceLimit; 1.3497 + uint8_t *target; 1.3498 + int32_t targetCapacity; 1.3499 + int32_t *offsets; 1.3500 + 1.3501 + const uint16_t *table; 1.3502 + const uint16_t *results; 1.3503 + 1.3504 + UChar32 c; 1.3505 + 1.3506 + int32_t sourceIndex, nextSourceIndex; 1.3507 + 1.3508 + uint16_t value, minValue; 1.3509 + UBool hasSupplementary; 1.3510 + 1.3511 + /* set up the local pointers */ 1.3512 + cnv=pArgs->converter; 1.3513 + source=pArgs->source; 1.3514 + sourceLimit=pArgs->sourceLimit; 1.3515 + target=(uint8_t *)pArgs->target; 1.3516 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.3517 + offsets=pArgs->offsets; 1.3518 + 1.3519 + table=cnv->sharedData->mbcs.fromUnicodeTable; 1.3520 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.3521 + results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 1.3522 + } else { 1.3523 + results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 1.3524 + } 1.3525 + 1.3526 + if(cnv->useFallback) { 1.3527 + /* use all roundtrip and fallback results */ 1.3528 + minValue=0x800; 1.3529 + } else { 1.3530 + /* use only roundtrips and fallbacks from private-use characters */ 1.3531 + minValue=0xc00; 1.3532 + } 1.3533 + hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 1.3534 + 1.3535 + /* get the converter state from UConverter */ 1.3536 + c=cnv->fromUChar32; 1.3537 + 1.3538 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.3539 + sourceIndex= c==0 ? 0 : -1; 1.3540 + nextSourceIndex=0; 1.3541 + 1.3542 + /* conversion loop */ 1.3543 + if(c!=0 && targetCapacity>0) { 1.3544 + goto getTrail; 1.3545 + } 1.3546 + 1.3547 + while(source<sourceLimit) { 1.3548 + /* 1.3549 + * This following test is to see if available input would overflow the output. 1.3550 + * It does not catch output of more than one byte that 1.3551 + * overflows as a result of a multi-byte character or callback output 1.3552 + * from the last source character. 1.3553 + * Therefore, those situations also test for overflows and will 1.3554 + * then break the loop, too. 1.3555 + */ 1.3556 + if(targetCapacity>0) { 1.3557 + /* 1.3558 + * Get a correct Unicode code point: 1.3559 + * a single UChar for a BMP code point or 1.3560 + * a matched surrogate pair for a "supplementary code point". 1.3561 + */ 1.3562 + c=*source++; 1.3563 + ++nextSourceIndex; 1.3564 + if(U16_IS_SURROGATE(c)) { 1.3565 + if(U16_IS_SURROGATE_LEAD(c)) { 1.3566 +getTrail: 1.3567 + if(source<sourceLimit) { 1.3568 + /* test the following code unit */ 1.3569 + UChar trail=*source; 1.3570 + if(U16_IS_TRAIL(trail)) { 1.3571 + ++source; 1.3572 + ++nextSourceIndex; 1.3573 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.3574 + if(!hasSupplementary) { 1.3575 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.3576 + /* callback(unassigned) */ 1.3577 + goto unassigned; 1.3578 + } 1.3579 + /* convert this supplementary code point */ 1.3580 + /* exit this condition tree */ 1.3581 + } else { 1.3582 + /* this is an unmatched lead code unit (1st surrogate) */ 1.3583 + /* callback(illegal) */ 1.3584 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.3585 + break; 1.3586 + } 1.3587 + } else { 1.3588 + /* no more input */ 1.3589 + break; 1.3590 + } 1.3591 + } else { 1.3592 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.3593 + /* callback(illegal) */ 1.3594 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.3595 + break; 1.3596 + } 1.3597 + } 1.3598 + 1.3599 + /* convert the Unicode code point in c into codepage bytes */ 1.3600 + value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 1.3601 + 1.3602 + /* is this code point assigned, or do we use fallbacks? */ 1.3603 + if(value>=minValue) { 1.3604 + /* assigned, write the output character bytes from value and length */ 1.3605 + /* length==1 */ 1.3606 + /* this is easy because we know that there is enough space */ 1.3607 + *target++=(uint8_t)value; 1.3608 + if(offsets!=NULL) { 1.3609 + *offsets++=sourceIndex; 1.3610 + } 1.3611 + --targetCapacity; 1.3612 + 1.3613 + /* normal end of conversion: prepare for a new character */ 1.3614 + c=0; 1.3615 + sourceIndex=nextSourceIndex; 1.3616 + } else { /* unassigned */ 1.3617 +unassigned: 1.3618 + /* try an extension mapping */ 1.3619 + pArgs->source=source; 1.3620 + c=_extFromU(cnv, cnv->sharedData, 1.3621 + c, &source, sourceLimit, 1.3622 + &target, target+targetCapacity, 1.3623 + &offsets, sourceIndex, 1.3624 + pArgs->flush, 1.3625 + pErrorCode); 1.3626 + nextSourceIndex+=(int32_t)(source-pArgs->source); 1.3627 + 1.3628 + if(U_FAILURE(*pErrorCode)) { 1.3629 + /* not mappable or buffer overflow */ 1.3630 + break; 1.3631 + } else { 1.3632 + /* a mapping was written to the target, continue */ 1.3633 + 1.3634 + /* recalculate the targetCapacity after an extension mapping */ 1.3635 + targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 1.3636 + 1.3637 + /* normal end of conversion: prepare for a new character */ 1.3638 + sourceIndex=nextSourceIndex; 1.3639 + } 1.3640 + } 1.3641 + } else { 1.3642 + /* target is full */ 1.3643 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.3644 + break; 1.3645 + } 1.3646 + } 1.3647 + 1.3648 + /* set the converter state back into UConverter */ 1.3649 + cnv->fromUChar32=c; 1.3650 + 1.3651 + /* write back the updated pointers */ 1.3652 + pArgs->source=source; 1.3653 + pArgs->target=(char *)target; 1.3654 + pArgs->offsets=offsets; 1.3655 +} 1.3656 + 1.3657 +/* 1.3658 + * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 1.3659 + * that map only to and from the BMP. 1.3660 + * In addition to single-byte/state optimizations, the offset calculations 1.3661 + * become much easier. 1.3662 + * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 1.3663 + * but measurements have shown that this diminishes performance 1.3664 + * in more cases than it improves it. 1.3665 + * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 1.3666 + * for various MBCS and SBCS optimizations. 1.3667 + */ 1.3668 +static void 1.3669 +ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.3670 + UErrorCode *pErrorCode) { 1.3671 + UConverter *cnv; 1.3672 + const UChar *source, *sourceLimit, *lastSource; 1.3673 + uint8_t *target; 1.3674 + int32_t targetCapacity, length; 1.3675 + int32_t *offsets; 1.3676 + 1.3677 + const uint16_t *table; 1.3678 + const uint16_t *results; 1.3679 + 1.3680 + UChar32 c; 1.3681 + 1.3682 + int32_t sourceIndex; 1.3683 + 1.3684 + uint32_t asciiRoundtrips; 1.3685 + uint16_t value, minValue; 1.3686 + 1.3687 + /* set up the local pointers */ 1.3688 + cnv=pArgs->converter; 1.3689 + source=pArgs->source; 1.3690 + sourceLimit=pArgs->sourceLimit; 1.3691 + target=(uint8_t *)pArgs->target; 1.3692 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.3693 + offsets=pArgs->offsets; 1.3694 + 1.3695 + table=cnv->sharedData->mbcs.fromUnicodeTable; 1.3696 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.3697 + results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 1.3698 + } else { 1.3699 + results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 1.3700 + } 1.3701 + asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 1.3702 + 1.3703 + if(cnv->useFallback) { 1.3704 + /* use all roundtrip and fallback results */ 1.3705 + minValue=0x800; 1.3706 + } else { 1.3707 + /* use only roundtrips and fallbacks from private-use characters */ 1.3708 + minValue=0xc00; 1.3709 + } 1.3710 + 1.3711 + /* get the converter state from UConverter */ 1.3712 + c=cnv->fromUChar32; 1.3713 + 1.3714 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.3715 + sourceIndex= c==0 ? 0 : -1; 1.3716 + lastSource=source; 1.3717 + 1.3718 + /* 1.3719 + * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 1.3720 + * for the minimum of the sourceLength and targetCapacity 1.3721 + */ 1.3722 + length=(int32_t)(sourceLimit-source); 1.3723 + if(length<targetCapacity) { 1.3724 + targetCapacity=length; 1.3725 + } 1.3726 + 1.3727 + /* conversion loop */ 1.3728 + if(c!=0 && targetCapacity>0) { 1.3729 + goto getTrail; 1.3730 + } 1.3731 + 1.3732 +#if MBCS_UNROLL_SINGLE_FROM_BMP 1.3733 + /* unrolling makes it slower on Pentium III/Windows 2000?! */ 1.3734 + /* unroll the loop with the most common case */ 1.3735 +unrolled: 1.3736 + if(targetCapacity>=4) { 1.3737 + int32_t count, loops; 1.3738 + uint16_t andedValues; 1.3739 + 1.3740 + loops=count=targetCapacity>>2; 1.3741 + do { 1.3742 + c=*source++; 1.3743 + andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 1.3744 + *target++=(uint8_t)value; 1.3745 + c=*source++; 1.3746 + andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 1.3747 + *target++=(uint8_t)value; 1.3748 + c=*source++; 1.3749 + andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 1.3750 + *target++=(uint8_t)value; 1.3751 + c=*source++; 1.3752 + andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 1.3753 + *target++=(uint8_t)value; 1.3754 + 1.3755 + /* were all 4 entries really valid? */ 1.3756 + if(andedValues<minValue) { 1.3757 + /* no, return to the first of these 4 */ 1.3758 + source-=4; 1.3759 + target-=4; 1.3760 + break; 1.3761 + } 1.3762 + } while(--count>0); 1.3763 + count=loops-count; 1.3764 + targetCapacity-=4*count; 1.3765 + 1.3766 + if(offsets!=NULL) { 1.3767 + lastSource+=4*count; 1.3768 + while(count>0) { 1.3769 + *offsets++=sourceIndex++; 1.3770 + *offsets++=sourceIndex++; 1.3771 + *offsets++=sourceIndex++; 1.3772 + *offsets++=sourceIndex++; 1.3773 + --count; 1.3774 + } 1.3775 + } 1.3776 + 1.3777 + c=0; 1.3778 + } 1.3779 +#endif 1.3780 + 1.3781 + while(targetCapacity>0) { 1.3782 + /* 1.3783 + * Get a correct Unicode code point: 1.3784 + * a single UChar for a BMP code point or 1.3785 + * a matched surrogate pair for a "supplementary code point". 1.3786 + */ 1.3787 + c=*source++; 1.3788 + /* 1.3789 + * Do not immediately check for single surrogates: 1.3790 + * Assume that they are unassigned and check for them in that case. 1.3791 + * This speeds up the conversion of assigned characters. 1.3792 + */ 1.3793 + /* convert the Unicode code point in c into codepage bytes */ 1.3794 + if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 1.3795 + *target++=(uint8_t)c; 1.3796 + --targetCapacity; 1.3797 + c=0; 1.3798 + continue; 1.3799 + } 1.3800 + value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 1.3801 + /* is this code point assigned, or do we use fallbacks? */ 1.3802 + if(value>=minValue) { 1.3803 + /* assigned, write the output character bytes from value and length */ 1.3804 + /* length==1 */ 1.3805 + /* this is easy because we know that there is enough space */ 1.3806 + *target++=(uint8_t)value; 1.3807 + --targetCapacity; 1.3808 + 1.3809 + /* normal end of conversion: prepare for a new character */ 1.3810 + c=0; 1.3811 + continue; 1.3812 + } else if(!U16_IS_SURROGATE(c)) { 1.3813 + /* normal, unassigned BMP character */ 1.3814 + } else if(U16_IS_SURROGATE_LEAD(c)) { 1.3815 +getTrail: 1.3816 + if(source<sourceLimit) { 1.3817 + /* test the following code unit */ 1.3818 + UChar trail=*source; 1.3819 + if(U16_IS_TRAIL(trail)) { 1.3820 + ++source; 1.3821 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.3822 + /* this codepage does not map supplementary code points */ 1.3823 + /* callback(unassigned) */ 1.3824 + } else { 1.3825 + /* this is an unmatched lead code unit (1st surrogate) */ 1.3826 + /* callback(illegal) */ 1.3827 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.3828 + break; 1.3829 + } 1.3830 + } else { 1.3831 + /* no more input */ 1.3832 + if (pArgs->flush) { 1.3833 + *pErrorCode=U_TRUNCATED_CHAR_FOUND; 1.3834 + } 1.3835 + break; 1.3836 + } 1.3837 + } else { 1.3838 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.3839 + /* callback(illegal) */ 1.3840 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.3841 + break; 1.3842 + } 1.3843 + 1.3844 + /* c does not have a mapping */ 1.3845 + 1.3846 + /* get the number of code units for c to correctly advance sourceIndex */ 1.3847 + length=U16_LENGTH(c); 1.3848 + 1.3849 + /* set offsets since the start or the last extension */ 1.3850 + if(offsets!=NULL) { 1.3851 + int32_t count=(int32_t)(source-lastSource); 1.3852 + 1.3853 + /* do not set the offset for this character */ 1.3854 + count-=length; 1.3855 + 1.3856 + while(count>0) { 1.3857 + *offsets++=sourceIndex++; 1.3858 + --count; 1.3859 + } 1.3860 + /* offsets and sourceIndex are now set for the current character */ 1.3861 + } 1.3862 + 1.3863 + /* try an extension mapping */ 1.3864 + lastSource=source; 1.3865 + c=_extFromU(cnv, cnv->sharedData, 1.3866 + c, &source, sourceLimit, 1.3867 + &target, (const uint8_t *)(pArgs->targetLimit), 1.3868 + &offsets, sourceIndex, 1.3869 + pArgs->flush, 1.3870 + pErrorCode); 1.3871 + sourceIndex+=length+(int32_t)(source-lastSource); 1.3872 + lastSource=source; 1.3873 + 1.3874 + if(U_FAILURE(*pErrorCode)) { 1.3875 + /* not mappable or buffer overflow */ 1.3876 + break; 1.3877 + } else { 1.3878 + /* a mapping was written to the target, continue */ 1.3879 + 1.3880 + /* recalculate the targetCapacity after an extension mapping */ 1.3881 + targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 1.3882 + length=(int32_t)(sourceLimit-source); 1.3883 + if(length<targetCapacity) { 1.3884 + targetCapacity=length; 1.3885 + } 1.3886 + } 1.3887 + 1.3888 +#if MBCS_UNROLL_SINGLE_FROM_BMP 1.3889 + /* unrolling makes it slower on Pentium III/Windows 2000?! */ 1.3890 + goto unrolled; 1.3891 +#endif 1.3892 + } 1.3893 + 1.3894 + if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 1.3895 + /* target is full */ 1.3896 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.3897 + } 1.3898 + 1.3899 + /* set offsets since the start or the last callback */ 1.3900 + if(offsets!=NULL) { 1.3901 + size_t count=source-lastSource; 1.3902 + if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 1.3903 + /* 1.3904 + Caller gave us a partial supplementary character, 1.3905 + which this function couldn't convert in any case. 1.3906 + The callback will handle the offset. 1.3907 + */ 1.3908 + count--; 1.3909 + } 1.3910 + while(count>0) { 1.3911 + *offsets++=sourceIndex++; 1.3912 + --count; 1.3913 + } 1.3914 + } 1.3915 + 1.3916 + /* set the converter state back into UConverter */ 1.3917 + cnv->fromUChar32=c; 1.3918 + 1.3919 + /* write back the updated pointers */ 1.3920 + pArgs->source=source; 1.3921 + pArgs->target=(char *)target; 1.3922 + pArgs->offsets=offsets; 1.3923 +} 1.3924 + 1.3925 +U_CFUNC void 1.3926 +ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.3927 + UErrorCode *pErrorCode) { 1.3928 + UConverter *cnv; 1.3929 + const UChar *source, *sourceLimit; 1.3930 + uint8_t *target; 1.3931 + int32_t targetCapacity; 1.3932 + int32_t *offsets; 1.3933 + 1.3934 + const uint16_t *table; 1.3935 + const uint16_t *mbcsIndex; 1.3936 + const uint8_t *p, *bytes; 1.3937 + uint8_t outputType; 1.3938 + 1.3939 + UChar32 c; 1.3940 + 1.3941 + int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 1.3942 + 1.3943 + uint32_t stage2Entry; 1.3944 + uint32_t asciiRoundtrips; 1.3945 + uint32_t value; 1.3946 + /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */ 1.3947 + uint8_t siBytes[2] = {0, 0}; 1.3948 + uint8_t soBytes[2] = {0, 0}; 1.3949 + uint8_t siLength, soLength; 1.3950 + int32_t length = 0, prevLength; 1.3951 + uint8_t unicodeMask; 1.3952 + 1.3953 + cnv=pArgs->converter; 1.3954 + 1.3955 + if(cnv->preFromUFirstCP>=0) { 1.3956 + /* 1.3957 + * pass sourceIndex=-1 because we continue from an earlier buffer 1.3958 + * in the future, this may change with continuous offsets 1.3959 + */ 1.3960 + ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 1.3961 + 1.3962 + if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 1.3963 + return; 1.3964 + } 1.3965 + } 1.3966 + 1.3967 + /* use optimized function if possible */ 1.3968 + outputType=cnv->sharedData->mbcs.outputType; 1.3969 + unicodeMask=cnv->sharedData->mbcs.unicodeMask; 1.3970 + if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 1.3971 + if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1.3972 + ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 1.3973 + } else { 1.3974 + ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 1.3975 + } 1.3976 + return; 1.3977 + } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 1.3978 + ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 1.3979 + return; 1.3980 + } 1.3981 + 1.3982 + /* set up the local pointers */ 1.3983 + source=pArgs->source; 1.3984 + sourceLimit=pArgs->sourceLimit; 1.3985 + target=(uint8_t *)pArgs->target; 1.3986 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.3987 + offsets=pArgs->offsets; 1.3988 + 1.3989 + table=cnv->sharedData->mbcs.fromUnicodeTable; 1.3990 + if(cnv->sharedData->mbcs.utf8Friendly) { 1.3991 + mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 1.3992 + } else { 1.3993 + mbcsIndex=NULL; 1.3994 + } 1.3995 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.3996 + bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 1.3997 + } else { 1.3998 + bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 1.3999 + } 1.4000 + asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 1.4001 + 1.4002 + /* get the converter state from UConverter */ 1.4003 + c=cnv->fromUChar32; 1.4004 + 1.4005 + if(outputType==MBCS_OUTPUT_2_SISO) { 1.4006 + prevLength=cnv->fromUnicodeStatus; 1.4007 + if(prevLength==0) { 1.4008 + /* set the real value */ 1.4009 + prevLength=1; 1.4010 + } 1.4011 + } else { 1.4012 + /* prevent fromUnicodeStatus from being set to something non-0 */ 1.4013 + prevLength=0; 1.4014 + } 1.4015 + 1.4016 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.4017 + prevSourceIndex=-1; 1.4018 + sourceIndex= c==0 ? 0 : -1; 1.4019 + nextSourceIndex=0; 1.4020 + 1.4021 + /* Get the SI/SO character for the converter */ 1.4022 + siLength = getSISOBytes(SI, cnv->options, siBytes); 1.4023 + soLength = getSISOBytes(SO, cnv->options, soBytes); 1.4024 + 1.4025 + /* conversion loop */ 1.4026 + /* 1.4027 + * This is another piece of ugly code: 1.4028 + * A goto into the loop if the converter state contains a first surrogate 1.4029 + * from the previous function call. 1.4030 + * It saves me to check in each loop iteration a check of if(c==0) 1.4031 + * and duplicating the trail-surrogate-handling code in the else 1.4032 + * branch of that check. 1.4033 + * I could not find any other way to get around this other than 1.4034 + * using a function call for the conversion and callback, which would 1.4035 + * be even more inefficient. 1.4036 + * 1.4037 + * Markus Scherer 2000-jul-19 1.4038 + */ 1.4039 + if(c!=0 && targetCapacity>0) { 1.4040 + goto getTrail; 1.4041 + } 1.4042 + 1.4043 + while(source<sourceLimit) { 1.4044 + /* 1.4045 + * This following test is to see if available input would overflow the output. 1.4046 + * It does not catch output of more than one byte that 1.4047 + * overflows as a result of a multi-byte character or callback output 1.4048 + * from the last source character. 1.4049 + * Therefore, those situations also test for overflows and will 1.4050 + * then break the loop, too. 1.4051 + */ 1.4052 + if(targetCapacity>0) { 1.4053 + /* 1.4054 + * Get a correct Unicode code point: 1.4055 + * a single UChar for a BMP code point or 1.4056 + * a matched surrogate pair for a "supplementary code point". 1.4057 + */ 1.4058 + c=*source++; 1.4059 + ++nextSourceIndex; 1.4060 + if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 1.4061 + *target++=(uint8_t)c; 1.4062 + if(offsets!=NULL) { 1.4063 + *offsets++=sourceIndex; 1.4064 + prevSourceIndex=sourceIndex; 1.4065 + sourceIndex=nextSourceIndex; 1.4066 + } 1.4067 + --targetCapacity; 1.4068 + c=0; 1.4069 + continue; 1.4070 + } 1.4071 + /* 1.4072 + * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 1.4073 + * to avoid dealing with surrogates. 1.4074 + * MBCS_FAST_MAX must be >=0xd7ff. 1.4075 + */ 1.4076 + if(c<=0xd7ff && mbcsIndex!=NULL) { 1.4077 + value=mbcsIndex[c>>6]; 1.4078 + 1.4079 + /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 1.4080 + /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 1.4081 + switch(outputType) { 1.4082 + case MBCS_OUTPUT_2: 1.4083 + value=((const uint16_t *)bytes)[value +(c&0x3f)]; 1.4084 + if(value<=0xff) { 1.4085 + if(value==0) { 1.4086 + goto unassigned; 1.4087 + } else { 1.4088 + length=1; 1.4089 + } 1.4090 + } else { 1.4091 + length=2; 1.4092 + } 1.4093 + break; 1.4094 + case MBCS_OUTPUT_2_SISO: 1.4095 + /* 1/2-byte stateful with Shift-In/Shift-Out */ 1.4096 + /* 1.4097 + * Save the old state in the converter object 1.4098 + * right here, then change the local prevLength state variable if necessary. 1.4099 + * Then, if this character turns out to be unassigned or a fallback that 1.4100 + * is not taken, the callback code must not save the new state in the converter 1.4101 + * because the new state is for a character that is not output. 1.4102 + * However, the callback must still restore the state from the converter 1.4103 + * in case the callback function changed it for its output. 1.4104 + */ 1.4105 + cnv->fromUnicodeStatus=prevLength; /* save the old state */ 1.4106 + value=((const uint16_t *)bytes)[value +(c&0x3f)]; 1.4107 + if(value<=0xff) { 1.4108 + if(value==0) { 1.4109 + goto unassigned; 1.4110 + } else if(prevLength<=1) { 1.4111 + length=1; 1.4112 + } else { 1.4113 + /* change from double-byte mode to single-byte */ 1.4114 + if (siLength == 1) { 1.4115 + value|=(uint32_t)siBytes[0]<<8; 1.4116 + length = 2; 1.4117 + } else if (siLength == 2) { 1.4118 + value|=(uint32_t)siBytes[1]<<8; 1.4119 + value|=(uint32_t)siBytes[0]<<16; 1.4120 + length = 3; 1.4121 + } 1.4122 + prevLength=1; 1.4123 + } 1.4124 + } else { 1.4125 + if(prevLength==2) { 1.4126 + length=2; 1.4127 + } else { 1.4128 + /* change from single-byte mode to double-byte */ 1.4129 + if (soLength == 1) { 1.4130 + value|=(uint32_t)soBytes[0]<<16; 1.4131 + length = 3; 1.4132 + } else if (soLength == 2) { 1.4133 + value|=(uint32_t)soBytes[1]<<16; 1.4134 + value|=(uint32_t)soBytes[0]<<24; 1.4135 + length = 4; 1.4136 + } 1.4137 + prevLength=2; 1.4138 + } 1.4139 + } 1.4140 + break; 1.4141 + case MBCS_OUTPUT_DBCS_ONLY: 1.4142 + /* table with single-byte results, but only DBCS mappings used */ 1.4143 + value=((const uint16_t *)bytes)[value +(c&0x3f)]; 1.4144 + if(value<=0xff) { 1.4145 + /* no mapping or SBCS result, not taken for DBCS-only */ 1.4146 + goto unassigned; 1.4147 + } else { 1.4148 + length=2; 1.4149 + } 1.4150 + break; 1.4151 + case MBCS_OUTPUT_3: 1.4152 + p=bytes+(value+(c&0x3f))*3; 1.4153 + value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1.4154 + if(value<=0xff) { 1.4155 + if(value==0) { 1.4156 + goto unassigned; 1.4157 + } else { 1.4158 + length=1; 1.4159 + } 1.4160 + } else if(value<=0xffff) { 1.4161 + length=2; 1.4162 + } else { 1.4163 + length=3; 1.4164 + } 1.4165 + break; 1.4166 + case MBCS_OUTPUT_4: 1.4167 + value=((const uint32_t *)bytes)[value +(c&0x3f)]; 1.4168 + if(value<=0xff) { 1.4169 + if(value==0) { 1.4170 + goto unassigned; 1.4171 + } else { 1.4172 + length=1; 1.4173 + } 1.4174 + } else if(value<=0xffff) { 1.4175 + length=2; 1.4176 + } else if(value<=0xffffff) { 1.4177 + length=3; 1.4178 + } else { 1.4179 + length=4; 1.4180 + } 1.4181 + break; 1.4182 + case MBCS_OUTPUT_3_EUC: 1.4183 + value=((const uint16_t *)bytes)[value +(c&0x3f)]; 1.4184 + /* EUC 16-bit fixed-length representation */ 1.4185 + if(value<=0xff) { 1.4186 + if(value==0) { 1.4187 + goto unassigned; 1.4188 + } else { 1.4189 + length=1; 1.4190 + } 1.4191 + } else if((value&0x8000)==0) { 1.4192 + value|=0x8e8000; 1.4193 + length=3; 1.4194 + } else if((value&0x80)==0) { 1.4195 + value|=0x8f0080; 1.4196 + length=3; 1.4197 + } else { 1.4198 + length=2; 1.4199 + } 1.4200 + break; 1.4201 + case MBCS_OUTPUT_4_EUC: 1.4202 + p=bytes+(value+(c&0x3f))*3; 1.4203 + value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1.4204 + /* EUC 16-bit fixed-length representation applied to the first two bytes */ 1.4205 + if(value<=0xff) { 1.4206 + if(value==0) { 1.4207 + goto unassigned; 1.4208 + } else { 1.4209 + length=1; 1.4210 + } 1.4211 + } else if(value<=0xffff) { 1.4212 + length=2; 1.4213 + } else if((value&0x800000)==0) { 1.4214 + value|=0x8e800000; 1.4215 + length=4; 1.4216 + } else if((value&0x8000)==0) { 1.4217 + value|=0x8f008000; 1.4218 + length=4; 1.4219 + } else { 1.4220 + length=3; 1.4221 + } 1.4222 + break; 1.4223 + default: 1.4224 + /* must not occur */ 1.4225 + /* 1.4226 + * To avoid compiler warnings that value & length may be 1.4227 + * used without having been initialized, we set them here. 1.4228 + * In reality, this is unreachable code. 1.4229 + * Not having a default branch also causes warnings with 1.4230 + * some compilers. 1.4231 + */ 1.4232 + value=0; 1.4233 + length=0; 1.4234 + break; 1.4235 + } 1.4236 + /* output the value */ 1.4237 + } else { 1.4238 + /* 1.4239 + * This also tests if the codepage maps single surrogates. 1.4240 + * If it does, then surrogates are not paired but mapped separately. 1.4241 + * Note that in this case unmatched surrogates are not detected. 1.4242 + */ 1.4243 + if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 1.4244 + if(U16_IS_SURROGATE_LEAD(c)) { 1.4245 +getTrail: 1.4246 + if(source<sourceLimit) { 1.4247 + /* test the following code unit */ 1.4248 + UChar trail=*source; 1.4249 + if(U16_IS_TRAIL(trail)) { 1.4250 + ++source; 1.4251 + ++nextSourceIndex; 1.4252 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.4253 + if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1.4254 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.4255 + cnv->fromUnicodeStatus=prevLength; /* save the old state */ 1.4256 + /* callback(unassigned) */ 1.4257 + goto unassigned; 1.4258 + } 1.4259 + /* convert this supplementary code point */ 1.4260 + /* exit this condition tree */ 1.4261 + } else { 1.4262 + /* this is an unmatched lead code unit (1st surrogate) */ 1.4263 + /* callback(illegal) */ 1.4264 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.4265 + break; 1.4266 + } 1.4267 + } else { 1.4268 + /* no more input */ 1.4269 + break; 1.4270 + } 1.4271 + } else { 1.4272 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.4273 + /* callback(illegal) */ 1.4274 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.4275 + break; 1.4276 + } 1.4277 + } 1.4278 + 1.4279 + /* convert the Unicode code point in c into codepage bytes */ 1.4280 + 1.4281 + /* 1.4282 + * The basic lookup is a triple-stage compact array (trie) lookup. 1.4283 + * For details see the beginning of this file. 1.4284 + * 1.4285 + * Single-byte codepages are handled with a different data structure 1.4286 + * by _MBCSSingle... functions. 1.4287 + * 1.4288 + * The result consists of a 32-bit value from stage 2 and 1.4289 + * a pointer to as many bytes as are stored per character. 1.4290 + * The pointer points to the character's bytes in stage 3. 1.4291 + * Bits 15..0 of the stage 2 entry contain the stage 3 index 1.4292 + * for that pointer, while bits 31..16 are flags for which of 1.4293 + * the 16 characters in the block are roundtrip-assigned. 1.4294 + * 1.4295 + * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 1.4296 + * respectively as uint32_t, in the platform encoding. 1.4297 + * For 3-byte codepages, the bytes are always stored in big-endian order. 1.4298 + * 1.4299 + * For EUC encodings that use only either 0x8e or 0x8f as the first 1.4300 + * byte of their longest byte sequences, the first two bytes in 1.4301 + * this third stage indicate with their 7th bits whether these bytes 1.4302 + * are to be written directly or actually need to be preceeded by 1.4303 + * one of the two Single-Shift codes. With this, the third stage 1.4304 + * stores one byte fewer per character than the actual maximum length of 1.4305 + * EUC byte sequences. 1.4306 + * 1.4307 + * Other than that, leading zero bytes are removed and the other 1.4308 + * bytes output. A single zero byte may be output if the "assigned" 1.4309 + * bit in stage 2 was on. 1.4310 + * The data structure does not support zero byte output as a fallback, 1.4311 + * and also does not allow output of leading zeros. 1.4312 + */ 1.4313 + stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1.4314 + 1.4315 + /* get the bytes and the length for the output */ 1.4316 + switch(outputType) { 1.4317 + case MBCS_OUTPUT_2: 1.4318 + value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 1.4319 + if(value<=0xff) { 1.4320 + length=1; 1.4321 + } else { 1.4322 + length=2; 1.4323 + } 1.4324 + break; 1.4325 + case MBCS_OUTPUT_2_SISO: 1.4326 + /* 1/2-byte stateful with Shift-In/Shift-Out */ 1.4327 + /* 1.4328 + * Save the old state in the converter object 1.4329 + * right here, then change the local prevLength state variable if necessary. 1.4330 + * Then, if this character turns out to be unassigned or a fallback that 1.4331 + * is not taken, the callback code must not save the new state in the converter 1.4332 + * because the new state is for a character that is not output. 1.4333 + * However, the callback must still restore the state from the converter 1.4334 + * in case the callback function changed it for its output. 1.4335 + */ 1.4336 + cnv->fromUnicodeStatus=prevLength; /* save the old state */ 1.4337 + value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 1.4338 + if(value<=0xff) { 1.4339 + if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 1.4340 + /* no mapping, leave value==0 */ 1.4341 + length=0; 1.4342 + } else if(prevLength<=1) { 1.4343 + length=1; 1.4344 + } else { 1.4345 + /* change from double-byte mode to single-byte */ 1.4346 + if (siLength == 1) { 1.4347 + value|=(uint32_t)siBytes[0]<<8; 1.4348 + length = 2; 1.4349 + } else if (siLength == 2) { 1.4350 + value|=(uint32_t)siBytes[1]<<8; 1.4351 + value|=(uint32_t)siBytes[0]<<16; 1.4352 + length = 3; 1.4353 + } 1.4354 + prevLength=1; 1.4355 + } 1.4356 + } else { 1.4357 + if(prevLength==2) { 1.4358 + length=2; 1.4359 + } else { 1.4360 + /* change from single-byte mode to double-byte */ 1.4361 + if (soLength == 1) { 1.4362 + value|=(uint32_t)soBytes[0]<<16; 1.4363 + length = 3; 1.4364 + } else if (soLength == 2) { 1.4365 + value|=(uint32_t)soBytes[1]<<16; 1.4366 + value|=(uint32_t)soBytes[0]<<24; 1.4367 + length = 4; 1.4368 + } 1.4369 + prevLength=2; 1.4370 + } 1.4371 + } 1.4372 + break; 1.4373 + case MBCS_OUTPUT_DBCS_ONLY: 1.4374 + /* table with single-byte results, but only DBCS mappings used */ 1.4375 + value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 1.4376 + if(value<=0xff) { 1.4377 + /* no mapping or SBCS result, not taken for DBCS-only */ 1.4378 + value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 1.4379 + length=0; 1.4380 + } else { 1.4381 + length=2; 1.4382 + } 1.4383 + break; 1.4384 + case MBCS_OUTPUT_3: 1.4385 + p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 1.4386 + value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1.4387 + if(value<=0xff) { 1.4388 + length=1; 1.4389 + } else if(value<=0xffff) { 1.4390 + length=2; 1.4391 + } else { 1.4392 + length=3; 1.4393 + } 1.4394 + break; 1.4395 + case MBCS_OUTPUT_4: 1.4396 + value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 1.4397 + if(value<=0xff) { 1.4398 + length=1; 1.4399 + } else if(value<=0xffff) { 1.4400 + length=2; 1.4401 + } else if(value<=0xffffff) { 1.4402 + length=3; 1.4403 + } else { 1.4404 + length=4; 1.4405 + } 1.4406 + break; 1.4407 + case MBCS_OUTPUT_3_EUC: 1.4408 + value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 1.4409 + /* EUC 16-bit fixed-length representation */ 1.4410 + if(value<=0xff) { 1.4411 + length=1; 1.4412 + } else if((value&0x8000)==0) { 1.4413 + value|=0x8e8000; 1.4414 + length=3; 1.4415 + } else if((value&0x80)==0) { 1.4416 + value|=0x8f0080; 1.4417 + length=3; 1.4418 + } else { 1.4419 + length=2; 1.4420 + } 1.4421 + break; 1.4422 + case MBCS_OUTPUT_4_EUC: 1.4423 + p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 1.4424 + value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1.4425 + /* EUC 16-bit fixed-length representation applied to the first two bytes */ 1.4426 + if(value<=0xff) { 1.4427 + length=1; 1.4428 + } else if(value<=0xffff) { 1.4429 + length=2; 1.4430 + } else if((value&0x800000)==0) { 1.4431 + value|=0x8e800000; 1.4432 + length=4; 1.4433 + } else if((value&0x8000)==0) { 1.4434 + value|=0x8f008000; 1.4435 + length=4; 1.4436 + } else { 1.4437 + length=3; 1.4438 + } 1.4439 + break; 1.4440 + default: 1.4441 + /* must not occur */ 1.4442 + /* 1.4443 + * To avoid compiler warnings that value & length may be 1.4444 + * used without having been initialized, we set them here. 1.4445 + * In reality, this is unreachable code. 1.4446 + * Not having a default branch also causes warnings with 1.4447 + * some compilers. 1.4448 + */ 1.4449 + value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 1.4450 + length=0; 1.4451 + break; 1.4452 + } 1.4453 + 1.4454 + /* is this code point assigned, or do we use fallbacks? */ 1.4455 + if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 1.4456 + (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 1.4457 + ) { 1.4458 + /* 1.4459 + * We allow a 0 byte output if the "assigned" bit is set for this entry. 1.4460 + * There is no way with this data structure for fallback output 1.4461 + * to be a zero byte. 1.4462 + */ 1.4463 + 1.4464 +unassigned: 1.4465 + /* try an extension mapping */ 1.4466 + pArgs->source=source; 1.4467 + c=_extFromU(cnv, cnv->sharedData, 1.4468 + c, &source, sourceLimit, 1.4469 + &target, target+targetCapacity, 1.4470 + &offsets, sourceIndex, 1.4471 + pArgs->flush, 1.4472 + pErrorCode); 1.4473 + nextSourceIndex+=(int32_t)(source-pArgs->source); 1.4474 + prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 1.4475 + 1.4476 + if(U_FAILURE(*pErrorCode)) { 1.4477 + /* not mappable or buffer overflow */ 1.4478 + break; 1.4479 + } else { 1.4480 + /* a mapping was written to the target, continue */ 1.4481 + 1.4482 + /* recalculate the targetCapacity after an extension mapping */ 1.4483 + targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 1.4484 + 1.4485 + /* normal end of conversion: prepare for a new character */ 1.4486 + if(offsets!=NULL) { 1.4487 + prevSourceIndex=sourceIndex; 1.4488 + sourceIndex=nextSourceIndex; 1.4489 + } 1.4490 + continue; 1.4491 + } 1.4492 + } 1.4493 + } 1.4494 + 1.4495 + /* write the output character bytes from value and length */ 1.4496 + /* from the first if in the loop we know that targetCapacity>0 */ 1.4497 + if(length<=targetCapacity) { 1.4498 + if(offsets==NULL) { 1.4499 + switch(length) { 1.4500 + /* each branch falls through to the next one */ 1.4501 + case 4: 1.4502 + *target++=(uint8_t)(value>>24); 1.4503 + case 3: /*fall through*/ 1.4504 + *target++=(uint8_t)(value>>16); 1.4505 + case 2: /*fall through*/ 1.4506 + *target++=(uint8_t)(value>>8); 1.4507 + case 1: /*fall through*/ 1.4508 + *target++=(uint8_t)value; 1.4509 + default: 1.4510 + /* will never occur */ 1.4511 + break; 1.4512 + } 1.4513 + } else { 1.4514 + switch(length) { 1.4515 + /* each branch falls through to the next one */ 1.4516 + case 4: 1.4517 + *target++=(uint8_t)(value>>24); 1.4518 + *offsets++=sourceIndex; 1.4519 + case 3: /*fall through*/ 1.4520 + *target++=(uint8_t)(value>>16); 1.4521 + *offsets++=sourceIndex; 1.4522 + case 2: /*fall through*/ 1.4523 + *target++=(uint8_t)(value>>8); 1.4524 + *offsets++=sourceIndex; 1.4525 + case 1: /*fall through*/ 1.4526 + *target++=(uint8_t)value; 1.4527 + *offsets++=sourceIndex; 1.4528 + default: 1.4529 + /* will never occur */ 1.4530 + break; 1.4531 + } 1.4532 + } 1.4533 + targetCapacity-=length; 1.4534 + } else { 1.4535 + uint8_t *charErrorBuffer; 1.4536 + 1.4537 + /* 1.4538 + * We actually do this backwards here: 1.4539 + * In order to save an intermediate variable, we output 1.4540 + * first to the overflow buffer what does not fit into the 1.4541 + * regular target. 1.4542 + */ 1.4543 + /* we know that 1<=targetCapacity<length<=4 */ 1.4544 + length-=targetCapacity; 1.4545 + charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 1.4546 + switch(length) { 1.4547 + /* each branch falls through to the next one */ 1.4548 + case 3: 1.4549 + *charErrorBuffer++=(uint8_t)(value>>16); 1.4550 + case 2: /*fall through*/ 1.4551 + *charErrorBuffer++=(uint8_t)(value>>8); 1.4552 + case 1: /*fall through*/ 1.4553 + *charErrorBuffer=(uint8_t)value; 1.4554 + default: 1.4555 + /* will never occur */ 1.4556 + break; 1.4557 + } 1.4558 + cnv->charErrorBufferLength=(int8_t)length; 1.4559 + 1.4560 + /* now output what fits into the regular target */ 1.4561 + value>>=8*length; /* length was reduced by targetCapacity */ 1.4562 + switch(targetCapacity) { 1.4563 + /* each branch falls through to the next one */ 1.4564 + case 3: 1.4565 + *target++=(uint8_t)(value>>16); 1.4566 + if(offsets!=NULL) { 1.4567 + *offsets++=sourceIndex; 1.4568 + } 1.4569 + case 2: /*fall through*/ 1.4570 + *target++=(uint8_t)(value>>8); 1.4571 + if(offsets!=NULL) { 1.4572 + *offsets++=sourceIndex; 1.4573 + } 1.4574 + case 1: /*fall through*/ 1.4575 + *target++=(uint8_t)value; 1.4576 + if(offsets!=NULL) { 1.4577 + *offsets++=sourceIndex; 1.4578 + } 1.4579 + default: 1.4580 + /* will never occur */ 1.4581 + break; 1.4582 + } 1.4583 + 1.4584 + /* target overflow */ 1.4585 + targetCapacity=0; 1.4586 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.4587 + c=0; 1.4588 + break; 1.4589 + } 1.4590 + 1.4591 + /* normal end of conversion: prepare for a new character */ 1.4592 + c=0; 1.4593 + if(offsets!=NULL) { 1.4594 + prevSourceIndex=sourceIndex; 1.4595 + sourceIndex=nextSourceIndex; 1.4596 + } 1.4597 + continue; 1.4598 + } else { 1.4599 + /* target is full */ 1.4600 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.4601 + break; 1.4602 + } 1.4603 + } 1.4604 + 1.4605 + /* 1.4606 + * the end of the input stream and detection of truncated input 1.4607 + * are handled by the framework, but for EBCDIC_STATEFUL conversion 1.4608 + * we need to emit an SI at the very end 1.4609 + * 1.4610 + * conditions: 1.4611 + * successful 1.4612 + * EBCDIC_STATEFUL in DBCS mode 1.4613 + * end of input and no truncated input 1.4614 + */ 1.4615 + if( U_SUCCESS(*pErrorCode) && 1.4616 + outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 1.4617 + pArgs->flush && source>=sourceLimit && c==0 1.4618 + ) { 1.4619 + /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 1.4620 + if(targetCapacity>0) { 1.4621 + *target++=(uint8_t)siBytes[0]; 1.4622 + if (siLength == 2) { 1.4623 + if (targetCapacity<2) { 1.4624 + cnv->charErrorBuffer[0]=(uint8_t)siBytes[1]; 1.4625 + cnv->charErrorBufferLength=1; 1.4626 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.4627 + } else { 1.4628 + *target++=(uint8_t)siBytes[1]; 1.4629 + } 1.4630 + } 1.4631 + if(offsets!=NULL) { 1.4632 + /* set the last source character's index (sourceIndex points at sourceLimit now) */ 1.4633 + *offsets++=prevSourceIndex; 1.4634 + } 1.4635 + } else { 1.4636 + /* target is full */ 1.4637 + cnv->charErrorBuffer[0]=(uint8_t)siBytes[0]; 1.4638 + if (siLength == 2) { 1.4639 + cnv->charErrorBuffer[1]=(uint8_t)siBytes[1]; 1.4640 + } 1.4641 + cnv->charErrorBufferLength=siLength; 1.4642 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.4643 + } 1.4644 + prevLength=1; /* we switched into SBCS */ 1.4645 + } 1.4646 + 1.4647 + /* set the converter state back into UConverter */ 1.4648 + cnv->fromUChar32=c; 1.4649 + cnv->fromUnicodeStatus=prevLength; 1.4650 + 1.4651 + /* write back the updated pointers */ 1.4652 + pArgs->source=source; 1.4653 + pArgs->target=(char *)target; 1.4654 + pArgs->offsets=offsets; 1.4655 +} 1.4656 + 1.4657 +/* 1.4658 + * This is another simple conversion function for internal use by other 1.4659 + * conversion implementations. 1.4660 + * It does not use the converter state nor call callbacks. 1.4661 + * It does not handle the EBCDIC swaplfnl option (set in UConverter). 1.4662 + * It handles conversion extensions but not GB 18030. 1.4663 + * 1.4664 + * It converts one single Unicode code point into codepage bytes, encoded 1.4665 + * as one 32-bit value. The function returns the number of bytes in *pValue: 1.4666 + * 1..4 the number of bytes in *pValue 1.4667 + * 0 unassigned (*pValue undefined) 1.4668 + * -1 illegal (currently not used, *pValue undefined) 1.4669 + * 1.4670 + * *pValue will contain the resulting bytes with the last byte in bits 7..0, 1.4671 + * the second to last byte in bits 15..8, etc. 1.4672 + * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 1.4673 + */ 1.4674 +U_CFUNC int32_t 1.4675 +ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 1.4676 + UChar32 c, uint32_t *pValue, 1.4677 + UBool useFallback) { 1.4678 + const int32_t *cx; 1.4679 + const uint16_t *table; 1.4680 +#if 0 1.4681 +/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 1.4682 + const uint8_t *p; 1.4683 +#endif 1.4684 + uint32_t stage2Entry; 1.4685 + uint32_t value; 1.4686 + int32_t length; 1.4687 + 1.4688 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.4689 + if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1.4690 + table=sharedData->mbcs.fromUnicodeTable; 1.4691 + 1.4692 + /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1.4693 + if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 1.4694 + value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1.4695 + /* is this code point assigned, or do we use fallbacks? */ 1.4696 + if(useFallback ? value>=0x800 : value>=0xc00) { 1.4697 + *pValue=value&0xff; 1.4698 + return 1; 1.4699 + } 1.4700 + } else /* outputType!=MBCS_OUTPUT_1 */ { 1.4701 + stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1.4702 + 1.4703 + /* get the bytes and the length for the output */ 1.4704 + switch(sharedData->mbcs.outputType) { 1.4705 + case MBCS_OUTPUT_2: 1.4706 + value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1.4707 + if(value<=0xff) { 1.4708 + length=1; 1.4709 + } else { 1.4710 + length=2; 1.4711 + } 1.4712 + break; 1.4713 +#if 0 1.4714 +/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 1.4715 + case MBCS_OUTPUT_DBCS_ONLY: 1.4716 + /* table with single-byte results, but only DBCS mappings used */ 1.4717 + value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1.4718 + if(value<=0xff) { 1.4719 + /* no mapping or SBCS result, not taken for DBCS-only */ 1.4720 + value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 1.4721 + length=0; 1.4722 + } else { 1.4723 + length=2; 1.4724 + } 1.4725 + break; 1.4726 + case MBCS_OUTPUT_3: 1.4727 + p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1.4728 + value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1.4729 + if(value<=0xff) { 1.4730 + length=1; 1.4731 + } else if(value<=0xffff) { 1.4732 + length=2; 1.4733 + } else { 1.4734 + length=3; 1.4735 + } 1.4736 + break; 1.4737 + case MBCS_OUTPUT_4: 1.4738 + value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1.4739 + if(value<=0xff) { 1.4740 + length=1; 1.4741 + } else if(value<=0xffff) { 1.4742 + length=2; 1.4743 + } else if(value<=0xffffff) { 1.4744 + length=3; 1.4745 + } else { 1.4746 + length=4; 1.4747 + } 1.4748 + break; 1.4749 + case MBCS_OUTPUT_3_EUC: 1.4750 + value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1.4751 + /* EUC 16-bit fixed-length representation */ 1.4752 + if(value<=0xff) { 1.4753 + length=1; 1.4754 + } else if((value&0x8000)==0) { 1.4755 + value|=0x8e8000; 1.4756 + length=3; 1.4757 + } else if((value&0x80)==0) { 1.4758 + value|=0x8f0080; 1.4759 + length=3; 1.4760 + } else { 1.4761 + length=2; 1.4762 + } 1.4763 + break; 1.4764 + case MBCS_OUTPUT_4_EUC: 1.4765 + p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1.4766 + value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1.4767 + /* EUC 16-bit fixed-length representation applied to the first two bytes */ 1.4768 + if(value<=0xff) { 1.4769 + length=1; 1.4770 + } else if(value<=0xffff) { 1.4771 + length=2; 1.4772 + } else if((value&0x800000)==0) { 1.4773 + value|=0x8e800000; 1.4774 + length=4; 1.4775 + } else if((value&0x8000)==0) { 1.4776 + value|=0x8f008000; 1.4777 + length=4; 1.4778 + } else { 1.4779 + length=3; 1.4780 + } 1.4781 + break; 1.4782 +#endif 1.4783 + default: 1.4784 + /* must not occur */ 1.4785 + return -1; 1.4786 + } 1.4787 + 1.4788 + /* is this code point assigned, or do we use fallbacks? */ 1.4789 + if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 1.4790 + (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 1.4791 + ) { 1.4792 + /* 1.4793 + * We allow a 0 byte output if the "assigned" bit is set for this entry. 1.4794 + * There is no way with this data structure for fallback output 1.4795 + * to be a zero byte. 1.4796 + */ 1.4797 + /* assigned */ 1.4798 + *pValue=value; 1.4799 + return length; 1.4800 + } 1.4801 + } 1.4802 + } 1.4803 + 1.4804 + cx=sharedData->mbcs.extIndexes; 1.4805 + if(cx!=NULL) { 1.4806 + length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 1.4807 + return length>=0 ? length : -length; /* return abs(length); */ 1.4808 + } 1.4809 + 1.4810 + /* unassigned */ 1.4811 + return 0; 1.4812 +} 1.4813 + 1.4814 + 1.4815 +#if 0 1.4816 +/* 1.4817 + * This function has been moved to ucnv2022.c for inlining. 1.4818 + * This implementation is here only for documentation purposes 1.4819 + */ 1.4820 + 1.4821 +/** 1.4822 + * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 1.4823 + * It does not handle the EBCDIC swaplfnl option (set in UConverter). 1.4824 + * It does not handle conversion extensions (_extFromU()). 1.4825 + * 1.4826 + * It returns the codepage byte for the code point, or -1 if it is unassigned. 1.4827 + */ 1.4828 +U_CFUNC int32_t 1.4829 +ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 1.4830 + UChar32 c, 1.4831 + UBool useFallback) { 1.4832 + const uint16_t *table; 1.4833 + int32_t value; 1.4834 + 1.4835 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.4836 + if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1.4837 + return -1; 1.4838 + } 1.4839 + 1.4840 + /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1.4841 + table=sharedData->mbcs.fromUnicodeTable; 1.4842 + 1.4843 + /* get the byte for the output */ 1.4844 + value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1.4845 + /* is this code point assigned, or do we use fallbacks? */ 1.4846 + if(useFallback ? value>=0x800 : value>=0xc00) { 1.4847 + return value&0xff; 1.4848 + } else { 1.4849 + return -1; 1.4850 + } 1.4851 +} 1.4852 +#endif 1.4853 + 1.4854 +/* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 1.4855 + 1.4856 +/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 1.4857 +static const UChar32 1.4858 +utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 1.4859 + 1.4860 +/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 1.4861 +static const UChar32 1.4862 +utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 1.4863 + 1.4864 +static void 1.4865 +ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 1.4866 + UConverterToUnicodeArgs *pToUArgs, 1.4867 + UErrorCode *pErrorCode) { 1.4868 + UConverter *utf8, *cnv; 1.4869 + const uint8_t *source, *sourceLimit; 1.4870 + uint8_t *target; 1.4871 + int32_t targetCapacity; 1.4872 + 1.4873 + const uint16_t *table, *sbcsIndex; 1.4874 + const uint16_t *results; 1.4875 + 1.4876 + int8_t oldToULength, toULength, toULimit; 1.4877 + 1.4878 + UChar32 c; 1.4879 + uint8_t b, t1, t2; 1.4880 + 1.4881 + uint32_t asciiRoundtrips; 1.4882 + uint16_t value, minValue; 1.4883 + UBool hasSupplementary; 1.4884 + 1.4885 + /* set up the local pointers */ 1.4886 + utf8=pToUArgs->converter; 1.4887 + cnv=pFromUArgs->converter; 1.4888 + source=(uint8_t *)pToUArgs->source; 1.4889 + sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 1.4890 + target=(uint8_t *)pFromUArgs->target; 1.4891 + targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 1.4892 + 1.4893 + table=cnv->sharedData->mbcs.fromUnicodeTable; 1.4894 + sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 1.4895 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.4896 + results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 1.4897 + } else { 1.4898 + results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 1.4899 + } 1.4900 + asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 1.4901 + 1.4902 + if(cnv->useFallback) { 1.4903 + /* use all roundtrip and fallback results */ 1.4904 + minValue=0x800; 1.4905 + } else { 1.4906 + /* use only roundtrips and fallbacks from private-use characters */ 1.4907 + minValue=0xc00; 1.4908 + } 1.4909 + hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 1.4910 + 1.4911 + /* get the converter state from the UTF-8 UConverter */ 1.4912 + c=(UChar32)utf8->toUnicodeStatus; 1.4913 + if(c!=0) { 1.4914 + toULength=oldToULength=utf8->toULength; 1.4915 + toULimit=(int8_t)utf8->mode; 1.4916 + } else { 1.4917 + toULength=oldToULength=toULimit=0; 1.4918 + } 1.4919 + 1.4920 + /* 1.4921 + * Make sure that the last byte sequence before sourceLimit is complete 1.4922 + * or runs into a lead byte. 1.4923 + * Do not go back into the bytes that will be read for finishing a partial 1.4924 + * sequence from the previous buffer. 1.4925 + * In the conversion loop compare source with sourceLimit only once 1.4926 + * per multi-byte character. 1.4927 + */ 1.4928 + { 1.4929 + int32_t i, length; 1.4930 + 1.4931 + length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 1.4932 + for(i=0; i<3 && i<length;) { 1.4933 + b=*(sourceLimit-i-1); 1.4934 + if(U8_IS_TRAIL(b)) { 1.4935 + ++i; 1.4936 + } else { 1.4937 + if(i<U8_COUNT_TRAIL_BYTES(b)) { 1.4938 + /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 1.4939 + sourceLimit-=i+1; 1.4940 + } 1.4941 + break; 1.4942 + } 1.4943 + } 1.4944 + } 1.4945 + 1.4946 + if(c!=0 && targetCapacity>0) { 1.4947 + utf8->toUnicodeStatus=0; 1.4948 + utf8->toULength=0; 1.4949 + goto moreBytes; 1.4950 + /* 1.4951 + * Note: We could avoid the goto by duplicating some of the moreBytes 1.4952 + * code, but only up to the point of collecting a complete UTF-8 1.4953 + * sequence; then recurse for the toUBytes[toULength] 1.4954 + * and then continue with normal conversion. 1.4955 + * 1.4956 + * If so, move this code to just after initializing the minimum 1.4957 + * set of local variables for reading the UTF-8 input 1.4958 + * (utf8, source, target, limits but not cnv, table, minValue, etc.). 1.4959 + * 1.4960 + * Potential advantages: 1.4961 + * - avoid the goto 1.4962 + * - oldToULength could become a local variable in just those code blocks 1.4963 + * that deal with buffer boundaries 1.4964 + * - possibly faster if the goto prevents some compiler optimizations 1.4965 + * (this would need measuring to confirm) 1.4966 + * Disadvantage: 1.4967 + * - code duplication 1.4968 + */ 1.4969 + } 1.4970 + 1.4971 + /* conversion loop */ 1.4972 + while(source<sourceLimit) { 1.4973 + if(targetCapacity>0) { 1.4974 + b=*source++; 1.4975 + if((int8_t)b>=0) { 1.4976 + /* convert ASCII */ 1.4977 + if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 1.4978 + *target++=(uint8_t)b; 1.4979 + --targetCapacity; 1.4980 + continue; 1.4981 + } else { 1.4982 + c=b; 1.4983 + value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 1.4984 + } 1.4985 + } else { 1.4986 + if(b<0xe0) { 1.4987 + if( /* handle U+0080..U+07FF inline */ 1.4988 + b>=0xc2 && 1.4989 + (t1=(uint8_t)(*source-0x80)) <= 0x3f 1.4990 + ) { 1.4991 + c=b&0x1f; 1.4992 + ++source; 1.4993 + value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 1.4994 + if(value>=minValue) { 1.4995 + *target++=(uint8_t)value; 1.4996 + --targetCapacity; 1.4997 + continue; 1.4998 + } else { 1.4999 + c=(c<<6)|t1; 1.5000 + } 1.5001 + } else { 1.5002 + c=-1; 1.5003 + } 1.5004 + } else if(b==0xe0) { 1.5005 + if( /* handle U+0800..U+0FFF inline */ 1.5006 + (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 1.5007 + (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 1.5008 + ) { 1.5009 + c=t1; 1.5010 + source+=2; 1.5011 + value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 1.5012 + if(value>=minValue) { 1.5013 + *target++=(uint8_t)value; 1.5014 + --targetCapacity; 1.5015 + continue; 1.5016 + } else { 1.5017 + c=(c<<6)|t2; 1.5018 + } 1.5019 + } else { 1.5020 + c=-1; 1.5021 + } 1.5022 + } else { 1.5023 + c=-1; 1.5024 + } 1.5025 + 1.5026 + if(c<0) { 1.5027 + /* handle "complicated" and error cases, and continuing partial characters */ 1.5028 + oldToULength=0; 1.5029 + toULength=1; 1.5030 + toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 1.5031 + c=b; 1.5032 +moreBytes: 1.5033 + while(toULength<toULimit) { 1.5034 + /* 1.5035 + * The sourceLimit may have been adjusted before the conversion loop 1.5036 + * to stop before a truncated sequence. 1.5037 + * Here we need to use the real limit in case we have two truncated 1.5038 + * sequences at the end. 1.5039 + * See ticket #7492. 1.5040 + */ 1.5041 + if(source<(uint8_t *)pToUArgs->sourceLimit) { 1.5042 + b=*source; 1.5043 + if(U8_IS_TRAIL(b)) { 1.5044 + ++source; 1.5045 + ++toULength; 1.5046 + c=(c<<6)+b; 1.5047 + } else { 1.5048 + break; /* sequence too short, stop with toULength<toULimit */ 1.5049 + } 1.5050 + } else { 1.5051 + /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 1.5052 + source-=(toULength-oldToULength); 1.5053 + while(oldToULength<toULength) { 1.5054 + utf8->toUBytes[oldToULength++]=*source++; 1.5055 + } 1.5056 + utf8->toUnicodeStatus=c; 1.5057 + utf8->toULength=toULength; 1.5058 + utf8->mode=toULimit; 1.5059 + pToUArgs->source=(char *)source; 1.5060 + pFromUArgs->target=(char *)target; 1.5061 + return; 1.5062 + } 1.5063 + } 1.5064 + 1.5065 + if( toULength==toULimit && /* consumed all trail bytes */ 1.5066 + (toULength==3 || toULength==2) && /* BMP */ 1.5067 + (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 1.5068 + (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 1.5069 + ) { 1.5070 + value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 1.5071 + } else if( 1.5072 + toULength==toULimit && toULength==4 && 1.5073 + (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 1.5074 + ) { 1.5075 + /* supplementary code point */ 1.5076 + if(!hasSupplementary) { 1.5077 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.5078 + value=0; 1.5079 + } else { 1.5080 + value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 1.5081 + } 1.5082 + } else { 1.5083 + /* error handling: illegal UTF-8 byte sequence */ 1.5084 + source-=(toULength-oldToULength); 1.5085 + while(oldToULength<toULength) { 1.5086 + utf8->toUBytes[oldToULength++]=*source++; 1.5087 + } 1.5088 + utf8->toULength=toULength; 1.5089 + pToUArgs->source=(char *)source; 1.5090 + pFromUArgs->target=(char *)target; 1.5091 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.5092 + return; 1.5093 + } 1.5094 + } 1.5095 + } 1.5096 + 1.5097 + if(value>=minValue) { 1.5098 + /* output the mapping for c */ 1.5099 + *target++=(uint8_t)value; 1.5100 + --targetCapacity; 1.5101 + } else { 1.5102 + /* value<minValue means c is unassigned (unmappable) */ 1.5103 + /* 1.5104 + * Try an extension mapping. 1.5105 + * Pass in no source because we don't have UTF-16 input. 1.5106 + * If we have a partial match on c, we will return and revert 1.5107 + * to UTF-8->UTF-16->charset conversion. 1.5108 + */ 1.5109 + static const UChar nul=0; 1.5110 + const UChar *noSource=&nul; 1.5111 + c=_extFromU(cnv, cnv->sharedData, 1.5112 + c, &noSource, noSource, 1.5113 + &target, target+targetCapacity, 1.5114 + NULL, -1, 1.5115 + pFromUArgs->flush, 1.5116 + pErrorCode); 1.5117 + 1.5118 + if(U_FAILURE(*pErrorCode)) { 1.5119 + /* not mappable or buffer overflow */ 1.5120 + cnv->fromUChar32=c; 1.5121 + break; 1.5122 + } else if(cnv->preFromUFirstCP>=0) { 1.5123 + /* 1.5124 + * Partial match, return and revert to pivoting. 1.5125 + * In normal from-UTF-16 conversion, we would just continue 1.5126 + * but then exit the loop because the extension match would 1.5127 + * have consumed the source. 1.5128 + */ 1.5129 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.5130 + break; 1.5131 + } else { 1.5132 + /* a mapping was written to the target, continue */ 1.5133 + 1.5134 + /* recalculate the targetCapacity after an extension mapping */ 1.5135 + targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 1.5136 + } 1.5137 + } 1.5138 + } else { 1.5139 + /* target is full */ 1.5140 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.5141 + break; 1.5142 + } 1.5143 + } 1.5144 + 1.5145 + /* 1.5146 + * The sourceLimit may have been adjusted before the conversion loop 1.5147 + * to stop before a truncated sequence. 1.5148 + * If so, then collect the truncated sequence now. 1.5149 + */ 1.5150 + if(U_SUCCESS(*pErrorCode) && 1.5151 + cnv->preFromUFirstCP<0 && 1.5152 + source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 1.5153 + c=utf8->toUBytes[0]=b=*source++; 1.5154 + toULength=1; 1.5155 + toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 1.5156 + while(source<sourceLimit) { 1.5157 + utf8->toUBytes[toULength++]=b=*source++; 1.5158 + c=(c<<6)+b; 1.5159 + } 1.5160 + utf8->toUnicodeStatus=c; 1.5161 + utf8->toULength=toULength; 1.5162 + utf8->mode=toULimit; 1.5163 + } 1.5164 + 1.5165 + /* write back the updated pointers */ 1.5166 + pToUArgs->source=(char *)source; 1.5167 + pFromUArgs->target=(char *)target; 1.5168 +} 1.5169 + 1.5170 +static void 1.5171 +ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 1.5172 + UConverterToUnicodeArgs *pToUArgs, 1.5173 + UErrorCode *pErrorCode) { 1.5174 + UConverter *utf8, *cnv; 1.5175 + const uint8_t *source, *sourceLimit; 1.5176 + uint8_t *target; 1.5177 + int32_t targetCapacity; 1.5178 + 1.5179 + const uint16_t *table, *mbcsIndex; 1.5180 + const uint16_t *results; 1.5181 + 1.5182 + int8_t oldToULength, toULength, toULimit; 1.5183 + 1.5184 + UChar32 c; 1.5185 + uint8_t b, t1, t2; 1.5186 + 1.5187 + uint32_t stage2Entry; 1.5188 + uint32_t asciiRoundtrips; 1.5189 + uint16_t value; 1.5190 + UBool hasSupplementary; 1.5191 + 1.5192 + /* set up the local pointers */ 1.5193 + utf8=pToUArgs->converter; 1.5194 + cnv=pFromUArgs->converter; 1.5195 + source=(uint8_t *)pToUArgs->source; 1.5196 + sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 1.5197 + target=(uint8_t *)pFromUArgs->target; 1.5198 + targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 1.5199 + 1.5200 + table=cnv->sharedData->mbcs.fromUnicodeTable; 1.5201 + mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 1.5202 + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1.5203 + results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 1.5204 + } else { 1.5205 + results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 1.5206 + } 1.5207 + asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 1.5208 + 1.5209 + hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 1.5210 + 1.5211 + /* get the converter state from the UTF-8 UConverter */ 1.5212 + c=(UChar32)utf8->toUnicodeStatus; 1.5213 + if(c!=0) { 1.5214 + toULength=oldToULength=utf8->toULength; 1.5215 + toULimit=(int8_t)utf8->mode; 1.5216 + } else { 1.5217 + toULength=oldToULength=toULimit=0; 1.5218 + } 1.5219 + 1.5220 + /* 1.5221 + * Make sure that the last byte sequence before sourceLimit is complete 1.5222 + * or runs into a lead byte. 1.5223 + * Do not go back into the bytes that will be read for finishing a partial 1.5224 + * sequence from the previous buffer. 1.5225 + * In the conversion loop compare source with sourceLimit only once 1.5226 + * per multi-byte character. 1.5227 + */ 1.5228 + { 1.5229 + int32_t i, length; 1.5230 + 1.5231 + length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 1.5232 + for(i=0; i<3 && i<length;) { 1.5233 + b=*(sourceLimit-i-1); 1.5234 + if(U8_IS_TRAIL(b)) { 1.5235 + ++i; 1.5236 + } else { 1.5237 + if(i<U8_COUNT_TRAIL_BYTES(b)) { 1.5238 + /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 1.5239 + sourceLimit-=i+1; 1.5240 + } 1.5241 + break; 1.5242 + } 1.5243 + } 1.5244 + } 1.5245 + 1.5246 + if(c!=0 && targetCapacity>0) { 1.5247 + utf8->toUnicodeStatus=0; 1.5248 + utf8->toULength=0; 1.5249 + goto moreBytes; 1.5250 + /* See note in ucnv_SBCSFromUTF8() about this goto. */ 1.5251 + } 1.5252 + 1.5253 + /* conversion loop */ 1.5254 + while(source<sourceLimit) { 1.5255 + if(targetCapacity>0) { 1.5256 + b=*source++; 1.5257 + if((int8_t)b>=0) { 1.5258 + /* convert ASCII */ 1.5259 + if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 1.5260 + *target++=b; 1.5261 + --targetCapacity; 1.5262 + continue; 1.5263 + } else { 1.5264 + value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 1.5265 + if(value==0) { 1.5266 + c=b; 1.5267 + goto unassigned; 1.5268 + } 1.5269 + } 1.5270 + } else { 1.5271 + if(b>0xe0) { 1.5272 + if( /* handle U+1000..U+D7FF inline */ 1.5273 + (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || 1.5274 + (b==0xed && (t1 <= 0x1f))) && 1.5275 + (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 1.5276 + ) { 1.5277 + c=((b&0xf)<<6)|t1; 1.5278 + source+=2; 1.5279 + value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 1.5280 + if(value==0) { 1.5281 + c=(c<<6)|t2; 1.5282 + goto unassigned; 1.5283 + } 1.5284 + } else { 1.5285 + c=-1; 1.5286 + } 1.5287 + } else if(b<0xe0) { 1.5288 + if( /* handle U+0080..U+07FF inline */ 1.5289 + b>=0xc2 && 1.5290 + (t1=(uint8_t)(*source-0x80)) <= 0x3f 1.5291 + ) { 1.5292 + c=b&0x1f; 1.5293 + ++source; 1.5294 + value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 1.5295 + if(value==0) { 1.5296 + c=(c<<6)|t1; 1.5297 + goto unassigned; 1.5298 + } 1.5299 + } else { 1.5300 + c=-1; 1.5301 + } 1.5302 + } else { 1.5303 + c=-1; 1.5304 + } 1.5305 + 1.5306 + if(c<0) { 1.5307 + /* handle "complicated" and error cases, and continuing partial characters */ 1.5308 + oldToULength=0; 1.5309 + toULength=1; 1.5310 + toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 1.5311 + c=b; 1.5312 +moreBytes: 1.5313 + while(toULength<toULimit) { 1.5314 + /* 1.5315 + * The sourceLimit may have been adjusted before the conversion loop 1.5316 + * to stop before a truncated sequence. 1.5317 + * Here we need to use the real limit in case we have two truncated 1.5318 + * sequences at the end. 1.5319 + * See ticket #7492. 1.5320 + */ 1.5321 + if(source<(uint8_t *)pToUArgs->sourceLimit) { 1.5322 + b=*source; 1.5323 + if(U8_IS_TRAIL(b)) { 1.5324 + ++source; 1.5325 + ++toULength; 1.5326 + c=(c<<6)+b; 1.5327 + } else { 1.5328 + break; /* sequence too short, stop with toULength<toULimit */ 1.5329 + } 1.5330 + } else { 1.5331 + /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 1.5332 + source-=(toULength-oldToULength); 1.5333 + while(oldToULength<toULength) { 1.5334 + utf8->toUBytes[oldToULength++]=*source++; 1.5335 + } 1.5336 + utf8->toUnicodeStatus=c; 1.5337 + utf8->toULength=toULength; 1.5338 + utf8->mode=toULimit; 1.5339 + pToUArgs->source=(char *)source; 1.5340 + pFromUArgs->target=(char *)target; 1.5341 + return; 1.5342 + } 1.5343 + } 1.5344 + 1.5345 + if( toULength==toULimit && /* consumed all trail bytes */ 1.5346 + (toULength==3 || toULength==2) && /* BMP */ 1.5347 + (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 1.5348 + (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 1.5349 + ) { 1.5350 + stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1.5351 + } else if( 1.5352 + toULength==toULimit && toULength==4 && 1.5353 + (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 1.5354 + ) { 1.5355 + /* supplementary code point */ 1.5356 + if(!hasSupplementary) { 1.5357 + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1.5358 + stage2Entry=0; 1.5359 + } else { 1.5360 + stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1.5361 + } 1.5362 + } else { 1.5363 + /* error handling: illegal UTF-8 byte sequence */ 1.5364 + source-=(toULength-oldToULength); 1.5365 + while(oldToULength<toULength) { 1.5366 + utf8->toUBytes[oldToULength++]=*source++; 1.5367 + } 1.5368 + utf8->toULength=toULength; 1.5369 + pToUArgs->source=(char *)source; 1.5370 + pFromUArgs->target=(char *)target; 1.5371 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.5372 + return; 1.5373 + } 1.5374 + 1.5375 + /* get the bytes and the length for the output */ 1.5376 + /* MBCS_OUTPUT_2 */ 1.5377 + value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 1.5378 + 1.5379 + /* is this code point assigned, or do we use fallbacks? */ 1.5380 + if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 1.5381 + (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 1.5382 + ) { 1.5383 + goto unassigned; 1.5384 + } 1.5385 + } 1.5386 + } 1.5387 + 1.5388 + /* write the output character bytes from value and length */ 1.5389 + /* from the first if in the loop we know that targetCapacity>0 */ 1.5390 + if(value<=0xff) { 1.5391 + /* this is easy because we know that there is enough space */ 1.5392 + *target++=(uint8_t)value; 1.5393 + --targetCapacity; 1.5394 + } else /* length==2 */ { 1.5395 + *target++=(uint8_t)(value>>8); 1.5396 + if(2<=targetCapacity) { 1.5397 + *target++=(uint8_t)value; 1.5398 + targetCapacity-=2; 1.5399 + } else { 1.5400 + cnv->charErrorBuffer[0]=(char)value; 1.5401 + cnv->charErrorBufferLength=1; 1.5402 + 1.5403 + /* target overflow */ 1.5404 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.5405 + break; 1.5406 + } 1.5407 + } 1.5408 + continue; 1.5409 + 1.5410 +unassigned: 1.5411 + { 1.5412 + /* 1.5413 + * Try an extension mapping. 1.5414 + * Pass in no source because we don't have UTF-16 input. 1.5415 + * If we have a partial match on c, we will return and revert 1.5416 + * to UTF-8->UTF-16->charset conversion. 1.5417 + */ 1.5418 + static const UChar nul=0; 1.5419 + const UChar *noSource=&nul; 1.5420 + c=_extFromU(cnv, cnv->sharedData, 1.5421 + c, &noSource, noSource, 1.5422 + &target, target+targetCapacity, 1.5423 + NULL, -1, 1.5424 + pFromUArgs->flush, 1.5425 + pErrorCode); 1.5426 + 1.5427 + if(U_FAILURE(*pErrorCode)) { 1.5428 + /* not mappable or buffer overflow */ 1.5429 + cnv->fromUChar32=c; 1.5430 + break; 1.5431 + } else if(cnv->preFromUFirstCP>=0) { 1.5432 + /* 1.5433 + * Partial match, return and revert to pivoting. 1.5434 + * In normal from-UTF-16 conversion, we would just continue 1.5435 + * but then exit the loop because the extension match would 1.5436 + * have consumed the source. 1.5437 + */ 1.5438 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.5439 + break; 1.5440 + } else { 1.5441 + /* a mapping was written to the target, continue */ 1.5442 + 1.5443 + /* recalculate the targetCapacity after an extension mapping */ 1.5444 + targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 1.5445 + continue; 1.5446 + } 1.5447 + } 1.5448 + } else { 1.5449 + /* target is full */ 1.5450 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.5451 + break; 1.5452 + } 1.5453 + } 1.5454 + 1.5455 + /* 1.5456 + * The sourceLimit may have been adjusted before the conversion loop 1.5457 + * to stop before a truncated sequence. 1.5458 + * If so, then collect the truncated sequence now. 1.5459 + */ 1.5460 + if(U_SUCCESS(*pErrorCode) && 1.5461 + cnv->preFromUFirstCP<0 && 1.5462 + source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 1.5463 + c=utf8->toUBytes[0]=b=*source++; 1.5464 + toULength=1; 1.5465 + toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 1.5466 + while(source<sourceLimit) { 1.5467 + utf8->toUBytes[toULength++]=b=*source++; 1.5468 + c=(c<<6)+b; 1.5469 + } 1.5470 + utf8->toUnicodeStatus=c; 1.5471 + utf8->toULength=toULength; 1.5472 + utf8->mode=toULimit; 1.5473 + } 1.5474 + 1.5475 + /* write back the updated pointers */ 1.5476 + pToUArgs->source=(char *)source; 1.5477 + pFromUArgs->target=(char *)target; 1.5478 +} 1.5479 + 1.5480 +/* miscellaneous ------------------------------------------------------------ */ 1.5481 + 1.5482 +static void 1.5483 +ucnv_MBCSGetStarters(const UConverter* cnv, 1.5484 + UBool starters[256], 1.5485 + UErrorCode *pErrorCode) { 1.5486 + const int32_t *state0; 1.5487 + int i; 1.5488 + 1.5489 + state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 1.5490 + for(i=0; i<256; ++i) { 1.5491 + /* all bytes that cause a state transition from state 0 are lead bytes */ 1.5492 + starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 1.5493 + } 1.5494 +} 1.5495 + 1.5496 +/* 1.5497 + * This is an internal function that allows other converter implementations 1.5498 + * to check whether a byte is a lead byte. 1.5499 + */ 1.5500 +U_CFUNC UBool 1.5501 +ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 1.5502 + return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 1.5503 +} 1.5504 + 1.5505 +static void 1.5506 +ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 1.5507 + int32_t offsetIndex, 1.5508 + UErrorCode *pErrorCode) { 1.5509 + UConverter *cnv=pArgs->converter; 1.5510 + char *p, *subchar; 1.5511 + char buffer[4]; 1.5512 + int32_t length; 1.5513 + 1.5514 + /* first, select between subChar and subChar1 */ 1.5515 + if( cnv->subChar1!=0 && 1.5516 + (cnv->sharedData->mbcs.extIndexes!=NULL ? 1.5517 + cnv->useSubChar1 : 1.5518 + (cnv->invalidUCharBuffer[0]<=0xff)) 1.5519 + ) { 1.5520 + /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 1.5521 + subchar=(char *)&cnv->subChar1; 1.5522 + length=1; 1.5523 + } else { 1.5524 + /* select subChar in all other cases */ 1.5525 + subchar=(char *)cnv->subChars; 1.5526 + length=cnv->subCharLen; 1.5527 + } 1.5528 + 1.5529 + /* reset the selector for the next code point */ 1.5530 + cnv->useSubChar1=FALSE; 1.5531 + 1.5532 + if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 1.5533 + p=buffer; 1.5534 + 1.5535 + /* fromUnicodeStatus contains prevLength */ 1.5536 + switch(length) { 1.5537 + case 1: 1.5538 + if(cnv->fromUnicodeStatus==2) { 1.5539 + /* DBCS mode and SBCS sub char: change to SBCS */ 1.5540 + cnv->fromUnicodeStatus=1; 1.5541 + *p++=UCNV_SI; 1.5542 + } 1.5543 + *p++=subchar[0]; 1.5544 + break; 1.5545 + case 2: 1.5546 + if(cnv->fromUnicodeStatus<=1) { 1.5547 + /* SBCS mode and DBCS sub char: change to DBCS */ 1.5548 + cnv->fromUnicodeStatus=2; 1.5549 + *p++=UCNV_SO; 1.5550 + } 1.5551 + *p++=subchar[0]; 1.5552 + *p++=subchar[1]; 1.5553 + break; 1.5554 + default: 1.5555 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.5556 + return; 1.5557 + } 1.5558 + subchar=buffer; 1.5559 + length=(int32_t)(p-buffer); 1.5560 + } 1.5561 + 1.5562 + ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 1.5563 +} 1.5564 + 1.5565 +U_CFUNC UConverterType 1.5566 +ucnv_MBCSGetType(const UConverter* converter) { 1.5567 + /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 1.5568 + if(converter->sharedData->mbcs.countStates==1) { 1.5569 + return (UConverterType)UCNV_SBCS; 1.5570 + } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 1.5571 + return (UConverterType)UCNV_EBCDIC_STATEFUL; 1.5572 + } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 1.5573 + return (UConverterType)UCNV_DBCS; 1.5574 + } 1.5575 + return (UConverterType)UCNV_MBCS; 1.5576 +} 1.5577 + 1.5578 +static const UConverterImpl _SBCSUTF8Impl={ 1.5579 + UCNV_MBCS, 1.5580 + 1.5581 + ucnv_MBCSLoad, 1.5582 + ucnv_MBCSUnload, 1.5583 + 1.5584 + ucnv_MBCSOpen, 1.5585 + NULL, 1.5586 + NULL, 1.5587 + 1.5588 + ucnv_MBCSToUnicodeWithOffsets, 1.5589 + ucnv_MBCSToUnicodeWithOffsets, 1.5590 + ucnv_MBCSFromUnicodeWithOffsets, 1.5591 + ucnv_MBCSFromUnicodeWithOffsets, 1.5592 + ucnv_MBCSGetNextUChar, 1.5593 + 1.5594 + ucnv_MBCSGetStarters, 1.5595 + ucnv_MBCSGetName, 1.5596 + ucnv_MBCSWriteSub, 1.5597 + NULL, 1.5598 + ucnv_MBCSGetUnicodeSet, 1.5599 + 1.5600 + NULL, 1.5601 + ucnv_SBCSFromUTF8 1.5602 +}; 1.5603 + 1.5604 +static const UConverterImpl _DBCSUTF8Impl={ 1.5605 + UCNV_MBCS, 1.5606 + 1.5607 + ucnv_MBCSLoad, 1.5608 + ucnv_MBCSUnload, 1.5609 + 1.5610 + ucnv_MBCSOpen, 1.5611 + NULL, 1.5612 + NULL, 1.5613 + 1.5614 + ucnv_MBCSToUnicodeWithOffsets, 1.5615 + ucnv_MBCSToUnicodeWithOffsets, 1.5616 + ucnv_MBCSFromUnicodeWithOffsets, 1.5617 + ucnv_MBCSFromUnicodeWithOffsets, 1.5618 + ucnv_MBCSGetNextUChar, 1.5619 + 1.5620 + ucnv_MBCSGetStarters, 1.5621 + ucnv_MBCSGetName, 1.5622 + ucnv_MBCSWriteSub, 1.5623 + NULL, 1.5624 + ucnv_MBCSGetUnicodeSet, 1.5625 + 1.5626 + NULL, 1.5627 + ucnv_DBCSFromUTF8 1.5628 +}; 1.5629 + 1.5630 +static const UConverterImpl _MBCSImpl={ 1.5631 + UCNV_MBCS, 1.5632 + 1.5633 + ucnv_MBCSLoad, 1.5634 + ucnv_MBCSUnload, 1.5635 + 1.5636 + ucnv_MBCSOpen, 1.5637 + NULL, 1.5638 + NULL, 1.5639 + 1.5640 + ucnv_MBCSToUnicodeWithOffsets, 1.5641 + ucnv_MBCSToUnicodeWithOffsets, 1.5642 + ucnv_MBCSFromUnicodeWithOffsets, 1.5643 + ucnv_MBCSFromUnicodeWithOffsets, 1.5644 + ucnv_MBCSGetNextUChar, 1.5645 + 1.5646 + ucnv_MBCSGetStarters, 1.5647 + ucnv_MBCSGetName, 1.5648 + ucnv_MBCSWriteSub, 1.5649 + NULL, 1.5650 + ucnv_MBCSGetUnicodeSet 1.5651 +}; 1.5652 + 1.5653 + 1.5654 +/* Static data is in tools/makeconv/ucnvstat.c for data-based 1.5655 + * converters. Be sure to update it as well. 1.5656 + */ 1.5657 + 1.5658 +const UConverterSharedData _MBCSData={ 1.5659 + sizeof(UConverterSharedData), 1, 1.5660 + NULL, NULL, NULL, FALSE, &_MBCSImpl, 1.5661 + 0 1.5662 +}; 1.5663 + 1.5664 +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */