intl/icu/source/common/ucnv_lmb.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2000-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * file name: ucnv_lmb.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 4 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2000feb09
michael@0 12 * created by: Brendan Murray
michael@0 13 * extensively hacked up by: Jim Snyder-Grant
michael@0 14 *
michael@0 15 * Modification History:
michael@0 16 *
michael@0 17 * Date Name Description
michael@0 18 *
michael@0 19 * 06/20/2000 helena OS/400 port changes; mostly typecast.
michael@0 20 * 06/27/2000 Jim Snyder-Grant Deal with partial characters and small buffers.
michael@0 21 * Add comments to document LMBCS format and implementation
michael@0 22 * restructured order & breakdown of functions
michael@0 23 * 06/28/2000 helena Major rewrite for the callback API changes.
michael@0 24 */
michael@0 25
michael@0 26 #include "unicode/utypes.h"
michael@0 27
michael@0 28 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
michael@0 29
michael@0 30 #include "unicode/ucnv_err.h"
michael@0 31 #include "unicode/ucnv.h"
michael@0 32 #include "unicode/uset.h"
michael@0 33 #include "cmemory.h"
michael@0 34 #include "cstring.h"
michael@0 35 #include "uassert.h"
michael@0 36 #include "ucnv_imp.h"
michael@0 37 #include "ucnv_bld.h"
michael@0 38 #include "ucnv_cnv.h"
michael@0 39
michael@0 40 #ifdef EBCDIC_RTL
michael@0 41 #include "ascii_a.h"
michael@0 42 #endif
michael@0 43
michael@0 44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 45
michael@0 46 /*
michael@0 47 LMBCS
michael@0 48
michael@0 49 (Lotus Multi-Byte Character Set)
michael@0 50
michael@0 51 LMBCS was invented in the late 1980's and is primarily used in Lotus Notes
michael@0 52 databases and in Lotus 1-2-3 files. Programmers who work with the APIs
michael@0 53 into these products will sometimes need to deal with strings in this format.
michael@0 54
michael@0 55 The code in this file provides an implementation for an ICU converter of
michael@0 56 LMBCS to and from Unicode.
michael@0 57
michael@0 58 Since the LMBCS character set is only sparsely documented in existing
michael@0 59 printed or online material, we have added extensive annotation to this
michael@0 60 file to serve as a guide to understanding LMBCS.
michael@0 61
michael@0 62 LMBCS was originally designed with these four sometimes-competing design goals:
michael@0 63
michael@0 64 -Provide encodings for the characters in 12 existing national standards
michael@0 65 (plus a few other characters)
michael@0 66 -Minimal memory footprint
michael@0 67 -Maximal speed of conversion into the existing national character sets
michael@0 68 -No need to track a changing state as you interpret a string.
michael@0 69
michael@0 70
michael@0 71 All of the national character sets LMBCS was trying to encode are 'ANSI'
michael@0 72 based, in that the bytes from 0x20 - 0x7F are almost exactly the
michael@0 73 same common Latin unaccented characters and symbols in all character sets.
michael@0 74
michael@0 75 So, in order to help meet the speed & memory design goals, the common ANSI
michael@0 76 bytes from 0x20-0x7F are represented by the same single-byte values in LMBCS.
michael@0 77
michael@0 78 The general LMBCS code unit is from 1-3 bytes. We can describe the 3 bytes as
michael@0 79 follows:
michael@0 80
michael@0 81 [G] D1 [D2]
michael@0 82
michael@0 83 That is, a sometimes-optional 'group' byte, followed by 1 and sometimes 2
michael@0 84 data bytes. The maximum size of a LMBCS chjaracter is 3 bytes:
michael@0 85 */
michael@0 86 #define ULMBCS_CHARSIZE_MAX 3
michael@0 87 /*
michael@0 88 The single-byte values from 0x20 to 0x7F are examples of single D1 bytes.
michael@0 89 We often have to figure out if byte values are below or above this, so we
michael@0 90 use the ANSI nomenclature 'C0' and 'C1' to refer to the range of control
michael@0 91 characters just above & below the common lower-ANSI range */
michael@0 92 #define ULMBCS_C0END 0x1F
michael@0 93 #define ULMBCS_C1START 0x80
michael@0 94 /*
michael@0 95 Since LMBCS is always dealing in byte units. we create a local type here for
michael@0 96 dealing with these units of LMBCS code units:
michael@0 97
michael@0 98 */
michael@0 99 typedef uint8_t ulmbcs_byte_t;
michael@0 100
michael@0 101 /*
michael@0 102 Most of the values less than 0x20 are reserved in LMBCS to announce
michael@0 103 which national character standard is being used for the 'D' bytes.
michael@0 104 In the comments we show the common name and the IBM character-set ID
michael@0 105 for these character-set announcers:
michael@0 106 */
michael@0 107
michael@0 108 #define ULMBCS_GRP_L1 0x01 /* Latin-1 :ibm-850 */
michael@0 109 #define ULMBCS_GRP_GR 0x02 /* Greek :ibm-851 */
michael@0 110 #define ULMBCS_GRP_HE 0x03 /* Hebrew :ibm-1255 */
michael@0 111 #define ULMBCS_GRP_AR 0x04 /* Arabic :ibm-1256 */
michael@0 112 #define ULMBCS_GRP_RU 0x05 /* Cyrillic :ibm-1251 */
michael@0 113 #define ULMBCS_GRP_L2 0x06 /* Latin-2 :ibm-852 */
michael@0 114 #define ULMBCS_GRP_TR 0x08 /* Turkish :ibm-1254 */
michael@0 115 #define ULMBCS_GRP_TH 0x0B /* Thai :ibm-874 */
michael@0 116 #define ULMBCS_GRP_JA 0x10 /* Japanese :ibm-943 */
michael@0 117 #define ULMBCS_GRP_KO 0x11 /* Korean :ibm-1261 */
michael@0 118 #define ULMBCS_GRP_TW 0x12 /* Chinese SC :ibm-950 */
michael@0 119 #define ULMBCS_GRP_CN 0x13 /* Chinese TC :ibm-1386 */
michael@0 120
michael@0 121 /*
michael@0 122 So, the beginning of understanding LMBCS is that IF the first byte of a LMBCS
michael@0 123 character is one of those 12 values, you can interpret the remaining bytes of
michael@0 124 that character as coming from one of those character sets. Since the lower
michael@0 125 ANSI bytes already are represented in single bytes, using one of the character
michael@0 126 set announcers is used to announce a character that starts with a byte of
michael@0 127 0x80 or greater.
michael@0 128
michael@0 129 The character sets are arranged so that the single byte sets all appear
michael@0 130 before the multi-byte character sets. When we need to tell whether a
michael@0 131 group byte is for a single byte char set or not we use this define: */
michael@0 132
michael@0 133 #define ULMBCS_DOUBLEOPTGROUP_START 0x10
michael@0 134
michael@0 135 /*
michael@0 136 However, to fully understand LMBCS, you must also understand a series of
michael@0 137 exceptions & optimizations made in service of the design goals.
michael@0 138
michael@0 139 First, those of you who are character set mavens may have noticed that
michael@0 140 the 'double-byte' character sets are actually multi-byte character sets
michael@0 141 that can have 1 or two bytes, even in the upper-ascii range. To force
michael@0 142 each group byte to introduce a fixed-width encoding (to make it faster to
michael@0 143 count characters), we use a convention of doubling up on the group byte
michael@0 144 to introduce any single-byte character > 0x80 in an otherwise double-byte
michael@0 145 character set. So, for example, the LMBCS sequence x10 x10 xAE is the
michael@0 146 same as '0xAE' in the Japanese code page 943.
michael@0 147
michael@0 148 Next, you will notice that the list of group bytes has some gaps.
michael@0 149 These are used in various ways.
michael@0 150
michael@0 151 We reserve a few special single byte values for common control
michael@0 152 characters. These are in the same place as their ANSI eqivalents for speed.
michael@0 153 */
michael@0 154
michael@0 155 #define ULMBCS_HT 0x09 /* Fixed control char - Horizontal Tab */
michael@0 156 #define ULMBCS_LF 0x0A /* Fixed control char - Line Feed */
michael@0 157 #define ULMBCS_CR 0x0D /* Fixed control char - Carriage Return */
michael@0 158
michael@0 159 /* Then, 1-2-3 reserved a special single-byte character to put at the
michael@0 160 beginning of internal 'system' range names: */
michael@0 161
michael@0 162 #define ULMBCS_123SYSTEMRANGE 0x19
michael@0 163
michael@0 164 /* Then we needed a place to put all the other ansi control characters
michael@0 165 that must be moved to different values because LMBCS reserves those
michael@0 166 values for other purposes. To represent the control characters, we start
michael@0 167 with a first byte of 0xF & add the control chaarcter value as the
michael@0 168 second byte */
michael@0 169 #define ULMBCS_GRP_CTRL 0x0F
michael@0 170
michael@0 171 /* For the C0 controls (less than 0x20), we add 0x20 to preserve the
michael@0 172 useful doctrine that any byte less than 0x20 in a LMBCS char must be
michael@0 173 the first byte of a character:*/
michael@0 174 #define ULMBCS_CTRLOFFSET 0x20
michael@0 175
michael@0 176 /*
michael@0 177 Where to put the characters that aren't part of any of the 12 national
michael@0 178 character sets? The first thing that was done, in the earlier years of
michael@0 179 LMBCS, was to use up the spaces of the form
michael@0 180
michael@0 181 [G] D1,
michael@0 182
michael@0 183 where 'G' was one of the single-byte character groups, and
michael@0 184 D1 was less than 0x80. These sequences are gathered together
michael@0 185 into a Lotus-invented doublebyte character set to represent a
michael@0 186 lot of stray values. Internally, in this implementation, we track this
michael@0 187 as group '0', as a place to tuck this exceptions list.*/
michael@0 188
michael@0 189 #define ULMBCS_GRP_EXCEPT 0x00
michael@0 190 /*
michael@0 191 Finally, as the durability and usefulness of UNICODE became clear,
michael@0 192 LOTUS added a new group 0x14 to hold Unicode values not otherwise
michael@0 193 represented in LMBCS: */
michael@0 194 #define ULMBCS_GRP_UNICODE 0x14
michael@0 195 /* The two bytes appearing after a 0x14 are intrepreted as UFT-16 BE
michael@0 196 (Big-Endian) characters. The exception comes when the UTF16
michael@0 197 representation would have a zero as the second byte. In that case,
michael@0 198 'F6' is used in its place, and the bytes are swapped. (This prevents
michael@0 199 LMBCS from encoding any Unicode values of the form U+F6xx, but that's OK:
michael@0 200 0xF6xx is in the middle of the Private Use Area.)*/
michael@0 201 #define ULMBCS_UNICOMPATZERO 0xF6
michael@0 202
michael@0 203 /* It is also useful in our code to have a constant for the size of
michael@0 204 a LMBCS char that holds a literal Unicode value */
michael@0 205 #define ULMBCS_UNICODE_SIZE 3
michael@0 206
michael@0 207 /*
michael@0 208 To squish the LMBCS representations down even further, and to make
michael@0 209 translations even faster,sometimes the optimization group byte can be dropped
michael@0 210 from a LMBCS character. This is decided on a process-by-process basis. The
michael@0 211 group byte that is dropped is called the 'optimization group'.
michael@0 212
michael@0 213 For Notes, the optimzation group is always 0x1.*/
michael@0 214 #define ULMBCS_DEFAULTOPTGROUP 0x1
michael@0 215 /* For 1-2-3 files, the optimzation group is stored in the header of the 1-2-3
michael@0 216 file.
michael@0 217
michael@0 218 In any case, when using ICU, you either pass in the
michael@0 219 optimization group as part of the name of the converter (LMBCS-1, LMBCS-2,
michael@0 220 etc.). Using plain 'LMBCS' as the name of the converter will give you
michael@0 221 LMBCS-1.
michael@0 222
michael@0 223
michael@0 224 *** Implementation strategy ***
michael@0 225
michael@0 226
michael@0 227 Because of the extensive use of other character sets, the LMBCS converter
michael@0 228 keeps a mapping between optimization groups and IBM character sets, so that
michael@0 229 ICU converters can be created and used as needed. */
michael@0 230
michael@0 231 /* As you can see, even though any byte below 0x20 could be an optimization
michael@0 232 byte, only those at 0x13 or below can map to an actual converter. To limit
michael@0 233 some loops and searches, we define a value for that last group converter:*/
michael@0 234
michael@0 235 #define ULMBCS_GRP_LAST 0x13 /* last LMBCS group that has a converter */
michael@0 236
michael@0 237 static const char * const OptGroupByteToCPName[ULMBCS_GRP_LAST + 1] = {
michael@0 238 /* 0x0000 */ "lmb-excp", /* internal home for the LOTUS exceptions list */
michael@0 239 /* 0x0001 */ "ibm-850",
michael@0 240 /* 0x0002 */ "ibm-851",
michael@0 241 /* 0x0003 */ "windows-1255",
michael@0 242 /* 0x0004 */ "windows-1256",
michael@0 243 /* 0x0005 */ "windows-1251",
michael@0 244 /* 0x0006 */ "ibm-852",
michael@0 245 /* 0x0007 */ NULL, /* Unused */
michael@0 246 /* 0x0008 */ "windows-1254",
michael@0 247 /* 0x0009 */ NULL, /* Control char HT */
michael@0 248 /* 0x000A */ NULL, /* Control char LF */
michael@0 249 /* 0x000B */ "windows-874",
michael@0 250 /* 0x000C */ NULL, /* Unused */
michael@0 251 /* 0x000D */ NULL, /* Control char CR */
michael@0 252 /* 0x000E */ NULL, /* Unused */
michael@0 253 /* 0x000F */ NULL, /* Control chars: 0x0F20 + C0/C1 character: algorithmic */
michael@0 254 /* 0x0010 */ "windows-932",
michael@0 255 /* 0x0011 */ "windows-949",
michael@0 256 /* 0x0012 */ "windows-950",
michael@0 257 /* 0x0013 */ "windows-936"
michael@0 258
michael@0 259 /* The rest are null, including the 0x0014 Unicode compatibility region
michael@0 260 and 0x0019, the 1-2-3 system range control char */
michael@0 261 };
michael@0 262
michael@0 263
michael@0 264 /* That's approximately all the data that's needed for translating
michael@0 265 LMBCS to Unicode.
michael@0 266
michael@0 267
michael@0 268 However, to translate Unicode to LMBCS, we need some more support.
michael@0 269
michael@0 270 That's because there are often more than one possible mappings from a Unicode
michael@0 271 code point back into LMBCS. The first thing we do is look up into a table
michael@0 272 to figure out if there are more than one possible mappings. This table,
michael@0 273 arranged by Unicode values (including ranges) either lists which group
michael@0 274 to use, or says that it could go into one or more of the SBCS sets, or
michael@0 275 into one or more of the DBCS sets. (If the character exists in both DBCS &
michael@0 276 SBCS, the table will place it in the SBCS sets, to make the LMBCS code point
michael@0 277 length as small as possible. Here's the two special markers we use to indicate
michael@0 278 ambiguous mappings: */
michael@0 279
michael@0 280 #define ULMBCS_AMBIGUOUS_SBCS 0x80 /* could fit in more than one
michael@0 281 LMBCS sbcs native encoding
michael@0 282 (example: most accented latin) */
michael@0 283 #define ULMBCS_AMBIGUOUS_MBCS 0x81 /* could fit in more than one
michael@0 284 LMBCS mbcs native encoding
michael@0 285 (example: Unihan) */
michael@0 286 #define ULMBCS_AMBIGUOUS_ALL 0x82
michael@0 287 /* And here's a simple way to see if a group falls in an appropriate range */
michael@0 288 #define ULMBCS_AMBIGUOUS_MATCH(agroup, xgroup) \
michael@0 289 ((((agroup) == ULMBCS_AMBIGUOUS_SBCS) && \
michael@0 290 (xgroup) < ULMBCS_DOUBLEOPTGROUP_START) || \
michael@0 291 (((agroup) == ULMBCS_AMBIGUOUS_MBCS) && \
michael@0 292 (xgroup) >= ULMBCS_DOUBLEOPTGROUP_START)) || \
michael@0 293 ((agroup) == ULMBCS_AMBIGUOUS_ALL)
michael@0 294
michael@0 295
michael@0 296 /* The table & some code to use it: */
michael@0 297
michael@0 298
michael@0 299 static const struct _UniLMBCSGrpMap
michael@0 300 {
michael@0 301 const UChar uniStartRange;
michael@0 302 const UChar uniEndRange;
michael@0 303 const ulmbcs_byte_t GrpType;
michael@0 304 } UniLMBCSGrpMap[]
michael@0 305 =
michael@0 306 {
michael@0 307
michael@0 308 {0x0001, 0x001F, ULMBCS_GRP_CTRL},
michael@0 309 {0x0080, 0x009F, ULMBCS_GRP_CTRL},
michael@0 310 {0x00A0, 0x00A6, ULMBCS_AMBIGUOUS_SBCS},
michael@0 311 {0x00A7, 0x00A8, ULMBCS_AMBIGUOUS_ALL},
michael@0 312 {0x00A9, 0x00AF, ULMBCS_AMBIGUOUS_SBCS},
michael@0 313 {0x00B0, 0x00B1, ULMBCS_AMBIGUOUS_ALL},
michael@0 314 {0x00B2, 0x00B3, ULMBCS_AMBIGUOUS_SBCS},
michael@0 315 {0x00B4, 0x00B4, ULMBCS_AMBIGUOUS_ALL},
michael@0 316 {0x00B5, 0x00B5, ULMBCS_AMBIGUOUS_SBCS},
michael@0 317 {0x00B6, 0x00B6, ULMBCS_AMBIGUOUS_ALL},
michael@0 318 {0x00B7, 0x00D6, ULMBCS_AMBIGUOUS_SBCS},
michael@0 319 {0x00D7, 0x00D7, ULMBCS_AMBIGUOUS_ALL},
michael@0 320 {0x00D8, 0x00F6, ULMBCS_AMBIGUOUS_SBCS},
michael@0 321 {0x00F7, 0x00F7, ULMBCS_AMBIGUOUS_ALL},
michael@0 322 {0x00F8, 0x01CD, ULMBCS_AMBIGUOUS_SBCS},
michael@0 323 {0x01CE, 0x01CE, ULMBCS_GRP_TW },
michael@0 324 {0x01CF, 0x02B9, ULMBCS_AMBIGUOUS_SBCS},
michael@0 325 {0x02BA, 0x02BA, ULMBCS_GRP_CN},
michael@0 326 {0x02BC, 0x02C8, ULMBCS_AMBIGUOUS_SBCS},
michael@0 327 {0x02C9, 0x02D0, ULMBCS_AMBIGUOUS_MBCS},
michael@0 328 {0x02D8, 0x02DD, ULMBCS_AMBIGUOUS_SBCS},
michael@0 329 {0x0384, 0x0390, ULMBCS_AMBIGUOUS_SBCS},
michael@0 330 {0x0391, 0x03A9, ULMBCS_AMBIGUOUS_ALL},
michael@0 331 {0x03AA, 0x03B0, ULMBCS_AMBIGUOUS_SBCS},
michael@0 332 {0x03B1, 0x03C9, ULMBCS_AMBIGUOUS_ALL},
michael@0 333 {0x03CA, 0x03CE, ULMBCS_AMBIGUOUS_SBCS},
michael@0 334 {0x0400, 0x0400, ULMBCS_GRP_RU},
michael@0 335 {0x0401, 0x0401, ULMBCS_AMBIGUOUS_ALL},
michael@0 336 {0x0402, 0x040F, ULMBCS_GRP_RU},
michael@0 337 {0x0410, 0x0431, ULMBCS_AMBIGUOUS_ALL},
michael@0 338 {0x0432, 0x044E, ULMBCS_GRP_RU},
michael@0 339 {0x044F, 0x044F, ULMBCS_AMBIGUOUS_ALL},
michael@0 340 {0x0450, 0x0491, ULMBCS_GRP_RU},
michael@0 341 {0x05B0, 0x05F2, ULMBCS_GRP_HE},
michael@0 342 {0x060C, 0x06AF, ULMBCS_GRP_AR},
michael@0 343 {0x0E01, 0x0E5B, ULMBCS_GRP_TH},
michael@0 344 {0x200C, 0x200F, ULMBCS_AMBIGUOUS_SBCS},
michael@0 345 {0x2010, 0x2010, ULMBCS_AMBIGUOUS_MBCS},
michael@0 346 {0x2013, 0x2014, ULMBCS_AMBIGUOUS_SBCS},
michael@0 347 {0x2015, 0x2015, ULMBCS_AMBIGUOUS_MBCS},
michael@0 348 {0x2016, 0x2016, ULMBCS_AMBIGUOUS_MBCS},
michael@0 349 {0x2017, 0x2017, ULMBCS_AMBIGUOUS_SBCS},
michael@0 350 {0x2018, 0x2019, ULMBCS_AMBIGUOUS_ALL},
michael@0 351 {0x201A, 0x201B, ULMBCS_AMBIGUOUS_SBCS},
michael@0 352 {0x201C, 0x201D, ULMBCS_AMBIGUOUS_ALL},
michael@0 353 {0x201E, 0x201F, ULMBCS_AMBIGUOUS_SBCS},
michael@0 354 {0x2020, 0x2021, ULMBCS_AMBIGUOUS_ALL},
michael@0 355 {0x2022, 0x2024, ULMBCS_AMBIGUOUS_SBCS},
michael@0 356 {0x2025, 0x2025, ULMBCS_AMBIGUOUS_MBCS},
michael@0 357 {0x2026, 0x2026, ULMBCS_AMBIGUOUS_ALL},
michael@0 358 {0x2027, 0x2027, ULMBCS_GRP_TW},
michael@0 359 {0x2030, 0x2030, ULMBCS_AMBIGUOUS_ALL},
michael@0 360 {0x2031, 0x2031, ULMBCS_AMBIGUOUS_SBCS},
michael@0 361 {0x2032, 0x2033, ULMBCS_AMBIGUOUS_MBCS},
michael@0 362 {0x2035, 0x2035, ULMBCS_AMBIGUOUS_MBCS},
michael@0 363 {0x2039, 0x203A, ULMBCS_AMBIGUOUS_SBCS},
michael@0 364 {0x203B, 0x203B, ULMBCS_AMBIGUOUS_MBCS},
michael@0 365 {0x203C, 0x203C, ULMBCS_GRP_EXCEPT},
michael@0 366 {0x2074, 0x2074, ULMBCS_GRP_KO},
michael@0 367 {0x207F, 0x207F, ULMBCS_GRP_EXCEPT},
michael@0 368 {0x2081, 0x2084, ULMBCS_GRP_KO},
michael@0 369 {0x20A4, 0x20AC, ULMBCS_AMBIGUOUS_SBCS},
michael@0 370 {0x2103, 0x2109, ULMBCS_AMBIGUOUS_MBCS},
michael@0 371 {0x2111, 0x2120, ULMBCS_AMBIGUOUS_SBCS},
michael@0 372 /*zhujin: upgrade, for regressiont test, spr HKIA4YHTSU*/
michael@0 373 {0x2121, 0x2121, ULMBCS_AMBIGUOUS_MBCS},
michael@0 374 {0x2122, 0x2126, ULMBCS_AMBIGUOUS_SBCS},
michael@0 375 {0x212B, 0x212B, ULMBCS_AMBIGUOUS_MBCS},
michael@0 376 {0x2135, 0x2135, ULMBCS_AMBIGUOUS_SBCS},
michael@0 377 {0x2153, 0x2154, ULMBCS_GRP_KO},
michael@0 378 {0x215B, 0x215E, ULMBCS_GRP_EXCEPT},
michael@0 379 {0x2160, 0x2179, ULMBCS_AMBIGUOUS_MBCS},
michael@0 380 {0x2190, 0x2193, ULMBCS_AMBIGUOUS_ALL},
michael@0 381 {0x2194, 0x2195, ULMBCS_GRP_EXCEPT},
michael@0 382 {0x2196, 0x2199, ULMBCS_AMBIGUOUS_MBCS},
michael@0 383 {0x21A8, 0x21A8, ULMBCS_GRP_EXCEPT},
michael@0 384 {0x21B8, 0x21B9, ULMBCS_GRP_CN},
michael@0 385 {0x21D0, 0x21D1, ULMBCS_GRP_EXCEPT},
michael@0 386 {0x21D2, 0x21D2, ULMBCS_AMBIGUOUS_MBCS},
michael@0 387 {0x21D3, 0x21D3, ULMBCS_GRP_EXCEPT},
michael@0 388 {0x21D4, 0x21D4, ULMBCS_AMBIGUOUS_MBCS},
michael@0 389 {0x21D5, 0x21D5, ULMBCS_GRP_EXCEPT},
michael@0 390 {0x21E7, 0x21E7, ULMBCS_GRP_CN},
michael@0 391 {0x2200, 0x2200, ULMBCS_AMBIGUOUS_MBCS},
michael@0 392 {0x2201, 0x2201, ULMBCS_GRP_EXCEPT},
michael@0 393 {0x2202, 0x2202, ULMBCS_AMBIGUOUS_MBCS},
michael@0 394 {0x2203, 0x2203, ULMBCS_AMBIGUOUS_MBCS},
michael@0 395 {0x2204, 0x2206, ULMBCS_GRP_EXCEPT},
michael@0 396 {0x2207, 0x2208, ULMBCS_AMBIGUOUS_MBCS},
michael@0 397 {0x2209, 0x220A, ULMBCS_GRP_EXCEPT},
michael@0 398 {0x220B, 0x220B, ULMBCS_AMBIGUOUS_MBCS},
michael@0 399 {0x220F, 0x2215, ULMBCS_AMBIGUOUS_MBCS},
michael@0 400 {0x2219, 0x2219, ULMBCS_GRP_EXCEPT},
michael@0 401 {0x221A, 0x221A, ULMBCS_AMBIGUOUS_MBCS},
michael@0 402 {0x221B, 0x221C, ULMBCS_GRP_EXCEPT},
michael@0 403 {0x221D, 0x221E, ULMBCS_AMBIGUOUS_MBCS},
michael@0 404 {0x221F, 0x221F, ULMBCS_GRP_EXCEPT},
michael@0 405 {0x2220, 0x2220, ULMBCS_AMBIGUOUS_MBCS},
michael@0 406 {0x2223, 0x222A, ULMBCS_AMBIGUOUS_MBCS},
michael@0 407 {0x222B, 0x223D, ULMBCS_AMBIGUOUS_MBCS},
michael@0 408 {0x2245, 0x2248, ULMBCS_GRP_EXCEPT},
michael@0 409 {0x224C, 0x224C, ULMBCS_GRP_TW},
michael@0 410 {0x2252, 0x2252, ULMBCS_AMBIGUOUS_MBCS},
michael@0 411 {0x2260, 0x2261, ULMBCS_AMBIGUOUS_MBCS},
michael@0 412 {0x2262, 0x2265, ULMBCS_GRP_EXCEPT},
michael@0 413 {0x2266, 0x226F, ULMBCS_AMBIGUOUS_MBCS},
michael@0 414 {0x2282, 0x2283, ULMBCS_AMBIGUOUS_MBCS},
michael@0 415 {0x2284, 0x2285, ULMBCS_GRP_EXCEPT},
michael@0 416 {0x2286, 0x2287, ULMBCS_AMBIGUOUS_MBCS},
michael@0 417 {0x2288, 0x2297, ULMBCS_GRP_EXCEPT},
michael@0 418 {0x2299, 0x22BF, ULMBCS_AMBIGUOUS_MBCS},
michael@0 419 {0x22C0, 0x22C0, ULMBCS_GRP_EXCEPT},
michael@0 420 {0x2310, 0x2310, ULMBCS_GRP_EXCEPT},
michael@0 421 {0x2312, 0x2312, ULMBCS_AMBIGUOUS_MBCS},
michael@0 422 {0x2318, 0x2321, ULMBCS_GRP_EXCEPT},
michael@0 423 {0x2318, 0x2321, ULMBCS_GRP_CN},
michael@0 424 {0x2460, 0x24E9, ULMBCS_AMBIGUOUS_MBCS},
michael@0 425 {0x2500, 0x2500, ULMBCS_AMBIGUOUS_SBCS},
michael@0 426 {0x2501, 0x2501, ULMBCS_AMBIGUOUS_MBCS},
michael@0 427 {0x2502, 0x2502, ULMBCS_AMBIGUOUS_ALL},
michael@0 428 {0x2503, 0x2503, ULMBCS_AMBIGUOUS_MBCS},
michael@0 429 {0x2504, 0x2505, ULMBCS_GRP_TW},
michael@0 430 {0x2506, 0x2665, ULMBCS_AMBIGUOUS_ALL},
michael@0 431 {0x2666, 0x2666, ULMBCS_GRP_EXCEPT},
michael@0 432 {0x2667, 0x2669, ULMBCS_AMBIGUOUS_SBCS},
michael@0 433 {0x266A, 0x266A, ULMBCS_AMBIGUOUS_ALL},
michael@0 434 {0x266B, 0x266C, ULMBCS_AMBIGUOUS_SBCS},
michael@0 435 {0x266D, 0x266D, ULMBCS_AMBIGUOUS_MBCS},
michael@0 436 {0x266E, 0x266E, ULMBCS_AMBIGUOUS_SBCS},
michael@0 437 {0x266F, 0x266F, ULMBCS_GRP_JA},
michael@0 438 {0x2670, 0x2E7F, ULMBCS_AMBIGUOUS_SBCS},
michael@0 439 {0x2E80, 0xF861, ULMBCS_AMBIGUOUS_MBCS},
michael@0 440 {0xF862, 0xF8FF, ULMBCS_GRP_EXCEPT},
michael@0 441 {0xF900, 0xFA2D, ULMBCS_AMBIGUOUS_MBCS},
michael@0 442 {0xFB00, 0xFEFF, ULMBCS_AMBIGUOUS_SBCS},
michael@0 443 {0xFF01, 0xFFEE, ULMBCS_AMBIGUOUS_MBCS},
michael@0 444 {0xFFFF, 0xFFFF, ULMBCS_GRP_UNICODE}
michael@0 445 };
michael@0 446
michael@0 447 static ulmbcs_byte_t
michael@0 448 FindLMBCSUniRange(UChar uniChar)
michael@0 449 {
michael@0 450 const struct _UniLMBCSGrpMap * pTable = UniLMBCSGrpMap;
michael@0 451
michael@0 452 while (uniChar > pTable->uniEndRange)
michael@0 453 {
michael@0 454 pTable++;
michael@0 455 }
michael@0 456
michael@0 457 if (uniChar >= pTable->uniStartRange)
michael@0 458 {
michael@0 459 return pTable->GrpType;
michael@0 460 }
michael@0 461 return ULMBCS_GRP_UNICODE;
michael@0 462 }
michael@0 463
michael@0 464 /*
michael@0 465 We also ask the creator of a converter to send in a preferred locale
michael@0 466 that we can use in resolving ambiguous mappings. They send the locale
michael@0 467 in as a string, and we map it, if possible, to one of the
michael@0 468 LMBCS groups. We use this table, and the associated code, to
michael@0 469 do the lookup: */
michael@0 470
michael@0 471 /**************************************************
michael@0 472 This table maps locale ID's to LMBCS opt groups.
michael@0 473 The default return is group 0x01. Note that for
michael@0 474 performance reasons, the table is sorted in
michael@0 475 increasing alphabetic order, with the notable
michael@0 476 exception of zhTW. This is to force the check
michael@0 477 for Traditonal Chinese before dropping back to
michael@0 478 Simplified.
michael@0 479
michael@0 480 Note too that the Latin-1 groups have been
michael@0 481 commented out because it's the default, and
michael@0 482 this shortens the table, allowing a serial
michael@0 483 search to go quickly.
michael@0 484 *************************************************/
michael@0 485
michael@0 486 static const struct _LocaleLMBCSGrpMap
michael@0 487 {
michael@0 488 const char *LocaleID;
michael@0 489 const ulmbcs_byte_t OptGroup;
michael@0 490 } LocaleLMBCSGrpMap[] =
michael@0 491 {
michael@0 492 {"ar", ULMBCS_GRP_AR},
michael@0 493 {"be", ULMBCS_GRP_RU},
michael@0 494 {"bg", ULMBCS_GRP_L2},
michael@0 495 /* {"ca", ULMBCS_GRP_L1}, */
michael@0 496 {"cs", ULMBCS_GRP_L2},
michael@0 497 /* {"da", ULMBCS_GRP_L1}, */
michael@0 498 /* {"de", ULMBCS_GRP_L1}, */
michael@0 499 {"el", ULMBCS_GRP_GR},
michael@0 500 /* {"en", ULMBCS_GRP_L1}, */
michael@0 501 /* {"es", ULMBCS_GRP_L1}, */
michael@0 502 /* {"et", ULMBCS_GRP_L1}, */
michael@0 503 /* {"fi", ULMBCS_GRP_L1}, */
michael@0 504 /* {"fr", ULMBCS_GRP_L1}, */
michael@0 505 {"he", ULMBCS_GRP_HE},
michael@0 506 {"hu", ULMBCS_GRP_L2},
michael@0 507 /* {"is", ULMBCS_GRP_L1}, */
michael@0 508 /* {"it", ULMBCS_GRP_L1}, */
michael@0 509 {"iw", ULMBCS_GRP_HE},
michael@0 510 {"ja", ULMBCS_GRP_JA},
michael@0 511 {"ko", ULMBCS_GRP_KO},
michael@0 512 /* {"lt", ULMBCS_GRP_L1}, */
michael@0 513 /* {"lv", ULMBCS_GRP_L1}, */
michael@0 514 {"mk", ULMBCS_GRP_RU},
michael@0 515 /* {"nl", ULMBCS_GRP_L1}, */
michael@0 516 /* {"no", ULMBCS_GRP_L1}, */
michael@0 517 {"pl", ULMBCS_GRP_L2},
michael@0 518 /* {"pt", ULMBCS_GRP_L1}, */
michael@0 519 {"ro", ULMBCS_GRP_L2},
michael@0 520 {"ru", ULMBCS_GRP_RU},
michael@0 521 {"sh", ULMBCS_GRP_L2},
michael@0 522 {"sk", ULMBCS_GRP_L2},
michael@0 523 {"sl", ULMBCS_GRP_L2},
michael@0 524 {"sq", ULMBCS_GRP_L2},
michael@0 525 {"sr", ULMBCS_GRP_RU},
michael@0 526 /* {"sv", ULMBCS_GRP_L1}, */
michael@0 527 {"th", ULMBCS_GRP_TH},
michael@0 528 {"tr", ULMBCS_GRP_TR},
michael@0 529 {"uk", ULMBCS_GRP_RU},
michael@0 530 /* {"vi", ULMBCS_GRP_L1}, */
michael@0 531 {"zhTW", ULMBCS_GRP_TW},
michael@0 532 {"zh", ULMBCS_GRP_CN},
michael@0 533 {NULL, ULMBCS_GRP_L1}
michael@0 534 };
michael@0 535
michael@0 536
michael@0 537 static ulmbcs_byte_t
michael@0 538 FindLMBCSLocale(const char *LocaleID)
michael@0 539 {
michael@0 540 const struct _LocaleLMBCSGrpMap *pTable = LocaleLMBCSGrpMap;
michael@0 541
michael@0 542 if ((!LocaleID) || (!*LocaleID))
michael@0 543 {
michael@0 544 return 0;
michael@0 545 }
michael@0 546
michael@0 547 while (pTable->LocaleID)
michael@0 548 {
michael@0 549 if (*pTable->LocaleID == *LocaleID) /* Check only first char for speed */
michael@0 550 {
michael@0 551 /* First char matches - check whole name, for entry-length */
michael@0 552 if (uprv_strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0)
michael@0 553 return pTable->OptGroup;
michael@0 554 }
michael@0 555 else
michael@0 556 if (*pTable->LocaleID > *LocaleID) /* Sorted alphabetically - exit */
michael@0 557 break;
michael@0 558 pTable++;
michael@0 559 }
michael@0 560 return ULMBCS_GRP_L1;
michael@0 561 }
michael@0 562
michael@0 563
michael@0 564 /*
michael@0 565 Before we get to the main body of code, here's how we hook up to the rest
michael@0 566 of ICU. ICU converters are required to define a structure that includes
michael@0 567 some function pointers, and some common data, in the style of a C++
michael@0 568 vtable. There is also room in there for converter-specific data. LMBCS
michael@0 569 uses that converter-specific data to keep track of the 12 subconverters
michael@0 570 we use, the optimization group, and the group (if any) that matches the
michael@0 571 locale. We have one structure instantiated for each of the 12 possible
michael@0 572 optimization groups. To avoid typos & to avoid boring the reader, we
michael@0 573 put the declarations of these structures and functions into macros. To see
michael@0 574 the definitions of these structures, see unicode\ucnv_bld.h
michael@0 575 */
michael@0 576
michael@0 577 typedef struct
michael@0 578 {
michael@0 579 UConverterSharedData *OptGrpConverter[ULMBCS_GRP_LAST+1]; /* Converter per Opt. grp. */
michael@0 580 uint8_t OptGroup; /* default Opt. grp. for this LMBCS session */
michael@0 581 uint8_t localeConverterIndex; /* reasonable locale match for index */
michael@0 582 }
michael@0 583 UConverterDataLMBCS;
michael@0 584
michael@0 585 static void _LMBCSClose(UConverter * _this);
michael@0 586
michael@0 587 #define DECLARE_LMBCS_DATA(n) \
michael@0 588 static const UConverterImpl _LMBCSImpl##n={\
michael@0 589 UCNV_LMBCS_##n,\
michael@0 590 NULL,NULL,\
michael@0 591 _LMBCSOpen##n,\
michael@0 592 _LMBCSClose,\
michael@0 593 NULL,\
michael@0 594 _LMBCSToUnicodeWithOffsets,\
michael@0 595 _LMBCSToUnicodeWithOffsets,\
michael@0 596 _LMBCSFromUnicode,\
michael@0 597 _LMBCSFromUnicode,\
michael@0 598 NULL,\
michael@0 599 NULL,\
michael@0 600 NULL,\
michael@0 601 NULL,\
michael@0 602 _LMBCSSafeClone,\
michael@0 603 ucnv_getCompleteUnicodeSet\
michael@0 604 };\
michael@0 605 static const UConverterStaticData _LMBCSStaticData##n={\
michael@0 606 sizeof(UConverterStaticData),\
michael@0 607 "LMBCS-" #n,\
michael@0 608 0, UCNV_IBM, UCNV_LMBCS_##n, 1, 3,\
michael@0 609 { 0x3f, 0, 0, 0 },1,FALSE,FALSE,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
michael@0 610 };\
michael@0 611 const UConverterSharedData _LMBCSData##n={\
michael@0 612 sizeof(UConverterSharedData), ~((uint32_t) 0),\
michael@0 613 NULL, NULL, &_LMBCSStaticData##n, FALSE, &_LMBCSImpl##n, \
michael@0 614 0 \
michael@0 615 };
michael@0 616
michael@0 617 /* The only function we needed to duplicate 12 times was the 'open'
michael@0 618 function, which will do basically the same thing except set a different
michael@0 619 optimization group. So, we put the common stuff into a worker function,
michael@0 620 and set up another macro to stamp out the 12 open functions:*/
michael@0 621 #define DEFINE_LMBCS_OPEN(n) \
michael@0 622 static void \
michael@0 623 _LMBCSOpen##n(UConverter* _this, UConverterLoadArgs* pArgs, UErrorCode* err) \
michael@0 624 { _LMBCSOpenWorker(_this, pArgs, err, n); }
michael@0 625
michael@0 626
michael@0 627
michael@0 628 /* Here's the open worker & the common close function */
michael@0 629 static void
michael@0 630 _LMBCSOpenWorker(UConverter* _this,
michael@0 631 UConverterLoadArgs *pArgs,
michael@0 632 UErrorCode* err,
michael@0 633 ulmbcs_byte_t OptGroup)
michael@0 634 {
michael@0 635 UConverterDataLMBCS * extraInfo = _this->extraInfo =
michael@0 636 (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS));
michael@0 637 if(extraInfo != NULL)
michael@0 638 {
michael@0 639 UConverterNamePieces stackPieces;
michael@0 640 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
michael@0 641 ulmbcs_byte_t i;
michael@0 642
michael@0 643 uprv_memset(extraInfo, 0, sizeof(UConverterDataLMBCS));
michael@0 644
michael@0 645 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
michael@0 646
michael@0 647 for (i=0; i <= ULMBCS_GRP_LAST && U_SUCCESS(*err); i++)
michael@0 648 {
michael@0 649 if(OptGroupByteToCPName[i] != NULL) {
michael@0 650 extraInfo->OptGrpConverter[i] = ucnv_loadSharedData(OptGroupByteToCPName[i], &stackPieces, &stackArgs, err);
michael@0 651 }
michael@0 652 }
michael@0 653
michael@0 654 if(U_FAILURE(*err) || pArgs->onlyTestIsLoadable) {
michael@0 655 _LMBCSClose(_this);
michael@0 656 return;
michael@0 657 }
michael@0 658 extraInfo->OptGroup = OptGroup;
michael@0 659 extraInfo->localeConverterIndex = FindLMBCSLocale(pArgs->locale);
michael@0 660 }
michael@0 661 else
michael@0 662 {
michael@0 663 *err = U_MEMORY_ALLOCATION_ERROR;
michael@0 664 }
michael@0 665 }
michael@0 666
michael@0 667 static void
michael@0 668 _LMBCSClose(UConverter * _this)
michael@0 669 {
michael@0 670 if (_this->extraInfo != NULL)
michael@0 671 {
michael@0 672 ulmbcs_byte_t Ix;
michael@0 673 UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
michael@0 674
michael@0 675 for (Ix=0; Ix <= ULMBCS_GRP_LAST; Ix++)
michael@0 676 {
michael@0 677 if (extraInfo->OptGrpConverter[Ix] != NULL)
michael@0 678 ucnv_unloadSharedDataIfReady(extraInfo->OptGrpConverter[Ix]);
michael@0 679 }
michael@0 680 if (!_this->isExtraLocal) {
michael@0 681 uprv_free (_this->extraInfo);
michael@0 682 _this->extraInfo = NULL;
michael@0 683 }
michael@0 684 }
michael@0 685 }
michael@0 686
michael@0 687 typedef struct LMBCSClone {
michael@0 688 UConverter cnv;
michael@0 689 UConverterDataLMBCS lmbcs;
michael@0 690 } LMBCSClone;
michael@0 691
michael@0 692 static UConverter *
michael@0 693 _LMBCSSafeClone(const UConverter *cnv,
michael@0 694 void *stackBuffer,
michael@0 695 int32_t *pBufferSize,
michael@0 696 UErrorCode *status) {
michael@0 697 LMBCSClone *newLMBCS;
michael@0 698 UConverterDataLMBCS *extraInfo;
michael@0 699 int32_t i;
michael@0 700
michael@0 701 if(*pBufferSize<=0) {
michael@0 702 *pBufferSize=(int32_t)sizeof(LMBCSClone);
michael@0 703 return NULL;
michael@0 704 }
michael@0 705
michael@0 706 extraInfo=(UConverterDataLMBCS *)cnv->extraInfo;
michael@0 707 newLMBCS=(LMBCSClone *)stackBuffer;
michael@0 708
michael@0 709 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
michael@0 710
michael@0 711 uprv_memcpy(&newLMBCS->lmbcs, extraInfo, sizeof(UConverterDataLMBCS));
michael@0 712
michael@0 713 /* share the subconverters */
michael@0 714 for(i = 0; i <= ULMBCS_GRP_LAST; ++i) {
michael@0 715 if(extraInfo->OptGrpConverter[i] != NULL) {
michael@0 716 ucnv_incrementRefCount(extraInfo->OptGrpConverter[i]);
michael@0 717 }
michael@0 718 }
michael@0 719
michael@0 720 newLMBCS->cnv.extraInfo = &newLMBCS->lmbcs;
michael@0 721 newLMBCS->cnv.isExtraLocal = TRUE;
michael@0 722 return &newLMBCS->cnv;
michael@0 723 }
michael@0 724
michael@0 725 /*
michael@0 726 * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)
michael@0 727 * which added all code points except for U+F6xx
michael@0 728 * because those cannot be represented in the Unicode group.
michael@0 729 * However, it turns out that windows-950 has roundtrips for all of U+F6xx
michael@0 730 * which means that LMBCS can convert all Unicode code points after all.
michael@0 731 * We now simply use ucnv_getCompleteUnicodeSet().
michael@0 732 *
michael@0 733 * This may need to be looked at again as Lotus uses _LMBCSGetUnicodeSet(). (091216)
michael@0 734 */
michael@0 735
michael@0 736 /*
michael@0 737 Here's the basic helper function that we use when converting from
michael@0 738 Unicode to LMBCS, and we suspect that a Unicode character will fit into
michael@0 739 one of the 12 groups. The return value is the number of bytes written
michael@0 740 starting at pStartLMBCS (if any).
michael@0 741 */
michael@0 742
michael@0 743 static size_t
michael@0 744 LMBCSConversionWorker (
michael@0 745 UConverterDataLMBCS * extraInfo, /* subconverters, opt & locale groups */
michael@0 746 ulmbcs_byte_t group, /* The group to try */
michael@0 747 ulmbcs_byte_t * pStartLMBCS, /* where to put the results */
michael@0 748 UChar * pUniChar, /* The input unicode character */
michael@0 749 ulmbcs_byte_t * lastConverterIndex, /* output: track last successful group used */
michael@0 750 UBool * groups_tried /* output: track any unsuccessful groups */
michael@0 751 )
michael@0 752 {
michael@0 753 ulmbcs_byte_t * pLMBCS = pStartLMBCS;
michael@0 754 UConverterSharedData * xcnv = extraInfo->OptGrpConverter[group];
michael@0 755
michael@0 756 int bytesConverted;
michael@0 757 uint32_t value;
michael@0 758 ulmbcs_byte_t firstByte;
michael@0 759
michael@0 760 U_ASSERT(xcnv);
michael@0 761 U_ASSERT(group<ULMBCS_GRP_UNICODE);
michael@0 762
michael@0 763 bytesConverted = ucnv_MBCSFromUChar32(xcnv, *pUniChar, &value, FALSE);
michael@0 764
michael@0 765 /* get the first result byte */
michael@0 766 if(bytesConverted > 0) {
michael@0 767 firstByte = (ulmbcs_byte_t)(value >> ((bytesConverted - 1) * 8));
michael@0 768 } else {
michael@0 769 /* most common failure mode is an unassigned character */
michael@0 770 groups_tried[group] = TRUE;
michael@0 771 return 0;
michael@0 772 }
michael@0 773
michael@0 774 *lastConverterIndex = group;
michael@0 775
michael@0 776 /* All initial byte values in lower ascii range should have been caught by now,
michael@0 777 except with the exception group.
michael@0 778 */
michael@0 779 U_ASSERT((firstByte <= ULMBCS_C0END) || (firstByte >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT));
michael@0 780
michael@0 781 /* use converted data: first write 0, 1 or two group bytes */
michael@0 782 if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group)
michael@0 783 {
michael@0 784 *pLMBCS++ = group;
michael@0 785 if (bytesConverted == 1 && group >= ULMBCS_DOUBLEOPTGROUP_START)
michael@0 786 {
michael@0 787 *pLMBCS++ = group;
michael@0 788 }
michael@0 789 }
michael@0 790
michael@0 791 /* don't emit control chars */
michael@0 792 if ( bytesConverted == 1 && firstByte < 0x20 )
michael@0 793 return 0;
michael@0 794
michael@0 795
michael@0 796 /* then move over the converted data */
michael@0 797 switch(bytesConverted)
michael@0 798 {
michael@0 799 case 4:
michael@0 800 *pLMBCS++ = (ulmbcs_byte_t)(value >> 24);
michael@0 801 case 3: /*fall through*/
michael@0 802 *pLMBCS++ = (ulmbcs_byte_t)(value >> 16);
michael@0 803 case 2: /*fall through*/
michael@0 804 *pLMBCS++ = (ulmbcs_byte_t)(value >> 8);
michael@0 805 case 1: /*fall through*/
michael@0 806 *pLMBCS++ = (ulmbcs_byte_t)value;
michael@0 807 default:
michael@0 808 /* will never occur */
michael@0 809 break;
michael@0 810 }
michael@0 811
michael@0 812 return (pLMBCS - pStartLMBCS);
michael@0 813 }
michael@0 814
michael@0 815
michael@0 816 /* This is a much simpler version of above, when we
michael@0 817 know we are writing LMBCS using the Unicode group
michael@0 818 */
michael@0 819 static size_t
michael@0 820 LMBCSConvertUni(ulmbcs_byte_t * pLMBCS, UChar uniChar)
michael@0 821 {
michael@0 822 /* encode into LMBCS Unicode range */
michael@0 823 uint8_t LowCh = (uint8_t)(uniChar & 0x00FF);
michael@0 824 uint8_t HighCh = (uint8_t)(uniChar >> 8);
michael@0 825
michael@0 826 *pLMBCS++ = ULMBCS_GRP_UNICODE;
michael@0 827
michael@0 828 if (LowCh == 0)
michael@0 829 {
michael@0 830 *pLMBCS++ = ULMBCS_UNICOMPATZERO;
michael@0 831 *pLMBCS++ = HighCh;
michael@0 832 }
michael@0 833 else
michael@0 834 {
michael@0 835 *pLMBCS++ = HighCh;
michael@0 836 *pLMBCS++ = LowCh;
michael@0 837 }
michael@0 838 return ULMBCS_UNICODE_SIZE;
michael@0 839 }
michael@0 840
michael@0 841
michael@0 842
michael@0 843 /* The main Unicode to LMBCS conversion function */
michael@0 844 static void
michael@0 845 _LMBCSFromUnicode(UConverterFromUnicodeArgs* args,
michael@0 846 UErrorCode* err)
michael@0 847 {
michael@0 848 ulmbcs_byte_t lastConverterIndex = 0;
michael@0 849 UChar uniChar;
michael@0 850 ulmbcs_byte_t LMBCS[ULMBCS_CHARSIZE_MAX];
michael@0 851 ulmbcs_byte_t * pLMBCS;
michael@0 852 int32_t bytes_written;
michael@0 853 UBool groups_tried[ULMBCS_GRP_LAST+1];
michael@0 854 UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
michael@0 855 int sourceIndex = 0;
michael@0 856
michael@0 857 /* Basic strategy: attempt to fill in local LMBCS 1-char buffer.(LMBCS)
michael@0 858 If that succeeds, see if it will all fit into the target & copy it over
michael@0 859 if it does.
michael@0 860
michael@0 861 We try conversions in the following order:
michael@0 862
michael@0 863 1. Single-byte ascii & special fixed control chars (&null)
michael@0 864 2. Look up group in table & try that (could be
michael@0 865 A) Unicode group
michael@0 866 B) control group,
michael@0 867 C) national encoding,
michael@0 868 or ambiguous SBCS or MBCS group (on to step 4...)
michael@0 869
michael@0 870 3. If its ambiguous, try this order:
michael@0 871 A) The optimization group
michael@0 872 B) The locale group
michael@0 873 C) The last group that succeeded with this string.
michael@0 874 D) every other group that's relevent (single or double)
michael@0 875 E) If its single-byte ambiguous, try the exceptions group
michael@0 876
michael@0 877 4. And as a grand fallback: Unicode
michael@0 878 */
michael@0 879
michael@0 880 /*Fix for SPR#DJOE66JFN3 (Lotus)*/
michael@0 881 ulmbcs_byte_t OldConverterIndex = 0;
michael@0 882
michael@0 883 while (args->source < args->sourceLimit && !U_FAILURE(*err))
michael@0 884 {
michael@0 885 /*Fix for SPR#DJOE66JFN3 (Lotus)*/
michael@0 886 OldConverterIndex = extraInfo->localeConverterIndex;
michael@0 887
michael@0 888 if (args->target >= args->targetLimit)
michael@0 889 {
michael@0 890 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 891 break;
michael@0 892 }
michael@0 893 uniChar = *(args->source);
michael@0 894 bytes_written = 0;
michael@0 895 pLMBCS = LMBCS;
michael@0 896
michael@0 897 /* check cases in rough order of how common they are, for speed */
michael@0 898
michael@0 899 /* single byte matches: strategy 1 */
michael@0 900 /*Fix for SPR#DJOE66JFN3 (Lotus)*/
michael@0 901 if((uniChar>=0x80) && (uniChar<=0xff)
michael@0 902 /*Fix for SPR#JUYA6XAERU and TSAO7GL5NK (Lotus)*/ &&(uniChar!=0xB1) &&(uniChar!=0xD7) &&(uniChar!=0xF7)
michael@0 903 &&(uniChar!=0xB0) &&(uniChar!=0xB4) &&(uniChar!=0xB6) &&(uniChar!=0xA7) &&(uniChar!=0xA8))
michael@0 904 {
michael@0 905 extraInfo->localeConverterIndex = ULMBCS_GRP_L1;
michael@0 906 }
michael@0 907 if (((uniChar > ULMBCS_C0END) && (uniChar < ULMBCS_C1START)) ||
michael@0 908 uniChar == 0 || uniChar == ULMBCS_HT || uniChar == ULMBCS_CR ||
michael@0 909 uniChar == ULMBCS_LF || uniChar == ULMBCS_123SYSTEMRANGE
michael@0 910 )
michael@0 911 {
michael@0 912 *pLMBCS++ = (ulmbcs_byte_t ) uniChar;
michael@0 913 bytes_written = 1;
michael@0 914 }
michael@0 915
michael@0 916
michael@0 917 if (!bytes_written)
michael@0 918 {
michael@0 919 /* Check by UNICODE range (Strategy 2) */
michael@0 920 ulmbcs_byte_t group = FindLMBCSUniRange(uniChar);
michael@0 921
michael@0 922 if (group == ULMBCS_GRP_UNICODE) /* (Strategy 2A) */
michael@0 923 {
michael@0 924 pLMBCS += LMBCSConvertUni(pLMBCS,uniChar);
michael@0 925
michael@0 926 bytes_written = (int32_t)(pLMBCS - LMBCS);
michael@0 927 }
michael@0 928 else if (group == ULMBCS_GRP_CTRL) /* (Strategy 2B) */
michael@0 929 {
michael@0 930 /* Handle control characters here */
michael@0 931 if (uniChar <= ULMBCS_C0END)
michael@0 932 {
michael@0 933 *pLMBCS++ = ULMBCS_GRP_CTRL;
michael@0 934 *pLMBCS++ = (ulmbcs_byte_t)(ULMBCS_CTRLOFFSET + uniChar);
michael@0 935 }
michael@0 936 else if (uniChar >= ULMBCS_C1START && uniChar <= ULMBCS_C1START + ULMBCS_CTRLOFFSET)
michael@0 937 {
michael@0 938 *pLMBCS++ = ULMBCS_GRP_CTRL;
michael@0 939 *pLMBCS++ = (ulmbcs_byte_t ) (uniChar & 0x00FF);
michael@0 940 }
michael@0 941 bytes_written = (int32_t)(pLMBCS - LMBCS);
michael@0 942 }
michael@0 943 else if (group < ULMBCS_GRP_UNICODE) /* (Strategy 2C) */
michael@0 944 {
michael@0 945 /* a specific converter has been identified - use it */
michael@0 946 bytes_written = (int32_t)LMBCSConversionWorker (
michael@0 947 extraInfo, group, pLMBCS, &uniChar,
michael@0 948 &lastConverterIndex, groups_tried);
michael@0 949 }
michael@0 950 if (!bytes_written) /* the ambiguous group cases (Strategy 3) */
michael@0 951 {
michael@0 952 uprv_memset(groups_tried, 0, sizeof(groups_tried));
michael@0 953
michael@0 954 /* check for non-default optimization group (Strategy 3A )*/
michael@0 955 if ((extraInfo->OptGroup != 1) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->OptGroup)))
michael@0 956 {
michael@0 957 /*zhujin: upgrade, merge #39299 here (Lotus) */
michael@0 958 /*To make R5 compatible translation, look for exceptional group first for non-DBCS*/
michael@0 959
michael@0 960 if(extraInfo->localeConverterIndex < ULMBCS_DOUBLEOPTGROUP_START)
michael@0 961 {
michael@0 962 bytes_written = LMBCSConversionWorker (extraInfo,
michael@0 963 ULMBCS_GRP_L1, pLMBCS, &uniChar,
michael@0 964 &lastConverterIndex, groups_tried);
michael@0 965
michael@0 966 if(!bytes_written)
michael@0 967 {
michael@0 968 bytes_written = LMBCSConversionWorker (extraInfo,
michael@0 969 ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar,
michael@0 970 &lastConverterIndex, groups_tried);
michael@0 971 }
michael@0 972 if(!bytes_written)
michael@0 973 {
michael@0 974 bytes_written = LMBCSConversionWorker (extraInfo,
michael@0 975 extraInfo->localeConverterIndex, pLMBCS, &uniChar,
michael@0 976 &lastConverterIndex, groups_tried);
michael@0 977 }
michael@0 978 }
michael@0 979 else
michael@0 980 {
michael@0 981 bytes_written = LMBCSConversionWorker (extraInfo,
michael@0 982 extraInfo->localeConverterIndex, pLMBCS, &uniChar,
michael@0 983 &lastConverterIndex, groups_tried);
michael@0 984 }
michael@0 985 }
michael@0 986 /* check for locale optimization group (Strategy 3B) */
michael@0 987 if (!bytes_written && (extraInfo->localeConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->localeConverterIndex)))
michael@0 988 {
michael@0 989 bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
michael@0 990 extraInfo->localeConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried);
michael@0 991 }
michael@0 992 /* check for last optimization group used for this string (Strategy 3C) */
michael@0 993 if (!bytes_written && (lastConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, lastConverterIndex)))
michael@0 994 {
michael@0 995 bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
michael@0 996 lastConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried);
michael@0 997 }
michael@0 998 if (!bytes_written)
michael@0 999 {
michael@0 1000 /* just check every possible matching converter (Strategy 3D) */
michael@0 1001 ulmbcs_byte_t grp_start;
michael@0 1002 ulmbcs_byte_t grp_end;
michael@0 1003 ulmbcs_byte_t grp_ix;
michael@0 1004 grp_start = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS)
michael@0 1005 ? ULMBCS_DOUBLEOPTGROUP_START
michael@0 1006 : ULMBCS_GRP_L1);
michael@0 1007 grp_end = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS)
michael@0 1008 ? ULMBCS_GRP_LAST
michael@0 1009 : ULMBCS_GRP_TH);
michael@0 1010 if(group == ULMBCS_AMBIGUOUS_ALL)
michael@0 1011 {
michael@0 1012 grp_start = ULMBCS_GRP_L1;
michael@0 1013 grp_end = ULMBCS_GRP_LAST;
michael@0 1014 }
michael@0 1015 for (grp_ix = grp_start;
michael@0 1016 grp_ix <= grp_end && !bytes_written;
michael@0 1017 grp_ix++)
michael@0 1018 {
michael@0 1019 if (extraInfo->OptGrpConverter [grp_ix] && !groups_tried [grp_ix])
michael@0 1020 {
michael@0 1021 bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
michael@0 1022 grp_ix, pLMBCS, &uniChar,
michael@0 1023 &lastConverterIndex, groups_tried);
michael@0 1024 }
michael@0 1025 }
michael@0 1026 /* a final conversion fallback to the exceptions group if its likely
michael@0 1027 to be single byte (Strategy 3E) */
michael@0 1028 if (!bytes_written && grp_start == ULMBCS_GRP_L1)
michael@0 1029 {
michael@0 1030 bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
michael@0 1031 ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar,
michael@0 1032 &lastConverterIndex, groups_tried);
michael@0 1033 }
michael@0 1034 }
michael@0 1035 /* all of our other strategies failed. Fallback to Unicode. (Strategy 4)*/
michael@0 1036 if (!bytes_written)
michael@0 1037 {
michael@0 1038
michael@0 1039 pLMBCS += LMBCSConvertUni(pLMBCS, uniChar);
michael@0 1040 bytes_written = (int32_t)(pLMBCS - LMBCS);
michael@0 1041 }
michael@0 1042 }
michael@0 1043 }
michael@0 1044
michael@0 1045 /* we have a translation. increment source and write as much as posible to target */
michael@0 1046 args->source++;
michael@0 1047 pLMBCS = LMBCS;
michael@0 1048 while (args->target < args->targetLimit && bytes_written--)
michael@0 1049 {
michael@0 1050 *(args->target)++ = *pLMBCS++;
michael@0 1051 if (args->offsets)
michael@0 1052 {
michael@0 1053 *(args->offsets)++ = sourceIndex;
michael@0 1054 }
michael@0 1055 }
michael@0 1056 sourceIndex++;
michael@0 1057 if (bytes_written > 0)
michael@0 1058 {
michael@0 1059 /* write any bytes that didn't fit in target to the error buffer,
michael@0 1060 common code will move this to target if we get called back with
michael@0 1061 enough target room
michael@0 1062 */
michael@0 1063 uint8_t * pErrorBuffer = args->converter->charErrorBuffer;
michael@0 1064 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 1065 args->converter->charErrorBufferLength = (int8_t)bytes_written;
michael@0 1066 while (bytes_written--)
michael@0 1067 {
michael@0 1068 *pErrorBuffer++ = *pLMBCS++;
michael@0 1069 }
michael@0 1070 }
michael@0 1071 /*Fix for SPR#DJOE66JFN3 (Lotus)*/
michael@0 1072 extraInfo->localeConverterIndex = OldConverterIndex;
michael@0 1073 }
michael@0 1074 }
michael@0 1075
michael@0 1076
michael@0 1077 /* Now, the Unicode from LMBCS section */
michael@0 1078
michael@0 1079
michael@0 1080 /* A function to call when we are looking at the Unicode group byte in LMBCS */
michael@0 1081 static UChar
michael@0 1082 GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode byte stream */
michael@0 1083 {
michael@0 1084 uint8_t HighCh = *(*ppLMBCSin)++; /* Big-endian Unicode in LMBCS compatibility group*/
michael@0 1085 uint8_t LowCh = *(*ppLMBCSin)++;
michael@0 1086
michael@0 1087 if (HighCh == ULMBCS_UNICOMPATZERO )
michael@0 1088 {
michael@0 1089 HighCh = LowCh;
michael@0 1090 LowCh = 0; /* zero-byte in LSB special character */
michael@0 1091 }
michael@0 1092 return (UChar)((HighCh << 8) | LowCh);
michael@0 1093 }
michael@0 1094
michael@0 1095
michael@0 1096
michael@0 1097 /* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index'
michael@0 1098 bytes left in source up to sourceLimit.Errors appropriately if not.
michael@0 1099 If we reach the limit, then update the source pointer to there to consume
michael@0 1100 all input as required by ICU converter semantics.
michael@0 1101 */
michael@0 1102
michael@0 1103 #define CHECK_SOURCE_LIMIT(index) \
michael@0 1104 if (args->source+index > args->sourceLimit){\
michael@0 1105 *err = U_TRUNCATED_CHAR_FOUND;\
michael@0 1106 args->source = args->sourceLimit;\
michael@0 1107 return 0xffff;}
michael@0 1108
michael@0 1109 /* Return the Unicode representation for the current LMBCS character */
michael@0 1110
michael@0 1111 static UChar32
michael@0 1112 _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
michael@0 1113 UErrorCode* err)
michael@0 1114 {
michael@0 1115 UChar32 uniChar = 0; /* an output UNICODE char */
michael@0 1116 ulmbcs_byte_t CurByte; /* A byte from the input stream */
michael@0 1117
michael@0 1118 /* error check */
michael@0 1119 if (args->source >= args->sourceLimit)
michael@0 1120 {
michael@0 1121 *err = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1122 return 0xffff;
michael@0 1123 }
michael@0 1124 /* Grab first byte & save address for error recovery */
michael@0 1125 CurByte = *((ulmbcs_byte_t *) (args->source++));
michael@0 1126
michael@0 1127 /*
michael@0 1128 * at entry of each if clause:
michael@0 1129 * 1. 'CurByte' points at the first byte of a LMBCS character
michael@0 1130 * 2. '*source'points to the next byte of the source stream after 'CurByte'
michael@0 1131 *
michael@0 1132 * the job of each if clause is:
michael@0 1133 * 1. set '*source' to point at the beginning of next char (nop if LMBCS char is only 1 byte)
michael@0 1134 * 2. set 'uniChar' up with the right Unicode value, or set 'err' appropriately
michael@0 1135 */
michael@0 1136
michael@0 1137 /* First lets check the simple fixed values. */
michael@0 1138
michael@0 1139 if(((CurByte > ULMBCS_C0END) && (CurByte < ULMBCS_C1START)) /* ascii range */
michael@0 1140 || (CurByte == 0)
michael@0 1141 || CurByte == ULMBCS_HT || CurByte == ULMBCS_CR
michael@0 1142 || CurByte == ULMBCS_LF || CurByte == ULMBCS_123SYSTEMRANGE)
michael@0 1143 {
michael@0 1144 uniChar = CurByte;
michael@0 1145 }
michael@0 1146 else
michael@0 1147 {
michael@0 1148 UConverterDataLMBCS * extraInfo;
michael@0 1149 ulmbcs_byte_t group;
michael@0 1150 UConverterSharedData *cnv;
michael@0 1151
michael@0 1152 if (CurByte == ULMBCS_GRP_CTRL) /* Control character group - no opt group update */
michael@0 1153 {
michael@0 1154 ulmbcs_byte_t C0C1byte;
michael@0 1155 CHECK_SOURCE_LIMIT(1);
michael@0 1156 C0C1byte = *(args->source)++;
michael@0 1157 uniChar = (C0C1byte < ULMBCS_C1START) ? C0C1byte - ULMBCS_CTRLOFFSET : C0C1byte;
michael@0 1158 }
michael@0 1159 else
michael@0 1160 if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */
michael@0 1161 {
michael@0 1162 CHECK_SOURCE_LIMIT(2);
michael@0 1163
michael@0 1164 /* don't check for error indicators fffe/ffff below */
michael@0 1165 return GetUniFromLMBCSUni(&(args->source));
michael@0 1166 }
michael@0 1167 else if (CurByte <= ULMBCS_CTRLOFFSET)
michael@0 1168 {
michael@0 1169 group = CurByte; /* group byte is in the source */
michael@0 1170 extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
michael@0 1171 if (group > ULMBCS_GRP_LAST || (cnv = extraInfo->OptGrpConverter[group]) == NULL)
michael@0 1172 {
michael@0 1173 /* this is not a valid group byte - no converter*/
michael@0 1174 *err = U_INVALID_CHAR_FOUND;
michael@0 1175 }
michael@0 1176 else if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */
michael@0 1177 {
michael@0 1178
michael@0 1179 CHECK_SOURCE_LIMIT(2);
michael@0 1180
michael@0 1181 /* check for LMBCS doubled-group-byte case */
michael@0 1182 if (*args->source == group) {
michael@0 1183 /* single byte */
michael@0 1184 ++args->source;
michael@0 1185 uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 1, FALSE);
michael@0 1186 ++args->source;
michael@0 1187 } else {
michael@0 1188 /* double byte */
michael@0 1189 uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 2, FALSE);
michael@0 1190 args->source += 2;
michael@0 1191 }
michael@0 1192 }
michael@0 1193 else { /* single byte conversion */
michael@0 1194 CHECK_SOURCE_LIMIT(1);
michael@0 1195 CurByte = *(args->source)++;
michael@0 1196
michael@0 1197 if (CurByte >= ULMBCS_C1START)
michael@0 1198 {
michael@0 1199 uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
michael@0 1200 }
michael@0 1201 else
michael@0 1202 {
michael@0 1203 /* The non-optimizable oddballs where there is an explicit byte
michael@0 1204 * AND the second byte is not in the upper ascii range
michael@0 1205 */
michael@0 1206 char bytes[2];
michael@0 1207
michael@0 1208 extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
michael@0 1209 cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT];
michael@0 1210
michael@0 1211 /* Lookup value must include opt group */
michael@0 1212 bytes[0] = group;
michael@0 1213 bytes[1] = CurByte;
michael@0 1214 uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, bytes, 2, FALSE);
michael@0 1215 }
michael@0 1216 }
michael@0 1217 }
michael@0 1218 else if (CurByte >= ULMBCS_C1START) /* group byte is implicit */
michael@0 1219 {
michael@0 1220 extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
michael@0 1221 group = extraInfo->OptGroup;
michael@0 1222 cnv = extraInfo->OptGrpConverter[group];
michael@0 1223 if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */
michael@0 1224 {
michael@0 1225 if (!ucnv_MBCSIsLeadByte(cnv, CurByte))
michael@0 1226 {
michael@0 1227 CHECK_SOURCE_LIMIT(0);
michael@0 1228
michael@0 1229 /* let the MBCS conversion consume CurByte again */
michael@0 1230 uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 1, FALSE);
michael@0 1231 }
michael@0 1232 else
michael@0 1233 {
michael@0 1234 CHECK_SOURCE_LIMIT(1);
michael@0 1235 /* let the MBCS conversion consume CurByte again */
michael@0 1236 uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 2, FALSE);
michael@0 1237 ++args->source;
michael@0 1238 }
michael@0 1239 }
michael@0 1240 else /* single byte conversion */
michael@0 1241 {
michael@0 1242 uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
michael@0 1243 }
michael@0 1244 }
michael@0 1245 }
michael@0 1246 return uniChar;
michael@0 1247 }
michael@0 1248
michael@0 1249
michael@0 1250 /* The exported function that converts lmbcs to one or more
michael@0 1251 UChars - currently UTF-16
michael@0 1252 */
michael@0 1253 static void
michael@0 1254 _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
michael@0 1255 UErrorCode* err)
michael@0 1256 {
michael@0 1257 char LMBCS [ULMBCS_CHARSIZE_MAX];
michael@0 1258 UChar uniChar; /* one output UNICODE char */
michael@0 1259 const char * saveSource; /* beginning of current code point */
michael@0 1260 const char * pStartLMBCS = args->source; /* beginning of whole string */
michael@0 1261 const char * errSource = NULL; /* pointer to actual input in case an error occurs */
michael@0 1262 int8_t savebytes = 0;
michael@0 1263
michael@0 1264 /* Process from source to limit, or until error */
michael@0 1265 while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target)
michael@0 1266 {
michael@0 1267 saveSource = args->source; /* beginning of current code point */
michael@0 1268
michael@0 1269 if (args->converter->toULength) /* reassemble char from previous call */
michael@0 1270 {
michael@0 1271 const char *saveSourceLimit;
michael@0 1272 size_t size_old = args->converter->toULength;
michael@0 1273
michael@0 1274 /* limit from source is either remainder of temp buffer, or user limit on source */
michael@0 1275 size_t size_new_maybe_1 = sizeof(LMBCS) - size_old;
michael@0 1276 size_t size_new_maybe_2 = args->sourceLimit - args->source;
michael@0 1277 size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2;
michael@0 1278
michael@0 1279
michael@0 1280 uprv_memcpy(LMBCS, args->converter->toUBytes, size_old);
michael@0 1281 uprv_memcpy(LMBCS + size_old, args->source, size_new);
michael@0 1282 saveSourceLimit = args->sourceLimit;
michael@0 1283 args->source = errSource = LMBCS;
michael@0 1284 args->sourceLimit = LMBCS+size_old+size_new;
michael@0 1285 savebytes = (int8_t)(size_old+size_new);
michael@0 1286 uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
michael@0 1287 args->source = saveSource + ((args->source - LMBCS) - size_old);
michael@0 1288 args->sourceLimit = saveSourceLimit;
michael@0 1289
michael@0 1290 if (*err == U_TRUNCATED_CHAR_FOUND)
michael@0 1291 {
michael@0 1292 /* evil special case: source buffers so small a char spans more than 2 buffers */
michael@0 1293 args->converter->toULength = savebytes;
michael@0 1294 uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes);
michael@0 1295 args->source = args->sourceLimit;
michael@0 1296 *err = U_ZERO_ERROR;
michael@0 1297 return;
michael@0 1298 }
michael@0 1299 else
michael@0 1300 {
michael@0 1301 /* clear the partial-char marker */
michael@0 1302 args->converter->toULength = 0;
michael@0 1303 }
michael@0 1304 }
michael@0 1305 else
michael@0 1306 {
michael@0 1307 errSource = saveSource;
michael@0 1308 uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
michael@0 1309 savebytes = (int8_t)(args->source - saveSource);
michael@0 1310 }
michael@0 1311 if (U_SUCCESS(*err))
michael@0 1312 {
michael@0 1313 if (uniChar < 0xfffe)
michael@0 1314 {
michael@0 1315 *(args->target)++ = uniChar;
michael@0 1316 if(args->offsets)
michael@0 1317 {
michael@0 1318 *(args->offsets)++ = (int32_t)(saveSource - pStartLMBCS);
michael@0 1319 }
michael@0 1320 }
michael@0 1321 else if (uniChar == 0xfffe)
michael@0 1322 {
michael@0 1323 *err = U_INVALID_CHAR_FOUND;
michael@0 1324 }
michael@0 1325 else /* if (uniChar == 0xffff) */
michael@0 1326 {
michael@0 1327 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 1328 }
michael@0 1329 }
michael@0 1330 }
michael@0 1331 /* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */
michael@0 1332 if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target)
michael@0 1333 {
michael@0 1334 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 1335 }
michael@0 1336 else if (U_FAILURE(*err))
michael@0 1337 {
michael@0 1338 /* If character incomplete or unmappable/illegal, store it in toUBytes[] */
michael@0 1339 args->converter->toULength = savebytes;
michael@0 1340 if (savebytes > 0) {
michael@0 1341 uprv_memcpy(args->converter->toUBytes, errSource, savebytes);
michael@0 1342 }
michael@0 1343 if (*err == U_TRUNCATED_CHAR_FOUND) {
michael@0 1344 *err = U_ZERO_ERROR;
michael@0 1345 }
michael@0 1346 }
michael@0 1347 }
michael@0 1348
michael@0 1349 /* And now, the macroized declarations of data & functions: */
michael@0 1350 DEFINE_LMBCS_OPEN(1)
michael@0 1351 DEFINE_LMBCS_OPEN(2)
michael@0 1352 DEFINE_LMBCS_OPEN(3)
michael@0 1353 DEFINE_LMBCS_OPEN(4)
michael@0 1354 DEFINE_LMBCS_OPEN(5)
michael@0 1355 DEFINE_LMBCS_OPEN(6)
michael@0 1356 DEFINE_LMBCS_OPEN(8)
michael@0 1357 DEFINE_LMBCS_OPEN(11)
michael@0 1358 DEFINE_LMBCS_OPEN(16)
michael@0 1359 DEFINE_LMBCS_OPEN(17)
michael@0 1360 DEFINE_LMBCS_OPEN(18)
michael@0 1361 DEFINE_LMBCS_OPEN(19)
michael@0 1362
michael@0 1363
michael@0 1364 DECLARE_LMBCS_DATA(1)
michael@0 1365 DECLARE_LMBCS_DATA(2)
michael@0 1366 DECLARE_LMBCS_DATA(3)
michael@0 1367 DECLARE_LMBCS_DATA(4)
michael@0 1368 DECLARE_LMBCS_DATA(5)
michael@0 1369 DECLARE_LMBCS_DATA(6)
michael@0 1370 DECLARE_LMBCS_DATA(8)
michael@0 1371 DECLARE_LMBCS_DATA(11)
michael@0 1372 DECLARE_LMBCS_DATA(16)
michael@0 1373 DECLARE_LMBCS_DATA(17)
michael@0 1374 DECLARE_LMBCS_DATA(18)
michael@0 1375 DECLARE_LMBCS_DATA(19)
michael@0 1376
michael@0 1377 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

mercurial