michael@0: /*  
michael@0: **********************************************************************
michael@0: *   Copyright (C) 2000-2011, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: **********************************************************************
michael@0: *   file name:  ucnv_lmb.cpp
michael@0: *   encoding:   US-ASCII
michael@0: *   tab size:   4 (not used)
michael@0: *   indentation:4
michael@0: *
michael@0: *   created on: 2000feb09
michael@0: *   created by: Brendan Murray
michael@0: *   extensively hacked up by: Jim Snyder-Grant
michael@0: *
michael@0: * Modification History:
michael@0: * 
michael@0: *   Date        Name             Description
michael@0: * 
michael@0: *   06/20/2000  helena           OS/400 port changes; mostly typecast.
michael@0: *   06/27/2000  Jim Snyder-Grant Deal with partial characters and small buffers.
michael@0: *                                Add comments to document LMBCS format and implementation
michael@0: *                                restructured order & breakdown of functions
michael@0: *   06/28/2000  helena           Major rewrite for the callback API changes.
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
michael@0: 
michael@0: #include "unicode/ucnv_err.h"
michael@0: #include "unicode/ucnv.h"
michael@0: #include "unicode/uset.h"
michael@0: #include "cmemory.h"
michael@0: #include "cstring.h"
michael@0: #include "uassert.h"
michael@0: #include "ucnv_imp.h"
michael@0: #include "ucnv_bld.h"
michael@0: #include "ucnv_cnv.h"
michael@0: 
michael@0: #ifdef EBCDIC_RTL
michael@0:     #include "ascii_a.h"
michael@0: #endif
michael@0: 
michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0: 
michael@0: /*
michael@0:   LMBCS
michael@0: 
michael@0:   (Lotus Multi-Byte Character Set)
michael@0: 
michael@0:   LMBCS was invented in the late 1980's and is primarily used in Lotus Notes 
michael@0:   databases and in Lotus 1-2-3 files. Programmers who work with the APIs 
michael@0:   into these products will sometimes need to deal with strings in this format.
michael@0: 
michael@0:   The code in this file provides an implementation for an ICU converter of 
michael@0:   LMBCS to and from Unicode. 
michael@0: 
michael@0:   Since the LMBCS character set is only sparsely documented in existing 
michael@0:   printed or online material, we have added  extensive annotation to this 
michael@0:   file to serve as a guide to understanding LMBCS. 
michael@0: 
michael@0:   LMBCS was originally designed with these four sometimes-competing design goals:
michael@0: 
michael@0:   -Provide encodings for the characters in 12 existing national standards
michael@0:    (plus a few other characters)
michael@0:   -Minimal memory footprint
michael@0:   -Maximal speed of conversion into the existing national character sets
michael@0:   -No need to track a changing state as you interpret a string.
michael@0: 
michael@0: 
michael@0:   All of the national character sets LMBCS was trying to encode are 'ANSI'
michael@0:   based, in that the bytes from 0x20 - 0x7F are almost exactly the 
michael@0:   same common Latin unaccented characters and symbols in all character sets. 
michael@0: 
michael@0:   So, in order to help meet the speed & memory design goals, the common ANSI 
michael@0:   bytes from 0x20-0x7F are represented by the same single-byte values in LMBCS. 
michael@0: 
michael@0:   The general LMBCS code unit is from 1-3 bytes. We can describe the 3 bytes as
michael@0:   follows:
michael@0: 
michael@0:   [G] D1 [D2]
michael@0: 
michael@0:   That is, a sometimes-optional 'group' byte, followed by 1 and sometimes 2
michael@0:   data bytes. The maximum size of a LMBCS chjaracter is 3 bytes:
michael@0: */
michael@0: #define ULMBCS_CHARSIZE_MAX      3
michael@0: /*
michael@0:   The single-byte values from 0x20 to 0x7F are examples of single D1 bytes.
michael@0:   We often have to figure out if byte values are below or above this, so we 
michael@0:   use the ANSI nomenclature 'C0' and 'C1' to refer to the range of control 
michael@0:   characters just above & below the common lower-ANSI  range */
michael@0: #define ULMBCS_C0END           0x1F   
michael@0: #define ULMBCS_C1START         0x80   
michael@0: /*
michael@0:   Since LMBCS is always dealing in byte units. we create a local type here for 
michael@0:   dealing with these units of LMBCS code units:
michael@0: 
michael@0: */  
michael@0: typedef uint8_t ulmbcs_byte_t;
michael@0: 
michael@0: /* 
michael@0:    Most of the values less than 0x20 are reserved in LMBCS to announce 
michael@0:    which national  character standard is being used for the 'D' bytes. 
michael@0:    In the comments we show the common name and the IBM character-set ID
michael@0:    for these character-set announcers:
michael@0: */
michael@0: 
michael@0: #define ULMBCS_GRP_L1         0x01   /* Latin-1    :ibm-850  */
michael@0: #define ULMBCS_GRP_GR         0x02   /* Greek      :ibm-851  */
michael@0: #define ULMBCS_GRP_HE         0x03   /* Hebrew     :ibm-1255 */
michael@0: #define ULMBCS_GRP_AR         0x04   /* Arabic     :ibm-1256 */
michael@0: #define ULMBCS_GRP_RU         0x05   /* Cyrillic   :ibm-1251 */
michael@0: #define ULMBCS_GRP_L2         0x06   /* Latin-2    :ibm-852  */
michael@0: #define ULMBCS_GRP_TR         0x08   /* Turkish    :ibm-1254 */
michael@0: #define ULMBCS_GRP_TH         0x0B   /* Thai       :ibm-874  */
michael@0: #define ULMBCS_GRP_JA         0x10   /* Japanese   :ibm-943  */
michael@0: #define ULMBCS_GRP_KO         0x11   /* Korean     :ibm-1261 */
michael@0: #define ULMBCS_GRP_TW         0x12   /* Chinese SC :ibm-950  */
michael@0: #define ULMBCS_GRP_CN         0x13   /* Chinese TC :ibm-1386 */
michael@0: 
michael@0: /*
michael@0:    So, the beginning of understanding LMBCS is that IF the first byte of a LMBCS 
michael@0:    character is one of those 12 values, you can interpret the remaining bytes of 
michael@0:    that character as coming from one of those character sets. Since the lower 
michael@0:    ANSI bytes already are represented in single bytes, using one of the character 
michael@0:    set announcers is used to announce a character that starts with a byte of 
michael@0:    0x80 or greater.
michael@0: 
michael@0:    The character sets are  arranged so that the single byte sets all appear 
michael@0:    before the multi-byte character sets. When we need to tell whether a 
michael@0:    group byte is for a single byte char set or not we use this define: */
michael@0: 
michael@0: #define ULMBCS_DOUBLEOPTGROUP_START  0x10   
michael@0: 
michael@0: /* 
michael@0: However, to fully understand LMBCS, you must also understand a series of 
michael@0: exceptions & optimizations made in service of the design goals. 
michael@0: 
michael@0: First, those of you who are character set mavens may have noticed that
michael@0: the 'double-byte' character sets are actually multi-byte character sets 
michael@0: that can have 1 or two bytes, even in the upper-ascii range. To force
michael@0: each group byte to introduce a fixed-width encoding (to make it faster to 
michael@0: count characters), we use a convention of doubling up on the group byte 
michael@0: to introduce any single-byte character > 0x80 in an otherwise double-byte
michael@0: character set. So, for example, the LMBCS sequence x10 x10 xAE is the 
michael@0: same as '0xAE' in the Japanese code page 943.
michael@0: 
michael@0: Next, you will notice that the list of group bytes has some gaps. 
michael@0: These are used in various ways.
michael@0: 
michael@0: We reserve a few special single byte values for common control 
michael@0: characters. These are in the same place as their ANSI eqivalents for speed.
michael@0: */
michael@0:                      
michael@0: #define ULMBCS_HT    0x09   /* Fixed control char - Horizontal Tab */
michael@0: #define ULMBCS_LF    0x0A   /* Fixed control char - Line Feed */
michael@0: #define ULMBCS_CR    0x0D   /* Fixed control char - Carriage Return */
michael@0: 
michael@0: /* Then, 1-2-3 reserved a special single-byte character to put at the 
michael@0: beginning of internal 'system' range names: */
michael@0: 
michael@0: #define ULMBCS_123SYSTEMRANGE  0x19   
michael@0: 
michael@0: /* Then we needed a place to put all the other ansi control characters 
michael@0: that must be moved to different values because LMBCS reserves those 
michael@0: values for other purposes. To represent the control characters, we start 
michael@0: with a first byte of 0xF & add the control chaarcter value as the 
michael@0: second byte */
michael@0: #define ULMBCS_GRP_CTRL       0x0F   
michael@0: 
michael@0: /* For the C0 controls (less than 0x20), we add 0x20 to preserve the 
michael@0: useful doctrine that any byte less than 0x20 in a LMBCS char must be 
michael@0: the first byte of a character:*/
michael@0: #define ULMBCS_CTRLOFFSET      0x20   
michael@0: 
michael@0: /* 
michael@0: Where to put the characters that aren't part of any of the 12 national 
michael@0: character sets? The first thing that was done, in the earlier years of 
michael@0: LMBCS, was to use up the spaces of the form
michael@0: 
michael@0:   [G] D1, 
michael@0:   
michael@0:  where  'G' was one of the single-byte character groups, and
michael@0:  D1 was less than 0x80. These sequences are gathered together 
michael@0:  into a Lotus-invented doublebyte character set to represent a 
michael@0:  lot of stray values. Internally, in this implementation, we track this 
michael@0:  as group '0', as a place to tuck this exceptions list.*/
michael@0: 
michael@0: #define ULMBCS_GRP_EXCEPT     0x00    
michael@0: /*
michael@0:  Finally, as the durability and usefulness of UNICODE became clear, 
michael@0:  LOTUS added a new group 0x14 to hold Unicode values not otherwise 
michael@0:  represented in LMBCS: */
michael@0: #define ULMBCS_GRP_UNICODE    0x14   
michael@0: /* The two bytes appearing after a 0x14 are intrepreted as UFT-16 BE
michael@0: (Big-Endian) characters. The exception comes when the UTF16 
michael@0: representation would have a zero as the second byte. In that case,
michael@0: 'F6' is used in its place, and the bytes are swapped. (This prevents 
michael@0: LMBCS from encoding any Unicode values of the form U+F6xx, but that's OK:
michael@0: 0xF6xx is in the middle of the Private Use Area.)*/
michael@0: #define ULMBCS_UNICOMPATZERO   0xF6   
michael@0: 
michael@0: /* It is also useful in our code to have a constant for the size of 
michael@0: a LMBCS char that holds a literal Unicode value */
michael@0: #define ULMBCS_UNICODE_SIZE      3    
michael@0: 
michael@0: /* 
michael@0: To squish the LMBCS representations down even further, and to make 
michael@0: translations even faster,sometimes the optimization group byte can be dropped 
michael@0: from a LMBCS character. This is decided on a process-by-process basis. The 
michael@0: group byte that is dropped is called the 'optimization group'.
michael@0: 
michael@0: For Notes, the optimzation group is always 0x1.*/
michael@0: #define ULMBCS_DEFAULTOPTGROUP 0x1    
michael@0: /* For 1-2-3 files, the optimzation group is stored in the header of the 1-2-3 
michael@0: file. 
michael@0: 
michael@0:  In any case, when using ICU, you either pass in the 
michael@0: optimization group as part of the name of the converter (LMBCS-1, LMBCS-2, 
michael@0: etc.). Using plain 'LMBCS' as the name of the converter will give you 
michael@0: LMBCS-1.
michael@0: 
michael@0: 
michael@0: *** Implementation strategy ***
michael@0: 
michael@0: 
michael@0: Because of the extensive use of other character sets, the LMBCS converter
michael@0: keeps a mapping between optimization groups and IBM character sets, so that
michael@0: ICU converters can be created and used as needed. */
michael@0: 
michael@0: /* As you can see, even though any byte below 0x20 could be an optimization 
michael@0: byte, only those at 0x13 or below can map to an actual converter. To limit
michael@0: some loops and searches, we define a value for that last group converter:*/
michael@0: 
michael@0: #define ULMBCS_GRP_LAST       0x13   /* last LMBCS group that has a converter */
michael@0: 
michael@0: static const char * const OptGroupByteToCPName[ULMBCS_GRP_LAST + 1] = {
michael@0:    /* 0x0000 */ "lmb-excp", /* internal home for the LOTUS exceptions list */
michael@0:    /* 0x0001 */ "ibm-850",
michael@0:    /* 0x0002 */ "ibm-851",
michael@0:    /* 0x0003 */ "windows-1255",
michael@0:    /* 0x0004 */ "windows-1256",
michael@0:    /* 0x0005 */ "windows-1251",
michael@0:    /* 0x0006 */ "ibm-852",
michael@0:    /* 0x0007 */ NULL,      /* Unused */
michael@0:    /* 0x0008 */ "windows-1254",
michael@0:    /* 0x0009 */ NULL,      /* Control char HT */
michael@0:    /* 0x000A */ NULL,      /* Control char LF */
michael@0:    /* 0x000B */ "windows-874",
michael@0:    /* 0x000C */ NULL,      /* Unused */
michael@0:    /* 0x000D */ NULL,      /* Control char CR */
michael@0:    /* 0x000E */ NULL,      /* Unused */
michael@0:    /* 0x000F */ NULL,      /* Control chars: 0x0F20 + C0/C1 character: algorithmic */
michael@0:    /* 0x0010 */ "windows-932",
michael@0:    /* 0x0011 */ "windows-949",
michael@0:    /* 0x0012 */ "windows-950",
michael@0:    /* 0x0013 */ "windows-936"
michael@0: 
michael@0:    /* The rest are null, including the 0x0014 Unicode compatibility region
michael@0:    and 0x0019, the 1-2-3 system range control char */      
michael@0: };
michael@0: 
michael@0: 
michael@0: /* That's approximately all the data that's needed for translating 
michael@0:   LMBCS to Unicode. 
michael@0: 
michael@0: 
michael@0: However, to translate Unicode to LMBCS, we need some more support.
michael@0: 
michael@0: That's because there are often more than one possible mappings from a Unicode
michael@0: code point back into LMBCS. The first thing we do is look up into a table
michael@0: to figure out if there are more than one possible mappings. This table,
michael@0: arranged by Unicode values (including ranges) either lists which group 
michael@0: to use, or says that it could go into one or more of the SBCS sets, or
michael@0: into one or more of the DBCS sets.  (If the character exists in both DBCS & 
michael@0: SBCS, the table will place it in the SBCS sets, to make the LMBCS code point 
michael@0: length as small as possible. Here's the two special markers we use to indicate
michael@0: ambiguous mappings: */
michael@0: 
michael@0: #define ULMBCS_AMBIGUOUS_SBCS   0x80   /* could fit in more than one 
michael@0:                                           LMBCS sbcs native encoding 
michael@0:                                           (example: most accented latin) */
michael@0: #define ULMBCS_AMBIGUOUS_MBCS   0x81   /* could fit in more than one 
michael@0:                                           LMBCS mbcs native encoding 
michael@0:                                           (example: Unihan) */
michael@0: #define ULMBCS_AMBIGUOUS_ALL   0x82
michael@0: /* And here's a simple way to see if a group falls in an appropriate range */
michael@0: #define ULMBCS_AMBIGUOUS_MATCH(agroup, xgroup) \
michael@0:                   ((((agroup) == ULMBCS_AMBIGUOUS_SBCS) && \
michael@0:                   (xgroup) < ULMBCS_DOUBLEOPTGROUP_START) || \
michael@0:                   (((agroup) == ULMBCS_AMBIGUOUS_MBCS) && \
michael@0:                   (xgroup) >= ULMBCS_DOUBLEOPTGROUP_START)) || \
michael@0:                   ((agroup) == ULMBCS_AMBIGUOUS_ALL)
michael@0: 
michael@0: 
michael@0: /* The table & some code to use it: */
michael@0: 
michael@0: 
michael@0: static const struct _UniLMBCSGrpMap  
michael@0: {
michael@0:    const UChar uniStartRange;
michael@0:    const UChar uniEndRange;
michael@0:    const ulmbcs_byte_t  GrpType;
michael@0: } UniLMBCSGrpMap[]
michael@0: =
michael@0: {
michael@0: 
michael@0:     {0x0001, 0x001F,  ULMBCS_GRP_CTRL},
michael@0:     {0x0080, 0x009F,  ULMBCS_GRP_CTRL},
michael@0:     {0x00A0, 0x00A6,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x00A7, 0x00A8,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x00A9, 0x00AF,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x00B0, 0x00B1,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x00B2, 0x00B3,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x00B4, 0x00B4,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x00B5, 0x00B5,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x00B6, 0x00B6,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x00B7, 0x00D6,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x00D7, 0x00D7,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x00D8, 0x00F6,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x00F7, 0x00F7,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x00F8, 0x01CD,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x01CE, 0x01CE,  ULMBCS_GRP_TW },
michael@0:     {0x01CF, 0x02B9,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x02BA, 0x02BA,  ULMBCS_GRP_CN},
michael@0:     {0x02BC, 0x02C8,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x02C9, 0x02D0,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x02D8, 0x02DD,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x0384, 0x0390,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x0391, 0x03A9,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x03AA, 0x03B0,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x03B1, 0x03C9,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x03CA, 0x03CE,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x0400, 0x0400,  ULMBCS_GRP_RU},
michael@0:     {0x0401, 0x0401,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x0402, 0x040F,  ULMBCS_GRP_RU},
michael@0:     {0x0410, 0x0431,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x0432, 0x044E,  ULMBCS_GRP_RU},
michael@0:     {0x044F, 0x044F,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x0450, 0x0491,  ULMBCS_GRP_RU},
michael@0:     {0x05B0, 0x05F2,  ULMBCS_GRP_HE},
michael@0:     {0x060C, 0x06AF,  ULMBCS_GRP_AR},
michael@0:     {0x0E01, 0x0E5B,  ULMBCS_GRP_TH},
michael@0:     {0x200C, 0x200F,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2010, 0x2010,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2013, 0x2014,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2015, 0x2015,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2016, 0x2016,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2017, 0x2017,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2018, 0x2019,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x201A, 0x201B,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x201C, 0x201D,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x201E, 0x201F,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2020, 0x2021,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x2022, 0x2024,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2025, 0x2025,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2026, 0x2026,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x2027, 0x2027,  ULMBCS_GRP_TW},
michael@0:     {0x2030, 0x2030,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x2031, 0x2031,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2032, 0x2033,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2035, 0x2035,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2039, 0x203A,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x203B, 0x203B,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x203C, 0x203C,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2074, 0x2074,  ULMBCS_GRP_KO},
michael@0:     {0x207F, 0x207F,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2081, 0x2084,  ULMBCS_GRP_KO},
michael@0:     {0x20A4, 0x20AC,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2103, 0x2109,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2111, 0x2120,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     /*zhujin: upgrade, for regressiont test, spr HKIA4YHTSU*/
michael@0:     {0x2121, 0x2121,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2122, 0x2126,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x212B, 0x212B,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2135, 0x2135,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2153, 0x2154,  ULMBCS_GRP_KO},
michael@0:     {0x215B, 0x215E,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2160, 0x2179,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2190, 0x2193,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x2194, 0x2195,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2196, 0x2199,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x21A8, 0x21A8,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x21B8, 0x21B9,  ULMBCS_GRP_CN},
michael@0:     {0x21D0, 0x21D1,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x21D2, 0x21D2,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x21D3, 0x21D3,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x21D4, 0x21D4,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x21D5, 0x21D5,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x21E7, 0x21E7,  ULMBCS_GRP_CN},
michael@0:     {0x2200, 0x2200,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2201, 0x2201,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2202, 0x2202,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2203, 0x2203,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2204, 0x2206,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2207, 0x2208,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2209, 0x220A,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x220B, 0x220B,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x220F, 0x2215,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2219, 0x2219,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x221A, 0x221A,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x221B, 0x221C,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x221D, 0x221E,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x221F, 0x221F,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2220, 0x2220,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2223, 0x222A,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x222B, 0x223D,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2245, 0x2248,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x224C, 0x224C,  ULMBCS_GRP_TW},
michael@0:     {0x2252, 0x2252,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2260, 0x2261,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2262, 0x2265,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2266, 0x226F,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2282, 0x2283,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2284, 0x2285,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2286, 0x2287,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2288, 0x2297,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2299, 0x22BF,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x22C0, 0x22C0,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2310, 0x2310,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2312, 0x2312,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2318, 0x2321,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2318, 0x2321,  ULMBCS_GRP_CN},
michael@0:     {0x2460, 0x24E9,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2500, 0x2500,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2501, 0x2501,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2502, 0x2502,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x2503, 0x2503,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x2504, 0x2505,  ULMBCS_GRP_TW},
michael@0:     {0x2506, 0x2665,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x2666, 0x2666,  ULMBCS_GRP_EXCEPT},
michael@0:     {0x2667, 0x2669,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x266A, 0x266A,  ULMBCS_AMBIGUOUS_ALL},
michael@0:     {0x266B, 0x266C,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x266D, 0x266D,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0x266E, 0x266E,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x266F, 0x266F,  ULMBCS_GRP_JA},
michael@0:     {0x2670, 0x2E7F,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0x2E80, 0xF861,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0xF862, 0xF8FF,  ULMBCS_GRP_EXCEPT},
michael@0:     {0xF900, 0xFA2D,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0xFB00, 0xFEFF,  ULMBCS_AMBIGUOUS_SBCS},
michael@0:     {0xFF01, 0xFFEE,  ULMBCS_AMBIGUOUS_MBCS},
michael@0:     {0xFFFF, 0xFFFF,  ULMBCS_GRP_UNICODE}
michael@0: };
michael@0:    
michael@0: static ulmbcs_byte_t 
michael@0: FindLMBCSUniRange(UChar uniChar)
michael@0: {
michael@0:    const struct _UniLMBCSGrpMap * pTable = UniLMBCSGrpMap;
michael@0: 
michael@0:    while (uniChar > pTable->uniEndRange) 
michael@0:    {
michael@0:       pTable++;
michael@0:    }
michael@0: 
michael@0:    if (uniChar >= pTable->uniStartRange) 
michael@0:    {
michael@0:       return pTable->GrpType;
michael@0:    }
michael@0:    return ULMBCS_GRP_UNICODE;
michael@0: }
michael@0: 
michael@0: /* 
michael@0: We also ask the creator of a converter to send in a preferred locale 
michael@0: that we can use in resolving ambiguous mappings. They send the locale
michael@0: in as a string, and we map it, if possible, to one of the 
michael@0: LMBCS groups. We use this table, and the associated code, to 
michael@0: do the lookup: */
michael@0: 
michael@0: /**************************************************
michael@0:   This table maps locale ID's to LMBCS opt groups.
michael@0:   The default return is group 0x01. Note that for
michael@0:   performance reasons, the table is sorted in
michael@0:   increasing alphabetic order, with the notable
michael@0:   exception of zhTW. This is to force the check
michael@0:   for Traditonal Chinese before dropping back to
michael@0:   Simplified.
michael@0: 
michael@0:   Note too that the Latin-1 groups have been
michael@0:   commented out because it's the default, and
michael@0:   this shortens the table, allowing a serial
michael@0:   search to go quickly.
michael@0:  *************************************************/
michael@0: 
michael@0: static const struct _LocaleLMBCSGrpMap
michael@0: {
michael@0:    const char    *LocaleID;
michael@0:    const ulmbcs_byte_t OptGroup;
michael@0: } LocaleLMBCSGrpMap[] =
michael@0: {
michael@0:     {"ar", ULMBCS_GRP_AR},
michael@0:     {"be", ULMBCS_GRP_RU},
michael@0:     {"bg", ULMBCS_GRP_L2},
michael@0:    /* {"ca", ULMBCS_GRP_L1}, */
michael@0:     {"cs", ULMBCS_GRP_L2},
michael@0:    /* {"da", ULMBCS_GRP_L1}, */
michael@0:    /* {"de", ULMBCS_GRP_L1}, */
michael@0:     {"el", ULMBCS_GRP_GR},
michael@0:    /* {"en", ULMBCS_GRP_L1}, */
michael@0:    /* {"es", ULMBCS_GRP_L1}, */
michael@0:    /* {"et", ULMBCS_GRP_L1}, */
michael@0:    /* {"fi", ULMBCS_GRP_L1}, */
michael@0:    /* {"fr", ULMBCS_GRP_L1}, */
michael@0:     {"he", ULMBCS_GRP_HE},
michael@0:     {"hu", ULMBCS_GRP_L2},
michael@0:    /* {"is", ULMBCS_GRP_L1}, */
michael@0:    /* {"it", ULMBCS_GRP_L1}, */
michael@0:     {"iw", ULMBCS_GRP_HE},
michael@0:     {"ja", ULMBCS_GRP_JA},
michael@0:     {"ko", ULMBCS_GRP_KO},
michael@0:    /* {"lt", ULMBCS_GRP_L1}, */
michael@0:    /* {"lv", ULMBCS_GRP_L1}, */
michael@0:     {"mk", ULMBCS_GRP_RU},
michael@0:    /* {"nl", ULMBCS_GRP_L1}, */
michael@0:    /* {"no", ULMBCS_GRP_L1}, */
michael@0:     {"pl", ULMBCS_GRP_L2},
michael@0:    /* {"pt", ULMBCS_GRP_L1}, */
michael@0:     {"ro", ULMBCS_GRP_L2},
michael@0:     {"ru", ULMBCS_GRP_RU},
michael@0:     {"sh", ULMBCS_GRP_L2},
michael@0:     {"sk", ULMBCS_GRP_L2},
michael@0:     {"sl", ULMBCS_GRP_L2},
michael@0:     {"sq", ULMBCS_GRP_L2},
michael@0:     {"sr", ULMBCS_GRP_RU},
michael@0:    /* {"sv", ULMBCS_GRP_L1}, */
michael@0:     {"th", ULMBCS_GRP_TH},
michael@0:     {"tr", ULMBCS_GRP_TR},
michael@0:     {"uk", ULMBCS_GRP_RU},
michael@0:    /* {"vi", ULMBCS_GRP_L1}, */
michael@0:     {"zhTW", ULMBCS_GRP_TW},
michael@0:     {"zh", ULMBCS_GRP_CN},
michael@0:     {NULL, ULMBCS_GRP_L1}
michael@0: };
michael@0: 
michael@0: 
michael@0: static ulmbcs_byte_t 
michael@0: FindLMBCSLocale(const char *LocaleID)
michael@0: {
michael@0:    const struct _LocaleLMBCSGrpMap *pTable = LocaleLMBCSGrpMap;
michael@0: 
michael@0:    if ((!LocaleID) || (!*LocaleID)) 
michael@0:    {
michael@0:       return 0;
michael@0:    }
michael@0: 
michael@0:    while (pTable->LocaleID)
michael@0:    {
michael@0:       if (*pTable->LocaleID == *LocaleID) /* Check only first char for speed */
michael@0:       {
michael@0:          /* First char matches - check whole name, for entry-length */
michael@0:          if (uprv_strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0)
michael@0:             return pTable->OptGroup;
michael@0:       }
michael@0:       else
michael@0:       if (*pTable->LocaleID > *LocaleID) /* Sorted alphabetically - exit */
michael@0:          break;
michael@0:       pTable++;
michael@0:    }
michael@0:    return ULMBCS_GRP_L1;
michael@0: }
michael@0: 
michael@0: 
michael@0: /* 
michael@0:   Before we get to the main body of code, here's how we hook up to the rest 
michael@0:   of ICU. ICU converters are required to define a structure that includes 
michael@0:   some function pointers, and some common data, in the style of a C++
michael@0:   vtable. There is also room in there for converter-specific data. LMBCS
michael@0:   uses that converter-specific data to keep track of the 12 subconverters
michael@0:   we use, the optimization group, and the group (if any) that matches the 
michael@0:   locale. We have one structure instantiated for each of the 12 possible
michael@0:   optimization groups. To avoid typos & to avoid boring the reader, we 
michael@0:   put the declarations of these structures and functions into macros. To see 
michael@0:   the definitions of these structures, see unicode\ucnv_bld.h
michael@0: */
michael@0: 
michael@0: typedef struct
michael@0:   {
michael@0:     UConverterSharedData *OptGrpConverter[ULMBCS_GRP_LAST+1];    /* Converter per Opt. grp. */
michael@0:     uint8_t    OptGroup;                  /* default Opt. grp. for this LMBCS session */
michael@0:     uint8_t    localeConverterIndex;      /* reasonable locale match for index */
michael@0:   }
michael@0: UConverterDataLMBCS;
michael@0: 
michael@0: static void _LMBCSClose(UConverter * _this);
michael@0: 
michael@0: #define DECLARE_LMBCS_DATA(n) \
michael@0: static const UConverterImpl _LMBCSImpl##n={\
michael@0:     UCNV_LMBCS_##n,\
michael@0:     NULL,NULL,\
michael@0:     _LMBCSOpen##n,\
michael@0:     _LMBCSClose,\
michael@0:     NULL,\
michael@0:     _LMBCSToUnicodeWithOffsets,\
michael@0:     _LMBCSToUnicodeWithOffsets,\
michael@0:     _LMBCSFromUnicode,\
michael@0:     _LMBCSFromUnicode,\
michael@0:     NULL,\
michael@0:     NULL,\
michael@0:     NULL,\
michael@0:     NULL,\
michael@0:     _LMBCSSafeClone,\
michael@0:     ucnv_getCompleteUnicodeSet\
michael@0: };\
michael@0: static const UConverterStaticData _LMBCSStaticData##n={\
michael@0:   sizeof(UConverterStaticData),\
michael@0:  "LMBCS-"  #n,\
michael@0:     0, UCNV_IBM, UCNV_LMBCS_##n, 1, 3,\
michael@0:     { 0x3f, 0, 0, 0 },1,FALSE,FALSE,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
michael@0: };\
michael@0: const UConverterSharedData _LMBCSData##n={\
michael@0:     sizeof(UConverterSharedData), ~((uint32_t) 0),\
michael@0:     NULL, NULL, &_LMBCSStaticData##n, FALSE, &_LMBCSImpl##n, \
michael@0:     0 \
michael@0: };
michael@0: 
michael@0:  /* The only function we needed to duplicate 12 times was the 'open'
michael@0: function, which will do basically the same thing except set a  different
michael@0: optimization group. So, we put the common stuff into a worker function, 
michael@0: and set up another macro to stamp out the 12 open functions:*/
michael@0: #define DEFINE_LMBCS_OPEN(n) \
michael@0: static void \
michael@0:    _LMBCSOpen##n(UConverter* _this, UConverterLoadArgs* pArgs, UErrorCode* err) \
michael@0: { _LMBCSOpenWorker(_this, pArgs, err, n); }
michael@0: 
michael@0: 
michael@0: 
michael@0: /* Here's the open worker & the common close function */
michael@0: static void 
michael@0: _LMBCSOpenWorker(UConverter*  _this,
michael@0:                  UConverterLoadArgs *pArgs,
michael@0:                  UErrorCode*  err,
michael@0:                  ulmbcs_byte_t OptGroup)
michael@0: {
michael@0:     UConverterDataLMBCS * extraInfo = _this->extraInfo =
michael@0:         (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS));
michael@0:     if(extraInfo != NULL)
michael@0:     {
michael@0:         UConverterNamePieces stackPieces;
michael@0:         UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
michael@0:         ulmbcs_byte_t i;
michael@0: 
michael@0:         uprv_memset(extraInfo, 0, sizeof(UConverterDataLMBCS));
michael@0: 
michael@0:         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
michael@0: 
michael@0:         for (i=0; i <= ULMBCS_GRP_LAST && U_SUCCESS(*err); i++)         
michael@0:         {
michael@0:             if(OptGroupByteToCPName[i] != NULL) {
michael@0:                 extraInfo->OptGrpConverter[i] = ucnv_loadSharedData(OptGroupByteToCPName[i], &stackPieces, &stackArgs, err);
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         if(U_FAILURE(*err) || pArgs->onlyTestIsLoadable) {
michael@0:             _LMBCSClose(_this);
michael@0:             return;
michael@0:         }
michael@0:         extraInfo->OptGroup = OptGroup;
michael@0:         extraInfo->localeConverterIndex = FindLMBCSLocale(pArgs->locale);
michael@0:     }
michael@0:     else
michael@0:     {
michael@0:         *err = U_MEMORY_ALLOCATION_ERROR;
michael@0:     }
michael@0: }
michael@0: 
michael@0: static void 
michael@0: _LMBCSClose(UConverter *   _this) 
michael@0: {
michael@0:     if (_this->extraInfo != NULL)
michael@0:     {
michael@0:         ulmbcs_byte_t Ix;
michael@0:         UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
michael@0: 
michael@0:         for (Ix=0; Ix <= ULMBCS_GRP_LAST; Ix++)
michael@0:         {
michael@0:            if (extraInfo->OptGrpConverter[Ix] != NULL)
michael@0:               ucnv_unloadSharedDataIfReady(extraInfo->OptGrpConverter[Ix]);
michael@0:         }
michael@0:         if (!_this->isExtraLocal) {
michael@0:             uprv_free (_this->extraInfo);
michael@0:             _this->extraInfo = NULL;
michael@0:         }
michael@0:     }
michael@0: }
michael@0: 
michael@0: typedef struct LMBCSClone {
michael@0:     UConverter cnv;
michael@0:     UConverterDataLMBCS lmbcs;
michael@0: } LMBCSClone;
michael@0: 
michael@0: static UConverter * 
michael@0: _LMBCSSafeClone(const UConverter *cnv, 
michael@0:                 void *stackBuffer, 
michael@0:                 int32_t *pBufferSize, 
michael@0:                 UErrorCode *status) {
michael@0:     LMBCSClone *newLMBCS;
michael@0:     UConverterDataLMBCS *extraInfo;
michael@0:     int32_t i;
michael@0: 
michael@0:     if(*pBufferSize<=0) {
michael@0:         *pBufferSize=(int32_t)sizeof(LMBCSClone);
michael@0:         return NULL;
michael@0:     }
michael@0: 
michael@0:     extraInfo=(UConverterDataLMBCS *)cnv->extraInfo;
michael@0:     newLMBCS=(LMBCSClone *)stackBuffer;
michael@0: 
michael@0:     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
michael@0: 
michael@0:     uprv_memcpy(&newLMBCS->lmbcs, extraInfo, sizeof(UConverterDataLMBCS));
michael@0: 
michael@0:     /* share the subconverters */
michael@0:     for(i = 0; i <= ULMBCS_GRP_LAST; ++i) {
michael@0:         if(extraInfo->OptGrpConverter[i] != NULL) {
michael@0:             ucnv_incrementRefCount(extraInfo->OptGrpConverter[i]);
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     newLMBCS->cnv.extraInfo = &newLMBCS->lmbcs;
michael@0:     newLMBCS->cnv.isExtraLocal = TRUE;
michael@0:     return &newLMBCS->cnv;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)
michael@0:  * which added all code points except for U+F6xx
michael@0:  * because those cannot be represented in the Unicode group.
michael@0:  * However, it turns out that windows-950 has roundtrips for all of U+F6xx
michael@0:  * which means that LMBCS can convert all Unicode code points after all.
michael@0:  * We now simply use ucnv_getCompleteUnicodeSet().
michael@0:  *
michael@0:  * This may need to be looked at again as Lotus uses _LMBCSGetUnicodeSet(). (091216)
michael@0:  */
michael@0: 
michael@0: /* 
michael@0:    Here's the basic helper function that we use when converting from
michael@0:    Unicode to LMBCS, and we suspect that a Unicode character will fit into 
michael@0:    one of the 12 groups. The return value is the number of bytes written 
michael@0:    starting at pStartLMBCS (if any).
michael@0: */
michael@0: 
michael@0: static size_t
michael@0: LMBCSConversionWorker (
michael@0:    UConverterDataLMBCS * extraInfo,    /* subconverters, opt & locale groups */
michael@0:    ulmbcs_byte_t group,                /* The group to try */
michael@0:    ulmbcs_byte_t  * pStartLMBCS,              /* where to put the results */
michael@0:    UChar * pUniChar,                   /* The input unicode character */
michael@0:    ulmbcs_byte_t * lastConverterIndex, /* output: track last successful group used */
michael@0:    UBool * groups_tried                /* output: track any unsuccessful groups */
michael@0: )   
michael@0: {
michael@0:    ulmbcs_byte_t  * pLMBCS = pStartLMBCS;
michael@0:    UConverterSharedData * xcnv = extraInfo->OptGrpConverter[group];
michael@0: 
michael@0:    int bytesConverted;
michael@0:    uint32_t value;
michael@0:    ulmbcs_byte_t firstByte;
michael@0: 
michael@0:    U_ASSERT(xcnv);
michael@0:    U_ASSERT(group<ULMBCS_GRP_UNICODE);
michael@0: 
michael@0:    bytesConverted = ucnv_MBCSFromUChar32(xcnv, *pUniChar, &value, FALSE);
michael@0: 
michael@0:    /* get the first result byte */
michael@0:    if(bytesConverted > 0) {
michael@0:       firstByte = (ulmbcs_byte_t)(value >> ((bytesConverted - 1) * 8));
michael@0:    } else {
michael@0:       /* most common failure mode is an unassigned character */
michael@0:       groups_tried[group] = TRUE;
michael@0:       return 0;
michael@0:    }
michael@0: 
michael@0:    *lastConverterIndex = group;
michael@0: 
michael@0:    /* All initial byte values in lower ascii range should have been caught by now,
michael@0:       except with the exception group.
michael@0:     */
michael@0:    U_ASSERT((firstByte <= ULMBCS_C0END) || (firstByte >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT));
michael@0:    
michael@0:    /* use converted data: first write 0, 1 or two group bytes */
michael@0:    if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group)
michael@0:    {
michael@0:       *pLMBCS++ = group;
michael@0:       if (bytesConverted == 1 && group >= ULMBCS_DOUBLEOPTGROUP_START)
michael@0:       {
michael@0:          *pLMBCS++ = group;
michael@0:       }
michael@0:    }
michael@0: 
michael@0:   /* don't emit control chars */
michael@0:    if ( bytesConverted == 1 && firstByte < 0x20 )
michael@0:       return 0;
michael@0: 
michael@0: 
michael@0:    /* then move over the converted data */
michael@0:    switch(bytesConverted)
michael@0:    {
michael@0:    case 4:
michael@0:       *pLMBCS++ = (ulmbcs_byte_t)(value >> 24);
michael@0:    case 3: /*fall through*/
michael@0:       *pLMBCS++ = (ulmbcs_byte_t)(value >> 16);
michael@0:    case 2: /*fall through*/
michael@0:       *pLMBCS++ = (ulmbcs_byte_t)(value >> 8);
michael@0:    case 1: /*fall through*/
michael@0:       *pLMBCS++ = (ulmbcs_byte_t)value;
michael@0:    default:
michael@0:       /* will never occur */
michael@0:       break;
michael@0:    }
michael@0: 
michael@0:    return (pLMBCS - pStartLMBCS);
michael@0: }
michael@0: 
michael@0: 
michael@0: /* This is a much simpler version of above, when we 
michael@0: know we are writing LMBCS using the Unicode group
michael@0: */
michael@0: static size_t 
michael@0: LMBCSConvertUni(ulmbcs_byte_t * pLMBCS, UChar uniChar)  
michael@0: {
michael@0:      /* encode into LMBCS Unicode range */
michael@0:    uint8_t LowCh =   (uint8_t)(uniChar & 0x00FF);
michael@0:    uint8_t HighCh  = (uint8_t)(uniChar >> 8);
michael@0: 
michael@0:    *pLMBCS++ = ULMBCS_GRP_UNICODE;
michael@0: 
michael@0:    if (LowCh == 0)
michael@0:    {
michael@0:       *pLMBCS++ = ULMBCS_UNICOMPATZERO;
michael@0:       *pLMBCS++ = HighCh;
michael@0:    }
michael@0:    else
michael@0:    {
michael@0:       *pLMBCS++ = HighCh;
michael@0:       *pLMBCS++ = LowCh;
michael@0:    }
michael@0:    return ULMBCS_UNICODE_SIZE;
michael@0: }
michael@0: 
michael@0: 
michael@0: 
michael@0: /* The main Unicode to LMBCS conversion function */
michael@0: static void 
michael@0: _LMBCSFromUnicode(UConverterFromUnicodeArgs*     args,
michael@0:                   UErrorCode*     err)
michael@0: {
michael@0:    ulmbcs_byte_t lastConverterIndex = 0;
michael@0:    UChar uniChar;
michael@0:    ulmbcs_byte_t  LMBCS[ULMBCS_CHARSIZE_MAX];
michael@0:    ulmbcs_byte_t  * pLMBCS;
michael@0:    int32_t bytes_written;
michael@0:    UBool groups_tried[ULMBCS_GRP_LAST+1];
michael@0:    UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
michael@0:    int sourceIndex = 0; 
michael@0: 
michael@0:    /* Basic strategy: attempt to fill in local LMBCS 1-char buffer.(LMBCS)
michael@0:       If that succeeds, see if it will all fit into the target & copy it over 
michael@0:       if it does.
michael@0: 
michael@0:       We try conversions in the following order:
michael@0: 
michael@0:       1. Single-byte ascii & special fixed control chars (&null)
michael@0:       2. Look up group in table & try that (could be 
michael@0:             A) Unicode group
michael@0:             B) control group,
michael@0:             C) national encoding, 
michael@0:                or ambiguous SBCS or MBCS group (on to step 4...)
michael@0:         
michael@0:       3. If its ambiguous, try this order:
michael@0:          A) The optimization group
michael@0:          B) The locale group
michael@0:          C) The last group that succeeded with this string.
michael@0:          D) every other group that's relevent (single or double)
michael@0:          E) If its single-byte ambiguous, try the exceptions group
michael@0: 
michael@0:       4. And as a grand fallback: Unicode
michael@0:    */
michael@0: 
michael@0:     /*Fix for SPR#DJOE66JFN3 (Lotus)*/
michael@0:     ulmbcs_byte_t OldConverterIndex = 0;
michael@0: 
michael@0:    while (args->source < args->sourceLimit && !U_FAILURE(*err))
michael@0:    {
michael@0:       /*Fix for SPR#DJOE66JFN3 (Lotus)*/
michael@0:       OldConverterIndex = extraInfo->localeConverterIndex;
michael@0: 
michael@0:       if (args->target >= args->targetLimit)
michael@0:       {
michael@0:          *err = U_BUFFER_OVERFLOW_ERROR;
michael@0:          break;
michael@0:       }
michael@0:       uniChar = *(args->source);
michael@0:       bytes_written = 0;
michael@0:       pLMBCS = LMBCS;
michael@0: 
michael@0:       /* check cases in rough order of how common they are, for speed */
michael@0: 
michael@0:       /* single byte matches: strategy 1 */
michael@0:       /*Fix for SPR#DJOE66JFN3 (Lotus)*/
michael@0:       if((uniChar>=0x80) && (uniChar<=0xff)
michael@0:       /*Fix for SPR#JUYA6XAERU and TSAO7GL5NK (Lotus)*/ &&(uniChar!=0xB1) &&(uniChar!=0xD7) &&(uniChar!=0xF7)
michael@0:         &&(uniChar!=0xB0) &&(uniChar!=0xB4) &&(uniChar!=0xB6) &&(uniChar!=0xA7) &&(uniChar!=0xA8))
michael@0:       {
michael@0:             extraInfo->localeConverterIndex = ULMBCS_GRP_L1;
michael@0:       }
michael@0:       if (((uniChar > ULMBCS_C0END) && (uniChar < ULMBCS_C1START)) ||
michael@0:           uniChar == 0 || uniChar == ULMBCS_HT || uniChar == ULMBCS_CR || 
michael@0:           uniChar == ULMBCS_LF || uniChar == ULMBCS_123SYSTEMRANGE 
michael@0:           )
michael@0:       {
michael@0:          *pLMBCS++ = (ulmbcs_byte_t ) uniChar;
michael@0:          bytes_written = 1;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       if (!bytes_written) 
michael@0:       {
michael@0:          /* Check by UNICODE range (Strategy 2) */
michael@0:          ulmbcs_byte_t group = FindLMBCSUniRange(uniChar);
michael@0:          
michael@0:          if (group == ULMBCS_GRP_UNICODE)  /* (Strategy 2A) */
michael@0:          {
michael@0:             pLMBCS += LMBCSConvertUni(pLMBCS,uniChar);
michael@0:             
michael@0:             bytes_written = (int32_t)(pLMBCS - LMBCS);
michael@0:          }
michael@0:          else if (group == ULMBCS_GRP_CTRL)  /* (Strategy 2B) */
michael@0:          {
michael@0:             /* Handle control characters here */
michael@0:             if (uniChar <= ULMBCS_C0END)
michael@0:             {
michael@0:                *pLMBCS++ = ULMBCS_GRP_CTRL;
michael@0:                *pLMBCS++ = (ulmbcs_byte_t)(ULMBCS_CTRLOFFSET + uniChar);
michael@0:             }
michael@0:             else if (uniChar >= ULMBCS_C1START && uniChar <= ULMBCS_C1START + ULMBCS_CTRLOFFSET)
michael@0:             {
michael@0:                *pLMBCS++ = ULMBCS_GRP_CTRL;
michael@0:                *pLMBCS++ = (ulmbcs_byte_t ) (uniChar & 0x00FF);
michael@0:             }
michael@0:             bytes_written = (int32_t)(pLMBCS - LMBCS);
michael@0:          }
michael@0:          else if (group < ULMBCS_GRP_UNICODE)  /* (Strategy 2C) */
michael@0:          {
michael@0:             /* a specific converter has been identified - use it */
michael@0:             bytes_written = (int32_t)LMBCSConversionWorker (
michael@0:                               extraInfo, group, pLMBCS, &uniChar, 
michael@0:                               &lastConverterIndex, groups_tried);
michael@0:          }
michael@0:          if (!bytes_written)    /* the ambiguous group cases  (Strategy 3) */
michael@0:          {
michael@0:             uprv_memset(groups_tried, 0, sizeof(groups_tried));
michael@0: 
michael@0:             /* check for non-default optimization group (Strategy 3A )*/
michael@0:             if ((extraInfo->OptGroup != 1) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->OptGroup)))
michael@0:             {
michael@0:                 /*zhujin: upgrade, merge #39299 here (Lotus) */
michael@0:                 /*To make R5 compatible translation, look for exceptional group first for non-DBCS*/
michael@0: 
michael@0:                 if(extraInfo->localeConverterIndex < ULMBCS_DOUBLEOPTGROUP_START)
michael@0:                 {
michael@0:                   bytes_written = LMBCSConversionWorker (extraInfo,
michael@0:                      ULMBCS_GRP_L1, pLMBCS, &uniChar,
michael@0:                      &lastConverterIndex, groups_tried);
michael@0: 
michael@0:                   if(!bytes_written)
michael@0:                   {
michael@0:                      bytes_written = LMBCSConversionWorker (extraInfo,
michael@0:                          ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar,
michael@0:                          &lastConverterIndex, groups_tried);
michael@0:                   }
michael@0:                   if(!bytes_written)
michael@0:                   {
michael@0:                       bytes_written = LMBCSConversionWorker (extraInfo,
michael@0:                           extraInfo->localeConverterIndex, pLMBCS, &uniChar,
michael@0:                           &lastConverterIndex, groups_tried);
michael@0:                   }
michael@0:                 }
michael@0:                 else
michael@0:                 {
michael@0:                      bytes_written = LMBCSConversionWorker (extraInfo,
michael@0:                          extraInfo->localeConverterIndex, pLMBCS, &uniChar,
michael@0:                          &lastConverterIndex, groups_tried);
michael@0:                 }
michael@0:             }
michael@0:             /* check for locale optimization group (Strategy 3B) */
michael@0:             if (!bytes_written && (extraInfo->localeConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->localeConverterIndex)))
michael@0:             {
michael@0:                 bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
michael@0:                         extraInfo->localeConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried);
michael@0:             }
michael@0:             /* check for last optimization group used for this string (Strategy 3C) */
michael@0:             if (!bytes_written && (lastConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, lastConverterIndex)))
michael@0:             {
michael@0:                 bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
michael@0:                         lastConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried);
michael@0:             }
michael@0:             if (!bytes_written)
michael@0:             {
michael@0:                /* just check every possible matching converter (Strategy 3D) */ 
michael@0:                ulmbcs_byte_t grp_start;
michael@0:                ulmbcs_byte_t grp_end;  
michael@0:                ulmbcs_byte_t grp_ix;
michael@0:                grp_start = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS) 
michael@0:                         ? ULMBCS_DOUBLEOPTGROUP_START 
michael@0:                         :  ULMBCS_GRP_L1);
michael@0:                grp_end = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS) 
michael@0:                         ? ULMBCS_GRP_LAST 
michael@0:                         :  ULMBCS_GRP_TH);
michael@0:                if(group == ULMBCS_AMBIGUOUS_ALL)
michael@0:                {
michael@0:                    grp_start = ULMBCS_GRP_L1;
michael@0:                    grp_end = ULMBCS_GRP_LAST;
michael@0:                }
michael@0:                for (grp_ix = grp_start;
michael@0:                    grp_ix <= grp_end && !bytes_written; 
michael@0:                     grp_ix++)
michael@0:                {
michael@0:                   if (extraInfo->OptGrpConverter [grp_ix] && !groups_tried [grp_ix])
michael@0:                   {
michael@0:                      bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
michael@0:                        grp_ix, pLMBCS, &uniChar, 
michael@0:                        &lastConverterIndex, groups_tried);
michael@0:                   }
michael@0:                }
michael@0:                 /* a final conversion fallback to the exceptions group if its likely 
michael@0:                      to be single byte  (Strategy 3E) */
michael@0:                if (!bytes_written && grp_start == ULMBCS_GRP_L1)
michael@0:                {
michael@0:                   bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
michael@0:                      ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar, 
michael@0:                      &lastConverterIndex, groups_tried);
michael@0:                }
michael@0:             }
michael@0:             /* all of our other strategies failed. Fallback to Unicode. (Strategy 4)*/
michael@0:             if (!bytes_written)
michael@0:             {
michael@0: 
michael@0:                pLMBCS += LMBCSConvertUni(pLMBCS, uniChar);
michael@0:                bytes_written = (int32_t)(pLMBCS - LMBCS);
michael@0:             }
michael@0:          }
michael@0:       }
michael@0:   
michael@0:       /* we have a translation. increment source and write as much as posible to target */
michael@0:       args->source++;
michael@0:       pLMBCS = LMBCS;
michael@0:       while (args->target < args->targetLimit && bytes_written--)
michael@0:       {
michael@0:          *(args->target)++ = *pLMBCS++;
michael@0:          if (args->offsets)
michael@0:          {
michael@0:             *(args->offsets)++ = sourceIndex;
michael@0:          }
michael@0:       }
michael@0:       sourceIndex++;
michael@0:       if (bytes_written > 0)
michael@0:       {
michael@0:          /* write any bytes that didn't fit in target to the error buffer,
michael@0:             common code will move this to target if we get called back with
michael@0:             enough target room
michael@0:          */
michael@0:          uint8_t * pErrorBuffer = args->converter->charErrorBuffer;
michael@0:          *err = U_BUFFER_OVERFLOW_ERROR;
michael@0:          args->converter->charErrorBufferLength = (int8_t)bytes_written;
michael@0:          while (bytes_written--)
michael@0:          {
michael@0:             *pErrorBuffer++ = *pLMBCS++;
michael@0:          }
michael@0:       }
michael@0:       /*Fix for SPR#DJOE66JFN3 (Lotus)*/
michael@0:       extraInfo->localeConverterIndex = OldConverterIndex;
michael@0:    }     
michael@0: }
michael@0: 
michael@0: 
michael@0: /* Now, the Unicode from LMBCS section */
michael@0: 
michael@0: 
michael@0: /* A function to call when we are looking at the Unicode group byte in LMBCS */
michael@0: static UChar
michael@0: GetUniFromLMBCSUni(char const ** ppLMBCSin)  /* Called with LMBCS-style Unicode byte stream */
michael@0: {
michael@0:    uint8_t  HighCh = *(*ppLMBCSin)++;  /* Big-endian Unicode in LMBCS compatibility group*/
michael@0:    uint8_t  LowCh  = *(*ppLMBCSin)++;
michael@0: 
michael@0:    if (HighCh == ULMBCS_UNICOMPATZERO ) 
michael@0:    {
michael@0:       HighCh = LowCh;
michael@0:       LowCh = 0; /* zero-byte in LSB special character */
michael@0:    }
michael@0:    return (UChar)((HighCh << 8) | LowCh);
michael@0: }
michael@0: 
michael@0: 
michael@0: 
michael@0: /* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index' 
michael@0:    bytes left in source up to  sourceLimit.Errors appropriately if not.
michael@0:    If we reach the limit, then update the source pointer to there to consume
michael@0:    all input as required by ICU converter semantics.
michael@0: */
michael@0: 
michael@0: #define CHECK_SOURCE_LIMIT(index) \
michael@0:      if (args->source+index > args->sourceLimit){\
michael@0:          *err = U_TRUNCATED_CHAR_FOUND;\
michael@0:          args->source = args->sourceLimit;\
michael@0:          return 0xffff;}
michael@0: 
michael@0: /* Return the Unicode representation for the current LMBCS character */
michael@0: 
michael@0: static UChar32 
michael@0: _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
michael@0:                          UErrorCode*   err)
michael@0: {
michael@0:     UChar32 uniChar = 0;    /* an output UNICODE char */
michael@0:     ulmbcs_byte_t   CurByte; /* A byte from the input stream */
michael@0: 
michael@0:     /* error check */
michael@0:     if (args->source >= args->sourceLimit)
michael@0:     {
michael@0:         *err = U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         return 0xffff;
michael@0:     }
michael@0:     /* Grab first byte & save address for error recovery */
michael@0:     CurByte = *((ulmbcs_byte_t  *) (args->source++));
michael@0:    
michael@0:     /*
michael@0:     * at entry of each if clause:
michael@0:     * 1. 'CurByte' points at the first byte of a LMBCS character
michael@0:     * 2. '*source'points to the next byte of the source stream after 'CurByte' 
michael@0:     *
michael@0:     * the job of each if clause is:
michael@0:     * 1. set '*source' to point at the beginning of next char (nop if LMBCS char is only 1 byte)
michael@0:     * 2. set 'uniChar' up with the right Unicode value, or set 'err' appropriately
michael@0:     */
michael@0:    
michael@0:     /* First lets check the simple fixed values. */
michael@0: 
michael@0:     if(((CurByte > ULMBCS_C0END) && (CurByte < ULMBCS_C1START)) /* ascii range */
michael@0:     ||  (CurByte == 0) 
michael@0:     ||  CurByte == ULMBCS_HT || CurByte == ULMBCS_CR 
michael@0:     ||  CurByte == ULMBCS_LF || CurByte == ULMBCS_123SYSTEMRANGE)
michael@0:     {
michael@0:         uniChar = CurByte;
michael@0:     }
michael@0:     else  
michael@0:     {
michael@0:         UConverterDataLMBCS * extraInfo;
michael@0:         ulmbcs_byte_t group; 
michael@0:         UConverterSharedData *cnv; 
michael@0:         
michael@0:         if (CurByte == ULMBCS_GRP_CTRL)  /* Control character group - no opt group update */
michael@0:         {
michael@0:             ulmbcs_byte_t  C0C1byte;
michael@0:             CHECK_SOURCE_LIMIT(1);
michael@0:             C0C1byte = *(args->source)++;
michael@0:             uniChar = (C0C1byte < ULMBCS_C1START) ? C0C1byte - ULMBCS_CTRLOFFSET : C0C1byte;
michael@0:         }
michael@0:         else 
michael@0:         if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */
michael@0:         {
michael@0:             CHECK_SOURCE_LIMIT(2);
michael@0:      
michael@0:             /* don't check for error indicators fffe/ffff below */
michael@0:             return GetUniFromLMBCSUni(&(args->source));
michael@0:         }
michael@0:         else if (CurByte <= ULMBCS_CTRLOFFSET)  
michael@0:         {
michael@0:             group = CurByte;                   /* group byte is in the source */
michael@0:             extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
michael@0:             if (group > ULMBCS_GRP_LAST || (cnv = extraInfo->OptGrpConverter[group]) == NULL)
michael@0:             {
michael@0:                 /* this is not a valid group byte - no converter*/
michael@0:                 *err = U_INVALID_CHAR_FOUND;
michael@0:             }      
michael@0:             else if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
michael@0:             {
michael@0: 
michael@0:                 CHECK_SOURCE_LIMIT(2);
michael@0: 
michael@0:                 /* check for LMBCS doubled-group-byte case */
michael@0:                 if (*args->source == group) {
michael@0:                     /* single byte */
michael@0:                     ++args->source;
michael@0:                     uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 1, FALSE);
michael@0:                     ++args->source;
michael@0:                 } else {
michael@0:                     /* double byte */
michael@0:                     uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 2, FALSE);
michael@0:                     args->source += 2;
michael@0:                 }
michael@0:             }
michael@0:             else {                                  /* single byte conversion */
michael@0:                 CHECK_SOURCE_LIMIT(1);
michael@0:                 CurByte = *(args->source)++;
michael@0:         
michael@0:                 if (CurByte >= ULMBCS_C1START)
michael@0:                 {
michael@0:                     uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
michael@0:                 }
michael@0:                 else
michael@0:                 {
michael@0:                     /* The non-optimizable oddballs where there is an explicit byte 
michael@0:                     * AND the second byte is not in the upper ascii range
michael@0:                     */
michael@0:                     char bytes[2];
michael@0: 
michael@0:                     extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
michael@0:                     cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT];  
michael@0:         
michael@0:                     /* Lookup value must include opt group */
michael@0:                     bytes[0] = group;
michael@0:                     bytes[1] = CurByte;
michael@0:                     uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, bytes, 2, FALSE);
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0:         else if (CurByte >= ULMBCS_C1START) /* group byte is implicit */
michael@0:         {
michael@0:             extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
michael@0:             group = extraInfo->OptGroup;
michael@0:             cnv = extraInfo->OptGrpConverter[group];
michael@0:             if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
michael@0:             {
michael@0:                 if (!ucnv_MBCSIsLeadByte(cnv, CurByte))
michael@0:                 {
michael@0:                     CHECK_SOURCE_LIMIT(0);
michael@0: 
michael@0:                     /* let the MBCS conversion consume CurByte again */
michael@0:                     uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 1, FALSE);
michael@0:                 }
michael@0:                 else
michael@0:                 {
michael@0:                     CHECK_SOURCE_LIMIT(1);
michael@0:                     /* let the MBCS conversion consume CurByte again */
michael@0:                     uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 2, FALSE);
michael@0:                     ++args->source;
michael@0:                 }
michael@0:             }
michael@0:             else                                   /* single byte conversion */
michael@0:             {
michael@0:                 uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
michael@0:             }
michael@0:         }
michael@0:     }
michael@0:     return uniChar;
michael@0: }
michael@0: 
michael@0: 
michael@0: /* The exported function that converts lmbcs to one or more
michael@0:    UChars - currently UTF-16
michael@0: */
michael@0: static void 
michael@0: _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs*    args,
michael@0:                      UErrorCode*    err)
michael@0: {
michael@0:    char LMBCS [ULMBCS_CHARSIZE_MAX];
michael@0:    UChar uniChar;    /* one output UNICODE char */
michael@0:    const char * saveSource; /* beginning of current code point */
michael@0:    const char * pStartLMBCS = args->source;  /* beginning of whole string */
michael@0:    const char * errSource = NULL; /* pointer to actual input in case an error occurs */
michael@0:    int8_t savebytes = 0;
michael@0: 
michael@0:    /* Process from source to limit, or until error */
michael@0:    while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target)
michael@0:    {
michael@0:       saveSource = args->source; /* beginning of current code point */
michael@0: 
michael@0:       if (args->converter->toULength) /* reassemble char from previous call */
michael@0:       {
michael@0:         const char *saveSourceLimit; 
michael@0:         size_t size_old = args->converter->toULength;
michael@0: 
michael@0:          /* limit from source is either remainder of temp buffer, or user limit on source */
michael@0:         size_t size_new_maybe_1 = sizeof(LMBCS) - size_old;
michael@0:         size_t size_new_maybe_2 = args->sourceLimit - args->source;
michael@0:         size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2;
michael@0:          
michael@0:       
michael@0:         uprv_memcpy(LMBCS, args->converter->toUBytes, size_old);
michael@0:         uprv_memcpy(LMBCS + size_old, args->source, size_new);
michael@0:         saveSourceLimit = args->sourceLimit;
michael@0:         args->source = errSource = LMBCS;
michael@0:         args->sourceLimit = LMBCS+size_old+size_new;
michael@0:         savebytes = (int8_t)(size_old+size_new);
michael@0:         uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
michael@0:         args->source = saveSource + ((args->source - LMBCS) - size_old);
michael@0:         args->sourceLimit = saveSourceLimit;
michael@0: 
michael@0:         if (*err == U_TRUNCATED_CHAR_FOUND)
michael@0:         {
michael@0:             /* evil special case: source buffers so small a char spans more than 2 buffers */
michael@0:             args->converter->toULength = savebytes;
michael@0:             uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes);
michael@0:             args->source = args->sourceLimit;
michael@0:             *err = U_ZERO_ERROR;
michael@0:             return;
michael@0:          }
michael@0:          else
michael@0:          {
michael@0:             /* clear the partial-char marker */
michael@0:             args->converter->toULength = 0;
michael@0:          }
michael@0:       }
michael@0:       else
michael@0:       {
michael@0:          errSource = saveSource;
michael@0:          uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
michael@0:          savebytes = (int8_t)(args->source - saveSource);
michael@0:       }
michael@0:       if (U_SUCCESS(*err))
michael@0:       {
michael@0:          if (uniChar < 0xfffe)
michael@0:          {
michael@0:             *(args->target)++ = uniChar;
michael@0:             if(args->offsets)
michael@0:             {
michael@0:                *(args->offsets)++ = (int32_t)(saveSource - pStartLMBCS);
michael@0:             }
michael@0:          }
michael@0:          else if (uniChar == 0xfffe)
michael@0:          {
michael@0:             *err = U_INVALID_CHAR_FOUND;
michael@0:          }
michael@0:          else /* if (uniChar == 0xffff) */
michael@0:          {
michael@0:             *err = U_ILLEGAL_CHAR_FOUND;
michael@0:          }
michael@0:       }
michael@0:    }
michael@0:    /* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */
michael@0:    if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target)
michael@0:    {
michael@0:       *err = U_BUFFER_OVERFLOW_ERROR;
michael@0:    }
michael@0:    else if (U_FAILURE(*err)) 
michael@0:    {
michael@0:       /* If character incomplete or unmappable/illegal, store it in toUBytes[] */
michael@0:       args->converter->toULength = savebytes;
michael@0:       if (savebytes > 0) {
michael@0:          uprv_memcpy(args->converter->toUBytes, errSource, savebytes);
michael@0:       }
michael@0:       if (*err == U_TRUNCATED_CHAR_FOUND) {
michael@0:          *err = U_ZERO_ERROR;
michael@0:       }
michael@0:    }
michael@0: }
michael@0: 
michael@0: /* And now, the macroized declarations of data & functions: */
michael@0: DEFINE_LMBCS_OPEN(1)
michael@0: DEFINE_LMBCS_OPEN(2)
michael@0: DEFINE_LMBCS_OPEN(3)
michael@0: DEFINE_LMBCS_OPEN(4)
michael@0: DEFINE_LMBCS_OPEN(5)
michael@0: DEFINE_LMBCS_OPEN(6)
michael@0: DEFINE_LMBCS_OPEN(8)
michael@0: DEFINE_LMBCS_OPEN(11)
michael@0: DEFINE_LMBCS_OPEN(16)
michael@0: DEFINE_LMBCS_OPEN(17)
michael@0: DEFINE_LMBCS_OPEN(18)
michael@0: DEFINE_LMBCS_OPEN(19)
michael@0: 
michael@0: 
michael@0: DECLARE_LMBCS_DATA(1)
michael@0: DECLARE_LMBCS_DATA(2)
michael@0: DECLARE_LMBCS_DATA(3)
michael@0: DECLARE_LMBCS_DATA(4)
michael@0: DECLARE_LMBCS_DATA(5)
michael@0: DECLARE_LMBCS_DATA(6)
michael@0: DECLARE_LMBCS_DATA(8)
michael@0: DECLARE_LMBCS_DATA(11)
michael@0: DECLARE_LMBCS_DATA(16)
michael@0: DECLARE_LMBCS_DATA(17)
michael@0: DECLARE_LMBCS_DATA(18)
michael@0: DECLARE_LMBCS_DATA(19)
michael@0: 
michael@0: #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */