The Tor Browser: diff intl/icu/source/common/ucnv

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnv_lmb.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1377 @@
     1.4 +/*  
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2000-2011, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   file name:  ucnv_lmb.cpp
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   4 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2000feb09
    1.15 +*   created by: Brendan Murray
    1.16 +*   extensively hacked up by: Jim Snyder-Grant
    1.17 +*
    1.18 +* Modification History:
    1.19 +* 
    1.20 +*   Date        Name             Description
    1.21 +* 
    1.22 +*   06/20/2000  helena           OS/400 port changes; mostly typecast.
    1.23 +*   06/27/2000  Jim Snyder-Grant Deal with partial characters and small buffers.
    1.24 +*                                Add comments to document LMBCS format and implementation
    1.25 +*                                restructured order & breakdown of functions
    1.26 +*   06/28/2000  helena           Major rewrite for the callback API changes.
    1.27 +*/
    1.28 +
    1.29 +#include "unicode/utypes.h"
    1.30 +
    1.31 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
    1.32 +
    1.33 +#include "unicode/ucnv_err.h"
    1.34 +#include "unicode/ucnv.h"
    1.35 +#include "unicode/uset.h"
    1.36 +#include "cmemory.h"
    1.37 +#include "cstring.h"
    1.38 +#include "uassert.h"
    1.39 +#include "ucnv_imp.h"
    1.40 +#include "ucnv_bld.h"
    1.41 +#include "ucnv_cnv.h"
    1.42 +
    1.43 +#ifdef EBCDIC_RTL
    1.44 +    #include "ascii_a.h"
    1.45 +#endif
    1.46 +
    1.47 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.48 +
    1.49 +/*
    1.50 +  LMBCS
    1.51 +
    1.52 +  (Lotus Multi-Byte Character Set)
    1.53 +
    1.54 +  LMBCS was invented in the late 1980's and is primarily used in Lotus Notes 
    1.55 +  databases and in Lotus 1-2-3 files. Programmers who work with the APIs 
    1.56 +  into these products will sometimes need to deal with strings in this format.
    1.57 +
    1.58 +  The code in this file provides an implementation for an ICU converter of 
    1.59 +  LMBCS to and from Unicode. 
    1.60 +
    1.61 +  Since the LMBCS character set is only sparsely documented in existing 
    1.62 +  printed or online material, we have added  extensive annotation to this 
    1.63 +  file to serve as a guide to understanding LMBCS. 
    1.64 +
    1.65 +  LMBCS was originally designed with these four sometimes-competing design goals:
    1.66 +
    1.67 +  -Provide encodings for the characters in 12 existing national standards
    1.68 +   (plus a few other characters)
    1.69 +  -Minimal memory footprint
    1.70 +  -Maximal speed of conversion into the existing national character sets
    1.71 +  -No need to track a changing state as you interpret a string.
    1.72 +
    1.73 +
    1.74 +  All of the national character sets LMBCS was trying to encode are 'ANSI'
    1.75 +  based, in that the bytes from 0x20 - 0x7F are almost exactly the 
    1.76 +  same common Latin unaccented characters and symbols in all character sets. 
    1.77 +
    1.78 +  So, in order to help meet the speed & memory design goals, the common ANSI 
    1.79 +  bytes from 0x20-0x7F are represented by the same single-byte values in LMBCS. 
    1.80 +
    1.81 +  The general LMBCS code unit is from 1-3 bytes. We can describe the 3 bytes as
    1.82 +  follows:
    1.83 +
    1.84 +  [G] D1 [D2]
    1.85 +
    1.86 +  That is, a sometimes-optional 'group' byte, followed by 1 and sometimes 2
    1.87 +  data bytes. The maximum size of a LMBCS chjaracter is 3 bytes:
    1.88 +*/
    1.89 +#define ULMBCS_CHARSIZE_MAX      3
    1.90 +/*
    1.91 +  The single-byte values from 0x20 to 0x7F are examples of single D1 bytes.
    1.92 +  We often have to figure out if byte values are below or above this, so we 
    1.93 +  use the ANSI nomenclature 'C0' and 'C1' to refer to the range of control 
    1.94 +  characters just above & below the common lower-ANSI  range */
    1.95 +#define ULMBCS_C0END           0x1F   
    1.96 +#define ULMBCS_C1START         0x80   
    1.97 +/*
    1.98 +  Since LMBCS is always dealing in byte units. we create a local type here for 
    1.99 +  dealing with these units of LMBCS code units:
   1.100 +
   1.101 +*/  
   1.102 +typedef uint8_t ulmbcs_byte_t;
   1.103 +
   1.104 +/* 
   1.105 +   Most of the values less than 0x20 are reserved in LMBCS to announce 
   1.106 +   which national  character standard is being used for the 'D' bytes. 
   1.107 +   In the comments we show the common name and the IBM character-set ID
   1.108 +   for these character-set announcers:
   1.109 +*/
   1.110 +
   1.111 +#define ULMBCS_GRP_L1         0x01   /* Latin-1    :ibm-850  */
   1.112 +#define ULMBCS_GRP_GR         0x02   /* Greek      :ibm-851  */
   1.113 +#define ULMBCS_GRP_HE         0x03   /* Hebrew     :ibm-1255 */
   1.114 +#define ULMBCS_GRP_AR         0x04   /* Arabic     :ibm-1256 */
   1.115 +#define ULMBCS_GRP_RU         0x05   /* Cyrillic   :ibm-1251 */
   1.116 +#define ULMBCS_GRP_L2         0x06   /* Latin-2    :ibm-852  */
   1.117 +#define ULMBCS_GRP_TR         0x08   /* Turkish    :ibm-1254 */
   1.118 +#define ULMBCS_GRP_TH         0x0B   /* Thai       :ibm-874  */
   1.119 +#define ULMBCS_GRP_JA         0x10   /* Japanese   :ibm-943  */
   1.120 +#define ULMBCS_GRP_KO         0x11   /* Korean     :ibm-1261 */
   1.121 +#define ULMBCS_GRP_TW         0x12   /* Chinese SC :ibm-950  */
   1.122 +#define ULMBCS_GRP_CN         0x13   /* Chinese TC :ibm-1386 */
   1.123 +
   1.124 +/*
   1.125 +   So, the beginning of understanding LMBCS is that IF the first byte of a LMBCS 
   1.126 +   character is one of those 12 values, you can interpret the remaining bytes of 
   1.127 +   that character as coming from one of those character sets. Since the lower 
   1.128 +   ANSI bytes already are represented in single bytes, using one of the character 
   1.129 +   set announcers is used to announce a character that starts with a byte of 
   1.130 +   0x80 or greater.
   1.131 +
   1.132 +   The character sets are  arranged so that the single byte sets all appear 
   1.133 +   before the multi-byte character sets. When we need to tell whether a 
   1.134 +   group byte is for a single byte char set or not we use this define: */
   1.135 +
   1.136 +#define ULMBCS_DOUBLEOPTGROUP_START  0x10   
   1.137 +
   1.138 +/* 
   1.139 +However, to fully understand LMBCS, you must also understand a series of 
   1.140 +exceptions & optimizations made in service of the design goals. 
   1.141 +
   1.142 +First, those of you who are character set mavens may have noticed that
   1.143 +the 'double-byte' character sets are actually multi-byte character sets 
   1.144 +that can have 1 or two bytes, even in the upper-ascii range. To force
   1.145 +each group byte to introduce a fixed-width encoding (to make it faster to 
   1.146 +count characters), we use a convention of doubling up on the group byte 
   1.147 +to introduce any single-byte character > 0x80 in an otherwise double-byte
   1.148 +character set. So, for example, the LMBCS sequence x10 x10 xAE is the 
   1.149 +same as '0xAE' in the Japanese code page 943.
   1.150 +
   1.151 +Next, you will notice that the list of group bytes has some gaps. 
   1.152 +These are used in various ways.
   1.153 +
   1.154 +We reserve a few special single byte values for common control 
   1.155 +characters. These are in the same place as their ANSI eqivalents for speed.
   1.156 +*/
   1.157 +                     
   1.158 +#define ULMBCS_HT    0x09   /* Fixed control char - Horizontal Tab */
   1.159 +#define ULMBCS_LF    0x0A   /* Fixed control char - Line Feed */
   1.160 +#define ULMBCS_CR    0x0D   /* Fixed control char - Carriage Return */
   1.161 +
   1.162 +/* Then, 1-2-3 reserved a special single-byte character to put at the 
   1.163 +beginning of internal 'system' range names: */
   1.164 +
   1.165 +#define ULMBCS_123SYSTEMRANGE  0x19   
   1.166 +
   1.167 +/* Then we needed a place to put all the other ansi control characters 
   1.168 +that must be moved to different values because LMBCS reserves those 
   1.169 +values for other purposes. To represent the control characters, we start 
   1.170 +with a first byte of 0xF & add the control chaarcter value as the 
   1.171 +second byte */
   1.172 +#define ULMBCS_GRP_CTRL       0x0F   
   1.173 +
   1.174 +/* For the C0 controls (less than 0x20), we add 0x20 to preserve the 
   1.175 +useful doctrine that any byte less than 0x20 in a LMBCS char must be 
   1.176 +the first byte of a character:*/
   1.177 +#define ULMBCS_CTRLOFFSET      0x20   
   1.178 +
   1.179 +/* 
   1.180 +Where to put the characters that aren't part of any of the 12 national 
   1.181 +character sets? The first thing that was done, in the earlier years of 
   1.182 +LMBCS, was to use up the spaces of the form
   1.183 +
   1.184 +  [G] D1, 
   1.185 +  
   1.186 + where  'G' was one of the single-byte character groups, and
   1.187 + D1 was less than 0x80. These sequences are gathered together 
   1.188 + into a Lotus-invented doublebyte character set to represent a 
   1.189 + lot of stray values. Internally, in this implementation, we track this 
   1.190 + as group '0', as a place to tuck this exceptions list.*/
   1.191 +
   1.192 +#define ULMBCS_GRP_EXCEPT     0x00    
   1.193 +/*
   1.194 + Finally, as the durability and usefulness of UNICODE became clear, 
   1.195 + LOTUS added a new group 0x14 to hold Unicode values not otherwise 
   1.196 + represented in LMBCS: */
   1.197 +#define ULMBCS_GRP_UNICODE    0x14   
   1.198 +/* The two bytes appearing after a 0x14 are intrepreted as UFT-16 BE
   1.199 +(Big-Endian) characters. The exception comes when the UTF16 
   1.200 +representation would have a zero as the second byte. In that case,
   1.201 +'F6' is used in its place, and the bytes are swapped. (This prevents 
   1.202 +LMBCS from encoding any Unicode values of the form U+F6xx, but that's OK:
   1.203 +0xF6xx is in the middle of the Private Use Area.)*/
   1.204 +#define ULMBCS_UNICOMPATZERO   0xF6   
   1.205 +
   1.206 +/* It is also useful in our code to have a constant for the size of 
   1.207 +a LMBCS char that holds a literal Unicode value */
   1.208 +#define ULMBCS_UNICODE_SIZE      3    
   1.209 +
   1.210 +/* 
   1.211 +To squish the LMBCS representations down even further, and to make 
   1.212 +translations even faster,sometimes the optimization group byte can be dropped 
   1.213 +from a LMBCS character. This is decided on a process-by-process basis. The 
   1.214 +group byte that is dropped is called the 'optimization group'.
   1.215 +
   1.216 +For Notes, the optimzation group is always 0x1.*/
   1.217 +#define ULMBCS_DEFAULTOPTGROUP 0x1    
   1.218 +/* For 1-2-3 files, the optimzation group is stored in the header of the 1-2-3 
   1.219 +file. 
   1.220 +
   1.221 + In any case, when using ICU, you either pass in the 
   1.222 +optimization group as part of the name of the converter (LMBCS-1, LMBCS-2, 
   1.223 +etc.). Using plain 'LMBCS' as the name of the converter will give you 
   1.224 +LMBCS-1.
   1.225 +
   1.226 +
   1.227 +*** Implementation strategy ***
   1.228 +
   1.229 +
   1.230 +Because of the extensive use of other character sets, the LMBCS converter
   1.231 +keeps a mapping between optimization groups and IBM character sets, so that
   1.232 +ICU converters can be created and used as needed. */
   1.233 +
   1.234 +/* As you can see, even though any byte below 0x20 could be an optimization 
   1.235 +byte, only those at 0x13 or below can map to an actual converter. To limit
   1.236 +some loops and searches, we define a value for that last group converter:*/
   1.237 +
   1.238 +#define ULMBCS_GRP_LAST       0x13   /* last LMBCS group that has a converter */
   1.239 +
   1.240 +static const char * const OptGroupByteToCPName[ULMBCS_GRP_LAST + 1] = {
   1.241 +   /* 0x0000 */ "lmb-excp", /* internal home for the LOTUS exceptions list */
   1.242 +   /* 0x0001 */ "ibm-850",
   1.243 +   /* 0x0002 */ "ibm-851",
   1.244 +   /* 0x0003 */ "windows-1255",
   1.245 +   /* 0x0004 */ "windows-1256",
   1.246 +   /* 0x0005 */ "windows-1251",
   1.247 +   /* 0x0006 */ "ibm-852",
   1.248 +   /* 0x0007 */ NULL,      /* Unused */
   1.249 +   /* 0x0008 */ "windows-1254",
   1.250 +   /* 0x0009 */ NULL,      /* Control char HT */
   1.251 +   /* 0x000A */ NULL,      /* Control char LF */
   1.252 +   /* 0x000B */ "windows-874",
   1.253 +   /* 0x000C */ NULL,      /* Unused */
   1.254 +   /* 0x000D */ NULL,      /* Control char CR */
   1.255 +   /* 0x000E */ NULL,      /* Unused */
   1.256 +   /* 0x000F */ NULL,      /* Control chars: 0x0F20 + C0/C1 character: algorithmic */
   1.257 +   /* 0x0010 */ "windows-932",
   1.258 +   /* 0x0011 */ "windows-949",
   1.259 +   /* 0x0012 */ "windows-950",
   1.260 +   /* 0x0013 */ "windows-936"
   1.261 +
   1.262 +   /* The rest are null, including the 0x0014 Unicode compatibility region
   1.263 +   and 0x0019, the 1-2-3 system range control char */      
   1.264 +};
   1.265 +
   1.266 +
   1.267 +/* That's approximately all the data that's needed for translating 
   1.268 +  LMBCS to Unicode. 
   1.269 +
   1.270 +
   1.271 +However, to translate Unicode to LMBCS, we need some more support.
   1.272 +
   1.273 +That's because there are often more than one possible mappings from a Unicode
   1.274 +code point back into LMBCS. The first thing we do is look up into a table
   1.275 +to figure out if there are more than one possible mappings. This table,
   1.276 +arranged by Unicode values (including ranges) either lists which group 
   1.277 +to use, or says that it could go into one or more of the SBCS sets, or
   1.278 +into one or more of the DBCS sets.  (If the character exists in both DBCS & 
   1.279 +SBCS, the table will place it in the SBCS sets, to make the LMBCS code point 
   1.280 +length as small as possible. Here's the two special markers we use to indicate
   1.281 +ambiguous mappings: */
   1.282 +
   1.283 +#define ULMBCS_AMBIGUOUS_SBCS   0x80   /* could fit in more than one 
   1.284 +                                          LMBCS sbcs native encoding 
   1.285 +                                          (example: most accented latin) */
   1.286 +#define ULMBCS_AMBIGUOUS_MBCS   0x81   /* could fit in more than one 
   1.287 +                                          LMBCS mbcs native encoding 
   1.288 +                                          (example: Unihan) */
   1.289 +#define ULMBCS_AMBIGUOUS_ALL   0x82
   1.290 +/* And here's a simple way to see if a group falls in an appropriate range */
   1.291 +#define ULMBCS_AMBIGUOUS_MATCH(agroup, xgroup) \
   1.292 +                  ((((agroup) == ULMBCS_AMBIGUOUS_SBCS) && \
   1.293 +                  (xgroup) < ULMBCS_DOUBLEOPTGROUP_START) || \
   1.294 +                  (((agroup) == ULMBCS_AMBIGUOUS_MBCS) && \
   1.295 +                  (xgroup) >= ULMBCS_DOUBLEOPTGROUP_START)) || \
   1.296 +                  ((agroup) == ULMBCS_AMBIGUOUS_ALL)
   1.297 +
   1.298 +
   1.299 +/* The table & some code to use it: */
   1.300 +
   1.301 +
   1.302 +static const struct _UniLMBCSGrpMap  
   1.303 +{
   1.304 +   const UChar uniStartRange;
   1.305 +   const UChar uniEndRange;
   1.306 +   const ulmbcs_byte_t  GrpType;
   1.307 +} UniLMBCSGrpMap[]
   1.308 +=
   1.309 +{
   1.310 +
   1.311 +    {0x0001, 0x001F,  ULMBCS_GRP_CTRL},
   1.312 +    {0x0080, 0x009F,  ULMBCS_GRP_CTRL},
   1.313 +    {0x00A0, 0x00A6,  ULMBCS_AMBIGUOUS_SBCS},
   1.314 +    {0x00A7, 0x00A8,  ULMBCS_AMBIGUOUS_ALL},
   1.315 +    {0x00A9, 0x00AF,  ULMBCS_AMBIGUOUS_SBCS},
   1.316 +    {0x00B0, 0x00B1,  ULMBCS_AMBIGUOUS_ALL},
   1.317 +    {0x00B2, 0x00B3,  ULMBCS_AMBIGUOUS_SBCS},
   1.318 +    {0x00B4, 0x00B4,  ULMBCS_AMBIGUOUS_ALL},
   1.319 +    {0x00B5, 0x00B5,  ULMBCS_AMBIGUOUS_SBCS},
   1.320 +    {0x00B6, 0x00B6,  ULMBCS_AMBIGUOUS_ALL},
   1.321 +    {0x00B7, 0x00D6,  ULMBCS_AMBIGUOUS_SBCS},
   1.322 +    {0x00D7, 0x00D7,  ULMBCS_AMBIGUOUS_ALL},
   1.323 +    {0x00D8, 0x00F6,  ULMBCS_AMBIGUOUS_SBCS},
   1.324 +    {0x00F7, 0x00F7,  ULMBCS_AMBIGUOUS_ALL},
   1.325 +    {0x00F8, 0x01CD,  ULMBCS_AMBIGUOUS_SBCS},
   1.326 +    {0x01CE, 0x01CE,  ULMBCS_GRP_TW },
   1.327 +    {0x01CF, 0x02B9,  ULMBCS_AMBIGUOUS_SBCS},
   1.328 +    {0x02BA, 0x02BA,  ULMBCS_GRP_CN},
   1.329 +    {0x02BC, 0x02C8,  ULMBCS_AMBIGUOUS_SBCS},
   1.330 +    {0x02C9, 0x02D0,  ULMBCS_AMBIGUOUS_MBCS},
   1.331 +    {0x02D8, 0x02DD,  ULMBCS_AMBIGUOUS_SBCS},
   1.332 +    {0x0384, 0x0390,  ULMBCS_AMBIGUOUS_SBCS},
   1.333 +    {0x0391, 0x03A9,  ULMBCS_AMBIGUOUS_ALL},
   1.334 +    {0x03AA, 0x03B0,  ULMBCS_AMBIGUOUS_SBCS},
   1.335 +    {0x03B1, 0x03C9,  ULMBCS_AMBIGUOUS_ALL},
   1.336 +    {0x03CA, 0x03CE,  ULMBCS_AMBIGUOUS_SBCS},
   1.337 +    {0x0400, 0x0400,  ULMBCS_GRP_RU},
   1.338 +    {0x0401, 0x0401,  ULMBCS_AMBIGUOUS_ALL},
   1.339 +    {0x0402, 0x040F,  ULMBCS_GRP_RU},
   1.340 +    {0x0410, 0x0431,  ULMBCS_AMBIGUOUS_ALL},
   1.341 +    {0x0432, 0x044E,  ULMBCS_GRP_RU},
   1.342 +    {0x044F, 0x044F,  ULMBCS_AMBIGUOUS_ALL},
   1.343 +    {0x0450, 0x0491,  ULMBCS_GRP_RU},
   1.344 +    {0x05B0, 0x05F2,  ULMBCS_GRP_HE},
   1.345 +    {0x060C, 0x06AF,  ULMBCS_GRP_AR},
   1.346 +    {0x0E01, 0x0E5B,  ULMBCS_GRP_TH},
   1.347 +    {0x200C, 0x200F,  ULMBCS_AMBIGUOUS_SBCS},
   1.348 +    {0x2010, 0x2010,  ULMBCS_AMBIGUOUS_MBCS},
   1.349 +    {0x2013, 0x2014,  ULMBCS_AMBIGUOUS_SBCS},
   1.350 +    {0x2015, 0x2015,  ULMBCS_AMBIGUOUS_MBCS},
   1.351 +    {0x2016, 0x2016,  ULMBCS_AMBIGUOUS_MBCS},
   1.352 +    {0x2017, 0x2017,  ULMBCS_AMBIGUOUS_SBCS},
   1.353 +    {0x2018, 0x2019,  ULMBCS_AMBIGUOUS_ALL},
   1.354 +    {0x201A, 0x201B,  ULMBCS_AMBIGUOUS_SBCS},
   1.355 +    {0x201C, 0x201D,  ULMBCS_AMBIGUOUS_ALL},
   1.356 +    {0x201E, 0x201F,  ULMBCS_AMBIGUOUS_SBCS},
   1.357 +    {0x2020, 0x2021,  ULMBCS_AMBIGUOUS_ALL},
   1.358 +    {0x2022, 0x2024,  ULMBCS_AMBIGUOUS_SBCS},
   1.359 +    {0x2025, 0x2025,  ULMBCS_AMBIGUOUS_MBCS},
   1.360 +    {0x2026, 0x2026,  ULMBCS_AMBIGUOUS_ALL},
   1.361 +    {0x2027, 0x2027,  ULMBCS_GRP_TW},
   1.362 +    {0x2030, 0x2030,  ULMBCS_AMBIGUOUS_ALL},
   1.363 +    {0x2031, 0x2031,  ULMBCS_AMBIGUOUS_SBCS},
   1.364 +    {0x2032, 0x2033,  ULMBCS_AMBIGUOUS_MBCS},
   1.365 +    {0x2035, 0x2035,  ULMBCS_AMBIGUOUS_MBCS},
   1.366 +    {0x2039, 0x203A,  ULMBCS_AMBIGUOUS_SBCS},
   1.367 +    {0x203B, 0x203B,  ULMBCS_AMBIGUOUS_MBCS},
   1.368 +    {0x203C, 0x203C,  ULMBCS_GRP_EXCEPT},
   1.369 +    {0x2074, 0x2074,  ULMBCS_GRP_KO},
   1.370 +    {0x207F, 0x207F,  ULMBCS_GRP_EXCEPT},
   1.371 +    {0x2081, 0x2084,  ULMBCS_GRP_KO},
   1.372 +    {0x20A4, 0x20AC,  ULMBCS_AMBIGUOUS_SBCS},
   1.373 +    {0x2103, 0x2109,  ULMBCS_AMBIGUOUS_MBCS},
   1.374 +    {0x2111, 0x2120,  ULMBCS_AMBIGUOUS_SBCS},
   1.375 +    /*zhujin: upgrade, for regressiont test, spr HKIA4YHTSU*/
   1.376 +    {0x2121, 0x2121,  ULMBCS_AMBIGUOUS_MBCS},
   1.377 +    {0x2122, 0x2126,  ULMBCS_AMBIGUOUS_SBCS},
   1.378 +    {0x212B, 0x212B,  ULMBCS_AMBIGUOUS_MBCS},
   1.379 +    {0x2135, 0x2135,  ULMBCS_AMBIGUOUS_SBCS},
   1.380 +    {0x2153, 0x2154,  ULMBCS_GRP_KO},
   1.381 +    {0x215B, 0x215E,  ULMBCS_GRP_EXCEPT},
   1.382 +    {0x2160, 0x2179,  ULMBCS_AMBIGUOUS_MBCS},
   1.383 +    {0x2190, 0x2193,  ULMBCS_AMBIGUOUS_ALL},
   1.384 +    {0x2194, 0x2195,  ULMBCS_GRP_EXCEPT},
   1.385 +    {0x2196, 0x2199,  ULMBCS_AMBIGUOUS_MBCS},
   1.386 +    {0x21A8, 0x21A8,  ULMBCS_GRP_EXCEPT},
   1.387 +    {0x21B8, 0x21B9,  ULMBCS_GRP_CN},
   1.388 +    {0x21D0, 0x21D1,  ULMBCS_GRP_EXCEPT},
   1.389 +    {0x21D2, 0x21D2,  ULMBCS_AMBIGUOUS_MBCS},
   1.390 +    {0x21D3, 0x21D3,  ULMBCS_GRP_EXCEPT},
   1.391 +    {0x21D4, 0x21D4,  ULMBCS_AMBIGUOUS_MBCS},
   1.392 +    {0x21D5, 0x21D5,  ULMBCS_GRP_EXCEPT},
   1.393 +    {0x21E7, 0x21E7,  ULMBCS_GRP_CN},
   1.394 +    {0x2200, 0x2200,  ULMBCS_AMBIGUOUS_MBCS},
   1.395 +    {0x2201, 0x2201,  ULMBCS_GRP_EXCEPT},
   1.396 +    {0x2202, 0x2202,  ULMBCS_AMBIGUOUS_MBCS},
   1.397 +    {0x2203, 0x2203,  ULMBCS_AMBIGUOUS_MBCS},
   1.398 +    {0x2204, 0x2206,  ULMBCS_GRP_EXCEPT},
   1.399 +    {0x2207, 0x2208,  ULMBCS_AMBIGUOUS_MBCS},
   1.400 +    {0x2209, 0x220A,  ULMBCS_GRP_EXCEPT},
   1.401 +    {0x220B, 0x220B,  ULMBCS_AMBIGUOUS_MBCS},
   1.402 +    {0x220F, 0x2215,  ULMBCS_AMBIGUOUS_MBCS},
   1.403 +    {0x2219, 0x2219,  ULMBCS_GRP_EXCEPT},
   1.404 +    {0x221A, 0x221A,  ULMBCS_AMBIGUOUS_MBCS},
   1.405 +    {0x221B, 0x221C,  ULMBCS_GRP_EXCEPT},
   1.406 +    {0x221D, 0x221E,  ULMBCS_AMBIGUOUS_MBCS},
   1.407 +    {0x221F, 0x221F,  ULMBCS_GRP_EXCEPT},
   1.408 +    {0x2220, 0x2220,  ULMBCS_AMBIGUOUS_MBCS},
   1.409 +    {0x2223, 0x222A,  ULMBCS_AMBIGUOUS_MBCS},
   1.410 +    {0x222B, 0x223D,  ULMBCS_AMBIGUOUS_MBCS},
   1.411 +    {0x2245, 0x2248,  ULMBCS_GRP_EXCEPT},
   1.412 +    {0x224C, 0x224C,  ULMBCS_GRP_TW},
   1.413 +    {0x2252, 0x2252,  ULMBCS_AMBIGUOUS_MBCS},
   1.414 +    {0x2260, 0x2261,  ULMBCS_AMBIGUOUS_MBCS},
   1.415 +    {0x2262, 0x2265,  ULMBCS_GRP_EXCEPT},
   1.416 +    {0x2266, 0x226F,  ULMBCS_AMBIGUOUS_MBCS},
   1.417 +    {0x2282, 0x2283,  ULMBCS_AMBIGUOUS_MBCS},
   1.418 +    {0x2284, 0x2285,  ULMBCS_GRP_EXCEPT},
   1.419 +    {0x2286, 0x2287,  ULMBCS_AMBIGUOUS_MBCS},
   1.420 +    {0x2288, 0x2297,  ULMBCS_GRP_EXCEPT},
   1.421 +    {0x2299, 0x22BF,  ULMBCS_AMBIGUOUS_MBCS},
   1.422 +    {0x22C0, 0x22C0,  ULMBCS_GRP_EXCEPT},
   1.423 +    {0x2310, 0x2310,  ULMBCS_GRP_EXCEPT},
   1.424 +    {0x2312, 0x2312,  ULMBCS_AMBIGUOUS_MBCS},
   1.425 +    {0x2318, 0x2321,  ULMBCS_GRP_EXCEPT},
   1.426 +    {0x2318, 0x2321,  ULMBCS_GRP_CN},
   1.427 +    {0x2460, 0x24E9,  ULMBCS_AMBIGUOUS_MBCS},
   1.428 +    {0x2500, 0x2500,  ULMBCS_AMBIGUOUS_SBCS},
   1.429 +    {0x2501, 0x2501,  ULMBCS_AMBIGUOUS_MBCS},
   1.430 +    {0x2502, 0x2502,  ULMBCS_AMBIGUOUS_ALL},
   1.431 +    {0x2503, 0x2503,  ULMBCS_AMBIGUOUS_MBCS},
   1.432 +    {0x2504, 0x2505,  ULMBCS_GRP_TW},
   1.433 +    {0x2506, 0x2665,  ULMBCS_AMBIGUOUS_ALL},
   1.434 +    {0x2666, 0x2666,  ULMBCS_GRP_EXCEPT},
   1.435 +    {0x2667, 0x2669,  ULMBCS_AMBIGUOUS_SBCS},
   1.436 +    {0x266A, 0x266A,  ULMBCS_AMBIGUOUS_ALL},
   1.437 +    {0x266B, 0x266C,  ULMBCS_AMBIGUOUS_SBCS},
   1.438 +    {0x266D, 0x266D,  ULMBCS_AMBIGUOUS_MBCS},
   1.439 +    {0x266E, 0x266E,  ULMBCS_AMBIGUOUS_SBCS},
   1.440 +    {0x266F, 0x266F,  ULMBCS_GRP_JA},
   1.441 +    {0x2670, 0x2E7F,  ULMBCS_AMBIGUOUS_SBCS},
   1.442 +    {0x2E80, 0xF861,  ULMBCS_AMBIGUOUS_MBCS},
   1.443 +    {0xF862, 0xF8FF,  ULMBCS_GRP_EXCEPT},
   1.444 +    {0xF900, 0xFA2D,  ULMBCS_AMBIGUOUS_MBCS},
   1.445 +    {0xFB00, 0xFEFF,  ULMBCS_AMBIGUOUS_SBCS},
   1.446 +    {0xFF01, 0xFFEE,  ULMBCS_AMBIGUOUS_MBCS},
   1.447 +    {0xFFFF, 0xFFFF,  ULMBCS_GRP_UNICODE}
   1.448 +};
   1.449 +   
   1.450 +static ulmbcs_byte_t 
   1.451 +FindLMBCSUniRange(UChar uniChar)
   1.452 +{
   1.453 +   const struct _UniLMBCSGrpMap * pTable = UniLMBCSGrpMap;
   1.454 +
   1.455 +   while (uniChar > pTable->uniEndRange) 
   1.456 +   {
   1.457 +      pTable++;
   1.458 +   }
   1.459 +
   1.460 +   if (uniChar >= pTable->uniStartRange) 
   1.461 +   {
   1.462 +      return pTable->GrpType;
   1.463 +   }
   1.464 +   return ULMBCS_GRP_UNICODE;
   1.465 +}
   1.466 +
   1.467 +/* 
   1.468 +We also ask the creator of a converter to send in a preferred locale 
   1.469 +that we can use in resolving ambiguous mappings. They send the locale
   1.470 +in as a string, and we map it, if possible, to one of the 
   1.471 +LMBCS groups. We use this table, and the associated code, to 
   1.472 +do the lookup: */
   1.473 +
   1.474 +/**************************************************
   1.475 +  This table maps locale ID's to LMBCS opt groups.
   1.476 +  The default return is group 0x01. Note that for
   1.477 +  performance reasons, the table is sorted in
   1.478 +  increasing alphabetic order, with the notable
   1.479 +  exception of zhTW. This is to force the check
   1.480 +  for Traditonal Chinese before dropping back to
   1.481 +  Simplified.
   1.482 +
   1.483 +  Note too that the Latin-1 groups have been
   1.484 +  commented out because it's the default, and
   1.485 +  this shortens the table, allowing a serial
   1.486 +  search to go quickly.
   1.487 + *************************************************/
   1.488 +
   1.489 +static const struct _LocaleLMBCSGrpMap
   1.490 +{
   1.491 +   const char    *LocaleID;
   1.492 +   const ulmbcs_byte_t OptGroup;
   1.493 +} LocaleLMBCSGrpMap[] =
   1.494 +{
   1.495 +    {"ar", ULMBCS_GRP_AR},
   1.496 +    {"be", ULMBCS_GRP_RU},
   1.497 +    {"bg", ULMBCS_GRP_L2},
   1.498 +   /* {"ca", ULMBCS_GRP_L1}, */
   1.499 +    {"cs", ULMBCS_GRP_L2},
   1.500 +   /* {"da", ULMBCS_GRP_L1}, */
   1.501 +   /* {"de", ULMBCS_GRP_L1}, */
   1.502 +    {"el", ULMBCS_GRP_GR},
   1.503 +   /* {"en", ULMBCS_GRP_L1}, */
   1.504 +   /* {"es", ULMBCS_GRP_L1}, */
   1.505 +   /* {"et", ULMBCS_GRP_L1}, */
   1.506 +   /* {"fi", ULMBCS_GRP_L1}, */
   1.507 +   /* {"fr", ULMBCS_GRP_L1}, */
   1.508 +    {"he", ULMBCS_GRP_HE},
   1.509 +    {"hu", ULMBCS_GRP_L2},
   1.510 +   /* {"is", ULMBCS_GRP_L1}, */
   1.511 +   /* {"it", ULMBCS_GRP_L1}, */
   1.512 +    {"iw", ULMBCS_GRP_HE},
   1.513 +    {"ja", ULMBCS_GRP_JA},
   1.514 +    {"ko", ULMBCS_GRP_KO},
   1.515 +   /* {"lt", ULMBCS_GRP_L1}, */
   1.516 +   /* {"lv", ULMBCS_GRP_L1}, */
   1.517 +    {"mk", ULMBCS_GRP_RU},
   1.518 +   /* {"nl", ULMBCS_GRP_L1}, */
   1.519 +   /* {"no", ULMBCS_GRP_L1}, */
   1.520 +    {"pl", ULMBCS_GRP_L2},
   1.521 +   /* {"pt", ULMBCS_GRP_L1}, */
   1.522 +    {"ro", ULMBCS_GRP_L2},
   1.523 +    {"ru", ULMBCS_GRP_RU},
   1.524 +    {"sh", ULMBCS_GRP_L2},
   1.525 +    {"sk", ULMBCS_GRP_L2},
   1.526 +    {"sl", ULMBCS_GRP_L2},
   1.527 +    {"sq", ULMBCS_GRP_L2},
   1.528 +    {"sr", ULMBCS_GRP_RU},
   1.529 +   /* {"sv", ULMBCS_GRP_L1}, */
   1.530 +    {"th", ULMBCS_GRP_TH},
   1.531 +    {"tr", ULMBCS_GRP_TR},
   1.532 +    {"uk", ULMBCS_GRP_RU},
   1.533 +   /* {"vi", ULMBCS_GRP_L1}, */
   1.534 +    {"zhTW", ULMBCS_GRP_TW},
   1.535 +    {"zh", ULMBCS_GRP_CN},
   1.536 +    {NULL, ULMBCS_GRP_L1}
   1.537 +};
   1.538 +
   1.539 +
   1.540 +static ulmbcs_byte_t 
   1.541 +FindLMBCSLocale(const char *LocaleID)
   1.542 +{
   1.543 +   const struct _LocaleLMBCSGrpMap *pTable = LocaleLMBCSGrpMap;
   1.544 +
   1.545 +   if ((!LocaleID) || (!*LocaleID)) 
   1.546 +   {
   1.547 +      return 0;
   1.548 +   }
   1.549 +
   1.550 +   while (pTable->LocaleID)
   1.551 +   {
   1.552 +      if (*pTable->LocaleID == *LocaleID) /* Check only first char for speed */
   1.553 +      {
   1.554 +         /* First char matches - check whole name, for entry-length */
   1.555 +         if (uprv_strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0)
   1.556 +            return pTable->OptGroup;
   1.557 +      }
   1.558 +      else
   1.559 +      if (*pTable->LocaleID > *LocaleID) /* Sorted alphabetically - exit */
   1.560 +         break;
   1.561 +      pTable++;
   1.562 +   }
   1.563 +   return ULMBCS_GRP_L1;
   1.564 +}
   1.565 +
   1.566 +
   1.567 +/* 
   1.568 +  Before we get to the main body of code, here's how we hook up to the rest 
   1.569 +  of ICU. ICU converters are required to define a structure that includes 
   1.570 +  some function pointers, and some common data, in the style of a C++
   1.571 +  vtable. There is also room in there for converter-specific data. LMBCS
   1.572 +  uses that converter-specific data to keep track of the 12 subconverters
   1.573 +  we use, the optimization group, and the group (if any) that matches the 
   1.574 +  locale. We have one structure instantiated for each of the 12 possible
   1.575 +  optimization groups. To avoid typos & to avoid boring the reader, we 
   1.576 +  put the declarations of these structures and functions into macros. To see 
   1.577 +  the definitions of these structures, see unicode\ucnv_bld.h
   1.578 +*/
   1.579 +
   1.580 +typedef struct
   1.581 +  {
   1.582 +    UConverterSharedData *OptGrpConverter[ULMBCS_GRP_LAST+1];    /* Converter per Opt. grp. */
   1.583 +    uint8_t    OptGroup;                  /* default Opt. grp. for this LMBCS session */
   1.584 +    uint8_t    localeConverterIndex;      /* reasonable locale match for index */
   1.585 +  }
   1.586 +UConverterDataLMBCS;
   1.587 +
   1.588 +static void _LMBCSClose(UConverter * _this);
   1.589 +
   1.590 +#define DECLARE_LMBCS_DATA(n) \
   1.591 +static const UConverterImpl _LMBCSImpl##n={\
   1.592 +    UCNV_LMBCS_##n,\
   1.593 +    NULL,NULL,\
   1.594 +    _LMBCSOpen##n,\
   1.595 +    _LMBCSClose,\
   1.596 +    NULL,\
   1.597 +    _LMBCSToUnicodeWithOffsets,\
   1.598 +    _LMBCSToUnicodeWithOffsets,\
   1.599 +    _LMBCSFromUnicode,\
   1.600 +    _LMBCSFromUnicode,\
   1.601 +    NULL,\
   1.602 +    NULL,\
   1.603 +    NULL,\
   1.604 +    NULL,\
   1.605 +    _LMBCSSafeClone,\
   1.606 +    ucnv_getCompleteUnicodeSet\
   1.607 +};\
   1.608 +static const UConverterStaticData _LMBCSStaticData##n={\
   1.609 +  sizeof(UConverterStaticData),\
   1.610 + "LMBCS-"  #n,\
   1.611 +    0, UCNV_IBM, UCNV_LMBCS_##n, 1, 3,\
   1.612 +    { 0x3f, 0, 0, 0 },1,FALSE,FALSE,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
   1.613 +};\
   1.614 +const UConverterSharedData _LMBCSData##n={\
   1.615 +    sizeof(UConverterSharedData), ~((uint32_t) 0),\
   1.616 +    NULL, NULL, &_LMBCSStaticData##n, FALSE, &_LMBCSImpl##n, \
   1.617 +    0 \
   1.618 +};
   1.619 +
   1.620 + /* The only function we needed to duplicate 12 times was the 'open'
   1.621 +function, which will do basically the same thing except set a  different
   1.622 +optimization group. So, we put the common stuff into a worker function, 
   1.623 +and set up another macro to stamp out the 12 open functions:*/
   1.624 +#define DEFINE_LMBCS_OPEN(n) \
   1.625 +static void \
   1.626 +   _LMBCSOpen##n(UConverter* _this, UConverterLoadArgs* pArgs, UErrorCode* err) \
   1.627 +{ _LMBCSOpenWorker(_this, pArgs, err, n); }
   1.628 +
   1.629 +
   1.630 +
   1.631 +/* Here's the open worker & the common close function */
   1.632 +static void 
   1.633 +_LMBCSOpenWorker(UConverter*  _this,
   1.634 +                 UConverterLoadArgs *pArgs,
   1.635 +                 UErrorCode*  err,
   1.636 +                 ulmbcs_byte_t OptGroup)
   1.637 +{
   1.638 +    UConverterDataLMBCS * extraInfo = _this->extraInfo =
   1.639 +        (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS));
   1.640 +    if(extraInfo != NULL)
   1.641 +    {
   1.642 +        UConverterNamePieces stackPieces;
   1.643 +        UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
   1.644 +        ulmbcs_byte_t i;
   1.645 +
   1.646 +        uprv_memset(extraInfo, 0, sizeof(UConverterDataLMBCS));
   1.647 +
   1.648 +        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
   1.649 +
   1.650 +        for (i=0; i <= ULMBCS_GRP_LAST && U_SUCCESS(*err); i++)         
   1.651 +        {
   1.652 +            if(OptGroupByteToCPName[i] != NULL) {
   1.653 +                extraInfo->OptGrpConverter[i] = ucnv_loadSharedData(OptGroupByteToCPName[i], &stackPieces, &stackArgs, err);
   1.654 +            }
   1.655 +        }
   1.656 +
   1.657 +        if(U_FAILURE(*err) || pArgs->onlyTestIsLoadable) {
   1.658 +            _LMBCSClose(_this);
   1.659 +            return;
   1.660 +        }
   1.661 +        extraInfo->OptGroup = OptGroup;
   1.662 +        extraInfo->localeConverterIndex = FindLMBCSLocale(pArgs->locale);
   1.663 +    }
   1.664 +    else
   1.665 +    {
   1.666 +        *err = U_MEMORY_ALLOCATION_ERROR;
   1.667 +    }
   1.668 +}
   1.669 +
   1.670 +static void 
   1.671 +_LMBCSClose(UConverter *   _this) 
   1.672 +{
   1.673 +    if (_this->extraInfo != NULL)
   1.674 +    {
   1.675 +        ulmbcs_byte_t Ix;
   1.676 +        UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
   1.677 +
   1.678 +        for (Ix=0; Ix <= ULMBCS_GRP_LAST; Ix++)
   1.679 +        {
   1.680 +           if (extraInfo->OptGrpConverter[Ix] != NULL)
   1.681 +              ucnv_unloadSharedDataIfReady(extraInfo->OptGrpConverter[Ix]);
   1.682 +        }
   1.683 +        if (!_this->isExtraLocal) {
   1.684 +            uprv_free (_this->extraInfo);
   1.685 +            _this->extraInfo = NULL;
   1.686 +        }
   1.687 +    }
   1.688 +}
   1.689 +
   1.690 +typedef struct LMBCSClone {
   1.691 +    UConverter cnv;
   1.692 +    UConverterDataLMBCS lmbcs;
   1.693 +} LMBCSClone;
   1.694 +
   1.695 +static UConverter * 
   1.696 +_LMBCSSafeClone(const UConverter *cnv, 
   1.697 +                void *stackBuffer, 
   1.698 +                int32_t *pBufferSize, 
   1.699 +                UErrorCode *status) {
   1.700 +    LMBCSClone *newLMBCS;
   1.701 +    UConverterDataLMBCS *extraInfo;
   1.702 +    int32_t i;
   1.703 +
   1.704 +    if(*pBufferSize<=0) {
   1.705 +        *pBufferSize=(int32_t)sizeof(LMBCSClone);
   1.706 +        return NULL;
   1.707 +    }
   1.708 +
   1.709 +    extraInfo=(UConverterDataLMBCS *)cnv->extraInfo;
   1.710 +    newLMBCS=(LMBCSClone *)stackBuffer;
   1.711 +
   1.712 +    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
   1.713 +
   1.714 +    uprv_memcpy(&newLMBCS->lmbcs, extraInfo, sizeof(UConverterDataLMBCS));
   1.715 +
   1.716 +    /* share the subconverters */
   1.717 +    for(i = 0; i <= ULMBCS_GRP_LAST; ++i) {
   1.718 +        if(extraInfo->OptGrpConverter[i] != NULL) {
   1.719 +            ucnv_incrementRefCount(extraInfo->OptGrpConverter[i]);
   1.720 +        }
   1.721 +    }
   1.722 +
   1.723 +    newLMBCS->cnv.extraInfo = &newLMBCS->lmbcs;
   1.724 +    newLMBCS->cnv.isExtraLocal = TRUE;
   1.725 +    return &newLMBCS->cnv;
   1.726 +}
   1.727 +
   1.728 +/*
   1.729 + * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)
   1.730 + * which added all code points except for U+F6xx
   1.731 + * because those cannot be represented in the Unicode group.
   1.732 + * However, it turns out that windows-950 has roundtrips for all of U+F6xx
   1.733 + * which means that LMBCS can convert all Unicode code points after all.
   1.734 + * We now simply use ucnv_getCompleteUnicodeSet().
   1.735 + *
   1.736 + * This may need to be looked at again as Lotus uses _LMBCSGetUnicodeSet(). (091216)
   1.737 + */
   1.738 +
   1.739 +/* 
   1.740 +   Here's the basic helper function that we use when converting from
   1.741 +   Unicode to LMBCS, and we suspect that a Unicode character will fit into 
   1.742 +   one of the 12 groups. The return value is the number of bytes written 
   1.743 +   starting at pStartLMBCS (if any).
   1.744 +*/
   1.745 +
   1.746 +static size_t
   1.747 +LMBCSConversionWorker (
   1.748 +   UConverterDataLMBCS * extraInfo,    /* subconverters, opt & locale groups */
   1.749 +   ulmbcs_byte_t group,                /* The group to try */
   1.750 +   ulmbcs_byte_t  * pStartLMBCS,              /* where to put the results */
   1.751 +   UChar * pUniChar,                   /* The input unicode character */
   1.752 +   ulmbcs_byte_t * lastConverterIndex, /* output: track last successful group used */
   1.753 +   UBool * groups_tried                /* output: track any unsuccessful groups */
   1.754 +)   
   1.755 +{
   1.756 +   ulmbcs_byte_t  * pLMBCS = pStartLMBCS;
   1.757 +   UConverterSharedData * xcnv = extraInfo->OptGrpConverter[group];
   1.758 +
   1.759 +   int bytesConverted;
   1.760 +   uint32_t value;
   1.761 +   ulmbcs_byte_t firstByte;
   1.762 +
   1.763 +   U_ASSERT(xcnv);
   1.764 +   U_ASSERT(group<ULMBCS_GRP_UNICODE);
   1.765 +
   1.766 +   bytesConverted = ucnv_MBCSFromUChar32(xcnv, *pUniChar, &value, FALSE);
   1.767 +
   1.768 +   /* get the first result byte */
   1.769 +   if(bytesConverted > 0) {
   1.770 +      firstByte = (ulmbcs_byte_t)(value >> ((bytesConverted - 1) * 8));
   1.771 +   } else {
   1.772 +      /* most common failure mode is an unassigned character */
   1.773 +      groups_tried[group] = TRUE;
   1.774 +      return 0;
   1.775 +   }
   1.776 +
   1.777 +   *lastConverterIndex = group;
   1.778 +
   1.779 +   /* All initial byte values in lower ascii range should have been caught by now,
   1.780 +      except with the exception group.
   1.781 +    */
   1.782 +   U_ASSERT((firstByte <= ULMBCS_C0END) || (firstByte >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT));
   1.783 +   
   1.784 +   /* use converted data: first write 0, 1 or two group bytes */
   1.785 +   if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group)
   1.786 +   {
   1.787 +      *pLMBCS++ = group;
   1.788 +      if (bytesConverted == 1 && group >= ULMBCS_DOUBLEOPTGROUP_START)
   1.789 +      {
   1.790 +         *pLMBCS++ = group;
   1.791 +      }
   1.792 +   }
   1.793 +
   1.794 +  /* don't emit control chars */
   1.795 +   if ( bytesConverted == 1 && firstByte < 0x20 )
   1.796 +      return 0;
   1.797 +
   1.798 +
   1.799 +   /* then move over the converted data */
   1.800 +   switch(bytesConverted)
   1.801 +   {
   1.802 +   case 4:
   1.803 +      *pLMBCS++ = (ulmbcs_byte_t)(value >> 24);
   1.804 +   case 3: /*fall through*/
   1.805 +      *pLMBCS++ = (ulmbcs_byte_t)(value >> 16);
   1.806 +   case 2: /*fall through*/
   1.807 +      *pLMBCS++ = (ulmbcs_byte_t)(value >> 8);
   1.808 +   case 1: /*fall through*/
   1.809 +      *pLMBCS++ = (ulmbcs_byte_t)value;
   1.810 +   default:
   1.811 +      /* will never occur */
   1.812 +      break;
   1.813 +   }
   1.814 +
   1.815 +   return (pLMBCS - pStartLMBCS);
   1.816 +}
   1.817 +
   1.818 +
   1.819 +/* This is a much simpler version of above, when we 
   1.820 +know we are writing LMBCS using the Unicode group
   1.821 +*/
   1.822 +static size_t 
   1.823 +LMBCSConvertUni(ulmbcs_byte_t * pLMBCS, UChar uniChar)  
   1.824 +{
   1.825 +     /* encode into LMBCS Unicode range */
   1.826 +   uint8_t LowCh =   (uint8_t)(uniChar & 0x00FF);
   1.827 +   uint8_t HighCh  = (uint8_t)(uniChar >> 8);
   1.828 +
   1.829 +   *pLMBCS++ = ULMBCS_GRP_UNICODE;
   1.830 +
   1.831 +   if (LowCh == 0)
   1.832 +   {
   1.833 +      *pLMBCS++ = ULMBCS_UNICOMPATZERO;
   1.834 +      *pLMBCS++ = HighCh;
   1.835 +   }
   1.836 +   else
   1.837 +   {
   1.838 +      *pLMBCS++ = HighCh;
   1.839 +      *pLMBCS++ = LowCh;
   1.840 +   }
   1.841 +   return ULMBCS_UNICODE_SIZE;
   1.842 +}
   1.843 +
   1.844 +
   1.845 +
   1.846 +/* The main Unicode to LMBCS conversion function */
   1.847 +static void 
   1.848 +_LMBCSFromUnicode(UConverterFromUnicodeArgs*     args,
   1.849 +                  UErrorCode*     err)
   1.850 +{
   1.851 +   ulmbcs_byte_t lastConverterIndex = 0;
   1.852 +   UChar uniChar;
   1.853 +   ulmbcs_byte_t  LMBCS[ULMBCS_CHARSIZE_MAX];
   1.854 +   ulmbcs_byte_t  * pLMBCS;
   1.855 +   int32_t bytes_written;
   1.856 +   UBool groups_tried[ULMBCS_GRP_LAST+1];
   1.857 +   UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
   1.858 +   int sourceIndex = 0; 
   1.859 +
   1.860 +   /* Basic strategy: attempt to fill in local LMBCS 1-char buffer.(LMBCS)
   1.861 +      If that succeeds, see if it will all fit into the target & copy it over 
   1.862 +      if it does.
   1.863 +
   1.864 +      We try conversions in the following order:
   1.865 +
   1.866 +      1. Single-byte ascii & special fixed control chars (&null)
   1.867 +      2. Look up group in table & try that (could be 
   1.868 +            A) Unicode group
   1.869 +            B) control group,
   1.870 +            C) national encoding, 
   1.871 +               or ambiguous SBCS or MBCS group (on to step 4...)
   1.872 +        
   1.873 +      3. If its ambiguous, try this order:
   1.874 +         A) The optimization group
   1.875 +         B) The locale group
   1.876 +         C) The last group that succeeded with this string.
   1.877 +         D) every other group that's relevent (single or double)
   1.878 +         E) If its single-byte ambiguous, try the exceptions group
   1.879 +
   1.880 +      4. And as a grand fallback: Unicode
   1.881 +   */
   1.882 +
   1.883 +    /*Fix for SPR#DJOE66JFN3 (Lotus)*/
   1.884 +    ulmbcs_byte_t OldConverterIndex = 0;
   1.885 +
   1.886 +   while (args->source < args->sourceLimit && !U_FAILURE(*err))
   1.887 +   {
   1.888 +      /*Fix for SPR#DJOE66JFN3 (Lotus)*/
   1.889 +      OldConverterIndex = extraInfo->localeConverterIndex;
   1.890 +
   1.891 +      if (args->target >= args->targetLimit)
   1.892 +      {
   1.893 +         *err = U_BUFFER_OVERFLOW_ERROR;
   1.894 +         break;
   1.895 +      }
   1.896 +      uniChar = *(args->source);
   1.897 +      bytes_written = 0;
   1.898 +      pLMBCS = LMBCS;
   1.899 +
   1.900 +      /* check cases in rough order of how common they are, for speed */
   1.901 +
   1.902 +      /* single byte matches: strategy 1 */
   1.903 +      /*Fix for SPR#DJOE66JFN3 (Lotus)*/
   1.904 +      if((uniChar>=0x80) && (uniChar<=0xff)
   1.905 +      /*Fix for SPR#JUYA6XAERU and TSAO7GL5NK (Lotus)*/ &&(uniChar!=0xB1) &&(uniChar!=0xD7) &&(uniChar!=0xF7)
   1.906 +        &&(uniChar!=0xB0) &&(uniChar!=0xB4) &&(uniChar!=0xB6) &&(uniChar!=0xA7) &&(uniChar!=0xA8))
   1.907 +      {
   1.908 +            extraInfo->localeConverterIndex = ULMBCS_GRP_L1;
   1.909 +      }
   1.910 +      if (((uniChar > ULMBCS_C0END) && (uniChar < ULMBCS_C1START)) ||
   1.911 +          uniChar == 0 || uniChar == ULMBCS_HT || uniChar == ULMBCS_CR || 
   1.912 +          uniChar == ULMBCS_LF || uniChar == ULMBCS_123SYSTEMRANGE 
   1.913 +          )
   1.914 +      {
   1.915 +         *pLMBCS++ = (ulmbcs_byte_t ) uniChar;
   1.916 +         bytes_written = 1;
   1.917 +      }
   1.918 +
   1.919 +
   1.920 +      if (!bytes_written) 
   1.921 +      {
   1.922 +         /* Check by UNICODE range (Strategy 2) */
   1.923 +         ulmbcs_byte_t group = FindLMBCSUniRange(uniChar);
   1.924 +         
   1.925 +         if (group == ULMBCS_GRP_UNICODE)  /* (Strategy 2A) */
   1.926 +         {
   1.927 +            pLMBCS += LMBCSConvertUni(pLMBCS,uniChar);
   1.928 +            
   1.929 +            bytes_written = (int32_t)(pLMBCS - LMBCS);
   1.930 +         }
   1.931 +         else if (group == ULMBCS_GRP_CTRL)  /* (Strategy 2B) */
   1.932 +         {
   1.933 +            /* Handle control characters here */
   1.934 +            if (uniChar <= ULMBCS_C0END)
   1.935 +            {
   1.936 +               *pLMBCS++ = ULMBCS_GRP_CTRL;
   1.937 +               *pLMBCS++ = (ulmbcs_byte_t)(ULMBCS_CTRLOFFSET + uniChar);
   1.938 +            }
   1.939 +            else if (uniChar >= ULMBCS_C1START && uniChar <= ULMBCS_C1START + ULMBCS_CTRLOFFSET)
   1.940 +            {
   1.941 +               *pLMBCS++ = ULMBCS_GRP_CTRL;
   1.942 +               *pLMBCS++ = (ulmbcs_byte_t ) (uniChar & 0x00FF);
   1.943 +            }
   1.944 +            bytes_written = (int32_t)(pLMBCS - LMBCS);
   1.945 +         }
   1.946 +         else if (group < ULMBCS_GRP_UNICODE)  /* (Strategy 2C) */
   1.947 +         {
   1.948 +            /* a specific converter has been identified - use it */
   1.949 +            bytes_written = (int32_t)LMBCSConversionWorker (
   1.950 +                              extraInfo, group, pLMBCS, &uniChar, 
   1.951 +                              &lastConverterIndex, groups_tried);
   1.952 +         }
   1.953 +         if (!bytes_written)    /* the ambiguous group cases  (Strategy 3) */
   1.954 +         {
   1.955 +            uprv_memset(groups_tried, 0, sizeof(groups_tried));
   1.956 +
   1.957 +            /* check for non-default optimization group (Strategy 3A )*/
   1.958 +            if ((extraInfo->OptGroup != 1) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->OptGroup)))
   1.959 +            {
   1.960 +                /*zhujin: upgrade, merge #39299 here (Lotus) */
   1.961 +                /*To make R5 compatible translation, look for exceptional group first for non-DBCS*/
   1.962 +
   1.963 +                if(extraInfo->localeConverterIndex < ULMBCS_DOUBLEOPTGROUP_START)
   1.964 +                {
   1.965 +                  bytes_written = LMBCSConversionWorker (extraInfo,
   1.966 +                     ULMBCS_GRP_L1, pLMBCS, &uniChar,
   1.967 +                     &lastConverterIndex, groups_tried);
   1.968 +
   1.969 +                  if(!bytes_written)
   1.970 +                  {
   1.971 +                     bytes_written = LMBCSConversionWorker (extraInfo,
   1.972 +                         ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar,
   1.973 +                         &lastConverterIndex, groups_tried);
   1.974 +                  }
   1.975 +                  if(!bytes_written)
   1.976 +                  {
   1.977 +                      bytes_written = LMBCSConversionWorker (extraInfo,
   1.978 +                          extraInfo->localeConverterIndex, pLMBCS, &uniChar,
   1.979 +                          &lastConverterIndex, groups_tried);
   1.980 +                  }
   1.981 +                }
   1.982 +                else
   1.983 +                {
   1.984 +                     bytes_written = LMBCSConversionWorker (extraInfo,
   1.985 +                         extraInfo->localeConverterIndex, pLMBCS, &uniChar,
   1.986 +                         &lastConverterIndex, groups_tried);
   1.987 +                }
   1.988 +            }
   1.989 +            /* check for locale optimization group (Strategy 3B) */
   1.990 +            if (!bytes_written && (extraInfo->localeConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->localeConverterIndex)))
   1.991 +            {
   1.992 +                bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
   1.993 +                        extraInfo->localeConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried);
   1.994 +            }
   1.995 +            /* check for last optimization group used for this string (Strategy 3C) */
   1.996 +            if (!bytes_written && (lastConverterIndex) && (ULMBCS_AMBIGUOUS_MATCH(group, lastConverterIndex)))
   1.997 +            {
   1.998 +                bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
   1.999 +                        lastConverterIndex, pLMBCS, &uniChar, &lastConverterIndex, groups_tried);
  1.1000 +            }
  1.1001 +            if (!bytes_written)
  1.1002 +            {
  1.1003 +               /* just check every possible matching converter (Strategy 3D) */ 
  1.1004 +               ulmbcs_byte_t grp_start;
  1.1005 +               ulmbcs_byte_t grp_end;  
  1.1006 +               ulmbcs_byte_t grp_ix;
  1.1007 +               grp_start = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS) 
  1.1008 +                        ? ULMBCS_DOUBLEOPTGROUP_START 
  1.1009 +                        :  ULMBCS_GRP_L1);
  1.1010 +               grp_end = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS) 
  1.1011 +                        ? ULMBCS_GRP_LAST 
  1.1012 +                        :  ULMBCS_GRP_TH);
  1.1013 +               if(group == ULMBCS_AMBIGUOUS_ALL)
  1.1014 +               {
  1.1015 +                   grp_start = ULMBCS_GRP_L1;
  1.1016 +                   grp_end = ULMBCS_GRP_LAST;
  1.1017 +               }
  1.1018 +               for (grp_ix = grp_start;
  1.1019 +                   grp_ix <= grp_end && !bytes_written; 
  1.1020 +                    grp_ix++)
  1.1021 +               {
  1.1022 +                  if (extraInfo->OptGrpConverter [grp_ix] && !groups_tried [grp_ix])
  1.1023 +                  {
  1.1024 +                     bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
  1.1025 +                       grp_ix, pLMBCS, &uniChar, 
  1.1026 +                       &lastConverterIndex, groups_tried);
  1.1027 +                  }
  1.1028 +               }
  1.1029 +                /* a final conversion fallback to the exceptions group if its likely 
  1.1030 +                     to be single byte  (Strategy 3E) */
  1.1031 +               if (!bytes_written && grp_start == ULMBCS_GRP_L1)
  1.1032 +               {
  1.1033 +                  bytes_written = (int32_t)LMBCSConversionWorker (extraInfo,
  1.1034 +                     ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar, 
  1.1035 +                     &lastConverterIndex, groups_tried);
  1.1036 +               }
  1.1037 +            }
  1.1038 +            /* all of our other strategies failed. Fallback to Unicode. (Strategy 4)*/
  1.1039 +            if (!bytes_written)
  1.1040 +            {
  1.1041 +
  1.1042 +               pLMBCS += LMBCSConvertUni(pLMBCS, uniChar);
  1.1043 +               bytes_written = (int32_t)(pLMBCS - LMBCS);
  1.1044 +            }
  1.1045 +         }
  1.1046 +      }
  1.1047 +  
  1.1048 +      /* we have a translation. increment source and write as much as posible to target */
  1.1049 +      args->source++;
  1.1050 +      pLMBCS = LMBCS;
  1.1051 +      while (args->target < args->targetLimit && bytes_written--)
  1.1052 +      {
  1.1053 +         *(args->target)++ = *pLMBCS++;
  1.1054 +         if (args->offsets)
  1.1055 +         {
  1.1056 +            *(args->offsets)++ = sourceIndex;
  1.1057 +         }
  1.1058 +      }
  1.1059 +      sourceIndex++;
  1.1060 +      if (bytes_written > 0)
  1.1061 +      {
  1.1062 +         /* write any bytes that didn't fit in target to the error buffer,
  1.1063 +            common code will move this to target if we get called back with
  1.1064 +            enough target room
  1.1065 +         */
  1.1066 +         uint8_t * pErrorBuffer = args->converter->charErrorBuffer;
  1.1067 +         *err = U_BUFFER_OVERFLOW_ERROR;
  1.1068 +         args->converter->charErrorBufferLength = (int8_t)bytes_written;
  1.1069 +         while (bytes_written--)
  1.1070 +         {
  1.1071 +            *pErrorBuffer++ = *pLMBCS++;
  1.1072 +         }
  1.1073 +      }
  1.1074 +      /*Fix for SPR#DJOE66JFN3 (Lotus)*/
  1.1075 +      extraInfo->localeConverterIndex = OldConverterIndex;
  1.1076 +   }     
  1.1077 +}
  1.1078 +
  1.1079 +
  1.1080 +/* Now, the Unicode from LMBCS section */
  1.1081 +
  1.1082 +
  1.1083 +/* A function to call when we are looking at the Unicode group byte in LMBCS */
  1.1084 +static UChar
  1.1085 +GetUniFromLMBCSUni(char const ** ppLMBCSin)  /* Called with LMBCS-style Unicode byte stream */
  1.1086 +{
  1.1087 +   uint8_t  HighCh = *(*ppLMBCSin)++;  /* Big-endian Unicode in LMBCS compatibility group*/
  1.1088 +   uint8_t  LowCh  = *(*ppLMBCSin)++;
  1.1089 +
  1.1090 +   if (HighCh == ULMBCS_UNICOMPATZERO ) 
  1.1091 +   {
  1.1092 +      HighCh = LowCh;
  1.1093 +      LowCh = 0; /* zero-byte in LSB special character */
  1.1094 +   }
  1.1095 +   return (UChar)((HighCh << 8) | LowCh);
  1.1096 +}
  1.1097 +
  1.1098 +
  1.1099 +
  1.1100 +/* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index' 
  1.1101 +   bytes left in source up to  sourceLimit.Errors appropriately if not.
  1.1102 +   If we reach the limit, then update the source pointer to there to consume
  1.1103 +   all input as required by ICU converter semantics.
  1.1104 +*/
  1.1105 +
  1.1106 +#define CHECK_SOURCE_LIMIT(index) \
  1.1107 +     if (args->source+index > args->sourceLimit){\
  1.1108 +         *err = U_TRUNCATED_CHAR_FOUND;\
  1.1109 +         args->source = args->sourceLimit;\
  1.1110 +         return 0xffff;}
  1.1111 +
  1.1112 +/* Return the Unicode representation for the current LMBCS character */
  1.1113 +
  1.1114 +static UChar32 
  1.1115 +_LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
  1.1116 +                         UErrorCode*   err)
  1.1117 +{
  1.1118 +    UChar32 uniChar = 0;    /* an output UNICODE char */
  1.1119 +    ulmbcs_byte_t   CurByte; /* A byte from the input stream */
  1.1120 +
  1.1121 +    /* error check */
  1.1122 +    if (args->source >= args->sourceLimit)
  1.1123 +    {
  1.1124 +        *err = U_ILLEGAL_ARGUMENT_ERROR;
  1.1125 +        return 0xffff;
  1.1126 +    }
  1.1127 +    /* Grab first byte & save address for error recovery */
  1.1128 +    CurByte = *((ulmbcs_byte_t  *) (args->source++));
  1.1129 +   
  1.1130 +    /*
  1.1131 +    * at entry of each if clause:
  1.1132 +    * 1. 'CurByte' points at the first byte of a LMBCS character
  1.1133 +    * 2. '*source'points to the next byte of the source stream after 'CurByte' 
  1.1134 +    *
  1.1135 +    * the job of each if clause is:
  1.1136 +    * 1. set '*source' to point at the beginning of next char (nop if LMBCS char is only 1 byte)
  1.1137 +    * 2. set 'uniChar' up with the right Unicode value, or set 'err' appropriately
  1.1138 +    */
  1.1139 +   
  1.1140 +    /* First lets check the simple fixed values. */
  1.1141 +
  1.1142 +    if(((CurByte > ULMBCS_C0END) && (CurByte < ULMBCS_C1START)) /* ascii range */
  1.1143 +    ||  (CurByte == 0) 
  1.1144 +    ||  CurByte == ULMBCS_HT || CurByte == ULMBCS_CR 
  1.1145 +    ||  CurByte == ULMBCS_LF || CurByte == ULMBCS_123SYSTEMRANGE)
  1.1146 +    {
  1.1147 +        uniChar = CurByte;
  1.1148 +    }
  1.1149 +    else  
  1.1150 +    {
  1.1151 +        UConverterDataLMBCS * extraInfo;
  1.1152 +        ulmbcs_byte_t group; 
  1.1153 +        UConverterSharedData *cnv; 
  1.1154 +        
  1.1155 +        if (CurByte == ULMBCS_GRP_CTRL)  /* Control character group - no opt group update */
  1.1156 +        {
  1.1157 +            ulmbcs_byte_t  C0C1byte;
  1.1158 +            CHECK_SOURCE_LIMIT(1);
  1.1159 +            C0C1byte = *(args->source)++;
  1.1160 +            uniChar = (C0C1byte < ULMBCS_C1START) ? C0C1byte - ULMBCS_CTRLOFFSET : C0C1byte;
  1.1161 +        }
  1.1162 +        else 
  1.1163 +        if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */
  1.1164 +        {
  1.1165 +            CHECK_SOURCE_LIMIT(2);
  1.1166 +     
  1.1167 +            /* don't check for error indicators fffe/ffff below */
  1.1168 +            return GetUniFromLMBCSUni(&(args->source));
  1.1169 +        }
  1.1170 +        else if (CurByte <= ULMBCS_CTRLOFFSET)  
  1.1171 +        {
  1.1172 +            group = CurByte;                   /* group byte is in the source */
  1.1173 +            extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
  1.1174 +            if (group > ULMBCS_GRP_LAST || (cnv = extraInfo->OptGrpConverter[group]) == NULL)
  1.1175 +            {
  1.1176 +                /* this is not a valid group byte - no converter*/
  1.1177 +                *err = U_INVALID_CHAR_FOUND;
  1.1178 +            }      
  1.1179 +            else if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
  1.1180 +            {
  1.1181 +
  1.1182 +                CHECK_SOURCE_LIMIT(2);
  1.1183 +
  1.1184 +                /* check for LMBCS doubled-group-byte case */
  1.1185 +                if (*args->source == group) {
  1.1186 +                    /* single byte */
  1.1187 +                    ++args->source;
  1.1188 +                    uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 1, FALSE);
  1.1189 +                    ++args->source;
  1.1190 +                } else {
  1.1191 +                    /* double byte */
  1.1192 +                    uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 2, FALSE);
  1.1193 +                    args->source += 2;
  1.1194 +                }
  1.1195 +            }
  1.1196 +            else {                                  /* single byte conversion */
  1.1197 +                CHECK_SOURCE_LIMIT(1);
  1.1198 +                CurByte = *(args->source)++;
  1.1199 +        
  1.1200 +                if (CurByte >= ULMBCS_C1START)
  1.1201 +                {
  1.1202 +                    uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
  1.1203 +                }
  1.1204 +                else
  1.1205 +                {
  1.1206 +                    /* The non-optimizable oddballs where there is an explicit byte 
  1.1207 +                    * AND the second byte is not in the upper ascii range
  1.1208 +                    */
  1.1209 +                    char bytes[2];
  1.1210 +
  1.1211 +                    extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
  1.1212 +                    cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT];  
  1.1213 +        
  1.1214 +                    /* Lookup value must include opt group */
  1.1215 +                    bytes[0] = group;
  1.1216 +                    bytes[1] = CurByte;
  1.1217 +                    uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, bytes, 2, FALSE);
  1.1218 +                }
  1.1219 +            }
  1.1220 +        }
  1.1221 +        else if (CurByte >= ULMBCS_C1START) /* group byte is implicit */
  1.1222 +        {
  1.1223 +            extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
  1.1224 +            group = extraInfo->OptGroup;
  1.1225 +            cnv = extraInfo->OptGrpConverter[group];
  1.1226 +            if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
  1.1227 +            {
  1.1228 +                if (!ucnv_MBCSIsLeadByte(cnv, CurByte))
  1.1229 +                {
  1.1230 +                    CHECK_SOURCE_LIMIT(0);
  1.1231 +
  1.1232 +                    /* let the MBCS conversion consume CurByte again */
  1.1233 +                    uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 1, FALSE);
  1.1234 +                }
  1.1235 +                else
  1.1236 +                {
  1.1237 +                    CHECK_SOURCE_LIMIT(1);
  1.1238 +                    /* let the MBCS conversion consume CurByte again */
  1.1239 +                    uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 2, FALSE);
  1.1240 +                    ++args->source;
  1.1241 +                }
  1.1242 +            }
  1.1243 +            else                                   /* single byte conversion */
  1.1244 +            {
  1.1245 +                uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
  1.1246 +            }
  1.1247 +        }
  1.1248 +    }
  1.1249 +    return uniChar;
  1.1250 +}
  1.1251 +
  1.1252 +
  1.1253 +/* The exported function that converts lmbcs to one or more
  1.1254 +   UChars - currently UTF-16
  1.1255 +*/
  1.1256 +static void 
  1.1257 +_LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs*    args,
  1.1258 +                     UErrorCode*    err)
  1.1259 +{
  1.1260 +   char LMBCS [ULMBCS_CHARSIZE_MAX];
  1.1261 +   UChar uniChar;    /* one output UNICODE char */
  1.1262 +   const char * saveSource; /* beginning of current code point */
  1.1263 +   const char * pStartLMBCS = args->source;  /* beginning of whole string */
  1.1264 +   const char * errSource = NULL; /* pointer to actual input in case an error occurs */
  1.1265 +   int8_t savebytes = 0;
  1.1266 +
  1.1267 +   /* Process from source to limit, or until error */
  1.1268 +   while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target)
  1.1269 +   {
  1.1270 +      saveSource = args->source; /* beginning of current code point */
  1.1271 +
  1.1272 +      if (args->converter->toULength) /* reassemble char from previous call */
  1.1273 +      {
  1.1274 +        const char *saveSourceLimit; 
  1.1275 +        size_t size_old = args->converter->toULength;
  1.1276 +
  1.1277 +         /* limit from source is either remainder of temp buffer, or user limit on source */
  1.1278 +        size_t size_new_maybe_1 = sizeof(LMBCS) - size_old;
  1.1279 +        size_t size_new_maybe_2 = args->sourceLimit - args->source;
  1.1280 +        size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2;
  1.1281 +         
  1.1282 +      
  1.1283 +        uprv_memcpy(LMBCS, args->converter->toUBytes, size_old);
  1.1284 +        uprv_memcpy(LMBCS + size_old, args->source, size_new);
  1.1285 +        saveSourceLimit = args->sourceLimit;
  1.1286 +        args->source = errSource = LMBCS;
  1.1287 +        args->sourceLimit = LMBCS+size_old+size_new;
  1.1288 +        savebytes = (int8_t)(size_old+size_new);
  1.1289 +        uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
  1.1290 +        args->source = saveSource + ((args->source - LMBCS) - size_old);
  1.1291 +        args->sourceLimit = saveSourceLimit;
  1.1292 +
  1.1293 +        if (*err == U_TRUNCATED_CHAR_FOUND)
  1.1294 +        {
  1.1295 +            /* evil special case: source buffers so small a char spans more than 2 buffers */
  1.1296 +            args->converter->toULength = savebytes;
  1.1297 +            uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes);
  1.1298 +            args->source = args->sourceLimit;
  1.1299 +            *err = U_ZERO_ERROR;
  1.1300 +            return;
  1.1301 +         }
  1.1302 +         else
  1.1303 +         {
  1.1304 +            /* clear the partial-char marker */
  1.1305 +            args->converter->toULength = 0;
  1.1306 +         }
  1.1307 +      }
  1.1308 +      else
  1.1309 +      {
  1.1310 +         errSource = saveSource;
  1.1311 +         uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
  1.1312 +         savebytes = (int8_t)(args->source - saveSource);
  1.1313 +      }
  1.1314 +      if (U_SUCCESS(*err))
  1.1315 +      {
  1.1316 +         if (uniChar < 0xfffe)
  1.1317 +         {
  1.1318 +            *(args->target)++ = uniChar;
  1.1319 +            if(args->offsets)
  1.1320 +            {
  1.1321 +               *(args->offsets)++ = (int32_t)(saveSource - pStartLMBCS);
  1.1322 +            }
  1.1323 +         }
  1.1324 +         else if (uniChar == 0xfffe)
  1.1325 +         {
  1.1326 +            *err = U_INVALID_CHAR_FOUND;
  1.1327 +         }
  1.1328 +         else /* if (uniChar == 0xffff) */
  1.1329 +         {
  1.1330 +            *err = U_ILLEGAL_CHAR_FOUND;
  1.1331 +         }
  1.1332 +      }
  1.1333 +   }
  1.1334 +   /* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */
  1.1335 +   if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target)
  1.1336 +   {
  1.1337 +      *err = U_BUFFER_OVERFLOW_ERROR;
  1.1338 +   }
  1.1339 +   else if (U_FAILURE(*err)) 
  1.1340 +   {
  1.1341 +      /* If character incomplete or unmappable/illegal, store it in toUBytes[] */
  1.1342 +      args->converter->toULength = savebytes;
  1.1343 +      if (savebytes > 0) {
  1.1344 +         uprv_memcpy(args->converter->toUBytes, errSource, savebytes);
  1.1345 +      }
  1.1346 +      if (*err == U_TRUNCATED_CHAR_FOUND) {
  1.1347 +         *err = U_ZERO_ERROR;
  1.1348 +      }
  1.1349 +   }
  1.1350 +}
  1.1351 +
  1.1352 +/* And now, the macroized declarations of data & functions: */
  1.1353 +DEFINE_LMBCS_OPEN(1)
  1.1354 +DEFINE_LMBCS_OPEN(2)
  1.1355 +DEFINE_LMBCS_OPEN(3)
  1.1356 +DEFINE_LMBCS_OPEN(4)
  1.1357 +DEFINE_LMBCS_OPEN(5)
  1.1358 +DEFINE_LMBCS_OPEN(6)
  1.1359 +DEFINE_LMBCS_OPEN(8)
  1.1360 +DEFINE_LMBCS_OPEN(11)
  1.1361 +DEFINE_LMBCS_OPEN(16)
  1.1362 +DEFINE_LMBCS_OPEN(17)
  1.1363 +DEFINE_LMBCS_OPEN(18)
  1.1364 +DEFINE_LMBCS_OPEN(19)
  1.1365 +
  1.1366 +
  1.1367 +DECLARE_LMBCS_DATA(1)
  1.1368 +DECLARE_LMBCS_DATA(2)
  1.1369 +DECLARE_LMBCS_DATA(3)
  1.1370 +DECLARE_LMBCS_DATA(4)
  1.1371 +DECLARE_LMBCS_DATA(5)
  1.1372 +DECLARE_LMBCS_DATA(6)
  1.1373 +DECLARE_LMBCS_DATA(8)
  1.1374 +DECLARE_LMBCS_DATA(11)
  1.1375 +DECLARE_LMBCS_DATA(16)
  1.1376 +DECLARE_LMBCS_DATA(17)
  1.1377 +DECLARE_LMBCS_DATA(18)
  1.1378 +DECLARE_LMBCS_DATA(19)
  1.1379 +
  1.1380 +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
The Tor Browser / file diff

diff: intl/icu/source/common/ucnv_lmb.c

intl/icu/source/common/ucnv_lmb.c