1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/makeconv/genmbcs.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,124 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2000-2008, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: genmbcs.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2000jul10 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#ifndef __GENMBCS_H__ 1.21 +#define __GENMBCS_H__ 1.22 + 1.23 +#include "makeconv.h" 1.24 + 1.25 +enum { 1.26 + /* 1.27 + * TODO: Consider using ucnvmbcs.h constants. 1.28 + * However, not all values need to be exactly the same, for example 1.29 + * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX 1.30 + * may be higher in makeconv than in the runtime code because that 1.31 + * affects only a small number of .cnv files [if any] but all 1.32 + * runtime UConverterSharedData objects. 1.33 + */ 1.34 + MBCS_STAGE_2_SHIFT=4, 1.35 + MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */ 1.36 + MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ 1.37 + MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */ 1.38 + MBCS_STAGE_1_SHIFT=10, 1.39 + MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */ 1.40 + MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */ 1.41 + MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */ 1.42 + MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, 1.43 + MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, 1.44 + 1.45 + MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */ 1.46 + MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */ 1.47 + 1.48 + MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */ 1.49 + MBCS_STAGE_3_BLOCK_MASK=0xf, 1.50 + MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */ 1.51 + 1.52 + MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */ 1.53 + MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */ 1.54 + MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */ 1.55 + 1.56 + /* 1.57 + * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures. 1.58 + * Possible values are 0x01ff..0xffff, in steps of 0x100. 1.59 + * 1.60 + * Unlike for MBCS, this constant only affects the stage 3 block allocation size; 1.61 + * there is no additional stage 1/2 table stored in the .cnv file. 1.62 + * The max value should be at least 0x7ff to cover 2-byte UTF-8. 1.63 + * 0xfff also covers a number other small scripts which have legacy charsets 1.64 + * (like Thai). 1.65 + * Higher values up to 0x1fff are harmless and potentially useful because 1.66 + * that covers small-script blocks which usually have either dense mappings 1.67 + * or no mappings at all. 1.68 + * Starting at U+2000, there are mostly symbols and format characters 1.69 + * with a low density of SBCS mappings, which would result in more wasted 1.70 + * stage 3 entries with the larger block size. 1.71 + */ 1.72 + SBCS_UTF8_MAX=0x1fff, 1.73 + 1.74 + /* 1.75 + * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures. 1.76 + * Possible values are 0x01ff..0xffff, in steps of 0x100. 1.77 + * 1.78 + * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table 1.79 + * with extreme input data. The function checks for this overflow. 1.80 + * 1.81 + * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul. 1.82 + * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc. 1.83 + * Larger values cause slightly larger MBCS .cnv files. 1.84 + */ 1.85 + MBCS_UTF8_MAX=0xd7ff, 1.86 + MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */ 1.87 + 1.88 + MBCS_UTF8_STAGE_SHIFT=6, 1.89 + MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */ 1.90 + MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f, 1.91 + 1.92 + /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */ 1.93 + MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */ 1.94 + 1.95 + MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */ 1.96 + MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */ 1.97 + 1.98 + /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */ 1.99 + MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE, 1.100 + 1.101 + MBCS_MAX_FALLBACK_COUNT=8192 1.102 +}; 1.103 + 1.104 +U_CFUNC NewConverter * 1.105 +MBCSOpen(UCMFile *ucm); 1.106 + 1.107 +struct MBCSData; 1.108 +typedef struct MBCSData MBCSData; 1.109 + 1.110 +/* 1.111 + * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode() 1.112 + * for creating an extension-only file. 1.113 + * Assume maxCharLength>1. 1.114 + */ 1.115 +U_CFUNC const MBCSData * 1.116 +MBCSGetDummy(void); 1.117 + 1.118 +/* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */ 1.119 +U_CFUNC UBool 1.120 +MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, 1.121 + const uint8_t *bytes, int32_t length, 1.122 + UChar32 c, int8_t flag); 1.123 + 1.124 +U_CFUNC NewConverter * 1.125 +CnvExtOpen(UCMFile *ucm); 1.126 + 1.127 +#endif /* __GENMBCS_H__ */