1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/toolutil/ucm.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,300 @@ 1.4 +/* 1.5 + ******************************************************************************* 1.6 + * Copyright (C) 2003-2013, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ******************************************************************************* 1.9 + * file name: ucm.h 1.10 + * encoding: US-ASCII 1.11 + * tab size: 8 (not used) 1.12 + * indentation:4 1.13 + * 1.14 + * created on: 2003jun20 1.15 + * created by: Markus W. Scherer 1.16 + * 1.17 + * Definitions for the .ucm file parser and handler module ucm.c. 1.18 + */ 1.19 + 1.20 +#ifndef __UCM_H__ 1.21 +#define __UCM_H__ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 +#include "ucnvmbcs.h" 1.25 +#include "ucnv_ext.h" 1.26 +#include "filestrm.h" 1.27 +#include <stdio.h> 1.28 + 1.29 +#if !UCONFIG_NO_CONVERSION 1.30 + 1.31 +U_CDECL_BEGIN 1.32 + 1.33 +/* constants for UCMapping.moveFlag */ 1.34 +enum { 1.35 + UCM_MOVE_TO_EXT=1, 1.36 + UCM_REMOVE_MAPPING=2 1.37 +}; 1.38 + 1.39 +/* 1.40 + * Per-mapping data structure 1.41 + * 1.42 + * u if uLen==1: Unicode code point 1.43 + * else index to uLen code points 1.44 + * b if bLen<=4: up to 4 bytes 1.45 + * else index to bLen bytes 1.46 + * uLen number of code points 1.47 + * bLen number of words containing left-justified bytes 1.48 + * bIsMultipleChars indicates that the bytes contain more than one sequence 1.49 + * according to the state table 1.50 + * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) 1.51 + * or "good one-way" mapping (4). 1.52 + * Same values as in the source file after | 1.53 + */ 1.54 +typedef struct UCMapping { 1.55 + UChar32 u; 1.56 + union { 1.57 + uint32_t idx; 1.58 + uint8_t bytes[4]; 1.59 + } b; 1.60 + int8_t uLen, bLen, f, moveFlag; 1.61 +} UCMapping; 1.62 + 1.63 +/* constants for UCMTable.flagsType */ 1.64 +enum { 1.65 + UCM_FLAGS_INITIAL, /* no mappings parsed yet */ 1.66 + UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ 1.67 + UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ 1.68 + UCM_FLAGS_MIXED /* both implicit and explicit */ 1.69 +}; 1.70 + 1.71 +typedef struct UCMTable { 1.72 + UCMapping *mappings; 1.73 + int32_t mappingsCapacity, mappingsLength; 1.74 + 1.75 + UChar32 *codePoints; 1.76 + int32_t codePointsCapacity, codePointsLength; 1.77 + 1.78 + uint8_t *bytes; 1.79 + int32_t bytesCapacity, bytesLength; 1.80 + 1.81 + /* index map for mapping by bytes first */ 1.82 + int32_t *reverseMap; 1.83 + 1.84 + uint8_t unicodeMask; 1.85 + int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ 1.86 + UBool isSorted; 1.87 +} UCMTable; 1.88 + 1.89 +enum { 1.90 + MBCS_STATE_FLAG_DIRECT=1, 1.91 + MBCS_STATE_FLAG_SURROGATES, 1.92 + 1.93 + MBCS_STATE_FLAG_READY=16 1.94 +}; 1.95 + 1.96 +typedef struct UCMStates { 1.97 + int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; 1.98 + uint32_t stateFlags[MBCS_MAX_STATE_COUNT], 1.99 + stateOffsetSum[MBCS_MAX_STATE_COUNT]; 1.100 + 1.101 + int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; 1.102 + int8_t conversionType, outputType; 1.103 +} UCMStates; 1.104 + 1.105 +typedef struct UCMFile { 1.106 + UCMTable *base, *ext; 1.107 + UCMStates states; 1.108 + 1.109 + char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; 1.110 +} UCMFile; 1.111 + 1.112 +/* simple accesses ---------------------------------------------------------- */ 1.113 + 1.114 +#define UCM_GET_CODE_POINTS(t, m) \ 1.115 + (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) 1.116 + 1.117 +#define UCM_GET_BYTES(t, m) \ 1.118 + (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) 1.119 + 1.120 +/* APIs --------------------------------------------------------------------- */ 1.121 + 1.122 +U_CAPI UCMFile * U_EXPORT2 1.123 +ucm_open(void); 1.124 + 1.125 +U_CAPI void U_EXPORT2 1.126 +ucm_close(UCMFile *ucm); 1.127 + 1.128 +U_CAPI UBool U_EXPORT2 1.129 +ucm_parseHeaderLine(UCMFile *ucm, 1.130 + char *line, char **pKey, char **pValue); 1.131 + 1.132 +/* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ 1.133 +U_CAPI int32_t U_EXPORT2 1.134 +ucm_mappingType(UCMStates *baseStates, 1.135 + UCMapping *m, 1.136 + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 1.137 + uint8_t bytes[UCNV_EXT_MAX_BYTES]); 1.138 + 1.139 +/* add a mapping to the base or extension table as appropriate */ 1.140 +U_CAPI UBool U_EXPORT2 1.141 +ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, 1.142 + UCMapping *m, 1.143 + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 1.144 + uint8_t bytes[UCNV_EXT_MAX_BYTES]); 1.145 + 1.146 +U_CAPI UBool U_EXPORT2 1.147 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); 1.148 + 1.149 + 1.150 +U_CAPI UCMTable * U_EXPORT2 1.151 +ucm_openTable(void); 1.152 + 1.153 +U_CAPI void U_EXPORT2 1.154 +ucm_closeTable(UCMTable *table); 1.155 + 1.156 +U_CAPI void U_EXPORT2 1.157 +ucm_resetTable(UCMTable *table); 1.158 + 1.159 +U_CAPI void U_EXPORT2 1.160 +ucm_sortTable(UCMTable *t); 1.161 + 1.162 +/* 1.163 + * Remove mappings with their move flag set from the base table 1.164 + * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. 1.165 + */ 1.166 +U_CAPI void U_EXPORT2 1.167 +ucm_moveMappings(UCMTable *base, UCMTable *ext); 1.168 + 1.169 +/** 1.170 + * Read a table from a .ucm file, from after the CHARMAP line to 1.171 + * including the END CHARMAP line. 1.172 + */ 1.173 +U_CAPI void U_EXPORT2 1.174 +ucm_readTable(UCMFile *ucm, FileStream* convFile, 1.175 + UBool forBase, UCMStates *baseStates, 1.176 + UErrorCode *pErrorCode); 1.177 + 1.178 +/** 1.179 + * Check the validity of mappings against a base table's states; 1.180 + * necessary for extension-only tables that were read before their base tables. 1.181 + */ 1.182 +U_CAPI UBool U_EXPORT2 1.183 +ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); 1.184 + 1.185 +/** 1.186 + * Check a base table against an extension table. 1.187 + * Set the moveTarget!=NULL if it is possible to move mappings from the base. 1.188 + * This is the case where base and extension tables are parsed from a single file 1.189 + * (moveTarget==ext) 1.190 + * or when delta file mappings are subtracted from a base table. 1.191 + * 1.192 + * When a base table cannot be modified because a delta file is parsed in makeconv, 1.193 + * then set moveTarget=NULL. 1.194 + * 1.195 + * if(intersectBase) then mappings that exist in the base table but not in 1.196 + * the extension table are moved to moveTarget instead of showing an error. 1.197 + * 1.198 + * Special mode: 1.199 + * If intersectBase==2 for a DBCS extension table, then SBCS mappings are 1.200 + * not moved out of the base unless their Unicode input requires it. 1.201 + * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. 1.202 + * 1.203 + * For both tables in the same file, the extension table is automatically 1.204 + * built. 1.205 + * For separate files, the extension file can use a complete mapping table (.ucm file), 1.206 + * so that common mappings need not be stripped out manually. 1.207 + * 1.208 + * 1.209 + * Sort both tables, and then for each mapping direction: 1.210 + * 1.211 + * If intersectBase is TRUE and the base table contains a mapping 1.212 + * that does not exist in the extension table, then this mapping is moved 1.213 + * to moveTarget. 1.214 + * 1.215 + * - otherwise - 1.216 + * 1.217 + * If the base table contains a mapping for which the input sequence is 1.218 + * the same as the extension input, then 1.219 + * - if the output is the same: remove the extension mapping 1.220 + * - else: error 1.221 + * 1.222 + * If the base table contains a mapping for which the input sequence is 1.223 + * a prefix of the extension input, then 1.224 + * - if moveTarget!=NULL: move the base mapping to the moveTarget table 1.225 + * - else: error 1.226 + * 1.227 + * @return FALSE in case of an irreparable error 1.228 + */ 1.229 +U_CAPI UBool U_EXPORT2 1.230 +ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 1.231 + UCMTable *moveTarget, UBool intersectBase); 1.232 + 1.233 +U_CAPI void U_EXPORT2 1.234 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); 1.235 + 1.236 +U_CAPI void U_EXPORT2 1.237 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); 1.238 + 1.239 + 1.240 +U_CAPI void U_EXPORT2 1.241 +ucm_addState(UCMStates *states, const char *s); 1.242 + 1.243 +U_CAPI void U_EXPORT2 1.244 +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); 1.245 + 1.246 +U_CAPI int32_t U_EXPORT2 1.247 +ucm_countChars(UCMStates *states, 1.248 + const uint8_t *bytes, int32_t length); 1.249 + 1.250 + 1.251 +U_CAPI int8_t U_EXPORT2 1.252 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); 1.253 + 1.254 +U_CAPI UBool U_EXPORT2 1.255 +ucm_parseMappingLine(UCMapping *m, 1.256 + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 1.257 + uint8_t bytes[UCNV_EXT_MAX_BYTES], 1.258 + const char *line); 1.259 + 1.260 +U_CAPI void U_EXPORT2 1.261 +ucm_addMapping(UCMTable *table, 1.262 + UCMapping *m, 1.263 + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 1.264 + uint8_t bytes[UCNV_EXT_MAX_BYTES]); 1.265 + 1.266 +/* very makeconv-specific functions ----------------------------------------- */ 1.267 + 1.268 +/* finalize and optimize states after the toUnicode mappings are processed */ 1.269 +U_CAPI void U_EXPORT2 1.270 +ucm_optimizeStates(UCMStates *states, 1.271 + uint16_t **pUnicodeCodeUnits, 1.272 + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 1.273 + UBool verbose); 1.274 + 1.275 +/* moved here because it is used inside ucmstate.c */ 1.276 +U_CAPI int32_t U_EXPORT2 1.277 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 1.278 + uint32_t offset); 1.279 + 1.280 +/* very rptp2ucm-specific functions ----------------------------------------- */ 1.281 + 1.282 +/* 1.283 + * Input: Separate tables with mappings from/to Unicode, 1.284 + * subchar and subchar1 (0 if none). 1.285 + * All mappings must have flag 0. 1.286 + * 1.287 + * Output: fromUTable will contain the union of mappings with the correct 1.288 + * precision flags, and be sorted. 1.289 + */ 1.290 +U_CAPI void U_EXPORT2 1.291 +ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, 1.292 + const uint8_t *subchar, int32_t subcharLength, 1.293 + uint8_t subchar1); 1.294 + 1.295 +U_CAPI UBool U_EXPORT2 1.296 +ucm_separateMappings(UCMFile *ucm, UBool isSISO); 1.297 + 1.298 +U_CDECL_END 1.299 + 1.300 +#endif 1.301 + 1.302 +#endif 1.303 +