michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2003-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: * file name: ucm.h michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2003jun20 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * Definitions for the .ucm file parser and handler module ucm.c. michael@0: */ michael@0: michael@0: #ifndef __UCM_H__ michael@0: #define __UCM_H__ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "ucnvmbcs.h" michael@0: #include "ucnv_ext.h" michael@0: #include "filestrm.h" michael@0: #include michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: /* constants for UCMapping.moveFlag */ michael@0: enum { michael@0: UCM_MOVE_TO_EXT=1, michael@0: UCM_REMOVE_MAPPING=2 michael@0: }; michael@0: michael@0: /* michael@0: * Per-mapping data structure michael@0: * michael@0: * u if uLen==1: Unicode code point michael@0: * else index to uLen code points michael@0: * b if bLen<=4: up to 4 bytes michael@0: * else index to bLen bytes michael@0: * uLen number of code points michael@0: * bLen number of words containing left-justified bytes michael@0: * bIsMultipleChars indicates that the bytes contain more than one sequence michael@0: * according to the state table michael@0: * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) michael@0: * or "good one-way" mapping (4). michael@0: * Same values as in the source file after | michael@0: */ michael@0: typedef struct UCMapping { michael@0: UChar32 u; michael@0: union { michael@0: uint32_t idx; michael@0: uint8_t bytes[4]; michael@0: } b; michael@0: int8_t uLen, bLen, f, moveFlag; michael@0: } UCMapping; michael@0: michael@0: /* constants for UCMTable.flagsType */ michael@0: enum { michael@0: UCM_FLAGS_INITIAL, /* no mappings parsed yet */ michael@0: UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ michael@0: UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ michael@0: UCM_FLAGS_MIXED /* both implicit and explicit */ michael@0: }; michael@0: michael@0: typedef struct UCMTable { michael@0: UCMapping *mappings; michael@0: int32_t mappingsCapacity, mappingsLength; michael@0: michael@0: UChar32 *codePoints; michael@0: int32_t codePointsCapacity, codePointsLength; michael@0: michael@0: uint8_t *bytes; michael@0: int32_t bytesCapacity, bytesLength; michael@0: michael@0: /* index map for mapping by bytes first */ michael@0: int32_t *reverseMap; michael@0: michael@0: uint8_t unicodeMask; michael@0: int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ michael@0: UBool isSorted; michael@0: } UCMTable; michael@0: michael@0: enum { michael@0: MBCS_STATE_FLAG_DIRECT=1, michael@0: MBCS_STATE_FLAG_SURROGATES, michael@0: michael@0: MBCS_STATE_FLAG_READY=16 michael@0: }; michael@0: michael@0: typedef struct UCMStates { michael@0: int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; michael@0: uint32_t stateFlags[MBCS_MAX_STATE_COUNT], michael@0: stateOffsetSum[MBCS_MAX_STATE_COUNT]; michael@0: michael@0: int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; michael@0: int8_t conversionType, outputType; michael@0: } UCMStates; michael@0: michael@0: typedef struct UCMFile { michael@0: UCMTable *base, *ext; michael@0: UCMStates states; michael@0: michael@0: char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; michael@0: } UCMFile; michael@0: michael@0: /* simple accesses ---------------------------------------------------------- */ michael@0: michael@0: #define UCM_GET_CODE_POINTS(t, m) \ michael@0: (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) michael@0: michael@0: #define UCM_GET_BYTES(t, m) \ michael@0: (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) michael@0: michael@0: /* APIs --------------------------------------------------------------------- */ michael@0: michael@0: U_CAPI UCMFile * U_EXPORT2 michael@0: ucm_open(void); michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_close(UCMFile *ucm); michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_parseHeaderLine(UCMFile *ucm, michael@0: char *line, char **pKey, char **pValue); michael@0: michael@0: /* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ucm_mappingType(UCMStates *baseStates, michael@0: UCMapping *m, michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS], michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES]); michael@0: michael@0: /* add a mapping to the base or extension table as appropriate */ michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, michael@0: UCMapping *m, michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS], michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES]); michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); michael@0: michael@0: michael@0: U_CAPI UCMTable * U_EXPORT2 michael@0: ucm_openTable(void); michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_closeTable(UCMTable *table); michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_resetTable(UCMTable *table); michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_sortTable(UCMTable *t); michael@0: michael@0: /* michael@0: * Remove mappings with their move flag set from the base table michael@0: * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. michael@0: */ michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_moveMappings(UCMTable *base, UCMTable *ext); michael@0: michael@0: /** michael@0: * Read a table from a .ucm file, from after the CHARMAP line to michael@0: * including the END CHARMAP line. michael@0: */ michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_readTable(UCMFile *ucm, FileStream* convFile, michael@0: UBool forBase, UCMStates *baseStates, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: /** michael@0: * Check the validity of mappings against a base table's states; michael@0: * necessary for extension-only tables that were read before their base tables. michael@0: */ michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); michael@0: michael@0: /** michael@0: * Check a base table against an extension table. michael@0: * Set the moveTarget!=NULL if it is possible to move mappings from the base. michael@0: * This is the case where base and extension tables are parsed from a single file michael@0: * (moveTarget==ext) michael@0: * or when delta file mappings are subtracted from a base table. michael@0: * michael@0: * When a base table cannot be modified because a delta file is parsed in makeconv, michael@0: * then set moveTarget=NULL. michael@0: * michael@0: * if(intersectBase) then mappings that exist in the base table but not in michael@0: * the extension table are moved to moveTarget instead of showing an error. michael@0: * michael@0: * Special mode: michael@0: * If intersectBase==2 for a DBCS extension table, then SBCS mappings are michael@0: * not moved out of the base unless their Unicode input requires it. michael@0: * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. michael@0: * michael@0: * For both tables in the same file, the extension table is automatically michael@0: * built. michael@0: * For separate files, the extension file can use a complete mapping table (.ucm file), michael@0: * so that common mappings need not be stripped out manually. michael@0: * michael@0: * michael@0: * Sort both tables, and then for each mapping direction: michael@0: * michael@0: * If intersectBase is TRUE and the base table contains a mapping michael@0: * that does not exist in the extension table, then this mapping is moved michael@0: * to moveTarget. michael@0: * michael@0: * - otherwise - michael@0: * michael@0: * If the base table contains a mapping for which the input sequence is michael@0: * the same as the extension input, then michael@0: * - if the output is the same: remove the extension mapping michael@0: * - else: error michael@0: * michael@0: * If the base table contains a mapping for which the input sequence is michael@0: * a prefix of the extension input, then michael@0: * - if moveTarget!=NULL: move the base mapping to the moveTarget table michael@0: * - else: error michael@0: * michael@0: * @return FALSE in case of an irreparable error michael@0: */ michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, michael@0: UCMTable *moveTarget, UBool intersectBase); michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); michael@0: michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_addState(UCMStates *states, const char *s); michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ucm_countChars(UCMStates *states, michael@0: const uint8_t *bytes, int32_t length); michael@0: michael@0: michael@0: U_CAPI int8_t U_EXPORT2 michael@0: ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_parseMappingLine(UCMapping *m, michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS], michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES], michael@0: const char *line); michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_addMapping(UCMTable *table, michael@0: UCMapping *m, michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS], michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES]); michael@0: michael@0: /* very makeconv-specific functions ----------------------------------------- */ michael@0: michael@0: /* finalize and optimize states after the toUnicode mappings are processed */ michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_optimizeStates(UCMStates *states, michael@0: uint16_t **pUnicodeCodeUnits, michael@0: _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, michael@0: UBool verbose); michael@0: michael@0: /* moved here because it is used inside ucmstate.c */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, michael@0: uint32_t offset); michael@0: michael@0: /* very rptp2ucm-specific functions ----------------------------------------- */ michael@0: michael@0: /* michael@0: * Input: Separate tables with mappings from/to Unicode, michael@0: * subchar and subchar1 (0 if none). michael@0: * All mappings must have flag 0. michael@0: * michael@0: * Output: fromUTable will contain the union of mappings with the correct michael@0: * precision flags, and be sorted. michael@0: */ michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, michael@0: const uint8_t *subchar, int32_t subcharLength, michael@0: uint8_t subchar1); michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_separateMappings(UCMFile *ucm, UBool isSISO); michael@0: michael@0: U_CDECL_END michael@0: michael@0: #endif michael@0: michael@0: #endif michael@0: