Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * Copyright (C) 2003-2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ******************************************************************************* |
michael@0 | 6 | * file name: ucm.h |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * created on: 2003jun20 |
michael@0 | 12 | * created by: Markus W. Scherer |
michael@0 | 13 | * |
michael@0 | 14 | * Definitions for the .ucm file parser and handler module ucm.c. |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #ifndef __UCM_H__ |
michael@0 | 18 | #define __UCM_H__ |
michael@0 | 19 | |
michael@0 | 20 | #include "unicode/utypes.h" |
michael@0 | 21 | #include "ucnvmbcs.h" |
michael@0 | 22 | #include "ucnv_ext.h" |
michael@0 | 23 | #include "filestrm.h" |
michael@0 | 24 | #include <stdio.h> |
michael@0 | 25 | |
michael@0 | 26 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 27 | |
michael@0 | 28 | U_CDECL_BEGIN |
michael@0 | 29 | |
michael@0 | 30 | /* constants for UCMapping.moveFlag */ |
michael@0 | 31 | enum { |
michael@0 | 32 | UCM_MOVE_TO_EXT=1, |
michael@0 | 33 | UCM_REMOVE_MAPPING=2 |
michael@0 | 34 | }; |
michael@0 | 35 | |
michael@0 | 36 | /* |
michael@0 | 37 | * Per-mapping data structure |
michael@0 | 38 | * |
michael@0 | 39 | * u if uLen==1: Unicode code point |
michael@0 | 40 | * else index to uLen code points |
michael@0 | 41 | * b if bLen<=4: up to 4 bytes |
michael@0 | 42 | * else index to bLen bytes |
michael@0 | 43 | * uLen number of code points |
michael@0 | 44 | * bLen number of words containing left-justified bytes |
michael@0 | 45 | * bIsMultipleChars indicates that the bytes contain more than one sequence |
michael@0 | 46 | * according to the state table |
michael@0 | 47 | * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) |
michael@0 | 48 | * or "good one-way" mapping (4). |
michael@0 | 49 | * Same values as in the source file after | |
michael@0 | 50 | */ |
michael@0 | 51 | typedef struct UCMapping { |
michael@0 | 52 | UChar32 u; |
michael@0 | 53 | union { |
michael@0 | 54 | uint32_t idx; |
michael@0 | 55 | uint8_t bytes[4]; |
michael@0 | 56 | } b; |
michael@0 | 57 | int8_t uLen, bLen, f, moveFlag; |
michael@0 | 58 | } UCMapping; |
michael@0 | 59 | |
michael@0 | 60 | /* constants for UCMTable.flagsType */ |
michael@0 | 61 | enum { |
michael@0 | 62 | UCM_FLAGS_INITIAL, /* no mappings parsed yet */ |
michael@0 | 63 | UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ |
michael@0 | 64 | UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ |
michael@0 | 65 | UCM_FLAGS_MIXED /* both implicit and explicit */ |
michael@0 | 66 | }; |
michael@0 | 67 | |
michael@0 | 68 | typedef struct UCMTable { |
michael@0 | 69 | UCMapping *mappings; |
michael@0 | 70 | int32_t mappingsCapacity, mappingsLength; |
michael@0 | 71 | |
michael@0 | 72 | UChar32 *codePoints; |
michael@0 | 73 | int32_t codePointsCapacity, codePointsLength; |
michael@0 | 74 | |
michael@0 | 75 | uint8_t *bytes; |
michael@0 | 76 | int32_t bytesCapacity, bytesLength; |
michael@0 | 77 | |
michael@0 | 78 | /* index map for mapping by bytes first */ |
michael@0 | 79 | int32_t *reverseMap; |
michael@0 | 80 | |
michael@0 | 81 | uint8_t unicodeMask; |
michael@0 | 82 | int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ |
michael@0 | 83 | UBool isSorted; |
michael@0 | 84 | } UCMTable; |
michael@0 | 85 | |
michael@0 | 86 | enum { |
michael@0 | 87 | MBCS_STATE_FLAG_DIRECT=1, |
michael@0 | 88 | MBCS_STATE_FLAG_SURROGATES, |
michael@0 | 89 | |
michael@0 | 90 | MBCS_STATE_FLAG_READY=16 |
michael@0 | 91 | }; |
michael@0 | 92 | |
michael@0 | 93 | typedef struct UCMStates { |
michael@0 | 94 | int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; |
michael@0 | 95 | uint32_t stateFlags[MBCS_MAX_STATE_COUNT], |
michael@0 | 96 | stateOffsetSum[MBCS_MAX_STATE_COUNT]; |
michael@0 | 97 | |
michael@0 | 98 | int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; |
michael@0 | 99 | int8_t conversionType, outputType; |
michael@0 | 100 | } UCMStates; |
michael@0 | 101 | |
michael@0 | 102 | typedef struct UCMFile { |
michael@0 | 103 | UCMTable *base, *ext; |
michael@0 | 104 | UCMStates states; |
michael@0 | 105 | |
michael@0 | 106 | char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; |
michael@0 | 107 | } UCMFile; |
michael@0 | 108 | |
michael@0 | 109 | /* simple accesses ---------------------------------------------------------- */ |
michael@0 | 110 | |
michael@0 | 111 | #define UCM_GET_CODE_POINTS(t, m) \ |
michael@0 | 112 | (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) |
michael@0 | 113 | |
michael@0 | 114 | #define UCM_GET_BYTES(t, m) \ |
michael@0 | 115 | (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) |
michael@0 | 116 | |
michael@0 | 117 | /* APIs --------------------------------------------------------------------- */ |
michael@0 | 118 | |
michael@0 | 119 | U_CAPI UCMFile * U_EXPORT2 |
michael@0 | 120 | ucm_open(void); |
michael@0 | 121 | |
michael@0 | 122 | U_CAPI void U_EXPORT2 |
michael@0 | 123 | ucm_close(UCMFile *ucm); |
michael@0 | 124 | |
michael@0 | 125 | U_CAPI UBool U_EXPORT2 |
michael@0 | 126 | ucm_parseHeaderLine(UCMFile *ucm, |
michael@0 | 127 | char *line, char **pKey, char **pValue); |
michael@0 | 128 | |
michael@0 | 129 | /* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ |
michael@0 | 130 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 131 | ucm_mappingType(UCMStates *baseStates, |
michael@0 | 132 | UCMapping *m, |
michael@0 | 133 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
michael@0 | 134 | uint8_t bytes[UCNV_EXT_MAX_BYTES]); |
michael@0 | 135 | |
michael@0 | 136 | /* add a mapping to the base or extension table as appropriate */ |
michael@0 | 137 | U_CAPI UBool U_EXPORT2 |
michael@0 | 138 | ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, |
michael@0 | 139 | UCMapping *m, |
michael@0 | 140 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
michael@0 | 141 | uint8_t bytes[UCNV_EXT_MAX_BYTES]); |
michael@0 | 142 | |
michael@0 | 143 | U_CAPI UBool U_EXPORT2 |
michael@0 | 144 | ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); |
michael@0 | 145 | |
michael@0 | 146 | |
michael@0 | 147 | U_CAPI UCMTable * U_EXPORT2 |
michael@0 | 148 | ucm_openTable(void); |
michael@0 | 149 | |
michael@0 | 150 | U_CAPI void U_EXPORT2 |
michael@0 | 151 | ucm_closeTable(UCMTable *table); |
michael@0 | 152 | |
michael@0 | 153 | U_CAPI void U_EXPORT2 |
michael@0 | 154 | ucm_resetTable(UCMTable *table); |
michael@0 | 155 | |
michael@0 | 156 | U_CAPI void U_EXPORT2 |
michael@0 | 157 | ucm_sortTable(UCMTable *t); |
michael@0 | 158 | |
michael@0 | 159 | /* |
michael@0 | 160 | * Remove mappings with their move flag set from the base table |
michael@0 | 161 | * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. |
michael@0 | 162 | */ |
michael@0 | 163 | U_CAPI void U_EXPORT2 |
michael@0 | 164 | ucm_moveMappings(UCMTable *base, UCMTable *ext); |
michael@0 | 165 | |
michael@0 | 166 | /** |
michael@0 | 167 | * Read a table from a .ucm file, from after the CHARMAP line to |
michael@0 | 168 | * including the END CHARMAP line. |
michael@0 | 169 | */ |
michael@0 | 170 | U_CAPI void U_EXPORT2 |
michael@0 | 171 | ucm_readTable(UCMFile *ucm, FileStream* convFile, |
michael@0 | 172 | UBool forBase, UCMStates *baseStates, |
michael@0 | 173 | UErrorCode *pErrorCode); |
michael@0 | 174 | |
michael@0 | 175 | /** |
michael@0 | 176 | * Check the validity of mappings against a base table's states; |
michael@0 | 177 | * necessary for extension-only tables that were read before their base tables. |
michael@0 | 178 | */ |
michael@0 | 179 | U_CAPI UBool U_EXPORT2 |
michael@0 | 180 | ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); |
michael@0 | 181 | |
michael@0 | 182 | /** |
michael@0 | 183 | * Check a base table against an extension table. |
michael@0 | 184 | * Set the moveTarget!=NULL if it is possible to move mappings from the base. |
michael@0 | 185 | * This is the case where base and extension tables are parsed from a single file |
michael@0 | 186 | * (moveTarget==ext) |
michael@0 | 187 | * or when delta file mappings are subtracted from a base table. |
michael@0 | 188 | * |
michael@0 | 189 | * When a base table cannot be modified because a delta file is parsed in makeconv, |
michael@0 | 190 | * then set moveTarget=NULL. |
michael@0 | 191 | * |
michael@0 | 192 | * if(intersectBase) then mappings that exist in the base table but not in |
michael@0 | 193 | * the extension table are moved to moveTarget instead of showing an error. |
michael@0 | 194 | * |
michael@0 | 195 | * Special mode: |
michael@0 | 196 | * If intersectBase==2 for a DBCS extension table, then SBCS mappings are |
michael@0 | 197 | * not moved out of the base unless their Unicode input requires it. |
michael@0 | 198 | * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. |
michael@0 | 199 | * |
michael@0 | 200 | * For both tables in the same file, the extension table is automatically |
michael@0 | 201 | * built. |
michael@0 | 202 | * For separate files, the extension file can use a complete mapping table (.ucm file), |
michael@0 | 203 | * so that common mappings need not be stripped out manually. |
michael@0 | 204 | * |
michael@0 | 205 | * |
michael@0 | 206 | * Sort both tables, and then for each mapping direction: |
michael@0 | 207 | * |
michael@0 | 208 | * If intersectBase is TRUE and the base table contains a mapping |
michael@0 | 209 | * that does not exist in the extension table, then this mapping is moved |
michael@0 | 210 | * to moveTarget. |
michael@0 | 211 | * |
michael@0 | 212 | * - otherwise - |
michael@0 | 213 | * |
michael@0 | 214 | * If the base table contains a mapping for which the input sequence is |
michael@0 | 215 | * the same as the extension input, then |
michael@0 | 216 | * - if the output is the same: remove the extension mapping |
michael@0 | 217 | * - else: error |
michael@0 | 218 | * |
michael@0 | 219 | * If the base table contains a mapping for which the input sequence is |
michael@0 | 220 | * a prefix of the extension input, then |
michael@0 | 221 | * - if moveTarget!=NULL: move the base mapping to the moveTarget table |
michael@0 | 222 | * - else: error |
michael@0 | 223 | * |
michael@0 | 224 | * @return FALSE in case of an irreparable error |
michael@0 | 225 | */ |
michael@0 | 226 | U_CAPI UBool U_EXPORT2 |
michael@0 | 227 | ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, |
michael@0 | 228 | UCMTable *moveTarget, UBool intersectBase); |
michael@0 | 229 | |
michael@0 | 230 | U_CAPI void U_EXPORT2 |
michael@0 | 231 | ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); |
michael@0 | 232 | |
michael@0 | 233 | U_CAPI void U_EXPORT2 |
michael@0 | 234 | ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); |
michael@0 | 235 | |
michael@0 | 236 | |
michael@0 | 237 | U_CAPI void U_EXPORT2 |
michael@0 | 238 | ucm_addState(UCMStates *states, const char *s); |
michael@0 | 239 | |
michael@0 | 240 | U_CAPI void U_EXPORT2 |
michael@0 | 241 | ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); |
michael@0 | 242 | |
michael@0 | 243 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 244 | ucm_countChars(UCMStates *states, |
michael@0 | 245 | const uint8_t *bytes, int32_t length); |
michael@0 | 246 | |
michael@0 | 247 | |
michael@0 | 248 | U_CAPI int8_t U_EXPORT2 |
michael@0 | 249 | ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); |
michael@0 | 250 | |
michael@0 | 251 | U_CAPI UBool U_EXPORT2 |
michael@0 | 252 | ucm_parseMappingLine(UCMapping *m, |
michael@0 | 253 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
michael@0 | 254 | uint8_t bytes[UCNV_EXT_MAX_BYTES], |
michael@0 | 255 | const char *line); |
michael@0 | 256 | |
michael@0 | 257 | U_CAPI void U_EXPORT2 |
michael@0 | 258 | ucm_addMapping(UCMTable *table, |
michael@0 | 259 | UCMapping *m, |
michael@0 | 260 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
michael@0 | 261 | uint8_t bytes[UCNV_EXT_MAX_BYTES]); |
michael@0 | 262 | |
michael@0 | 263 | /* very makeconv-specific functions ----------------------------------------- */ |
michael@0 | 264 | |
michael@0 | 265 | /* finalize and optimize states after the toUnicode mappings are processed */ |
michael@0 | 266 | U_CAPI void U_EXPORT2 |
michael@0 | 267 | ucm_optimizeStates(UCMStates *states, |
michael@0 | 268 | uint16_t **pUnicodeCodeUnits, |
michael@0 | 269 | _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, |
michael@0 | 270 | UBool verbose); |
michael@0 | 271 | |
michael@0 | 272 | /* moved here because it is used inside ucmstate.c */ |
michael@0 | 273 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 274 | ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, |
michael@0 | 275 | uint32_t offset); |
michael@0 | 276 | |
michael@0 | 277 | /* very rptp2ucm-specific functions ----------------------------------------- */ |
michael@0 | 278 | |
michael@0 | 279 | /* |
michael@0 | 280 | * Input: Separate tables with mappings from/to Unicode, |
michael@0 | 281 | * subchar and subchar1 (0 if none). |
michael@0 | 282 | * All mappings must have flag 0. |
michael@0 | 283 | * |
michael@0 | 284 | * Output: fromUTable will contain the union of mappings with the correct |
michael@0 | 285 | * precision flags, and be sorted. |
michael@0 | 286 | */ |
michael@0 | 287 | U_CAPI void U_EXPORT2 |
michael@0 | 288 | ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, |
michael@0 | 289 | const uint8_t *subchar, int32_t subcharLength, |
michael@0 | 290 | uint8_t subchar1); |
michael@0 | 291 | |
michael@0 | 292 | U_CAPI UBool U_EXPORT2 |
michael@0 | 293 | ucm_separateMappings(UCMFile *ucm, UBool isSISO); |
michael@0 | 294 | |
michael@0 | 295 | U_CDECL_END |
michael@0 | 296 | |
michael@0 | 297 | #endif |
michael@0 | 298 | |
michael@0 | 299 | #endif |
michael@0 | 300 |