1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/makeconv/gencnvex.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1079 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2003-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: gencnvex.c 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2003oct12 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#include <stdio.h> 1.21 +#include "unicode/utypes.h" 1.22 +#include "unicode/ustring.h" 1.23 +#include "cstring.h" 1.24 +#include "cmemory.h" 1.25 +#include "ucnv_cnv.h" 1.26 +#include "ucnvmbcs.h" 1.27 +#include "toolutil.h" 1.28 +#include "unewdata.h" 1.29 +#include "ucm.h" 1.30 +#include "makeconv.h" 1.31 +#include "genmbcs.h" 1.32 + 1.33 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.34 + 1.35 + 1.36 +static void 1.37 +CnvExtClose(NewConverter *cnvData); 1.38 + 1.39 +static UBool 1.40 +CnvExtIsValid(NewConverter *cnvData, 1.41 + const uint8_t *bytes, int32_t length); 1.42 + 1.43 +static UBool 1.44 +CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); 1.45 + 1.46 +static uint32_t 1.47 +CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, 1.48 + UNewDataMemory *pData, int32_t tableType); 1.49 + 1.50 +typedef struct CnvExtData { 1.51 + NewConverter newConverter; 1.52 + 1.53 + UCMFile *ucm; 1.54 + 1.55 + /* toUnicode (state table in ucm->states) */ 1.56 + UToolMemory *toUTable, *toUUChars; 1.57 + 1.58 + /* fromUnicode */ 1.59 + UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes; 1.60 + 1.61 + uint16_t stage1[MBCS_STAGE_1_SIZE]; 1.62 + uint16_t stage2[MBCS_STAGE_2_SIZE]; 1.63 + uint16_t stage3[0x10000<<UCNV_EXT_STAGE_2_LEFT_SHIFT]; /* 0x10000 because of 16-bit stage 2/3 indexes */ 1.64 + uint32_t stage3b[0x10000]; 1.65 + 1.66 + int32_t stage1Top, stage2Top, stage3Top, stage3bTop; 1.67 + 1.68 + /* for stage3 compaction of <subchar1> |2 mappings */ 1.69 + uint16_t stage3Sub1Block; 1.70 + 1.71 + /* statistics */ 1.72 + int32_t 1.73 + maxInBytes, maxOutBytes, maxBytesPerUChar, 1.74 + maxInUChars, maxOutUChars, maxUCharsPerByte; 1.75 +} CnvExtData; 1.76 + 1.77 +NewConverter * 1.78 +CnvExtOpen(UCMFile *ucm) { 1.79 + CnvExtData *extData; 1.80 + 1.81 + extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData)); 1.82 + if(extData==NULL) { 1.83 + printf("out of memory\n"); 1.84 + exit(U_MEMORY_ALLOCATION_ERROR); 1.85 + } 1.86 + uprv_memset(extData, 0, sizeof(CnvExtData)); 1.87 + 1.88 + extData->ucm=ucm; /* aliased, not owned */ 1.89 + 1.90 + extData->newConverter.close=CnvExtClose; 1.91 + extData->newConverter.isValid=CnvExtIsValid; 1.92 + extData->newConverter.addTable=CnvExtAddTable; 1.93 + extData->newConverter.write=CnvExtWrite; 1.94 + return &extData->newConverter; 1.95 +} 1.96 + 1.97 +static void 1.98 +CnvExtClose(NewConverter *cnvData) { 1.99 + CnvExtData *extData=(CnvExtData *)cnvData; 1.100 + if(extData!=NULL) { 1.101 + utm_close(extData->toUTable); 1.102 + utm_close(extData->toUUChars); 1.103 + utm_close(extData->fromUTableUChars); 1.104 + utm_close(extData->fromUTableValues); 1.105 + utm_close(extData->fromUBytes); 1.106 + uprv_free(extData); 1.107 + } 1.108 +} 1.109 + 1.110 +/* we do not expect this to be called */ 1.111 +static UBool 1.112 +CnvExtIsValid(NewConverter *cnvData, 1.113 + const uint8_t *bytes, int32_t length) { 1.114 + return FALSE; 1.115 +} 1.116 + 1.117 +static uint32_t 1.118 +CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, 1.119 + UNewDataMemory *pData, int32_t tableType) { 1.120 + CnvExtData *extData=(CnvExtData *)cnvData; 1.121 + int32_t length, top, headerSize; 1.122 + 1.123 + int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 }; 1.124 + 1.125 + if(tableType&TABLE_BASE) { 1.126 + headerSize=0; 1.127 + } else { 1.128 + _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 }; 1.129 + 1.130 + /* write the header and base table name for an extension-only table */ 1.131 + length=(int32_t)uprv_strlen(extData->ucm->baseName)+1; 1.132 + while(length&3) { 1.133 + /* add padding */ 1.134 + extData->ucm->baseName[length++]=0; 1.135 + } 1.136 + 1.137 + headerSize=MBCS_HEADER_V4_LENGTH*4+length; 1.138 + 1.139 + /* fill the header */ 1.140 + header.version[0]=4; 1.141 + header.version[1]=2; 1.142 + header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY); 1.143 + 1.144 + /* write the header and the base table name */ 1.145 + udata_writeBlock(pData, &header, MBCS_HEADER_V4_LENGTH*4); 1.146 + udata_writeBlock(pData, extData->ucm->baseName, length); 1.147 + } 1.148 + 1.149 + /* fill indexes[] - offsets/indexes are in units of the target array */ 1.150 + top=0; 1.151 + 1.152 + indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH; 1.153 + top+=length*4; 1.154 + 1.155 + indexes[UCNV_EXT_TO_U_INDEX]=top; 1.156 + indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable); 1.157 + top+=length*4; 1.158 + 1.159 + indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top; 1.160 + indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars); 1.161 + top+=length*2; 1.162 + 1.163 + indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top; 1.164 + length=utm_countItems(extData->fromUTableUChars); 1.165 + top+=length*2; 1.166 + 1.167 + if(top&3) { 1.168 + /* add padding */ 1.169 + *((UChar *)utm_alloc(extData->fromUTableUChars))=0; 1.170 + *((uint32_t *)utm_alloc(extData->fromUTableValues))=0; 1.171 + ++length; 1.172 + top+=2; 1.173 + } 1.174 + indexes[UCNV_EXT_FROM_U_LENGTH]=length; 1.175 + 1.176 + indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top; 1.177 + top+=length*4; 1.178 + 1.179 + indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top; 1.180 + length=utm_countItems(extData->fromUBytes); 1.181 + top+=length; 1.182 + 1.183 + if(top&1) { 1.184 + /* add padding */ 1.185 + *((uint8_t *)utm_alloc(extData->fromUBytes))=0; 1.186 + ++length; 1.187 + ++top; 1.188 + } 1.189 + indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length; 1.190 + 1.191 + indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top; 1.192 + indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top; 1.193 + indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top; 1.194 + top+=length*2; 1.195 + 1.196 + indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top; 1.197 + length=extData->stage3Top; 1.198 + top+=length*2; 1.199 + 1.200 + if(top&3) { 1.201 + /* add padding */ 1.202 + extData->stage3[extData->stage3Top++]=0; 1.203 + ++length; 1.204 + top+=2; 1.205 + } 1.206 + indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length; 1.207 + 1.208 + indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top; 1.209 + indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop; 1.210 + top+=length*4; 1.211 + 1.212 + indexes[UCNV_EXT_SIZE]=top; 1.213 + 1.214 + /* statistics */ 1.215 + indexes[UCNV_EXT_COUNT_BYTES]= 1.216 + (extData->maxInBytes<<16)| 1.217 + (extData->maxOutBytes<<8)| 1.218 + extData->maxBytesPerUChar; 1.219 + indexes[UCNV_EXT_COUNT_UCHARS]= 1.220 + (extData->maxInUChars<<16)| 1.221 + (extData->maxOutUChars<<8)| 1.222 + extData->maxUCharsPerByte; 1.223 + 1.224 + indexes[UCNV_EXT_FLAGS]=extData->ucm->ext->unicodeMask; 1.225 + 1.226 + /* write the extension data */ 1.227 + udata_writeBlock(pData, indexes, sizeof(indexes)); 1.228 + udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4); 1.229 + udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2); 1.230 + 1.231 + udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2); 1.232 + udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4); 1.233 + udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]); 1.234 + 1.235 + udata_writeBlock(pData, extData->stage1, extData->stage1Top*2); 1.236 + udata_writeBlock(pData, extData->stage2, extData->stage2Top*2); 1.237 + udata_writeBlock(pData, extData->stage3, extData->stage3Top*2); 1.238 + udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4); 1.239 + 1.240 +#if 0 1.241 + { 1.242 + int32_t i, j; 1.243 + 1.244 + length=extData->stage1Top; 1.245 + printf("\nstage1[%x]:\n", length); 1.246 + 1.247 + for(i=0; i<length; ++i) { 1.248 + if(extData->stage1[i]!=length) { 1.249 + printf("stage1[%04x]=%04x\n", i, extData->stage1[i]); 1.250 + } 1.251 + } 1.252 + 1.253 + j=length; 1.254 + length=extData->stage2Top; 1.255 + printf("\nstage2[%x]:\n", length); 1.256 + 1.257 + for(i=0; i<length; ++j, ++i) { 1.258 + if(extData->stage2[i]!=0) { 1.259 + printf("stage12[%04x]=%04x\n", j, extData->stage2[i]); 1.260 + } 1.261 + } 1.262 + 1.263 + length=extData->stage3Top; 1.264 + printf("\nstage3[%x]:\n", length); 1.265 + 1.266 + for(i=0; i<length; ++i) { 1.267 + if(extData->stage3[i]!=0) { 1.268 + printf("stage3[%04x]=%04x\n", i, extData->stage3[i]); 1.269 + } 1.270 + } 1.271 + 1.272 + length=extData->stage3bTop; 1.273 + printf("\nstage3b[%x]:\n", length); 1.274 + 1.275 + for(i=0; i<length; ++i) { 1.276 + if(extData->stage3b[i]!=0) { 1.277 + printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]); 1.278 + } 1.279 + } 1.280 + } 1.281 +#endif 1.282 + 1.283 + if(VERBOSE) { 1.284 + printf("size of extension data: %ld\n", (long)top); 1.285 + } 1.286 + 1.287 + /* return the number of bytes that should have been written */ 1.288 + return (uint32_t)(headerSize+top); 1.289 +} 1.290 + 1.291 +/* to Unicode --------------------------------------------------------------- */ 1.292 + 1.293 +/* 1.294 + * Remove fromUnicode fallbacks and SUB mappings which are irrelevant for 1.295 + * the toUnicode table. 1.296 + * This includes mappings with MBCS_FROM_U_EXT_FLAG which were suitable 1.297 + * for the base toUnicode table but not for the base fromUnicode table. 1.298 + * The table must be sorted. 1.299 + * Modifies previous data in the reverseMap. 1.300 + */ 1.301 +static int32_t 1.302 +reduceToUMappings(UCMTable *table) { 1.303 + UCMapping *mappings; 1.304 + int32_t *map; 1.305 + int32_t i, j, count; 1.306 + int8_t flag; 1.307 + 1.308 + mappings=table->mappings; 1.309 + map=table->reverseMap; 1.310 + count=table->mappingsLength; 1.311 + 1.312 + /* leave the map alone for the initial mappings with desired flags */ 1.313 + for(i=j=0; i<count; ++i) { 1.314 + flag=mappings[map[i]].f; 1.315 + if(flag!=0 && flag!=3) { 1.316 + break; 1.317 + } 1.318 + } 1.319 + 1.320 + /* reduce from here to the rest */ 1.321 + for(j=i; i<count; ++i) { 1.322 + flag=mappings[map[i]].f; 1.323 + if(flag==0 || flag==3) { 1.324 + map[j++]=map[i]; 1.325 + } 1.326 + } 1.327 + 1.328 + return j; 1.329 +} 1.330 + 1.331 +static uint32_t 1.332 +getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { 1.333 + UChar32 *u32; 1.334 + UChar *u; 1.335 + uint32_t value; 1.336 + int32_t u16Length, ratio; 1.337 + UErrorCode errorCode; 1.338 + 1.339 + /* write the Unicode result code point or string index */ 1.340 + if(m->uLen==1) { 1.341 + u16Length=U16_LENGTH(m->u); 1.342 + value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u); 1.343 + } else { 1.344 + /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */ 1.345 + 1.346 + /* get the result code point string and its 16-bit string length */ 1.347 + u32=UCM_GET_CODE_POINTS(table, m); 1.348 + errorCode=U_ZERO_ERROR; 1.349 + u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode); 1.350 + if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { 1.351 + exit(errorCode); 1.352 + } 1.353 + 1.354 + /* allocate it and put its length and index into the value */ 1.355 + value= 1.356 + (((uint32_t)u16Length+UCNV_EXT_TO_U_LENGTH_OFFSET)<<UCNV_EXT_TO_U_LENGTH_SHIFT)| 1.357 + ((uint32_t)utm_countItems(extData->toUUChars)); 1.358 + u=utm_allocN(extData->toUUChars, u16Length); 1.359 + 1.360 + /* write the result 16-bit string */ 1.361 + errorCode=U_ZERO_ERROR; 1.362 + u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode); 1.363 + if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { 1.364 + exit(errorCode); 1.365 + } 1.366 + } 1.367 + if(m->f==0) { 1.368 + value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG; 1.369 + } 1.370 + 1.371 + /* update statistics */ 1.372 + if(m->bLen>extData->maxInBytes) { 1.373 + extData->maxInBytes=m->bLen; 1.374 + } 1.375 + if(u16Length>extData->maxOutUChars) { 1.376 + extData->maxOutUChars=u16Length; 1.377 + } 1.378 + 1.379 + ratio=(u16Length+(m->bLen-1))/m->bLen; 1.380 + if(ratio>extData->maxUCharsPerByte) { 1.381 + extData->maxUCharsPerByte=ratio; 1.382 + } 1.383 + 1.384 + return value; 1.385 +} 1.386 + 1.387 +/* 1.388 + * Recursive toUTable generator core function. 1.389 + * Preconditions: 1.390 + * - start<limit (There is at least one mapping.) 1.391 + * - The mappings are sorted lexically. (Access is through the reverseMap.) 1.392 + * - All mappings between start and limit have input sequences that share 1.393 + * the same prefix of unitIndex length, and therefore all of these sequences 1.394 + * are at least unitIndex+1 long. 1.395 + * - There are only relevant mappings available through the reverseMap, 1.396 + * see reduceToUMappings(). 1.397 + * 1.398 + * One function invocation generates one section table. 1.399 + * 1.400 + * Steps: 1.401 + * 1. Count the number of unique unit values and get the low/high unit values 1.402 + * that occur at unitIndex. 1.403 + * 2. Allocate the section table with possible optimization for linear access. 1.404 + * 3. Write temporary version of the section table with start indexes of 1.405 + * subsections, each corresponding to one unit value at unitIndex. 1.406 + * 4. Iterate through the table once more, and depending on the subsection length: 1.407 + * 0: write 0 as a result value (unused byte in linear-access section table) 1.408 + * >0: if there is one mapping with an input unit sequence of unitIndex+1 1.409 + * then defaultValue=compute the mapping result for this whole sequence 1.410 + * else defaultValue=0 1.411 + * 1.412 + * recurse into the subsection 1.413 + */ 1.414 +static UBool 1.415 +generateToUTable(CnvExtData *extData, UCMTable *table, 1.416 + int32_t start, int32_t limit, int32_t unitIndex, 1.417 + uint32_t defaultValue) { 1.418 + UCMapping *mappings, *m; 1.419 + int32_t *map; 1.420 + int32_t i, j, uniqueCount, count, subStart, subLimit; 1.421 + 1.422 + uint8_t *bytes; 1.423 + int32_t low, high, prev; 1.424 + 1.425 + uint32_t *section; 1.426 + 1.427 + mappings=table->mappings; 1.428 + map=table->reverseMap; 1.429 + 1.430 + /* step 1: examine the input units; set low, high, uniqueCount */ 1.431 + m=mappings+map[start]; 1.432 + bytes=UCM_GET_BYTES(table, m); 1.433 + low=bytes[unitIndex]; 1.434 + uniqueCount=1; 1.435 + 1.436 + prev=high=low; 1.437 + for(i=start+1; i<limit; ++i) { 1.438 + m=mappings+map[i]; 1.439 + bytes=UCM_GET_BYTES(table, m); 1.440 + high=bytes[unitIndex]; 1.441 + 1.442 + if(high!=prev) { 1.443 + prev=high; 1.444 + ++uniqueCount; 1.445 + } 1.446 + } 1.447 + 1.448 + /* step 2: allocate the section; set count, section */ 1.449 + count=(high-low)+1; 1.450 + if(count<0x100 && (unitIndex==0 || uniqueCount>=(3*count)/4)) { 1.451 + /* 1.452 + * for the root table and for fairly full tables: 1.453 + * allocate for direct, linear array access 1.454 + * by keeping count, to write an entry for each unit value 1.455 + * from low to high 1.456 + * exception: use a compact table if count==0x100 because 1.457 + * that cannot be encoded in the length byte 1.458 + */ 1.459 + } else { 1.460 + count=uniqueCount; 1.461 + } 1.462 + 1.463 + if(count>=0x100) { 1.464 + fprintf(stderr, "error: toUnicode extension table section overflow: %ld section entries\n", (long)count); 1.465 + return FALSE; 1.466 + } 1.467 + 1.468 + /* allocate the section: 1 entry for the header + count for the items */ 1.469 + section=(uint32_t *)utm_allocN(extData->toUTable, 1+count); 1.470 + 1.471 + /* write the section header */ 1.472 + *section++=((uint32_t)count<<UCNV_EXT_TO_U_BYTE_SHIFT)|defaultValue; 1.473 + 1.474 + /* step 3: write temporary section table with subsection starts */ 1.475 + prev=low-1; /* just before low to prevent empty subsections before low */ 1.476 + j=0; /* section table index */ 1.477 + for(i=start; i<limit; ++i) { 1.478 + m=mappings+map[i]; 1.479 + bytes=UCM_GET_BYTES(table, m); 1.480 + high=bytes[unitIndex]; 1.481 + 1.482 + if(high!=prev) { 1.483 + /* start of a new subsection for unit high */ 1.484 + if(count>uniqueCount) { 1.485 + /* write empty subsections for unused units in a linear table */ 1.486 + while(++prev<high) { 1.487 + section[j++]=((uint32_t)prev<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i; 1.488 + } 1.489 + } else { 1.490 + prev=high; 1.491 + } 1.492 + 1.493 + /* write the entry with the subsection start */ 1.494 + section[j++]=((uint32_t)high<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i; 1.495 + } 1.496 + } 1.497 + /* assert(j==count) */ 1.498 + 1.499 + /* step 4: recurse and write results */ 1.500 + subLimit=UCNV_EXT_TO_U_GET_VALUE(section[0]); 1.501 + for(j=0; j<count; ++j) { 1.502 + subStart=subLimit; 1.503 + subLimit= (j+1)<count ? UCNV_EXT_TO_U_GET_VALUE(section[j+1]) : limit; 1.504 + 1.505 + /* remove the subStart temporary value */ 1.506 + section[j]&=~UCNV_EXT_TO_U_VALUE_MASK; 1.507 + 1.508 + if(subStart==subLimit) { 1.509 + /* leave the value zero: empty subsection for unused unit in a linear table */ 1.510 + continue; 1.511 + } 1.512 + 1.513 + /* see if there is exactly one input unit sequence of length unitIndex+1 */ 1.514 + defaultValue=0; 1.515 + m=mappings+map[subStart]; 1.516 + if(m->bLen==unitIndex+1) { 1.517 + /* do not include this in generateToUTable() */ 1.518 + ++subStart; 1.519 + 1.520 + if(subStart<subLimit && mappings[map[subStart]].bLen==unitIndex+1) { 1.521 + /* print error for multiple same-input-sequence mappings */ 1.522 + fprintf(stderr, "error: multiple mappings from same bytes\n"); 1.523 + ucm_printMapping(table, m, stderr); 1.524 + ucm_printMapping(table, mappings+map[subStart], stderr); 1.525 + return FALSE; 1.526 + } 1.527 + 1.528 + defaultValue=getToUnicodeValue(extData, table, m); 1.529 + } 1.530 + 1.531 + if(subStart==subLimit) { 1.532 + /* write the result for the input sequence ending here */ 1.533 + section[j]|=defaultValue; 1.534 + } else { 1.535 + /* write the index to the subsection table */ 1.536 + section[j]|=(uint32_t)utm_countItems(extData->toUTable); 1.537 + 1.538 + /* recurse */ 1.539 + if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { 1.540 + return FALSE; 1.541 + } 1.542 + } 1.543 + } 1.544 + return TRUE; 1.545 +} 1.546 + 1.547 +/* 1.548 + * Generate the toUTable and toUUChars from the input table. 1.549 + * The input table must be sorted, and all precision flags must be 0..3. 1.550 + * This function will modify the table's reverseMap. 1.551 + */ 1.552 +static UBool 1.553 +makeToUTable(CnvExtData *extData, UCMTable *table) { 1.554 + int32_t toUCount; 1.555 + 1.556 + toUCount=reduceToUMappings(table); 1.557 + 1.558 + extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4); 1.559 + extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2); 1.560 + 1.561 + return generateToUTable(extData, table, 0, toUCount, 0, 0); 1.562 +} 1.563 + 1.564 +/* from Unicode ------------------------------------------------------------- */ 1.565 + 1.566 +/* 1.567 + * preprocessing: 1.568 + * rebuild reverseMap with mapping indexes for mappings relevant for from Unicode 1.569 + * change each Unicode string to encode all but the first code point in 16-bit form 1.570 + * 1.571 + * generation: 1.572 + * for each unique code point 1.573 + * write an entry in the 3-stage trie 1.574 + * check that there is only one single-code point sequence 1.575 + * start recursion for following 16-bit input units 1.576 + */ 1.577 + 1.578 +/* 1.579 + * Remove toUnicode fallbacks and non-<subchar1> SUB mappings 1.580 + * which are irrelevant for the fromUnicode extension table. 1.581 + * Remove MBCS_FROM_U_EXT_FLAG bits. 1.582 + * Overwrite the reverseMap with an index array to the relevant mappings. 1.583 + * Modify the code point sequences to a generator-friendly format where 1.584 + * the first code points remains unchanged but the following are recoded 1.585 + * into 16-bit Unicode string form. 1.586 + * The table must be sorted. 1.587 + * Destroys previous data in the reverseMap. 1.588 + */ 1.589 +static int32_t 1.590 +prepareFromUMappings(UCMTable *table) { 1.591 + UCMapping *mappings, *m; 1.592 + int32_t *map; 1.593 + int32_t i, j, count; 1.594 + int8_t flag; 1.595 + 1.596 + mappings=table->mappings; 1.597 + map=table->reverseMap; 1.598 + count=table->mappingsLength; 1.599 + 1.600 + /* 1.601 + * we do not go through the map on input because the mappings are 1.602 + * sorted lexically 1.603 + */ 1.604 + m=mappings; 1.605 + 1.606 + for(i=j=0; i<count; ++m, ++i) { 1.607 + flag=m->f; 1.608 + if(flag>=0) { 1.609 + flag&=MBCS_FROM_U_EXT_MASK; 1.610 + m->f=flag; 1.611 + } 1.612 + if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) { 1.613 + map[j++]=i; 1.614 + 1.615 + if(m->uLen>1) { 1.616 + /* recode all but the first code point to 16-bit Unicode */ 1.617 + UChar32 *u32; 1.618 + UChar *u; 1.619 + UChar32 c; 1.620 + int32_t q, r; 1.621 + 1.622 + u32=UCM_GET_CODE_POINTS(table, m); 1.623 + u=(UChar *)u32; /* destructive in-place recoding */ 1.624 + for(r=2, q=1; q<m->uLen; ++q) { 1.625 + c=u32[q]; 1.626 + U16_APPEND_UNSAFE(u, r, c); 1.627 + } 1.628 + 1.629 + /* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */ 1.630 + m->uLen=(int8_t)r; 1.631 + } 1.632 + } 1.633 + } 1.634 + 1.635 + return j; 1.636 +} 1.637 + 1.638 +static uint32_t 1.639 +getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { 1.640 + uint8_t *bytes, *resultBytes; 1.641 + uint32_t value; 1.642 + int32_t u16Length, ratio; 1.643 + 1.644 + if(m->f==2) { 1.645 + /* 1.646 + * no mapping, <subchar1> preferred 1.647 + * 1.648 + * no need to count in statistics because the subchars are already 1.649 + * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData, 1.650 + * and this non-mapping does not count for maxInUChars which are always 1.651 + * trivially at least two if counting unmappable supplementary code points 1.652 + */ 1.653 + return UCNV_EXT_FROM_U_SUBCHAR1; 1.654 + } 1.655 + 1.656 + bytes=UCM_GET_BYTES(table, m); 1.657 + value=0; 1.658 + switch(m->bLen) { 1.659 + /* 1..3: store the bytes in the value word */ 1.660 + case 3: 1.661 + value=((uint32_t)*bytes++)<<16; 1.662 + case 2: 1.663 + value|=((uint32_t)*bytes++)<<8; 1.664 + case 1: 1.665 + value|=*bytes; 1.666 + break; 1.667 + default: 1.668 + /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */ 1.669 + /* store the bytes in fromUBytes[] and the index in the value word */ 1.670 + value=(uint32_t)utm_countItems(extData->fromUBytes); 1.671 + resultBytes=utm_allocN(extData->fromUBytes, m->bLen); 1.672 + uprv_memcpy(resultBytes, bytes, m->bLen); 1.673 + break; 1.674 + } 1.675 + value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT; 1.676 + if(m->f==0) { 1.677 + value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; 1.678 + } else if(m->f==4) { 1.679 + value|=UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG; 1.680 + } 1.681 + 1.682 + /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */ 1.683 + if(m->uLen==1) { 1.684 + u16Length=U16_LENGTH(m->u); 1.685 + } else { 1.686 + u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2); 1.687 + } 1.688 + 1.689 + /* update statistics */ 1.690 + if(u16Length>extData->maxInUChars) { 1.691 + extData->maxInUChars=u16Length; 1.692 + } 1.693 + if(m->bLen>extData->maxOutBytes) { 1.694 + extData->maxOutBytes=m->bLen; 1.695 + } 1.696 + 1.697 + ratio=(m->bLen+(u16Length-1))/u16Length; 1.698 + if(ratio>extData->maxBytesPerUChar) { 1.699 + extData->maxBytesPerUChar=ratio; 1.700 + } 1.701 + 1.702 + return value; 1.703 +} 1.704 + 1.705 +/* 1.706 + * works like generateToUTable(), except that the 1.707 + * output section consists of two arrays, one for input UChars and one 1.708 + * for result values 1.709 + * 1.710 + * also, fromUTable sections are always stored in a compact form for 1.711 + * access via binary search 1.712 + */ 1.713 +static UBool 1.714 +generateFromUTable(CnvExtData *extData, UCMTable *table, 1.715 + int32_t start, int32_t limit, int32_t unitIndex, 1.716 + uint32_t defaultValue) { 1.717 + UCMapping *mappings, *m; 1.718 + int32_t *map; 1.719 + int32_t i, j, uniqueCount, count, subStart, subLimit; 1.720 + 1.721 + UChar *uchars; 1.722 + UChar32 low, high, prev; 1.723 + 1.724 + UChar *sectionUChars; 1.725 + uint32_t *sectionValues; 1.726 + 1.727 + mappings=table->mappings; 1.728 + map=table->reverseMap; 1.729 + 1.730 + /* step 1: examine the input units; set low, high, uniqueCount */ 1.731 + m=mappings+map[start]; 1.732 + uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); 1.733 + low=uchars[unitIndex]; 1.734 + uniqueCount=1; 1.735 + 1.736 + prev=high=low; 1.737 + for(i=start+1; i<limit; ++i) { 1.738 + m=mappings+map[i]; 1.739 + uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); 1.740 + high=uchars[unitIndex]; 1.741 + 1.742 + if(high!=prev) { 1.743 + prev=high; 1.744 + ++uniqueCount; 1.745 + } 1.746 + } 1.747 + 1.748 + /* step 2: allocate the section; set count, section */ 1.749 + /* the fromUTable always stores for access via binary search */ 1.750 + count=uniqueCount; 1.751 + 1.752 + /* allocate the section: 1 entry for the header + count for the items */ 1.753 + sectionUChars=(UChar *)utm_allocN(extData->fromUTableUChars, 1+count); 1.754 + sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count); 1.755 + 1.756 + /* write the section header */ 1.757 + *sectionUChars++=(UChar)count; 1.758 + *sectionValues++=defaultValue; 1.759 + 1.760 + /* step 3: write temporary section table with subsection starts */ 1.761 + prev=low-1; /* just before low to prevent empty subsections before low */ 1.762 + j=0; /* section table index */ 1.763 + for(i=start; i<limit; ++i) { 1.764 + m=mappings+map[i]; 1.765 + uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); 1.766 + high=uchars[unitIndex]; 1.767 + 1.768 + if(high!=prev) { 1.769 + /* start of a new subsection for unit high */ 1.770 + prev=high; 1.771 + 1.772 + /* write the entry with the subsection start */ 1.773 + sectionUChars[j]=(UChar)high; 1.774 + sectionValues[j]=(uint32_t)i; 1.775 + ++j; 1.776 + } 1.777 + } 1.778 + /* assert(j==count) */ 1.779 + 1.780 + /* step 4: recurse and write results */ 1.781 + subLimit=(int32_t)(sectionValues[0]); 1.782 + for(j=0; j<count; ++j) { 1.783 + subStart=subLimit; 1.784 + subLimit= (j+1)<count ? (int32_t)(sectionValues[j+1]) : limit; 1.785 + 1.786 + /* see if there is exactly one input unit sequence of length unitIndex+1 */ 1.787 + defaultValue=0; 1.788 + m=mappings+map[subStart]; 1.789 + if(m->uLen==unitIndex+1) { 1.790 + /* do not include this in generateToUTable() */ 1.791 + ++subStart; 1.792 + 1.793 + if(subStart<subLimit && mappings[map[subStart]].uLen==unitIndex+1) { 1.794 + /* print error for multiple same-input-sequence mappings */ 1.795 + fprintf(stderr, "error: multiple mappings from same Unicode code points\n"); 1.796 + ucm_printMapping(table, m, stderr); 1.797 + ucm_printMapping(table, mappings+map[subStart], stderr); 1.798 + return FALSE; 1.799 + } 1.800 + 1.801 + defaultValue=getFromUBytesValue(extData, table, m); 1.802 + } 1.803 + 1.804 + if(subStart==subLimit) { 1.805 + /* write the result for the input sequence ending here */ 1.806 + sectionValues[j]=defaultValue; 1.807 + } else { 1.808 + /* write the index to the subsection table */ 1.809 + sectionValues[j]=(uint32_t)utm_countItems(extData->fromUTableValues); 1.810 + 1.811 + /* recurse */ 1.812 + if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { 1.813 + return FALSE; 1.814 + } 1.815 + } 1.816 + } 1.817 + return TRUE; 1.818 +} 1.819 + 1.820 +/* 1.821 + * add entries to the fromUnicode trie, 1.822 + * assume to be called with code points in ascending order 1.823 + * and use that to build the trie in precompacted form 1.824 + */ 1.825 +static void 1.826 +addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) { 1.827 + int32_t i1, i2, i3, i3b, nextOffset, min, newBlock; 1.828 + 1.829 + if(value==0) { 1.830 + return; 1.831 + } 1.832 + 1.833 + /* 1.834 + * compute the index for each stage, 1.835 + * allocate a stage block if necessary, 1.836 + * and write the stage value 1.837 + */ 1.838 + i1=c>>10; 1.839 + if(i1>=extData->stage1Top) { 1.840 + extData->stage1Top=i1+1; 1.841 + } 1.842 + 1.843 + nextOffset=(c>>4)&0x3f; 1.844 + 1.845 + if(extData->stage1[i1]==0) { 1.846 + /* allocate another block in stage 2; overlap with the previous block */ 1.847 + newBlock=extData->stage2Top; 1.848 + min=newBlock-nextOffset; /* minimum block start with overlap */ 1.849 + while(min<newBlock && extData->stage2[newBlock-1]==0) { 1.850 + --newBlock; 1.851 + } 1.852 + 1.853 + extData->stage1[i1]=(uint16_t)newBlock; 1.854 + extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE; 1.855 + if(extData->stage2Top>LENGTHOF(extData->stage2)) { 1.856 + fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", (int)c); 1.857 + exit(U_MEMORY_ALLOCATION_ERROR); 1.858 + } 1.859 + } 1.860 + 1.861 + i2=extData->stage1[i1]+nextOffset; 1.862 + nextOffset=c&0xf; 1.863 + 1.864 + if(extData->stage2[i2]==0) { 1.865 + /* allocate another block in stage 3; overlap with the previous block */ 1.866 + newBlock=extData->stage3Top; 1.867 + min=newBlock-nextOffset; /* minimum block start with overlap */ 1.868 + while(min<newBlock && extData->stage3[newBlock-1]==0) { 1.869 + --newBlock; 1.870 + } 1.871 + 1.872 + /* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */ 1.873 + newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1); 1.874 + extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT); 1.875 + 1.876 + extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE; 1.877 + if(extData->stage3Top>LENGTHOF(extData->stage3)) { 1.878 + fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", (int)c); 1.879 + exit(U_MEMORY_ALLOCATION_ERROR); 1.880 + } 1.881 + } 1.882 + 1.883 + i3=((int32_t)extData->stage2[i2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)+nextOffset; 1.884 + /* 1.885 + * assume extData->stage3[i3]==0 because we get 1.886 + * code points in strictly ascending order 1.887 + */ 1.888 + 1.889 + if(value==UCNV_EXT_FROM_U_SUBCHAR1) { 1.890 + /* <subchar1> SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */ 1.891 + extData->stage3[i3]=1; 1.892 + 1.893 + /* 1.894 + * precompaction is not optimal for <subchar1> |2 mappings because 1.895 + * stage3 values for them are all the same, unlike for other mappings 1.896 + * which all have unique values; 1.897 + * use a simple compaction of reusing a whole block filled with these 1.898 + * mappings 1.899 + */ 1.900 + 1.901 + /* is the entire block filled with <subchar1> |2 mappings? */ 1.902 + if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) { 1.903 + for(min=i3-nextOffset; 1.904 + min<i3 && extData->stage3[min]==1; 1.905 + ++min) {} 1.906 + 1.907 + if(min==i3) { 1.908 + /* the entire block is filled with these mappings */ 1.909 + if(extData->stage3Sub1Block!=0) { 1.910 + /* point to the previous such block and remove this block from stage3 */ 1.911 + extData->stage2[i2]=extData->stage3Sub1Block; 1.912 + extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE; 1.913 + uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2); 1.914 + } else { 1.915 + /* remember this block's stage2 entry */ 1.916 + extData->stage3Sub1Block=extData->stage2[i2]; 1.917 + } 1.918 + } 1.919 + } 1.920 + } else { 1.921 + if((i3b=extData->stage3bTop++)>=LENGTHOF(extData->stage3b)) { 1.922 + fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", (int)c); 1.923 + exit(U_MEMORY_ALLOCATION_ERROR); 1.924 + } 1.925 + 1.926 + /* roundtrip or fallback mapping */ 1.927 + extData->stage3[i3]=(uint16_t)i3b; 1.928 + extData->stage3b[i3b]=value; 1.929 + } 1.930 +} 1.931 + 1.932 +static UBool 1.933 +generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) { 1.934 + UCMapping *mappings, *m; 1.935 + int32_t *map; 1.936 + uint32_t value; 1.937 + int32_t subStart, subLimit; 1.938 + 1.939 + UChar32 *codePoints; 1.940 + UChar32 c, next; 1.941 + 1.942 + if(mapLength==0) { 1.943 + return TRUE; 1.944 + } 1.945 + 1.946 + mappings=table->mappings; 1.947 + map=table->reverseMap; 1.948 + 1.949 + /* 1.950 + * iterate over same-initial-code point mappings, 1.951 + * enter the initial code point into the trie, 1.952 + * and start a recursion on the corresponding mappings section 1.953 + * with generateFromUTable() 1.954 + */ 1.955 + m=mappings+map[0]; 1.956 + codePoints=UCM_GET_CODE_POINTS(table, m); 1.957 + next=codePoints[0]; 1.958 + subLimit=0; 1.959 + while(subLimit<mapLength) { 1.960 + /* get a new subsection of mappings starting with the same code point */ 1.961 + subStart=subLimit; 1.962 + c=next; 1.963 + while(next==c && ++subLimit<mapLength) { 1.964 + m=mappings+map[subLimit]; 1.965 + codePoints=UCM_GET_CODE_POINTS(table, m); 1.966 + next=codePoints[0]; 1.967 + } 1.968 + 1.969 + /* 1.970 + * compute the value for this code point; 1.971 + * if there is a mapping for this code point alone, it is at subStart 1.972 + * because the table is sorted lexically 1.973 + */ 1.974 + value=0; 1.975 + m=mappings+map[subStart]; 1.976 + codePoints=UCM_GET_CODE_POINTS(table, m); 1.977 + if(m->uLen==1) { 1.978 + /* do not include this in generateFromUTable() */ 1.979 + ++subStart; 1.980 + 1.981 + if(subStart<subLimit && mappings[map[subStart]].uLen==1) { 1.982 + /* print error for multiple same-input-sequence mappings */ 1.983 + fprintf(stderr, "error: multiple mappings from same Unicode code points\n"); 1.984 + ucm_printMapping(table, m, stderr); 1.985 + ucm_printMapping(table, mappings+map[subStart], stderr); 1.986 + return FALSE; 1.987 + } 1.988 + 1.989 + value=getFromUBytesValue(extData, table, m); 1.990 + } 1.991 + 1.992 + if(subStart==subLimit) { 1.993 + /* write the result for this one code point */ 1.994 + addFromUTrieEntry(extData, c, value); 1.995 + } else { 1.996 + /* write the index to the subsection table */ 1.997 + addFromUTrieEntry(extData, c, (uint32_t)utm_countItems(extData->fromUTableValues)); 1.998 + 1.999 + /* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */ 1.1000 + if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) { 1.1001 + return FALSE; 1.1002 + } 1.1003 + } 1.1004 + } 1.1005 + return TRUE; 1.1006 +} 1.1007 + 1.1008 +/* 1.1009 + * Generate the fromU data structures from the input table. 1.1010 + * The input table must be sorted, and all precision flags must be 0..3. 1.1011 + * This function will modify the table's reverseMap. 1.1012 + */ 1.1013 +static UBool 1.1014 +makeFromUTable(CnvExtData *extData, UCMTable *table) { 1.1015 + uint16_t *stage1; 1.1016 + int32_t i, stage1Top, fromUCount; 1.1017 + 1.1018 + fromUCount=prepareFromUMappings(table); 1.1019 + 1.1020 + extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2); 1.1021 + extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4); 1.1022 + extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1); 1.1023 + 1.1024 + /* allocate all-unassigned stage blocks */ 1.1025 + extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; 1.1026 + extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED; 1.1027 + 1.1028 + /* 1.1029 + * stage 3b stores only unique values, and in 1.1030 + * index 0: 0 for "no mapping" 1.1031 + * index 1: "no mapping" with preference for <subchar1> rather than <subchar> 1.1032 + */ 1.1033 + extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1; 1.1034 + extData->stage3bTop=2; 1.1035 + 1.1036 + /* allocate the first entry in the fromUTable because index 0 means "no result" */ 1.1037 + utm_alloc(extData->fromUTableUChars); 1.1038 + utm_alloc(extData->fromUTableValues); 1.1039 + 1.1040 + if(!generateFromUTrie(extData, table, fromUCount)) { 1.1041 + return FALSE; 1.1042 + } 1.1043 + 1.1044 + /* 1.1045 + * offset the stage 1 trie entries by stage1Top because they will 1.1046 + * be stored in a single array 1.1047 + */ 1.1048 + stage1=extData->stage1; 1.1049 + stage1Top=extData->stage1Top; 1.1050 + for(i=0; i<stage1Top; ++i) { 1.1051 + stage1[i]=(uint16_t)(stage1[i]+stage1Top); 1.1052 + } 1.1053 + 1.1054 + return TRUE; 1.1055 +} 1.1056 + 1.1057 +/* -------------------------------------------------------------------------- */ 1.1058 + 1.1059 +static UBool 1.1060 +CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) { 1.1061 + CnvExtData *extData; 1.1062 + 1.1063 + if(table->unicodeMask&UCNV_HAS_SURROGATES) { 1.1064 + fprintf(stderr, "error: contains mappings for surrogate code points\n"); 1.1065 + return FALSE; 1.1066 + } 1.1067 + 1.1068 + staticData->conversionType=UCNV_MBCS; 1.1069 + 1.1070 + extData=(CnvExtData *)cnvData; 1.1071 + 1.1072 + /* 1.1073 + * assume that the table is sorted 1.1074 + * 1.1075 + * call the functions in this order because 1.1076 + * makeToUTable() modifies the original reverseMap, 1.1077 + * makeFromUTable() writes a whole new mapping into reverseMap 1.1078 + */ 1.1079 + return 1.1080 + makeToUTable(extData, table) && 1.1081 + makeFromUTable(extData, table); 1.1082 +}