1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucmndata.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,382 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 1999-2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************/ 1.11 + 1.12 + 1.13 +/*------------------------------------------------------------------------------ 1.14 + * 1.15 + * UCommonData An abstract interface for dealing with ICU Common Data Files. 1.16 + * ICU Common Data Files are a grouping of a number of individual 1.17 + * data items (resources, converters, tables, anything) into a 1.18 + * single file or dll. The combined format includes a table of 1.19 + * contents for locating the individual items by name. 1.20 + * 1.21 + * Two formats for the table of contents are supported, which is 1.22 + * why there is an abstract inteface involved. 1.23 + * 1.24 + */ 1.25 + 1.26 +#include "unicode/utypes.h" 1.27 +#include "unicode/udata.h" 1.28 +#include "cstring.h" 1.29 +#include "ucmndata.h" 1.30 +#include "udatamem.h" 1.31 + 1.32 +#if defined(UDATA_DEBUG) || defined(UDATA_DEBUG_DUMP) 1.33 +# include <stdio.h> 1.34 +#endif 1.35 + 1.36 +U_CFUNC uint16_t 1.37 +udata_getHeaderSize(const DataHeader *udh) { 1.38 + if(udh==NULL) { 1.39 + return 0; 1.40 + } else if(udh->info.isBigEndian==U_IS_BIG_ENDIAN) { 1.41 + /* same endianness */ 1.42 + return udh->dataHeader.headerSize; 1.43 + } else { 1.44 + /* opposite endianness */ 1.45 + uint16_t x=udh->dataHeader.headerSize; 1.46 + return (uint16_t)((x<<8)|(x>>8)); 1.47 + } 1.48 +} 1.49 + 1.50 +U_CFUNC uint16_t 1.51 +udata_getInfoSize(const UDataInfo *info) { 1.52 + if(info==NULL) { 1.53 + return 0; 1.54 + } else if(info->isBigEndian==U_IS_BIG_ENDIAN) { 1.55 + /* same endianness */ 1.56 + return info->size; 1.57 + } else { 1.58 + /* opposite endianness */ 1.59 + uint16_t x=info->size; 1.60 + return (uint16_t)((x<<8)|(x>>8)); 1.61 + } 1.62 +} 1.63 + 1.64 +/*-----------------------------------------------------------------------------* 1.65 + * * 1.66 + * Pointer TOCs. TODO: This form of table-of-contents should be removed * 1.67 + * because DLLs must be relocated on loading to correct the * 1.68 + * pointer values and this operation makes shared memory * 1.69 + * mapping of the data much less likely to work. * 1.70 + * * 1.71 + *-----------------------------------------------------------------------------*/ 1.72 +typedef struct { 1.73 + const char *entryName; 1.74 + const DataHeader *pHeader; 1.75 +} PointerTOCEntry; 1.76 + 1.77 + 1.78 +typedef struct { 1.79 + uint32_t count; 1.80 + uint32_t reserved; 1.81 + PointerTOCEntry entry[2]; /* Actual size is from count. */ 1.82 +} PointerTOC; 1.83 + 1.84 + 1.85 +/* definition of OffsetTOC struct types moved to ucmndata.h */ 1.86 + 1.87 +/*-----------------------------------------------------------------------------* 1.88 + * * 1.89 + * entry point lookup implementations * 1.90 + * * 1.91 + *-----------------------------------------------------------------------------*/ 1.92 + 1.93 +#ifndef MIN 1.94 +#define MIN(a,b) (((a)<(b)) ? (a) : (b)) 1.95 +#endif 1.96 + 1.97 +/** 1.98 + * Compare strings where we know the shared prefix length, 1.99 + * and advance the prefix length as we find that the strings share even more characters. 1.100 + */ 1.101 +static int32_t 1.102 +strcmpAfterPrefix(const char *s1, const char *s2, int32_t *pPrefixLength) { 1.103 + int32_t pl=*pPrefixLength; 1.104 + int32_t cmp=0; 1.105 + s1+=pl; 1.106 + s2+=pl; 1.107 + for(;;) { 1.108 + int32_t c1=(uint8_t)*s1++; 1.109 + int32_t c2=(uint8_t)*s2++; 1.110 + cmp=c1-c2; 1.111 + if(cmp!=0 || c1==0) { /* different or done */ 1.112 + break; 1.113 + } 1.114 + ++pl; /* increment shared same-prefix length */ 1.115 + } 1.116 + *pPrefixLength=pl; 1.117 + return cmp; 1.118 +} 1.119 + 1.120 +static int32_t 1.121 +offsetTOCPrefixBinarySearch(const char *s, const char *names, 1.122 + const UDataOffsetTOCEntry *toc, int32_t count) { 1.123 + int32_t start=0; 1.124 + int32_t limit=count; 1.125 + /* 1.126 + * Remember the shared prefix between s, start and limit, 1.127 + * and don't compare that shared prefix again. 1.128 + * The shared prefix should get longer as we narrow the [start, limit[ range. 1.129 + */ 1.130 + int32_t startPrefixLength=0; 1.131 + int32_t limitPrefixLength=0; 1.132 + if(count==0) { 1.133 + return -1; 1.134 + } 1.135 + /* 1.136 + * Prime the prefix lengths so that we don't keep prefixLength at 0 until 1.137 + * both the start and limit indexes have moved. 1.138 + * At the same time, we find if s is one of the start and (limit-1) names, 1.139 + * and if not, exclude them from the actual binary search. 1.140 + */ 1.141 + if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, &startPrefixLength)) { 1.142 + return 0; 1.143 + } 1.144 + ++start; 1.145 + --limit; 1.146 + if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, &limitPrefixLength)) { 1.147 + return limit; 1.148 + } 1.149 + while(start<limit) { 1.150 + int32_t i=(start+limit)/2; 1.151 + int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 1.152 + int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, &prefixLength); 1.153 + if(cmp<0) { 1.154 + limit=i; 1.155 + limitPrefixLength=prefixLength; 1.156 + } else if(cmp==0) { 1.157 + return i; 1.158 + } else { 1.159 + start=i+1; 1.160 + startPrefixLength=prefixLength; 1.161 + } 1.162 + } 1.163 + return -1; 1.164 +} 1.165 + 1.166 +static int32_t 1.167 +pointerTOCPrefixBinarySearch(const char *s, const PointerTOCEntry *toc, int32_t count) { 1.168 + int32_t start=0; 1.169 + int32_t limit=count; 1.170 + /* 1.171 + * Remember the shared prefix between s, start and limit, 1.172 + * and don't compare that shared prefix again. 1.173 + * The shared prefix should get longer as we narrow the [start, limit[ range. 1.174 + */ 1.175 + int32_t startPrefixLength=0; 1.176 + int32_t limitPrefixLength=0; 1.177 + if(count==0) { 1.178 + return -1; 1.179 + } 1.180 + /* 1.181 + * Prime the prefix lengths so that we don't keep prefixLength at 0 until 1.182 + * both the start and limit indexes have moved. 1.183 + * At the same time, we find if s is one of the start and (limit-1) names, 1.184 + * and if not, exclude them from the actual binary search. 1.185 + */ 1.186 + if(0==strcmpAfterPrefix(s, toc[0].entryName, &startPrefixLength)) { 1.187 + return 0; 1.188 + } 1.189 + ++start; 1.190 + --limit; 1.191 + if(0==strcmpAfterPrefix(s, toc[limit].entryName, &limitPrefixLength)) { 1.192 + return limit; 1.193 + } 1.194 + while(start<limit) { 1.195 + int32_t i=(start+limit)/2; 1.196 + int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 1.197 + int32_t cmp=strcmpAfterPrefix(s, toc[i].entryName, &prefixLength); 1.198 + if(cmp<0) { 1.199 + limit=i; 1.200 + limitPrefixLength=prefixLength; 1.201 + } else if(cmp==0) { 1.202 + return i; 1.203 + } else { 1.204 + start=i+1; 1.205 + startPrefixLength=prefixLength; 1.206 + } 1.207 + } 1.208 + return -1; 1.209 +} 1.210 + 1.211 +static uint32_t offsetTOCEntryCount(const UDataMemory *pData) { 1.212 + int32_t retVal=0; 1.213 + const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc; 1.214 + if (toc != NULL) { 1.215 + retVal = toc->count; 1.216 + } 1.217 + return retVal; 1.218 +} 1.219 + 1.220 +static const DataHeader * 1.221 +offsetTOCLookupFn(const UDataMemory *pData, 1.222 + const char *tocEntryName, 1.223 + int32_t *pLength, 1.224 + UErrorCode *pErrorCode) { 1.225 + const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc; 1.226 + if(toc!=NULL) { 1.227 + const char *base=(const char *)toc; 1.228 + int32_t number, count=(int32_t)toc->count; 1.229 + 1.230 + /* perform a binary search for the data in the common data's table of contents */ 1.231 +#if defined (UDATA_DEBUG_DUMP) 1.232 + /* list the contents of the TOC each time .. not recommended */ 1.233 + for(number=0; number<count; ++number) { 1.234 + fprintf(stderr, "\tx%d: %s\n", number, &base[toc->entry[number].nameOffset]); 1.235 + } 1.236 +#endif 1.237 + number=offsetTOCPrefixBinarySearch(tocEntryName, base, toc->entry, count); 1.238 + if(number>=0) { 1.239 + /* found it */ 1.240 + const UDataOffsetTOCEntry *entry=toc->entry+number; 1.241 +#ifdef UDATA_DEBUG 1.242 + fprintf(stderr, "%s: Found.\n", tocEntryName); 1.243 +#endif 1.244 + if((number+1) < count) { 1.245 + *pLength = (int32_t)(entry[1].dataOffset - entry->dataOffset); 1.246 + } else { 1.247 + *pLength = -1; 1.248 + } 1.249 + return (const DataHeader *)(base+entry->dataOffset); 1.250 + } else { 1.251 +#ifdef UDATA_DEBUG 1.252 + fprintf(stderr, "%s: Not found.\n", tocEntryName); 1.253 +#endif 1.254 + return NULL; 1.255 + } 1.256 + } else { 1.257 +#ifdef UDATA_DEBUG 1.258 + fprintf(stderr, "returning header\n"); 1.259 +#endif 1.260 + 1.261 + return pData->pHeader; 1.262 + } 1.263 +} 1.264 + 1.265 + 1.266 +static uint32_t pointerTOCEntryCount(const UDataMemory *pData) { 1.267 + const PointerTOC *toc = (PointerTOC *)pData->toc; 1.268 + return (uint32_t)((toc != NULL) ? (toc->count) : 0); 1.269 +} 1.270 + 1.271 + 1.272 +static const DataHeader *pointerTOCLookupFn(const UDataMemory *pData, 1.273 + const char *name, 1.274 + int32_t *pLength, 1.275 + UErrorCode *pErrorCode) { 1.276 + if(pData->toc!=NULL) { 1.277 + const PointerTOC *toc = (PointerTOC *)pData->toc; 1.278 + int32_t number, count=(int32_t)toc->count; 1.279 + 1.280 +#if defined (UDATA_DEBUG_DUMP) 1.281 + /* list the contents of the TOC each time .. not recommended */ 1.282 + for(number=0; number<count; ++number) { 1.283 + fprintf(stderr, "\tx%d: %s\n", number, toc->entry[number].entryName); 1.284 + } 1.285 +#endif 1.286 + number=pointerTOCPrefixBinarySearch(name, toc->entry, count); 1.287 + if(number>=0) { 1.288 + /* found it */ 1.289 +#ifdef UDATA_DEBUG 1.290 + fprintf(stderr, "%s: Found.\n", toc->entry[number].entryName); 1.291 +#endif 1.292 + *pLength=-1; 1.293 + return UDataMemory_normalizeDataPointer(toc->entry[number].pHeader); 1.294 + } else { 1.295 +#ifdef UDATA_DEBUG 1.296 + fprintf(stderr, "%s: Not found.\n", name); 1.297 +#endif 1.298 + return NULL; 1.299 + } 1.300 + } else { 1.301 + return pData->pHeader; 1.302 + } 1.303 +} 1.304 + 1.305 +static const commonDataFuncs CmnDFuncs = {offsetTOCLookupFn, offsetTOCEntryCount}; 1.306 +static const commonDataFuncs ToCPFuncs = {pointerTOCLookupFn, pointerTOCEntryCount}; 1.307 + 1.308 + 1.309 + 1.310 +/*----------------------------------------------------------------------* 1.311 + * * 1.312 + * checkCommonData Validate the format of a common data file. * 1.313 + * Fill in the virtual function ptr based on TOC type * 1.314 + * If the data is invalid, close the UDataMemory * 1.315 + * and set the appropriate error code. * 1.316 + * * 1.317 + *----------------------------------------------------------------------*/ 1.318 +U_CFUNC void udata_checkCommonData(UDataMemory *udm, UErrorCode *err) { 1.319 + if (U_FAILURE(*err)) { 1.320 + return; 1.321 + } 1.322 + 1.323 + if(udm==NULL || udm->pHeader==NULL) { 1.324 + *err=U_INVALID_FORMAT_ERROR; 1.325 + } else if(!(udm->pHeader->dataHeader.magic1==0xda && 1.326 + udm->pHeader->dataHeader.magic2==0x27 && 1.327 + udm->pHeader->info.isBigEndian==U_IS_BIG_ENDIAN && 1.328 + udm->pHeader->info.charsetFamily==U_CHARSET_FAMILY) 1.329 + ) { 1.330 + /* header not valid */ 1.331 + *err=U_INVALID_FORMAT_ERROR; 1.332 + } 1.333 + else if (udm->pHeader->info.dataFormat[0]==0x43 && 1.334 + udm->pHeader->info.dataFormat[1]==0x6d && 1.335 + udm->pHeader->info.dataFormat[2]==0x6e && 1.336 + udm->pHeader->info.dataFormat[3]==0x44 && 1.337 + udm->pHeader->info.formatVersion[0]==1 1.338 + ) { 1.339 + /* dataFormat="CmnD" */ 1.340 + udm->vFuncs = &CmnDFuncs; 1.341 + udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader); 1.342 + } 1.343 + else if(udm->pHeader->info.dataFormat[0]==0x54 && 1.344 + udm->pHeader->info.dataFormat[1]==0x6f && 1.345 + udm->pHeader->info.dataFormat[2]==0x43 && 1.346 + udm->pHeader->info.dataFormat[3]==0x50 && 1.347 + udm->pHeader->info.formatVersion[0]==1 1.348 + ) { 1.349 + /* dataFormat="ToCP" */ 1.350 + udm->vFuncs = &ToCPFuncs; 1.351 + udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader); 1.352 + } 1.353 + else { 1.354 + /* dataFormat not recognized */ 1.355 + *err=U_INVALID_FORMAT_ERROR; 1.356 + } 1.357 + 1.358 + if (U_FAILURE(*err)) { 1.359 + /* If the data is no good and we memory-mapped it ourselves, 1.360 + * close the memory mapping so it doesn't leak. Note that this has 1.361 + * no effect on non-memory mapped data, other than clearing fields in udm. 1.362 + */ 1.363 + udata_close(udm); 1.364 + } 1.365 +} 1.366 + 1.367 +/* 1.368 + * TODO: Add a udata_swapPackageHeader() function that swaps an ICU .dat package 1.369 + * header but not its sub-items. 1.370 + * This function will be needed for automatic runtime swapping. 1.371 + * Sub-items should not be swapped to limit the swapping to the parts of the 1.372 + * package that are actually used. 1.373 + * 1.374 + * Since lengths of items are implicit in the order and offsets of their 1.375 + * ToC entries, and since offsets are relative to the start of the ToC, 1.376 + * a swapped version may need to generate a different data structure 1.377 + * with pointers to the original data items and with their lengths 1.378 + * (-1 for the last one if it is not known), and maybe even pointers to the 1.379 + * swapped versions of the items. 1.380 + * These pointers to swapped versions would establish a cache; 1.381 + * instead, each open data item could simply own the storage for its swapped 1.382 + * data. This fits better with the current design. 1.383 + * 1.384 + * markus 2003sep18 Jitterbug 2235 1.385 + */