intl/icu/source/common/ucmndata.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucmndata.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,382 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 1999-2011, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************/
    1.11 +
    1.12 +
    1.13 +/*------------------------------------------------------------------------------
    1.14 + *
    1.15 + *   UCommonData   An abstract interface for dealing with ICU Common Data Files.
    1.16 + *                 ICU Common Data Files are a grouping of a number of individual
    1.17 + *                 data items (resources, converters, tables, anything) into a
    1.18 + *                 single file or dll.  The combined format includes a table of
    1.19 + *                 contents for locating the individual items by name.
    1.20 + *
    1.21 + *                 Two formats for the table of contents are supported, which is
    1.22 + *                 why there is an abstract inteface involved.
    1.23 + *
    1.24 + */
    1.25 +
    1.26 +#include "unicode/utypes.h"
    1.27 +#include "unicode/udata.h"
    1.28 +#include "cstring.h"
    1.29 +#include "ucmndata.h"
    1.30 +#include "udatamem.h"
    1.31 +
    1.32 +#if defined(UDATA_DEBUG) || defined(UDATA_DEBUG_DUMP)
    1.33 +#   include <stdio.h>
    1.34 +#endif
    1.35 +
    1.36 +U_CFUNC uint16_t
    1.37 +udata_getHeaderSize(const DataHeader *udh) {
    1.38 +    if(udh==NULL) {
    1.39 +        return 0;
    1.40 +    } else if(udh->info.isBigEndian==U_IS_BIG_ENDIAN) {
    1.41 +        /* same endianness */
    1.42 +        return udh->dataHeader.headerSize;
    1.43 +    } else {
    1.44 +        /* opposite endianness */
    1.45 +        uint16_t x=udh->dataHeader.headerSize;
    1.46 +        return (uint16_t)((x<<8)|(x>>8));
    1.47 +    }
    1.48 +}
    1.49 +
    1.50 +U_CFUNC uint16_t
    1.51 +udata_getInfoSize(const UDataInfo *info) {
    1.52 +    if(info==NULL) {
    1.53 +        return 0;
    1.54 +    } else if(info->isBigEndian==U_IS_BIG_ENDIAN) {
    1.55 +        /* same endianness */
    1.56 +        return info->size;
    1.57 +    } else {
    1.58 +        /* opposite endianness */
    1.59 +        uint16_t x=info->size;
    1.60 +        return (uint16_t)((x<<8)|(x>>8));
    1.61 +    }
    1.62 +}
    1.63 +
    1.64 +/*-----------------------------------------------------------------------------*
    1.65 + *                                                                             *
    1.66 + *  Pointer TOCs.   TODO: This form of table-of-contents should be removed     *
    1.67 + *                  because DLLs must be relocated on loading to correct the   *
    1.68 + *                  pointer values and this operation makes shared memory      *
    1.69 + *                  mapping of the data much less likely to work.              *
    1.70 + *                                                                             *
    1.71 + *-----------------------------------------------------------------------------*/
    1.72 +typedef struct {
    1.73 +    const char       *entryName;
    1.74 +    const DataHeader *pHeader;
    1.75 +} PointerTOCEntry;
    1.76 +
    1.77 +
    1.78 +typedef struct  {
    1.79 +    uint32_t          count;
    1.80 +    uint32_t          reserved;
    1.81 +    PointerTOCEntry   entry[2];   /* Actual size is from count. */
    1.82 +}  PointerTOC;
    1.83 +
    1.84 +
    1.85 +/* definition of OffsetTOC struct types moved to ucmndata.h */
    1.86 +
    1.87 +/*-----------------------------------------------------------------------------*
    1.88 + *                                                                             *
    1.89 + *    entry point lookup implementations                                       *
    1.90 + *                                                                             *
    1.91 + *-----------------------------------------------------------------------------*/
    1.92 +
    1.93 +#ifndef MIN
    1.94 +#define MIN(a,b) (((a)<(b)) ? (a) : (b))
    1.95 +#endif
    1.96 +
    1.97 +/**
    1.98 + * Compare strings where we know the shared prefix length,
    1.99 + * and advance the prefix length as we find that the strings share even more characters.
   1.100 + */
   1.101 +static int32_t
   1.102 +strcmpAfterPrefix(const char *s1, const char *s2, int32_t *pPrefixLength) {
   1.103 +    int32_t pl=*pPrefixLength;
   1.104 +    int32_t cmp=0;
   1.105 +    s1+=pl;
   1.106 +    s2+=pl;
   1.107 +    for(;;) {
   1.108 +        int32_t c1=(uint8_t)*s1++;
   1.109 +        int32_t c2=(uint8_t)*s2++;
   1.110 +        cmp=c1-c2;
   1.111 +        if(cmp!=0 || c1==0) {  /* different or done */
   1.112 +            break;
   1.113 +        }
   1.114 +        ++pl;  /* increment shared same-prefix length */
   1.115 +    }
   1.116 +    *pPrefixLength=pl;
   1.117 +    return cmp;
   1.118 +}
   1.119 +
   1.120 +static int32_t
   1.121 +offsetTOCPrefixBinarySearch(const char *s, const char *names,
   1.122 +                            const UDataOffsetTOCEntry *toc, int32_t count) {
   1.123 +    int32_t start=0;
   1.124 +    int32_t limit=count;
   1.125 +    /*
   1.126 +     * Remember the shared prefix between s, start and limit,
   1.127 +     * and don't compare that shared prefix again.
   1.128 +     * The shared prefix should get longer as we narrow the [start, limit[ range.
   1.129 +     */
   1.130 +    int32_t startPrefixLength=0;
   1.131 +    int32_t limitPrefixLength=0;
   1.132 +    if(count==0) {
   1.133 +        return -1;
   1.134 +    }
   1.135 +    /*
   1.136 +     * Prime the prefix lengths so that we don't keep prefixLength at 0 until
   1.137 +     * both the start and limit indexes have moved.
   1.138 +     * At the same time, we find if s is one of the start and (limit-1) names,
   1.139 +     * and if not, exclude them from the actual binary search.
   1.140 +     */
   1.141 +    if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, &startPrefixLength)) {
   1.142 +        return 0;
   1.143 +    }
   1.144 +    ++start;
   1.145 +    --limit;
   1.146 +    if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, &limitPrefixLength)) {
   1.147 +        return limit;
   1.148 +    }
   1.149 +    while(start<limit) {
   1.150 +        int32_t i=(start+limit)/2;
   1.151 +        int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength);
   1.152 +        int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, &prefixLength);
   1.153 +        if(cmp<0) {
   1.154 +            limit=i;
   1.155 +            limitPrefixLength=prefixLength;
   1.156 +        } else if(cmp==0) {
   1.157 +            return i;
   1.158 +        } else {
   1.159 +            start=i+1;
   1.160 +            startPrefixLength=prefixLength;
   1.161 +        }
   1.162 +    }
   1.163 +    return -1;
   1.164 +}
   1.165 +
   1.166 +static int32_t
   1.167 +pointerTOCPrefixBinarySearch(const char *s, const PointerTOCEntry *toc, int32_t count) {
   1.168 +    int32_t start=0;
   1.169 +    int32_t limit=count;
   1.170 +    /*
   1.171 +     * Remember the shared prefix between s, start and limit,
   1.172 +     * and don't compare that shared prefix again.
   1.173 +     * The shared prefix should get longer as we narrow the [start, limit[ range.
   1.174 +     */
   1.175 +    int32_t startPrefixLength=0;
   1.176 +    int32_t limitPrefixLength=0;
   1.177 +    if(count==0) {
   1.178 +        return -1;
   1.179 +    }
   1.180 +    /*
   1.181 +     * Prime the prefix lengths so that we don't keep prefixLength at 0 until
   1.182 +     * both the start and limit indexes have moved.
   1.183 +     * At the same time, we find if s is one of the start and (limit-1) names,
   1.184 +     * and if not, exclude them from the actual binary search.
   1.185 +     */
   1.186 +    if(0==strcmpAfterPrefix(s, toc[0].entryName, &startPrefixLength)) {
   1.187 +        return 0;
   1.188 +    }
   1.189 +    ++start;
   1.190 +    --limit;
   1.191 +    if(0==strcmpAfterPrefix(s, toc[limit].entryName, &limitPrefixLength)) {
   1.192 +        return limit;
   1.193 +    }
   1.194 +    while(start<limit) {
   1.195 +        int32_t i=(start+limit)/2;
   1.196 +        int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength);
   1.197 +        int32_t cmp=strcmpAfterPrefix(s, toc[i].entryName, &prefixLength);
   1.198 +        if(cmp<0) {
   1.199 +            limit=i;
   1.200 +            limitPrefixLength=prefixLength;
   1.201 +        } else if(cmp==0) {
   1.202 +            return i;
   1.203 +        } else {
   1.204 +            start=i+1;
   1.205 +            startPrefixLength=prefixLength;
   1.206 +        }
   1.207 +    }
   1.208 +    return -1;
   1.209 +}
   1.210 +
   1.211 +static uint32_t offsetTOCEntryCount(const UDataMemory *pData) {
   1.212 +    int32_t          retVal=0;
   1.213 +    const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc;
   1.214 +    if (toc != NULL) {
   1.215 +        retVal = toc->count;
   1.216 +    }
   1.217 +    return retVal;
   1.218 +}
   1.219 +
   1.220 +static const DataHeader *
   1.221 +offsetTOCLookupFn(const UDataMemory *pData,
   1.222 +                  const char *tocEntryName,
   1.223 +                  int32_t *pLength,
   1.224 +                  UErrorCode *pErrorCode) {
   1.225 +    const UDataOffsetTOC  *toc = (UDataOffsetTOC *)pData->toc;
   1.226 +    if(toc!=NULL) {
   1.227 +        const char *base=(const char *)toc;
   1.228 +        int32_t number, count=(int32_t)toc->count;
   1.229 +
   1.230 +        /* perform a binary search for the data in the common data's table of contents */
   1.231 +#if defined (UDATA_DEBUG_DUMP)
   1.232 +        /* list the contents of the TOC each time .. not recommended */
   1.233 +        for(number=0; number<count; ++number) {
   1.234 +            fprintf(stderr, "\tx%d: %s\n", number, &base[toc->entry[number].nameOffset]);
   1.235 +        }
   1.236 +#endif
   1.237 +        number=offsetTOCPrefixBinarySearch(tocEntryName, base, toc->entry, count);
   1.238 +        if(number>=0) {
   1.239 +            /* found it */
   1.240 +            const UDataOffsetTOCEntry *entry=toc->entry+number;
   1.241 +#ifdef UDATA_DEBUG
   1.242 +            fprintf(stderr, "%s: Found.\n", tocEntryName);
   1.243 +#endif
   1.244 +            if((number+1) < count) {
   1.245 +                *pLength = (int32_t)(entry[1].dataOffset - entry->dataOffset);
   1.246 +            } else {
   1.247 +                *pLength = -1;
   1.248 +            }
   1.249 +            return (const DataHeader *)(base+entry->dataOffset);
   1.250 +        } else {
   1.251 +#ifdef UDATA_DEBUG
   1.252 +            fprintf(stderr, "%s: Not found.\n", tocEntryName);
   1.253 +#endif
   1.254 +            return NULL;
   1.255 +        }
   1.256 +    } else {
   1.257 +#ifdef UDATA_DEBUG
   1.258 +        fprintf(stderr, "returning header\n");
   1.259 +#endif
   1.260 +
   1.261 +        return pData->pHeader;
   1.262 +    }
   1.263 +}
   1.264 +
   1.265 +
   1.266 +static uint32_t pointerTOCEntryCount(const UDataMemory *pData) {
   1.267 +    const PointerTOC *toc = (PointerTOC *)pData->toc;
   1.268 +    return (uint32_t)((toc != NULL) ? (toc->count) : 0);
   1.269 +}
   1.270 +
   1.271 +
   1.272 +static const DataHeader *pointerTOCLookupFn(const UDataMemory *pData,
   1.273 +                   const char *name,
   1.274 +                   int32_t *pLength,
   1.275 +                   UErrorCode *pErrorCode) {
   1.276 +    if(pData->toc!=NULL) {
   1.277 +        const PointerTOC *toc = (PointerTOC *)pData->toc;
   1.278 +        int32_t number, count=(int32_t)toc->count;
   1.279 +
   1.280 +#if defined (UDATA_DEBUG_DUMP)
   1.281 +        /* list the contents of the TOC each time .. not recommended */
   1.282 +        for(number=0; number<count; ++number) {
   1.283 +            fprintf(stderr, "\tx%d: %s\n", number, toc->entry[number].entryName);
   1.284 +        }
   1.285 +#endif
   1.286 +        number=pointerTOCPrefixBinarySearch(name, toc->entry, count);
   1.287 +        if(number>=0) {
   1.288 +            /* found it */
   1.289 +#ifdef UDATA_DEBUG
   1.290 +            fprintf(stderr, "%s: Found.\n", toc->entry[number].entryName);
   1.291 +#endif
   1.292 +            *pLength=-1;
   1.293 +            return UDataMemory_normalizeDataPointer(toc->entry[number].pHeader);
   1.294 +        } else {
   1.295 +#ifdef UDATA_DEBUG
   1.296 +            fprintf(stderr, "%s: Not found.\n", name);
   1.297 +#endif
   1.298 +            return NULL;
   1.299 +        }
   1.300 +    } else {
   1.301 +        return pData->pHeader;
   1.302 +    }
   1.303 +}
   1.304 +
   1.305 +static const commonDataFuncs CmnDFuncs = {offsetTOCLookupFn,  offsetTOCEntryCount};
   1.306 +static const commonDataFuncs ToCPFuncs = {pointerTOCLookupFn, pointerTOCEntryCount};
   1.307 +
   1.308 +
   1.309 +
   1.310 +/*----------------------------------------------------------------------*
   1.311 + *                                                                      *
   1.312 + *  checkCommonData   Validate the format of a common data file.        *
   1.313 + *                    Fill in the virtual function ptr based on TOC type *
   1.314 + *                    If the data is invalid, close the UDataMemory     *
   1.315 + *                    and set the appropriate error code.               *
   1.316 + *                                                                      *
   1.317 + *----------------------------------------------------------------------*/
   1.318 +U_CFUNC void udata_checkCommonData(UDataMemory *udm, UErrorCode *err) {
   1.319 +    if (U_FAILURE(*err)) {
   1.320 +        return;
   1.321 +    }
   1.322 +
   1.323 +    if(udm==NULL || udm->pHeader==NULL) {
   1.324 +      *err=U_INVALID_FORMAT_ERROR;
   1.325 +    } else if(!(udm->pHeader->dataHeader.magic1==0xda &&
   1.326 +        udm->pHeader->dataHeader.magic2==0x27 &&
   1.327 +        udm->pHeader->info.isBigEndian==U_IS_BIG_ENDIAN &&
   1.328 +        udm->pHeader->info.charsetFamily==U_CHARSET_FAMILY)
   1.329 +        ) {
   1.330 +        /* header not valid */
   1.331 +        *err=U_INVALID_FORMAT_ERROR;
   1.332 +    }
   1.333 +    else if (udm->pHeader->info.dataFormat[0]==0x43 &&
   1.334 +        udm->pHeader->info.dataFormat[1]==0x6d &&
   1.335 +        udm->pHeader->info.dataFormat[2]==0x6e &&
   1.336 +        udm->pHeader->info.dataFormat[3]==0x44 &&
   1.337 +        udm->pHeader->info.formatVersion[0]==1
   1.338 +        ) {
   1.339 +        /* dataFormat="CmnD" */
   1.340 +        udm->vFuncs = &CmnDFuncs;
   1.341 +        udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader);
   1.342 +    }
   1.343 +    else if(udm->pHeader->info.dataFormat[0]==0x54 &&
   1.344 +        udm->pHeader->info.dataFormat[1]==0x6f &&
   1.345 +        udm->pHeader->info.dataFormat[2]==0x43 &&
   1.346 +        udm->pHeader->info.dataFormat[3]==0x50 &&
   1.347 +        udm->pHeader->info.formatVersion[0]==1
   1.348 +        ) {
   1.349 +        /* dataFormat="ToCP" */
   1.350 +        udm->vFuncs = &ToCPFuncs;
   1.351 +        udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader);
   1.352 +    }
   1.353 +    else {
   1.354 +        /* dataFormat not recognized */
   1.355 +        *err=U_INVALID_FORMAT_ERROR;
   1.356 +    }
   1.357 +
   1.358 +    if (U_FAILURE(*err)) {
   1.359 +        /* If the data is no good and we memory-mapped it ourselves,
   1.360 +         *  close the memory mapping so it doesn't leak.  Note that this has
   1.361 +         *  no effect on non-memory mapped data, other than clearing fields in udm.
   1.362 +         */
   1.363 +        udata_close(udm);
   1.364 +    }
   1.365 +}
   1.366 +
   1.367 +/*
   1.368 + * TODO: Add a udata_swapPackageHeader() function that swaps an ICU .dat package
   1.369 + * header but not its sub-items.
   1.370 + * This function will be needed for automatic runtime swapping.
   1.371 + * Sub-items should not be swapped to limit the swapping to the parts of the
   1.372 + * package that are actually used.
   1.373 + *
   1.374 + * Since lengths of items are implicit in the order and offsets of their
   1.375 + * ToC entries, and since offsets are relative to the start of the ToC,
   1.376 + * a swapped version may need to generate a different data structure
   1.377 + * with pointers to the original data items and with their lengths
   1.378 + * (-1 for the last one if it is not known), and maybe even pointers to the
   1.379 + * swapped versions of the items.
   1.380 + * These pointers to swapped versions would establish a cache;
   1.381 + * instead, each open data item could simply own the storage for its swapped
   1.382 + * data. This fits better with the current design.
   1.383 + *
   1.384 + * markus 2003sep18 Jitterbug 2235
   1.385 + */

mercurial