intl/icu/source/tools/gensprep/store.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/gensprep/store.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,647 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 1999-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  store.c
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2003-02-06
    1.17 +*   created by: Ram Viswanadha
    1.18 +*
    1.19 +*/
    1.20 +
    1.21 +#include <stdio.h>
    1.22 +#include <stdlib.h>
    1.23 +#include "unicode/utypes.h"
    1.24 +#include "cmemory.h"
    1.25 +#include "cstring.h"
    1.26 +#include "filestrm.h"
    1.27 +#include "unicode/udata.h"
    1.28 +#include "unicode/utf16.h"
    1.29 +#include "utrie.h"
    1.30 +#include "unewdata.h"
    1.31 +#include "gensprep.h"
    1.32 +#include "uhash.h"
    1.33 +
    1.34 +
    1.35 +#define DO_DEBUG_OUT 0
    1.36 +
    1.37 +
    1.38 +/* 
    1.39 + * StringPrep profile file format ------------------------------------
    1.40 + * 
    1.41 + * The file format prepared and written here contains a 16-bit trie and a mapping table.
    1.42 + * 
    1.43 + * Before the data contents described below, there are the headers required by
    1.44 + * the udata API for loading ICU data. Especially, a UDataInfo structure
    1.45 + * precedes the actual data. It contains platform properties values and the
    1.46 + * file format version.
    1.47 + * 
    1.48 + * The following is a description of format version 2.
    1.49 + * 
    1.50 + * Data contents:
    1.51 + * 
    1.52 + * The contents is a parsed, binary form of RFC3454 and possibly
    1.53 + * NormalizationCorrections.txt depending on the options specified on the profile.
    1.54 + * 
    1.55 + * Any Unicode code point from 0 to 0x10ffff can be looked up to get
    1.56 + * the trie-word, if any, for that code point. This means that the input
    1.57 + * to the lookup are 21-bit unsigned integers, with not all of the
    1.58 + * 21-bit range used.
    1.59 + * 
    1.60 + * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c.
    1.61 + * After that there are the following structures:
    1.62 + *
    1.63 + * int32_t indexes[_SPREP_INDEX_TOP];           -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file
    1.64 + *
    1.65 + * UTrie stringPrepTrie;                        -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE]
    1.66 + * 
    1.67 + * uint16_t mappingTable[];                     -- Contains the sequecence of code units that the code point maps to 
    1.68 + *                                                 size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]
    1.69 + *
    1.70 + * The indexes array contains the following values:
    1.71 + *  indexes[_SPREP_INDEX_TRIE_SIZE]                  -- The size of the StringPrep trie in bytes
    1.72 + *  indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]          -- The size of the mappingTable in bytes 
    1.73 + *  indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION]  -- The index of Unicode version of last entry in NormalizationCorrections.txt 
    1.74 + *  indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START]    -- The starting index of 1 UChar  mapping index in the mapping table 
    1.75 + *  indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]   -- The starting index of 2 UChars mapping index in the mapping table
    1.76 + *  indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table
    1.77 + *  indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]  -- The starting index of 4 UChars mapping index in the mapping table
    1.78 + *  indexes[_SPREP_OPTIONS]                          -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON
    1.79 + *    
    1.80 + *
    1.81 + * StringPrep Trie :
    1.82 + *
    1.83 + * The StringPrep tries is a 16-bit trie that contains data for the profile. 
    1.84 + * Each code point is associated with a value (trie-word) in the trie.
    1.85 + *
    1.86 + * - structure of data words from the trie
    1.87 + * 
    1.88 + *  i)  A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) 
    1.89 + *      represents the type associated with the code point
    1.90 + *      if(trieWord >= _SPREP_TYPE_THRESHOLD){
    1.91 + *          type = trieWord - 0xFFF0;
    1.92 + *      }
    1.93 + *      The type can be :
    1.94 + *             USPREP_UNASSIGNED                     
    1.95 + *             USPREP_PROHIBITED       
    1.96 + *             USPREP_DELETE     
    1.97 + *     
    1.98 + *  ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and
    1.99 + *      contains distribution described below
   1.100 + *      
   1.101 + *      0       -  ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped.
   1.102 + *      1       -  ON : The value in the next 14 bits is an index into the mapping table
   1.103 + *                 OFF: The value in the next 14 bits is an delta value from the code point
   1.104 + *      2..15   -  Contains data as described by bit 1. If all bits are set 
   1.105 + *                 (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE
   1.106 + *
   1.107 + *  
   1.108 + * Mapping Table:
   1.109 + * The data in mapping table is sorted according to the length of the mapping sequence.
   1.110 + * If the type of the code point is USPREP_MAP and value in trie word is an index, the index
   1.111 + * is compared with start indexes of sequence length start to figure out the length according to
   1.112 + * the following algorithm:
   1.113 + *
   1.114 + *              if(       index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
   1.115 + *                        index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
   1.116 + *                   length = 1;
   1.117 + *               }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
   1.118 + *                        index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
   1.119 + *                   length = 2;
   1.120 + *               }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
   1.121 + *                        index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
   1.122 + *                   length = 3;
   1.123 + *               }else{
   1.124 + *                   // The first position in the mapping table contains the length 
   1.125 + *                   // of the sequence
   1.126 + *                   length = mappingTable[index++];
   1.127 + *        
   1.128 + *               }
   1.129 + *
   1.130 + */
   1.131 +
   1.132 +/* file data ---------------------------------------------------------------- */
   1.133 +/* indexes[] value names */
   1.134 +
   1.135 +#if UCONFIG_NO_IDNA
   1.136 +
   1.137 +/* dummy UDataInfo cf. udata.h */
   1.138 +static UDataInfo dataInfo = {
   1.139 +    sizeof(UDataInfo),
   1.140 +    0,
   1.141 +
   1.142 +    U_IS_BIG_ENDIAN,
   1.143 +    U_CHARSET_FAMILY,
   1.144 +    U_SIZEOF_UCHAR,
   1.145 +    0,
   1.146 +
   1.147 +    { 0, 0, 0, 0 },                 /* dummy dataFormat */
   1.148 +    { 0, 0, 0, 0 },                 /* dummy formatVersion */
   1.149 +    { 0, 0, 0, 0 }                  /* dummy dataVersion */
   1.150 +};
   1.151 +
   1.152 +#else
   1.153 +
   1.154 +static int32_t indexes[_SPREP_INDEX_TOP]={ 0 };
   1.155 +
   1.156 +static uint16_t* mappingData= NULL;
   1.157 +static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */
   1.158 +static int16_t currentIndex = 0; /* the current index into the data trie */
   1.159 +static int32_t maxLength = 0;  /* maximum length of mapping string */
   1.160 +
   1.161 +
   1.162 +/* UDataInfo cf. udata.h */
   1.163 +static UDataInfo dataInfo={
   1.164 +    sizeof(UDataInfo),
   1.165 +    0,
   1.166 +
   1.167 +    U_IS_BIG_ENDIAN,
   1.168 +    U_CHARSET_FAMILY,
   1.169 +    U_SIZEOF_UCHAR,
   1.170 +    0,
   1.171 +
   1.172 +    { 0x53, 0x50, 0x52, 0x50 },                 /* dataFormat="SPRP" */
   1.173 +    { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
   1.174 +    { 3, 2, 0, 0 }                              /* dataVersion (Unicode version) */
   1.175 +};
   1.176 +void
   1.177 +setUnicodeVersion(const char *v) {
   1.178 +    UVersionInfo version;
   1.179 +    u_versionFromString(version, v);
   1.180 +    uprv_memcpy(dataInfo.dataVersion, version, 4);
   1.181 +}
   1.182 +
   1.183 +void
   1.184 +setUnicodeVersionNC(UVersionInfo version){
   1.185 +    uint32_t univer = version[0] << 24;
   1.186 +    univer += version[1] << 16;
   1.187 +    univer += version[2] << 8;
   1.188 +    univer += version[3];
   1.189 +    indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer;
   1.190 +}
   1.191 +static UNewTrie *sprepTrie;
   1.192 +
   1.193 +#define MAX_DATA_LENGTH 11500
   1.194 +
   1.195 +
   1.196 +#define SPREP_DELTA_RANGE_POSITIVE_LIMIT              8191 
   1.197 +#define SPREP_DELTA_RANGE_NEGATIVE_LIMIT              -8192
   1.198 +
   1.199 +
   1.200 +extern void
   1.201 +init() {
   1.202 +
   1.203 +    sprepTrie = (UNewTrie *)uprv_calloc(1, sizeof(UNewTrie));
   1.204 +
   1.205 +    /* initialize the two tries */
   1.206 +    if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) {
   1.207 +        fprintf(stderr, "error: failed to initialize tries\n");
   1.208 +        exit(U_MEMORY_ALLOCATION_ERROR);
   1.209 +    }
   1.210 +}
   1.211 +
   1.212 +static UHashtable* hashTable = NULL;
   1.213 +
   1.214 +
   1.215 +typedef struct ValueStruct {
   1.216 +    UChar* mapping;
   1.217 +    int16_t length;
   1.218 +    UStringPrepType type;
   1.219 +} ValueStruct;
   1.220 +
   1.221 +/* Callback for deleting the value from the hashtable */
   1.222 +static void U_CALLCONV valueDeleter(void* obj){
   1.223 +    ValueStruct* value = (ValueStruct*) obj;
   1.224 +    uprv_free(value->mapping);
   1.225 +    uprv_free(value);
   1.226 +}
   1.227 +
   1.228 +/* Callback for hashing the entry */
   1.229 +static int32_t U_CALLCONV hashEntry(const UHashTok parm) {
   1.230 +    return  parm.integer;
   1.231 +}
   1.232 +
   1.233 +/* Callback for comparing two entries */
   1.234 +static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) {
   1.235 +    return (UBool)(p1.integer != p2.integer);
   1.236 +}
   1.237 +
   1.238 +
   1.239 +static void 
   1.240 +storeMappingData(){
   1.241 +
   1.242 +    int32_t pos = -1;
   1.243 +    const UHashElement* element = NULL;
   1.244 +    ValueStruct* value  = NULL;
   1.245 +    int32_t codepoint = 0;
   1.246 +    int32_t elementCount = 0;
   1.247 +    int32_t writtenElementCount = 0;
   1.248 +    int32_t mappingLength = 1; /* minimum mapping length */
   1.249 +    int32_t oldMappingLength = 0;
   1.250 +    uint16_t trieWord =0;
   1.251 +    int32_t limitIndex = 0;
   1.252 +
   1.253 +    if (hashTable == NULL) {
   1.254 +        return;
   1.255 +    }
   1.256 +    elementCount = uhash_count(hashTable);
   1.257 +
   1.258 +	/*initialize the mapping data */
   1.259 +    mappingData = (uint16_t*) uprv_calloc(mappingDataCapacity, U_SIZEOF_UCHAR);
   1.260 +
   1.261 +    while(writtenElementCount < elementCount){
   1.262 +
   1.263 +        while( (element = uhash_nextElement(hashTable, &pos))!=NULL){
   1.264 +            
   1.265 +            codepoint = element->key.integer;
   1.266 +            value = (ValueStruct*)element->value.pointer;
   1.267 +            
   1.268 +            /* store the start of indexes */
   1.269 +            if(oldMappingLength != mappingLength){
   1.270 +                /* Assume that index[] is used according to the enums defined */
   1.271 +                if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){
   1.272 +                    indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex;
   1.273 +                }
   1.274 +                if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH &&
   1.275 +                   mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){
   1.276 +                   
   1.277 +                    limitIndex = currentIndex;
   1.278 +                     
   1.279 +                }
   1.280 +                oldMappingLength = mappingLength;
   1.281 +            }
   1.282 +
   1.283 +            if(value->length == mappingLength){
   1.284 +                uint32_t savedTrieWord = 0;
   1.285 +                trieWord = currentIndex << 2;
   1.286 +                /* turn on the 2nd bit to signal that the following bits contain an index */
   1.287 +                trieWord += 0x02;
   1.288 +            
   1.289 +                if(trieWord > _SPREP_TYPE_THRESHOLD){
   1.290 +                    fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
   1.291 +                    exit(U_ILLEGAL_CHAR_FOUND);
   1.292 +                }
   1.293 +                /* figure out if the code point has type already stored */
   1.294 +                savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL);
   1.295 +                if(savedTrieWord!=0){
   1.296 +                    if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){
   1.297 +                        /* turn on the first bit in trie word */
   1.298 +                        trieWord += 0x01;
   1.299 +                    }else{
   1.300 +                        /* 
   1.301 +                         * the codepoint has value something other than prohibited
   1.302 +                         * and a mapping .. error! 
   1.303 +                         */
   1.304 +                        fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint);
   1.305 +                        exit(U_ILLEGAL_ARGUMENT_ERROR); 
   1.306 +                    } 
   1.307 +                } 
   1.308 +                
   1.309 +                /* now set the value in the trie */
   1.310 +                if(!utrie_set32(sprepTrie,codepoint,trieWord)){
   1.311 +                    fprintf(stderr,"Could not set the value for code point.\n");
   1.312 +                    exit(U_ILLEGAL_ARGUMENT_ERROR);   
   1.313 +                }
   1.314 +
   1.315 +                /* written the trie word for the codepoint... increment the count*/
   1.316 +                writtenElementCount++;
   1.317 +
   1.318 +                /* sanity check are we exceeding the max number allowed */
   1.319 +                if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){
   1.320 +                    fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n", 
   1.321 +                        currentIndex+value->length, _SPREP_MAX_INDEX_VALUE);
   1.322 +                    exit(U_INDEX_OUTOFBOUNDS_ERROR);
   1.323 +                }
   1.324 +
   1.325 +                /* copy the mapping data */
   1.326 +                /* write the length */
   1.327 +                if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){
   1.328 +                     /* the cast here is safe since we donot expect the length to be > 65535 */
   1.329 +                     mappingData[currentIndex++] = (uint16_t) mappingLength;
   1.330 +                }
   1.331 +                /* copy the contents to mappindData array */
   1.332 +                uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR);
   1.333 +                currentIndex += value->length;
   1.334 +                if (currentIndex > mappingDataCapacity) {
   1.335 +                    /* If this happens there is a bug in the computation of the mapping data size in storeMapping() */
   1.336 +                    fprintf(stderr, "gensprep, fatal error at %s, %d.  Aborting.\n", __FILE__, __LINE__);
   1.337 +                    exit(U_INTERNAL_PROGRAM_ERROR);
   1.338 +                }
   1.339 +            }
   1.340 +        }
   1.341 +        mappingLength++;
   1.342 +        pos = -1;
   1.343 +    }
   1.344 +    /* set the last length for range check */
   1.345 +    if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){
   1.346 +        indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1;
   1.347 +    }else{
   1.348 +        indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex;
   1.349 +    }
   1.350 +    
   1.351 +}
   1.352 +
   1.353 +extern void setOptions(int32_t options){
   1.354 +    indexes[_SPREP_OPTIONS] = options;
   1.355 +}
   1.356 +extern void
   1.357 +storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length,
   1.358 +             UStringPrepType type, UErrorCode* status){
   1.359 +    
   1.360 + 
   1.361 +    UChar* map = NULL;
   1.362 +    int16_t adjustedLen=0, i, j;
   1.363 +    uint16_t trieWord = 0;
   1.364 +    ValueStruct *value = NULL;
   1.365 +    uint32_t savedTrieWord = 0;
   1.366 +
   1.367 +    /* initialize the hashtable */
   1.368 +    if(hashTable==NULL){
   1.369 +        hashTable = uhash_open(hashEntry, compareEntries, NULL, status);
   1.370 +        uhash_setValueDeleter(hashTable, valueDeleter);
   1.371 +    }
   1.372 +    
   1.373 +    /* figure out if the code point has type already stored */
   1.374 +    savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL);
   1.375 +    if(savedTrieWord!=0){
   1.376 +        if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){
   1.377 +            /* turn on the first bit in trie word */
   1.378 +            trieWord += 0x01;
   1.379 +        }else{
   1.380 +            /* 
   1.381 +             * the codepoint has value something other than prohibited
   1.382 +             * and a mapping .. error! 
   1.383 +             */
   1.384 +            fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint);
   1.385 +            exit(U_ILLEGAL_ARGUMENT_ERROR); 
   1.386 +        } 
   1.387 +    }
   1.388 +
   1.389 +    /* figure out the real length */ 
   1.390 +    for(i=0; i<length; i++){
   1.391 +        adjustedLen += U16_LENGTH(mapping[i]);
   1.392 +    }
   1.393 +
   1.394 +    if(adjustedLen == 0){
   1.395 +        trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2);
   1.396 +        /* make sure that the value of trieWord is less than the threshold */
   1.397 +        if(trieWord < _SPREP_TYPE_THRESHOLD){   
   1.398 +            /* now set the value in the trie */
   1.399 +            if(!utrie_set32(sprepTrie,codepoint,trieWord)){
   1.400 +                fprintf(stderr,"Could not set the value for code point.\n");
   1.401 +                exit(U_ILLEGAL_ARGUMENT_ERROR);   
   1.402 +            }
   1.403 +            /* value is set so just return */
   1.404 +            return;
   1.405 +        }else{
   1.406 +            fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
   1.407 +            exit(U_ILLEGAL_CHAR_FOUND);
   1.408 +        }
   1.409 +    }
   1.410 +
   1.411 +    if(adjustedLen == 1){
   1.412 +        /* calculate the delta */
   1.413 +        int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]);
   1.414 +        if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){
   1.415 +
   1.416 +            trieWord = delta << 2;
   1.417 +
   1.418 +
   1.419 +            /* make sure that the second bit is OFF */
   1.420 +            if((trieWord & 0x02) != 0 ){
   1.421 +                fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n");
   1.422 +                exit(U_INTERNAL_PROGRAM_ERROR);
   1.423 +            }
   1.424 +            /* make sure that the value of trieWord is less than the threshold */
   1.425 +            if(trieWord < _SPREP_TYPE_THRESHOLD){   
   1.426 +                /* now set the value in the trie */
   1.427 +                if(!utrie_set32(sprepTrie,codepoint,trieWord)){
   1.428 +                    fprintf(stderr,"Could not set the value for code point.\n");
   1.429 +                    exit(U_ILLEGAL_ARGUMENT_ERROR);   
   1.430 +                }
   1.431 +                /* value is set so just return */
   1.432 +                return;
   1.433 +            }
   1.434 +        }
   1.435 +        /* 
   1.436 +         * if the delta is not in the given range or if the trieWord is larger than the threshold
   1.437 +         * just fall through for storing the mapping in the mapping table
   1.438 +         */
   1.439 +    }
   1.440 +
   1.441 +    map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR);
   1.442 +    
   1.443 +    for (i=0, j=0; i<length; i++) {
   1.444 +        U16_APPEND_UNSAFE(map, j, mapping[i]);
   1.445 +    }
   1.446 +    
   1.447 +    value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct));
   1.448 +    value->mapping = map;
   1.449 +    value->type    = type;
   1.450 +    value->length  = adjustedLen;
   1.451 +    if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){
   1.452 +        mappingDataCapacity++;
   1.453 +    }
   1.454 +    if(maxLength < value->length){
   1.455 +        maxLength = value->length;
   1.456 +    }
   1.457 +    uhash_iput(hashTable,codepoint,value,status);
   1.458 +    mappingDataCapacity += adjustedLen;
   1.459 +
   1.460 +    if(U_FAILURE(*status)){
   1.461 +        fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status));
   1.462 +        exit(*status);
   1.463 +    }
   1.464 +}
   1.465 +
   1.466 +
   1.467 +extern void
   1.468 +storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status){
   1.469 +    uint16_t trieWord = 0;
   1.470 +
   1.471 +    if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){
   1.472 +        fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n");
   1.473 +        exit(U_ILLEGAL_CHAR_FOUND);
   1.474 +    }
   1.475 +    trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */
   1.476 +    if(start == end){
   1.477 +        uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL);
   1.478 +        if(savedTrieWord>0){
   1.479 +            if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){
   1.480 +                /* 
   1.481 +                 * A mapping is stored in the trie word 
   1.482 +                 * and the only other possible type that a 
   1.483 +                 * code point can have is USPREP_PROHIBITED
   1.484 +                 *
   1.485 +                 */
   1.486 +
   1.487 +                /* turn on the 0th bit in the savedTrieWord */
   1.488 +                savedTrieWord += 0x01;
   1.489 +
   1.490 +                /* the downcast is safe since we only save 16 bit values */
   1.491 +                trieWord = (uint16_t)savedTrieWord;
   1.492 +
   1.493 +                /* make sure that the value of trieWord is less than the threshold */
   1.494 +                if(trieWord < _SPREP_TYPE_THRESHOLD){   
   1.495 +                    /* now set the value in the trie */
   1.496 +                    if(!utrie_set32(sprepTrie,start,trieWord)){
   1.497 +                        fprintf(stderr,"Could not set the value for code point.\n");
   1.498 +                        exit(U_ILLEGAL_ARGUMENT_ERROR);   
   1.499 +                    }
   1.500 +                    /* value is set so just return */
   1.501 +                    return;
   1.502 +                }else{
   1.503 +                    fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
   1.504 +                    exit(U_ILLEGAL_CHAR_FOUND);
   1.505 +                }
   1.506 + 
   1.507 +            }else if(savedTrieWord != trieWord){
   1.508 +                fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start);
   1.509 +                exit(U_ILLEGAL_ARGUMENT_ERROR);
   1.510 +            }
   1.511 +            /* if savedTrieWord == trieWord .. fall through and set the value */
   1.512 +        }
   1.513 +        if(!utrie_set32(sprepTrie,start,trieWord)){
   1.514 +            fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start);
   1.515 +            exit(U_ILLEGAL_ARGUMENT_ERROR);   
   1.516 +        }
   1.517 +    }else{
   1.518 +        if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){
   1.519 +            fprintf(stderr,"Value for certain codepoint already set.\n");
   1.520 +            exit(U_ILLEGAL_CHAR_FOUND);   
   1.521 +        }
   1.522 +    }
   1.523 +
   1.524 +}
   1.525 +
   1.526 +/* folding value: just store the offset (16 bits) if there is any non-0 entry */
   1.527 +static uint32_t U_CALLCONV
   1.528 +getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
   1.529 +    uint32_t value;
   1.530 +    UChar32 limit=0;
   1.531 +    UBool inBlockZero;
   1.532 +
   1.533 +    limit=start+0x400;
   1.534 +    while(start<limit) {
   1.535 +        value=utrie_get32(trie, start, &inBlockZero);
   1.536 +        if(inBlockZero) {
   1.537 +            start+=UTRIE_DATA_BLOCK_LENGTH;
   1.538 +        } else if(value!=0) {
   1.539 +            return (uint32_t)offset;
   1.540 +        } else {
   1.541 +            ++start;
   1.542 +        }
   1.543 +    }
   1.544 +    return 0;
   1.545 +
   1.546 +}
   1.547 +
   1.548 +#endif /* #if !UCONFIG_NO_IDNA */
   1.549 +
   1.550 +extern void
   1.551 +generateData(const char *dataDir, const char* bundleName) {
   1.552 +    static uint8_t sprepTrieBlock[100000];
   1.553 +
   1.554 +    UNewDataMemory *pData;
   1.555 +    UErrorCode errorCode=U_ZERO_ERROR;
   1.556 +    int32_t size, dataLength;
   1.557 +    char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100);
   1.558 +
   1.559 +#if UCONFIG_NO_IDNA
   1.560 +
   1.561 +    size=0;
   1.562 +
   1.563 +#else
   1.564 +
   1.565 +    int32_t sprepTrieSize;
   1.566 +
   1.567 +    /* sort and add mapping data */
   1.568 +    storeMappingData();
   1.569 +    
   1.570 +    sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, TRUE, &errorCode);
   1.571 +    if(U_FAILURE(errorCode)) {
   1.572 +        fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode));
   1.573 +        exit(errorCode);
   1.574 +    }
   1.575 +    
   1.576 +    size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes);
   1.577 +    if(beVerbose) {
   1.578 +        printf("size of sprep trie              %5u bytes\n", (int)sprepTrieSize);
   1.579 +        printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size);
   1.580 +        printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR);
   1.581 +        printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex);
   1.582 +        printf("Maximum length of the mapping string is : %i \n", (int)maxLength);
   1.583 +    }
   1.584 +
   1.585 +#endif
   1.586 +
   1.587 +    fileName[0]=0;
   1.588 +    uprv_strcat(fileName,bundleName);
   1.589 +    /* write the data */
   1.590 +    pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo,
   1.591 +                       haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
   1.592 +    if(U_FAILURE(errorCode)) {
   1.593 +        fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode);
   1.594 +        exit(errorCode);
   1.595 +    }
   1.596 +
   1.597 +#if !UCONFIG_NO_IDNA
   1.598 +
   1.599 +    indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize;
   1.600 +    indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR;
   1.601 +    
   1.602 +    udata_writeBlock(pData, indexes, sizeof(indexes));
   1.603 +    udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize);
   1.604 +    udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]);
   1.605 +    
   1.606 +
   1.607 +#endif
   1.608 +
   1.609 +    /* finish up */
   1.610 +    dataLength=udata_finish(pData, &errorCode);
   1.611 +    if(U_FAILURE(errorCode)) {
   1.612 +        fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode);
   1.613 +        exit(errorCode);
   1.614 +    }
   1.615 +
   1.616 +    if(dataLength!=size) {
   1.617 +        fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n",
   1.618 +            (long)dataLength, (long)size);
   1.619 +        exit(U_INTERNAL_PROGRAM_ERROR);
   1.620 +    }
   1.621 +
   1.622 +#if !UCONFIG_NO_IDNA
   1.623 +    /* done with writing the data .. close the hashtable */
   1.624 +    if (hashTable != NULL) {
   1.625 +        uhash_close(hashTable);
   1.626 +    }
   1.627 +#endif
   1.628 +
   1.629 +    uprv_free(fileName);
   1.630 +}
   1.631 +
   1.632 +#if !UCONFIG_NO_IDNA
   1.633 +
   1.634 +extern void
   1.635 +cleanUpData(void) {
   1.636 +    uprv_free(mappingData);
   1.637 +    utrie_close(sprepTrie);
   1.638 +    uprv_free(sprepTrie);
   1.639 +}
   1.640 +
   1.641 +#endif /* #if !UCONFIG_NO_IDNA */
   1.642 +
   1.643 +/*
   1.644 + * Hey, Emacs, please set the following:
   1.645 + *
   1.646 + * Local Variables:
   1.647 + * indent-tabs-mode: nil
   1.648 + * End:
   1.649 + *
   1.650 + */

mercurial