1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/gensprep/store.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,647 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 1999-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: store.c 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2003-02-06 1.17 +* created by: Ram Viswanadha 1.18 +* 1.19 +*/ 1.20 + 1.21 +#include <stdio.h> 1.22 +#include <stdlib.h> 1.23 +#include "unicode/utypes.h" 1.24 +#include "cmemory.h" 1.25 +#include "cstring.h" 1.26 +#include "filestrm.h" 1.27 +#include "unicode/udata.h" 1.28 +#include "unicode/utf16.h" 1.29 +#include "utrie.h" 1.30 +#include "unewdata.h" 1.31 +#include "gensprep.h" 1.32 +#include "uhash.h" 1.33 + 1.34 + 1.35 +#define DO_DEBUG_OUT 0 1.36 + 1.37 + 1.38 +/* 1.39 + * StringPrep profile file format ------------------------------------ 1.40 + * 1.41 + * The file format prepared and written here contains a 16-bit trie and a mapping table. 1.42 + * 1.43 + * Before the data contents described below, there are the headers required by 1.44 + * the udata API for loading ICU data. Especially, a UDataInfo structure 1.45 + * precedes the actual data. It contains platform properties values and the 1.46 + * file format version. 1.47 + * 1.48 + * The following is a description of format version 2. 1.49 + * 1.50 + * Data contents: 1.51 + * 1.52 + * The contents is a parsed, binary form of RFC3454 and possibly 1.53 + * NormalizationCorrections.txt depending on the options specified on the profile. 1.54 + * 1.55 + * Any Unicode code point from 0 to 0x10ffff can be looked up to get 1.56 + * the trie-word, if any, for that code point. This means that the input 1.57 + * to the lookup are 21-bit unsigned integers, with not all of the 1.58 + * 21-bit range used. 1.59 + * 1.60 + * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. 1.61 + * After that there are the following structures: 1.62 + * 1.63 + * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file 1.64 + * 1.65 + * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE] 1.66 + * 1.67 + * uint16_t mappingTable[]; -- Contains the sequecence of code units that the code point maps to 1.68 + * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] 1.69 + * 1.70 + * The indexes array contains the following values: 1.71 + * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes 1.72 + * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes 1.73 + * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt 1.74 + * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table 1.75 + * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table 1.76 + * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table 1.77 + * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table 1.78 + * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON 1.79 + * 1.80 + * 1.81 + * StringPrep Trie : 1.82 + * 1.83 + * The StringPrep tries is a 16-bit trie that contains data for the profile. 1.84 + * Each code point is associated with a value (trie-word) in the trie. 1.85 + * 1.86 + * - structure of data words from the trie 1.87 + * 1.88 + * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) 1.89 + * represents the type associated with the code point 1.90 + * if(trieWord >= _SPREP_TYPE_THRESHOLD){ 1.91 + * type = trieWord - 0xFFF0; 1.92 + * } 1.93 + * The type can be : 1.94 + * USPREP_UNASSIGNED 1.95 + * USPREP_PROHIBITED 1.96 + * USPREP_DELETE 1.97 + * 1.98 + * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and 1.99 + * contains distribution described below 1.100 + * 1.101 + * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped. 1.102 + * 1 - ON : The value in the next 14 bits is an index into the mapping table 1.103 + * OFF: The value in the next 14 bits is an delta value from the code point 1.104 + * 2..15 - Contains data as described by bit 1. If all bits are set 1.105 + * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE 1.106 + * 1.107 + * 1.108 + * Mapping Table: 1.109 + * The data in mapping table is sorted according to the length of the mapping sequence. 1.110 + * If the type of the code point is USPREP_MAP and value in trie word is an index, the index 1.111 + * is compared with start indexes of sequence length start to figure out the length according to 1.112 + * the following algorithm: 1.113 + * 1.114 + * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] && 1.115 + * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){ 1.116 + * length = 1; 1.117 + * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] && 1.118 + * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){ 1.119 + * length = 2; 1.120 + * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] && 1.121 + * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){ 1.122 + * length = 3; 1.123 + * }else{ 1.124 + * // The first position in the mapping table contains the length 1.125 + * // of the sequence 1.126 + * length = mappingTable[index++]; 1.127 + * 1.128 + * } 1.129 + * 1.130 + */ 1.131 + 1.132 +/* file data ---------------------------------------------------------------- */ 1.133 +/* indexes[] value names */ 1.134 + 1.135 +#if UCONFIG_NO_IDNA 1.136 + 1.137 +/* dummy UDataInfo cf. udata.h */ 1.138 +static UDataInfo dataInfo = { 1.139 + sizeof(UDataInfo), 1.140 + 0, 1.141 + 1.142 + U_IS_BIG_ENDIAN, 1.143 + U_CHARSET_FAMILY, 1.144 + U_SIZEOF_UCHAR, 1.145 + 0, 1.146 + 1.147 + { 0, 0, 0, 0 }, /* dummy dataFormat */ 1.148 + { 0, 0, 0, 0 }, /* dummy formatVersion */ 1.149 + { 0, 0, 0, 0 } /* dummy dataVersion */ 1.150 +}; 1.151 + 1.152 +#else 1.153 + 1.154 +static int32_t indexes[_SPREP_INDEX_TOP]={ 0 }; 1.155 + 1.156 +static uint16_t* mappingData= NULL; 1.157 +static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */ 1.158 +static int16_t currentIndex = 0; /* the current index into the data trie */ 1.159 +static int32_t maxLength = 0; /* maximum length of mapping string */ 1.160 + 1.161 + 1.162 +/* UDataInfo cf. udata.h */ 1.163 +static UDataInfo dataInfo={ 1.164 + sizeof(UDataInfo), 1.165 + 0, 1.166 + 1.167 + U_IS_BIG_ENDIAN, 1.168 + U_CHARSET_FAMILY, 1.169 + U_SIZEOF_UCHAR, 1.170 + 0, 1.171 + 1.172 + { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */ 1.173 + { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ 1.174 + { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ 1.175 +}; 1.176 +void 1.177 +setUnicodeVersion(const char *v) { 1.178 + UVersionInfo version; 1.179 + u_versionFromString(version, v); 1.180 + uprv_memcpy(dataInfo.dataVersion, version, 4); 1.181 +} 1.182 + 1.183 +void 1.184 +setUnicodeVersionNC(UVersionInfo version){ 1.185 + uint32_t univer = version[0] << 24; 1.186 + univer += version[1] << 16; 1.187 + univer += version[2] << 8; 1.188 + univer += version[3]; 1.189 + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer; 1.190 +} 1.191 +static UNewTrie *sprepTrie; 1.192 + 1.193 +#define MAX_DATA_LENGTH 11500 1.194 + 1.195 + 1.196 +#define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191 1.197 +#define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192 1.198 + 1.199 + 1.200 +extern void 1.201 +init() { 1.202 + 1.203 + sprepTrie = (UNewTrie *)uprv_calloc(1, sizeof(UNewTrie)); 1.204 + 1.205 + /* initialize the two tries */ 1.206 + if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) { 1.207 + fprintf(stderr, "error: failed to initialize tries\n"); 1.208 + exit(U_MEMORY_ALLOCATION_ERROR); 1.209 + } 1.210 +} 1.211 + 1.212 +static UHashtable* hashTable = NULL; 1.213 + 1.214 + 1.215 +typedef struct ValueStruct { 1.216 + UChar* mapping; 1.217 + int16_t length; 1.218 + UStringPrepType type; 1.219 +} ValueStruct; 1.220 + 1.221 +/* Callback for deleting the value from the hashtable */ 1.222 +static void U_CALLCONV valueDeleter(void* obj){ 1.223 + ValueStruct* value = (ValueStruct*) obj; 1.224 + uprv_free(value->mapping); 1.225 + uprv_free(value); 1.226 +} 1.227 + 1.228 +/* Callback for hashing the entry */ 1.229 +static int32_t U_CALLCONV hashEntry(const UHashTok parm) { 1.230 + return parm.integer; 1.231 +} 1.232 + 1.233 +/* Callback for comparing two entries */ 1.234 +static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) { 1.235 + return (UBool)(p1.integer != p2.integer); 1.236 +} 1.237 + 1.238 + 1.239 +static void 1.240 +storeMappingData(){ 1.241 + 1.242 + int32_t pos = -1; 1.243 + const UHashElement* element = NULL; 1.244 + ValueStruct* value = NULL; 1.245 + int32_t codepoint = 0; 1.246 + int32_t elementCount = 0; 1.247 + int32_t writtenElementCount = 0; 1.248 + int32_t mappingLength = 1; /* minimum mapping length */ 1.249 + int32_t oldMappingLength = 0; 1.250 + uint16_t trieWord =0; 1.251 + int32_t limitIndex = 0; 1.252 + 1.253 + if (hashTable == NULL) { 1.254 + return; 1.255 + } 1.256 + elementCount = uhash_count(hashTable); 1.257 + 1.258 + /*initialize the mapping data */ 1.259 + mappingData = (uint16_t*) uprv_calloc(mappingDataCapacity, U_SIZEOF_UCHAR); 1.260 + 1.261 + while(writtenElementCount < elementCount){ 1.262 + 1.263 + while( (element = uhash_nextElement(hashTable, &pos))!=NULL){ 1.264 + 1.265 + codepoint = element->key.integer; 1.266 + value = (ValueStruct*)element->value.pointer; 1.267 + 1.268 + /* store the start of indexes */ 1.269 + if(oldMappingLength != mappingLength){ 1.270 + /* Assume that index[] is used according to the enums defined */ 1.271 + if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){ 1.272 + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex; 1.273 + } 1.274 + if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH && 1.275 + mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){ 1.276 + 1.277 + limitIndex = currentIndex; 1.278 + 1.279 + } 1.280 + oldMappingLength = mappingLength; 1.281 + } 1.282 + 1.283 + if(value->length == mappingLength){ 1.284 + uint32_t savedTrieWord = 0; 1.285 + trieWord = currentIndex << 2; 1.286 + /* turn on the 2nd bit to signal that the following bits contain an index */ 1.287 + trieWord += 0x02; 1.288 + 1.289 + if(trieWord > _SPREP_TYPE_THRESHOLD){ 1.290 + fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD); 1.291 + exit(U_ILLEGAL_CHAR_FOUND); 1.292 + } 1.293 + /* figure out if the code point has type already stored */ 1.294 + savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); 1.295 + if(savedTrieWord!=0){ 1.296 + if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ 1.297 + /* turn on the first bit in trie word */ 1.298 + trieWord += 0x01; 1.299 + }else{ 1.300 + /* 1.301 + * the codepoint has value something other than prohibited 1.302 + * and a mapping .. error! 1.303 + */ 1.304 + fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); 1.305 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.306 + } 1.307 + } 1.308 + 1.309 + /* now set the value in the trie */ 1.310 + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ 1.311 + fprintf(stderr,"Could not set the value for code point.\n"); 1.312 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.313 + } 1.314 + 1.315 + /* written the trie word for the codepoint... increment the count*/ 1.316 + writtenElementCount++; 1.317 + 1.318 + /* sanity check are we exceeding the max number allowed */ 1.319 + if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){ 1.320 + fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n", 1.321 + currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); 1.322 + exit(U_INDEX_OUTOFBOUNDS_ERROR); 1.323 + } 1.324 + 1.325 + /* copy the mapping data */ 1.326 + /* write the length */ 1.327 + if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ 1.328 + /* the cast here is safe since we donot expect the length to be > 65535 */ 1.329 + mappingData[currentIndex++] = (uint16_t) mappingLength; 1.330 + } 1.331 + /* copy the contents to mappindData array */ 1.332 + uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR); 1.333 + currentIndex += value->length; 1.334 + if (currentIndex > mappingDataCapacity) { 1.335 + /* If this happens there is a bug in the computation of the mapping data size in storeMapping() */ 1.336 + fprintf(stderr, "gensprep, fatal error at %s, %d. Aborting.\n", __FILE__, __LINE__); 1.337 + exit(U_INTERNAL_PROGRAM_ERROR); 1.338 + } 1.339 + } 1.340 + } 1.341 + mappingLength++; 1.342 + pos = -1; 1.343 + } 1.344 + /* set the last length for range check */ 1.345 + if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){ 1.346 + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1; 1.347 + }else{ 1.348 + indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex; 1.349 + } 1.350 + 1.351 +} 1.352 + 1.353 +extern void setOptions(int32_t options){ 1.354 + indexes[_SPREP_OPTIONS] = options; 1.355 +} 1.356 +extern void 1.357 +storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, 1.358 + UStringPrepType type, UErrorCode* status){ 1.359 + 1.360 + 1.361 + UChar* map = NULL; 1.362 + int16_t adjustedLen=0, i, j; 1.363 + uint16_t trieWord = 0; 1.364 + ValueStruct *value = NULL; 1.365 + uint32_t savedTrieWord = 0; 1.366 + 1.367 + /* initialize the hashtable */ 1.368 + if(hashTable==NULL){ 1.369 + hashTable = uhash_open(hashEntry, compareEntries, NULL, status); 1.370 + uhash_setValueDeleter(hashTable, valueDeleter); 1.371 + } 1.372 + 1.373 + /* figure out if the code point has type already stored */ 1.374 + savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); 1.375 + if(savedTrieWord!=0){ 1.376 + if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ 1.377 + /* turn on the first bit in trie word */ 1.378 + trieWord += 0x01; 1.379 + }else{ 1.380 + /* 1.381 + * the codepoint has value something other than prohibited 1.382 + * and a mapping .. error! 1.383 + */ 1.384 + fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); 1.385 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.386 + } 1.387 + } 1.388 + 1.389 + /* figure out the real length */ 1.390 + for(i=0; i<length; i++){ 1.391 + adjustedLen += U16_LENGTH(mapping[i]); 1.392 + } 1.393 + 1.394 + if(adjustedLen == 0){ 1.395 + trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); 1.396 + /* make sure that the value of trieWord is less than the threshold */ 1.397 + if(trieWord < _SPREP_TYPE_THRESHOLD){ 1.398 + /* now set the value in the trie */ 1.399 + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ 1.400 + fprintf(stderr,"Could not set the value for code point.\n"); 1.401 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.402 + } 1.403 + /* value is set so just return */ 1.404 + return; 1.405 + }else{ 1.406 + fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); 1.407 + exit(U_ILLEGAL_CHAR_FOUND); 1.408 + } 1.409 + } 1.410 + 1.411 + if(adjustedLen == 1){ 1.412 + /* calculate the delta */ 1.413 + int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]); 1.414 + if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){ 1.415 + 1.416 + trieWord = delta << 2; 1.417 + 1.418 + 1.419 + /* make sure that the second bit is OFF */ 1.420 + if((trieWord & 0x02) != 0 ){ 1.421 + fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); 1.422 + exit(U_INTERNAL_PROGRAM_ERROR); 1.423 + } 1.424 + /* make sure that the value of trieWord is less than the threshold */ 1.425 + if(trieWord < _SPREP_TYPE_THRESHOLD){ 1.426 + /* now set the value in the trie */ 1.427 + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ 1.428 + fprintf(stderr,"Could not set the value for code point.\n"); 1.429 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.430 + } 1.431 + /* value is set so just return */ 1.432 + return; 1.433 + } 1.434 + } 1.435 + /* 1.436 + * if the delta is not in the given range or if the trieWord is larger than the threshold 1.437 + * just fall through for storing the mapping in the mapping table 1.438 + */ 1.439 + } 1.440 + 1.441 + map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR); 1.442 + 1.443 + for (i=0, j=0; i<length; i++) { 1.444 + U16_APPEND_UNSAFE(map, j, mapping[i]); 1.445 + } 1.446 + 1.447 + value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct)); 1.448 + value->mapping = map; 1.449 + value->type = type; 1.450 + value->length = adjustedLen; 1.451 + if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ 1.452 + mappingDataCapacity++; 1.453 + } 1.454 + if(maxLength < value->length){ 1.455 + maxLength = value->length; 1.456 + } 1.457 + uhash_iput(hashTable,codepoint,value,status); 1.458 + mappingDataCapacity += adjustedLen; 1.459 + 1.460 + if(U_FAILURE(*status)){ 1.461 + fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status)); 1.462 + exit(*status); 1.463 + } 1.464 +} 1.465 + 1.466 + 1.467 +extern void 1.468 +storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status){ 1.469 + uint16_t trieWord = 0; 1.470 + 1.471 + if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){ 1.472 + fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); 1.473 + exit(U_ILLEGAL_CHAR_FOUND); 1.474 + } 1.475 + trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */ 1.476 + if(start == end){ 1.477 + uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); 1.478 + if(savedTrieWord>0){ 1.479 + if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){ 1.480 + /* 1.481 + * A mapping is stored in the trie word 1.482 + * and the only other possible type that a 1.483 + * code point can have is USPREP_PROHIBITED 1.484 + * 1.485 + */ 1.486 + 1.487 + /* turn on the 0th bit in the savedTrieWord */ 1.488 + savedTrieWord += 0x01; 1.489 + 1.490 + /* the downcast is safe since we only save 16 bit values */ 1.491 + trieWord = (uint16_t)savedTrieWord; 1.492 + 1.493 + /* make sure that the value of trieWord is less than the threshold */ 1.494 + if(trieWord < _SPREP_TYPE_THRESHOLD){ 1.495 + /* now set the value in the trie */ 1.496 + if(!utrie_set32(sprepTrie,start,trieWord)){ 1.497 + fprintf(stderr,"Could not set the value for code point.\n"); 1.498 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.499 + } 1.500 + /* value is set so just return */ 1.501 + return; 1.502 + }else{ 1.503 + fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); 1.504 + exit(U_ILLEGAL_CHAR_FOUND); 1.505 + } 1.506 + 1.507 + }else if(savedTrieWord != trieWord){ 1.508 + fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start); 1.509 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.510 + } 1.511 + /* if savedTrieWord == trieWord .. fall through and set the value */ 1.512 + } 1.513 + if(!utrie_set32(sprepTrie,start,trieWord)){ 1.514 + fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start); 1.515 + exit(U_ILLEGAL_ARGUMENT_ERROR); 1.516 + } 1.517 + }else{ 1.518 + if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){ 1.519 + fprintf(stderr,"Value for certain codepoint already set.\n"); 1.520 + exit(U_ILLEGAL_CHAR_FOUND); 1.521 + } 1.522 + } 1.523 + 1.524 +} 1.525 + 1.526 +/* folding value: just store the offset (16 bits) if there is any non-0 entry */ 1.527 +static uint32_t U_CALLCONV 1.528 +getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { 1.529 + uint32_t value; 1.530 + UChar32 limit=0; 1.531 + UBool inBlockZero; 1.532 + 1.533 + limit=start+0x400; 1.534 + while(start<limit) { 1.535 + value=utrie_get32(trie, start, &inBlockZero); 1.536 + if(inBlockZero) { 1.537 + start+=UTRIE_DATA_BLOCK_LENGTH; 1.538 + } else if(value!=0) { 1.539 + return (uint32_t)offset; 1.540 + } else { 1.541 + ++start; 1.542 + } 1.543 + } 1.544 + return 0; 1.545 + 1.546 +} 1.547 + 1.548 +#endif /* #if !UCONFIG_NO_IDNA */ 1.549 + 1.550 +extern void 1.551 +generateData(const char *dataDir, const char* bundleName) { 1.552 + static uint8_t sprepTrieBlock[100000]; 1.553 + 1.554 + UNewDataMemory *pData; 1.555 + UErrorCode errorCode=U_ZERO_ERROR; 1.556 + int32_t size, dataLength; 1.557 + char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100); 1.558 + 1.559 +#if UCONFIG_NO_IDNA 1.560 + 1.561 + size=0; 1.562 + 1.563 +#else 1.564 + 1.565 + int32_t sprepTrieSize; 1.566 + 1.567 + /* sort and add mapping data */ 1.568 + storeMappingData(); 1.569 + 1.570 + sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, TRUE, &errorCode); 1.571 + if(U_FAILURE(errorCode)) { 1.572 + fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode)); 1.573 + exit(errorCode); 1.574 + } 1.575 + 1.576 + size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes); 1.577 + if(beVerbose) { 1.578 + printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize); 1.579 + printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size); 1.580 + printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR); 1.581 + printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex); 1.582 + printf("Maximum length of the mapping string is : %i \n", (int)maxLength); 1.583 + } 1.584 + 1.585 +#endif 1.586 + 1.587 + fileName[0]=0; 1.588 + uprv_strcat(fileName,bundleName); 1.589 + /* write the data */ 1.590 + pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo, 1.591 + haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 1.592 + if(U_FAILURE(errorCode)) { 1.593 + fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode); 1.594 + exit(errorCode); 1.595 + } 1.596 + 1.597 +#if !UCONFIG_NO_IDNA 1.598 + 1.599 + indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize; 1.600 + indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR; 1.601 + 1.602 + udata_writeBlock(pData, indexes, sizeof(indexes)); 1.603 + udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize); 1.604 + udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]); 1.605 + 1.606 + 1.607 +#endif 1.608 + 1.609 + /* finish up */ 1.610 + dataLength=udata_finish(pData, &errorCode); 1.611 + if(U_FAILURE(errorCode)) { 1.612 + fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode); 1.613 + exit(errorCode); 1.614 + } 1.615 + 1.616 + if(dataLength!=size) { 1.617 + fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n", 1.618 + (long)dataLength, (long)size); 1.619 + exit(U_INTERNAL_PROGRAM_ERROR); 1.620 + } 1.621 + 1.622 +#if !UCONFIG_NO_IDNA 1.623 + /* done with writing the data .. close the hashtable */ 1.624 + if (hashTable != NULL) { 1.625 + uhash_close(hashTable); 1.626 + } 1.627 +#endif 1.628 + 1.629 + uprv_free(fileName); 1.630 +} 1.631 + 1.632 +#if !UCONFIG_NO_IDNA 1.633 + 1.634 +extern void 1.635 +cleanUpData(void) { 1.636 + uprv_free(mappingData); 1.637 + utrie_close(sprepTrie); 1.638 + uprv_free(sprepTrie); 1.639 +} 1.640 + 1.641 +#endif /* #if !UCONFIG_NO_IDNA */ 1.642 + 1.643 +/* 1.644 + * Hey, Emacs, please set the following: 1.645 + * 1.646 + * Local Variables: 1.647 + * indent-tabs-mode: nil 1.648 + * End: 1.649 + * 1.650 + */