michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 1999-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: store.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2003-02-06 michael@0: * created by: Ram Viswanadha michael@0: * michael@0: */ michael@0: michael@0: #include michael@0: #include michael@0: #include "unicode/utypes.h" michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: #include "filestrm.h" michael@0: #include "unicode/udata.h" michael@0: #include "unicode/utf16.h" michael@0: #include "utrie.h" michael@0: #include "unewdata.h" michael@0: #include "gensprep.h" michael@0: #include "uhash.h" michael@0: michael@0: michael@0: #define DO_DEBUG_OUT 0 michael@0: michael@0: michael@0: /* michael@0: * StringPrep profile file format ------------------------------------ michael@0: * michael@0: * The file format prepared and written here contains a 16-bit trie and a mapping table. michael@0: * michael@0: * Before the data contents described below, there are the headers required by michael@0: * the udata API for loading ICU data. Especially, a UDataInfo structure michael@0: * precedes the actual data. It contains platform properties values and the michael@0: * file format version. michael@0: * michael@0: * The following is a description of format version 2. michael@0: * michael@0: * Data contents: michael@0: * michael@0: * The contents is a parsed, binary form of RFC3454 and possibly michael@0: * NormalizationCorrections.txt depending on the options specified on the profile. michael@0: * michael@0: * Any Unicode code point from 0 to 0x10ffff can be looked up to get michael@0: * the trie-word, if any, for that code point. This means that the input michael@0: * to the lookup are 21-bit unsigned integers, with not all of the michael@0: * 21-bit range used. michael@0: * michael@0: * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. michael@0: * After that there are the following structures: michael@0: * michael@0: * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file michael@0: * michael@0: * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE] michael@0: * michael@0: * uint16_t mappingTable[]; -- Contains the sequecence of code units that the code point maps to michael@0: * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] michael@0: * michael@0: * The indexes array contains the following values: michael@0: * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes michael@0: * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes michael@0: * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt michael@0: * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table michael@0: * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table michael@0: * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table michael@0: * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table michael@0: * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON michael@0: * michael@0: * michael@0: * StringPrep Trie : michael@0: * michael@0: * The StringPrep tries is a 16-bit trie that contains data for the profile. michael@0: * Each code point is associated with a value (trie-word) in the trie. michael@0: * michael@0: * - structure of data words from the trie michael@0: * michael@0: * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) michael@0: * represents the type associated with the code point michael@0: * if(trieWord >= _SPREP_TYPE_THRESHOLD){ michael@0: * type = trieWord - 0xFFF0; michael@0: * } michael@0: * The type can be : michael@0: * USPREP_UNASSIGNED michael@0: * USPREP_PROHIBITED michael@0: * USPREP_DELETE michael@0: * michael@0: * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and michael@0: * contains distribution described below michael@0: * michael@0: * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped. michael@0: * 1 - ON : The value in the next 14 bits is an index into the mapping table michael@0: * OFF: The value in the next 14 bits is an delta value from the code point michael@0: * 2..15 - Contains data as described by bit 1. If all bits are set michael@0: * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE michael@0: * michael@0: * michael@0: * Mapping Table: michael@0: * The data in mapping table is sorted according to the length of the mapping sequence. michael@0: * If the type of the code point is USPREP_MAP and value in trie word is an index, the index michael@0: * is compared with start indexes of sequence length start to figure out the length according to michael@0: * the following algorithm: michael@0: * michael@0: * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] && michael@0: * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){ michael@0: * length = 1; michael@0: * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] && michael@0: * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){ michael@0: * length = 2; michael@0: * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] && michael@0: * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){ michael@0: * length = 3; michael@0: * }else{ michael@0: * // The first position in the mapping table contains the length michael@0: * // of the sequence michael@0: * length = mappingTable[index++]; michael@0: * michael@0: * } michael@0: * michael@0: */ michael@0: michael@0: /* file data ---------------------------------------------------------------- */ michael@0: /* indexes[] value names */ michael@0: michael@0: #if UCONFIG_NO_IDNA michael@0: michael@0: /* dummy UDataInfo cf. udata.h */ michael@0: static UDataInfo dataInfo = { michael@0: sizeof(UDataInfo), michael@0: 0, michael@0: michael@0: U_IS_BIG_ENDIAN, michael@0: U_CHARSET_FAMILY, michael@0: U_SIZEOF_UCHAR, michael@0: 0, michael@0: michael@0: { 0, 0, 0, 0 }, /* dummy dataFormat */ michael@0: { 0, 0, 0, 0 }, /* dummy formatVersion */ michael@0: { 0, 0, 0, 0 } /* dummy dataVersion */ michael@0: }; michael@0: michael@0: #else michael@0: michael@0: static int32_t indexes[_SPREP_INDEX_TOP]={ 0 }; michael@0: michael@0: static uint16_t* mappingData= NULL; michael@0: static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */ michael@0: static int16_t currentIndex = 0; /* the current index into the data trie */ michael@0: static int32_t maxLength = 0; /* maximum length of mapping string */ michael@0: michael@0: michael@0: /* UDataInfo cf. udata.h */ michael@0: static UDataInfo dataInfo={ michael@0: sizeof(UDataInfo), michael@0: 0, michael@0: michael@0: U_IS_BIG_ENDIAN, michael@0: U_CHARSET_FAMILY, michael@0: U_SIZEOF_UCHAR, michael@0: 0, michael@0: michael@0: { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */ michael@0: { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ michael@0: { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ michael@0: }; michael@0: void michael@0: setUnicodeVersion(const char *v) { michael@0: UVersionInfo version; michael@0: u_versionFromString(version, v); michael@0: uprv_memcpy(dataInfo.dataVersion, version, 4); michael@0: } michael@0: michael@0: void michael@0: setUnicodeVersionNC(UVersionInfo version){ michael@0: uint32_t univer = version[0] << 24; michael@0: univer += version[1] << 16; michael@0: univer += version[2] << 8; michael@0: univer += version[3]; michael@0: indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer; michael@0: } michael@0: static UNewTrie *sprepTrie; michael@0: michael@0: #define MAX_DATA_LENGTH 11500 michael@0: michael@0: michael@0: #define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191 michael@0: #define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192 michael@0: michael@0: michael@0: extern void michael@0: init() { michael@0: michael@0: sprepTrie = (UNewTrie *)uprv_calloc(1, sizeof(UNewTrie)); michael@0: michael@0: /* initialize the two tries */ michael@0: if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) { michael@0: fprintf(stderr, "error: failed to initialize tries\n"); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: } michael@0: michael@0: static UHashtable* hashTable = NULL; michael@0: michael@0: michael@0: typedef struct ValueStruct { michael@0: UChar* mapping; michael@0: int16_t length; michael@0: UStringPrepType type; michael@0: } ValueStruct; michael@0: michael@0: /* Callback for deleting the value from the hashtable */ michael@0: static void U_CALLCONV valueDeleter(void* obj){ michael@0: ValueStruct* value = (ValueStruct*) obj; michael@0: uprv_free(value->mapping); michael@0: uprv_free(value); michael@0: } michael@0: michael@0: /* Callback for hashing the entry */ michael@0: static int32_t U_CALLCONV hashEntry(const UHashTok parm) { michael@0: return parm.integer; michael@0: } michael@0: michael@0: /* Callback for comparing two entries */ michael@0: static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) { michael@0: return (UBool)(p1.integer != p2.integer); michael@0: } michael@0: michael@0: michael@0: static void michael@0: storeMappingData(){ michael@0: michael@0: int32_t pos = -1; michael@0: const UHashElement* element = NULL; michael@0: ValueStruct* value = NULL; michael@0: int32_t codepoint = 0; michael@0: int32_t elementCount = 0; michael@0: int32_t writtenElementCount = 0; michael@0: int32_t mappingLength = 1; /* minimum mapping length */ michael@0: int32_t oldMappingLength = 0; michael@0: uint16_t trieWord =0; michael@0: int32_t limitIndex = 0; michael@0: michael@0: if (hashTable == NULL) { michael@0: return; michael@0: } michael@0: elementCount = uhash_count(hashTable); michael@0: michael@0: /*initialize the mapping data */ michael@0: mappingData = (uint16_t*) uprv_calloc(mappingDataCapacity, U_SIZEOF_UCHAR); michael@0: michael@0: while(writtenElementCount < elementCount){ michael@0: michael@0: while( (element = uhash_nextElement(hashTable, &pos))!=NULL){ michael@0: michael@0: codepoint = element->key.integer; michael@0: value = (ValueStruct*)element->value.pointer; michael@0: michael@0: /* store the start of indexes */ michael@0: if(oldMappingLength != mappingLength){ michael@0: /* Assume that index[] is used according to the enums defined */ michael@0: if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){ michael@0: indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex; michael@0: } michael@0: if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH && michael@0: mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){ michael@0: michael@0: limitIndex = currentIndex; michael@0: michael@0: } michael@0: oldMappingLength = mappingLength; michael@0: } michael@0: michael@0: if(value->length == mappingLength){ michael@0: uint32_t savedTrieWord = 0; michael@0: trieWord = currentIndex << 2; michael@0: /* turn on the 2nd bit to signal that the following bits contain an index */ michael@0: trieWord += 0x02; michael@0: michael@0: if(trieWord > _SPREP_TYPE_THRESHOLD){ michael@0: fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD); michael@0: exit(U_ILLEGAL_CHAR_FOUND); michael@0: } michael@0: /* figure out if the code point has type already stored */ michael@0: savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); michael@0: if(savedTrieWord!=0){ michael@0: if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ michael@0: /* turn on the first bit in trie word */ michael@0: trieWord += 0x01; michael@0: }else{ michael@0: /* michael@0: * the codepoint has value something other than prohibited michael@0: * and a mapping .. error! michael@0: */ michael@0: fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); michael@0: exit(U_ILLEGAL_ARGUMENT_ERROR); michael@0: } michael@0: } michael@0: michael@0: /* now set the value in the trie */ michael@0: if(!utrie_set32(sprepTrie,codepoint,trieWord)){ michael@0: fprintf(stderr,"Could not set the value for code point.\n"); michael@0: exit(U_ILLEGAL_ARGUMENT_ERROR); michael@0: } michael@0: michael@0: /* written the trie word for the codepoint... increment the count*/ michael@0: writtenElementCount++; michael@0: michael@0: /* sanity check are we exceeding the max number allowed */ michael@0: if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){ michael@0: fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n", michael@0: currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); michael@0: exit(U_INDEX_OUTOFBOUNDS_ERROR); michael@0: } michael@0: michael@0: /* copy the mapping data */ michael@0: /* write the length */ michael@0: if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ michael@0: /* the cast here is safe since we donot expect the length to be > 65535 */ michael@0: mappingData[currentIndex++] = (uint16_t) mappingLength; michael@0: } michael@0: /* copy the contents to mappindData array */ michael@0: uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR); michael@0: currentIndex += value->length; michael@0: if (currentIndex > mappingDataCapacity) { michael@0: /* If this happens there is a bug in the computation of the mapping data size in storeMapping() */ michael@0: fprintf(stderr, "gensprep, fatal error at %s, %d. Aborting.\n", __FILE__, __LINE__); michael@0: exit(U_INTERNAL_PROGRAM_ERROR); michael@0: } michael@0: } michael@0: } michael@0: mappingLength++; michael@0: pos = -1; michael@0: } michael@0: /* set the last length for range check */ michael@0: if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){ michael@0: indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1; michael@0: }else{ michael@0: indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex; michael@0: } michael@0: michael@0: } michael@0: michael@0: extern void setOptions(int32_t options){ michael@0: indexes[_SPREP_OPTIONS] = options; michael@0: } michael@0: extern void michael@0: storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, michael@0: UStringPrepType type, UErrorCode* status){ michael@0: michael@0: michael@0: UChar* map = NULL; michael@0: int16_t adjustedLen=0, i, j; michael@0: uint16_t trieWord = 0; michael@0: ValueStruct *value = NULL; michael@0: uint32_t savedTrieWord = 0; michael@0: michael@0: /* initialize the hashtable */ michael@0: if(hashTable==NULL){ michael@0: hashTable = uhash_open(hashEntry, compareEntries, NULL, status); michael@0: uhash_setValueDeleter(hashTable, valueDeleter); michael@0: } michael@0: michael@0: /* figure out if the code point has type already stored */ michael@0: savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); michael@0: if(savedTrieWord!=0){ michael@0: if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ michael@0: /* turn on the first bit in trie word */ michael@0: trieWord += 0x01; michael@0: }else{ michael@0: /* michael@0: * the codepoint has value something other than prohibited michael@0: * and a mapping .. error! michael@0: */ michael@0: fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); michael@0: exit(U_ILLEGAL_ARGUMENT_ERROR); michael@0: } michael@0: } michael@0: michael@0: /* figure out the real length */ michael@0: for(i=0; i= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){ michael@0: michael@0: trieWord = delta << 2; michael@0: michael@0: michael@0: /* make sure that the second bit is OFF */ michael@0: if((trieWord & 0x02) != 0 ){ michael@0: fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); michael@0: exit(U_INTERNAL_PROGRAM_ERROR); michael@0: } michael@0: /* make sure that the value of trieWord is less than the threshold */ michael@0: if(trieWord < _SPREP_TYPE_THRESHOLD){ michael@0: /* now set the value in the trie */ michael@0: if(!utrie_set32(sprepTrie,codepoint,trieWord)){ michael@0: fprintf(stderr,"Could not set the value for code point.\n"); michael@0: exit(U_ILLEGAL_ARGUMENT_ERROR); michael@0: } michael@0: /* value is set so just return */ michael@0: return; michael@0: } michael@0: } michael@0: /* michael@0: * if the delta is not in the given range or if the trieWord is larger than the threshold michael@0: * just fall through for storing the mapping in the mapping table michael@0: */ michael@0: } michael@0: michael@0: map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR); michael@0: michael@0: for (i=0, j=0; imapping = map; michael@0: value->type = type; michael@0: value->length = adjustedLen; michael@0: if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ michael@0: mappingDataCapacity++; michael@0: } michael@0: if(maxLength < value->length){ michael@0: maxLength = value->length; michael@0: } michael@0: uhash_iput(hashTable,codepoint,value,status); michael@0: mappingDataCapacity += adjustedLen; michael@0: michael@0: if(U_FAILURE(*status)){ michael@0: fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status)); michael@0: exit(*status); michael@0: } michael@0: } michael@0: michael@0: michael@0: extern void michael@0: storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status){ michael@0: uint16_t trieWord = 0; michael@0: michael@0: if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){ michael@0: fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); michael@0: exit(U_ILLEGAL_CHAR_FOUND); michael@0: } michael@0: trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */ michael@0: if(start == end){ michael@0: uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); michael@0: if(savedTrieWord>0){ michael@0: if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){ michael@0: /* michael@0: * A mapping is stored in the trie word michael@0: * and the only other possible type that a michael@0: * code point can have is USPREP_PROHIBITED michael@0: * michael@0: */ michael@0: michael@0: /* turn on the 0th bit in the savedTrieWord */ michael@0: savedTrieWord += 0x01; michael@0: michael@0: /* the downcast is safe since we only save 16 bit values */ michael@0: trieWord = (uint16_t)savedTrieWord; michael@0: michael@0: /* make sure that the value of trieWord is less than the threshold */ michael@0: if(trieWord < _SPREP_TYPE_THRESHOLD){ michael@0: /* now set the value in the trie */ michael@0: if(!utrie_set32(sprepTrie,start,trieWord)){ michael@0: fprintf(stderr,"Could not set the value for code point.\n"); michael@0: exit(U_ILLEGAL_ARGUMENT_ERROR); michael@0: } michael@0: /* value is set so just return */ michael@0: return; michael@0: }else{ michael@0: fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); michael@0: exit(U_ILLEGAL_CHAR_FOUND); michael@0: } michael@0: michael@0: }else if(savedTrieWord != trieWord){ michael@0: fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start); michael@0: exit(U_ILLEGAL_ARGUMENT_ERROR); michael@0: } michael@0: /* if savedTrieWord == trieWord .. fall through and set the value */ michael@0: } michael@0: if(!utrie_set32(sprepTrie,start,trieWord)){ michael@0: fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start); michael@0: exit(U_ILLEGAL_ARGUMENT_ERROR); michael@0: } michael@0: }else{ michael@0: if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){ michael@0: fprintf(stderr,"Value for certain codepoint already set.\n"); michael@0: exit(U_ILLEGAL_CHAR_FOUND); michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: /* folding value: just store the offset (16 bits) if there is any non-0 entry */ michael@0: static uint32_t U_CALLCONV michael@0: getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { michael@0: uint32_t value; michael@0: UChar32 limit=0; michael@0: UBool inBlockZero; michael@0: michael@0: limit=start+0x400; michael@0: while(start