|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1999-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: store.c |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2003-02-06 |
|
14 * created by: Ram Viswanadha |
|
15 * |
|
16 */ |
|
17 |
|
18 #include <stdio.h> |
|
19 #include <stdlib.h> |
|
20 #include "unicode/utypes.h" |
|
21 #include "cmemory.h" |
|
22 #include "cstring.h" |
|
23 #include "filestrm.h" |
|
24 #include "unicode/udata.h" |
|
25 #include "unicode/utf16.h" |
|
26 #include "utrie.h" |
|
27 #include "unewdata.h" |
|
28 #include "gensprep.h" |
|
29 #include "uhash.h" |
|
30 |
|
31 |
|
32 #define DO_DEBUG_OUT 0 |
|
33 |
|
34 |
|
35 /* |
|
36 * StringPrep profile file format ------------------------------------ |
|
37 * |
|
38 * The file format prepared and written here contains a 16-bit trie and a mapping table. |
|
39 * |
|
40 * Before the data contents described below, there are the headers required by |
|
41 * the udata API for loading ICU data. Especially, a UDataInfo structure |
|
42 * precedes the actual data. It contains platform properties values and the |
|
43 * file format version. |
|
44 * |
|
45 * The following is a description of format version 2. |
|
46 * |
|
47 * Data contents: |
|
48 * |
|
49 * The contents is a parsed, binary form of RFC3454 and possibly |
|
50 * NormalizationCorrections.txt depending on the options specified on the profile. |
|
51 * |
|
52 * Any Unicode code point from 0 to 0x10ffff can be looked up to get |
|
53 * the trie-word, if any, for that code point. This means that the input |
|
54 * to the lookup are 21-bit unsigned integers, with not all of the |
|
55 * 21-bit range used. |
|
56 * |
|
57 * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. |
|
58 * After that there are the following structures: |
|
59 * |
|
60 * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file |
|
61 * |
|
62 * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE] |
|
63 * |
|
64 * uint16_t mappingTable[]; -- Contains the sequecence of code units that the code point maps to |
|
65 * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] |
|
66 * |
|
67 * The indexes array contains the following values: |
|
68 * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes |
|
69 * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes |
|
70 * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt |
|
71 * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table |
|
72 * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table |
|
73 * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table |
|
74 * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table |
|
75 * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON |
|
76 * |
|
77 * |
|
78 * StringPrep Trie : |
|
79 * |
|
80 * The StringPrep tries is a 16-bit trie that contains data for the profile. |
|
81 * Each code point is associated with a value (trie-word) in the trie. |
|
82 * |
|
83 * - structure of data words from the trie |
|
84 * |
|
85 * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) |
|
86 * represents the type associated with the code point |
|
87 * if(trieWord >= _SPREP_TYPE_THRESHOLD){ |
|
88 * type = trieWord - 0xFFF0; |
|
89 * } |
|
90 * The type can be : |
|
91 * USPREP_UNASSIGNED |
|
92 * USPREP_PROHIBITED |
|
93 * USPREP_DELETE |
|
94 * |
|
95 * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and |
|
96 * contains distribution described below |
|
97 * |
|
98 * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped. |
|
99 * 1 - ON : The value in the next 14 bits is an index into the mapping table |
|
100 * OFF: The value in the next 14 bits is an delta value from the code point |
|
101 * 2..15 - Contains data as described by bit 1. If all bits are set |
|
102 * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE |
|
103 * |
|
104 * |
|
105 * Mapping Table: |
|
106 * The data in mapping table is sorted according to the length of the mapping sequence. |
|
107 * If the type of the code point is USPREP_MAP and value in trie word is an index, the index |
|
108 * is compared with start indexes of sequence length start to figure out the length according to |
|
109 * the following algorithm: |
|
110 * |
|
111 * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] && |
|
112 * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){ |
|
113 * length = 1; |
|
114 * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] && |
|
115 * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){ |
|
116 * length = 2; |
|
117 * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] && |
|
118 * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){ |
|
119 * length = 3; |
|
120 * }else{ |
|
121 * // The first position in the mapping table contains the length |
|
122 * // of the sequence |
|
123 * length = mappingTable[index++]; |
|
124 * |
|
125 * } |
|
126 * |
|
127 */ |
|
128 |
|
129 /* file data ---------------------------------------------------------------- */ |
|
130 /* indexes[] value names */ |
|
131 |
|
132 #if UCONFIG_NO_IDNA |
|
133 |
|
134 /* dummy UDataInfo cf. udata.h */ |
|
135 static UDataInfo dataInfo = { |
|
136 sizeof(UDataInfo), |
|
137 0, |
|
138 |
|
139 U_IS_BIG_ENDIAN, |
|
140 U_CHARSET_FAMILY, |
|
141 U_SIZEOF_UCHAR, |
|
142 0, |
|
143 |
|
144 { 0, 0, 0, 0 }, /* dummy dataFormat */ |
|
145 { 0, 0, 0, 0 }, /* dummy formatVersion */ |
|
146 { 0, 0, 0, 0 } /* dummy dataVersion */ |
|
147 }; |
|
148 |
|
149 #else |
|
150 |
|
151 static int32_t indexes[_SPREP_INDEX_TOP]={ 0 }; |
|
152 |
|
153 static uint16_t* mappingData= NULL; |
|
154 static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */ |
|
155 static int16_t currentIndex = 0; /* the current index into the data trie */ |
|
156 static int32_t maxLength = 0; /* maximum length of mapping string */ |
|
157 |
|
158 |
|
159 /* UDataInfo cf. udata.h */ |
|
160 static UDataInfo dataInfo={ |
|
161 sizeof(UDataInfo), |
|
162 0, |
|
163 |
|
164 U_IS_BIG_ENDIAN, |
|
165 U_CHARSET_FAMILY, |
|
166 U_SIZEOF_UCHAR, |
|
167 0, |
|
168 |
|
169 { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */ |
|
170 { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ |
|
171 { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ |
|
172 }; |
|
173 void |
|
174 setUnicodeVersion(const char *v) { |
|
175 UVersionInfo version; |
|
176 u_versionFromString(version, v); |
|
177 uprv_memcpy(dataInfo.dataVersion, version, 4); |
|
178 } |
|
179 |
|
180 void |
|
181 setUnicodeVersionNC(UVersionInfo version){ |
|
182 uint32_t univer = version[0] << 24; |
|
183 univer += version[1] << 16; |
|
184 univer += version[2] << 8; |
|
185 univer += version[3]; |
|
186 indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer; |
|
187 } |
|
188 static UNewTrie *sprepTrie; |
|
189 |
|
190 #define MAX_DATA_LENGTH 11500 |
|
191 |
|
192 |
|
193 #define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191 |
|
194 #define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192 |
|
195 |
|
196 |
|
197 extern void |
|
198 init() { |
|
199 |
|
200 sprepTrie = (UNewTrie *)uprv_calloc(1, sizeof(UNewTrie)); |
|
201 |
|
202 /* initialize the two tries */ |
|
203 if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) { |
|
204 fprintf(stderr, "error: failed to initialize tries\n"); |
|
205 exit(U_MEMORY_ALLOCATION_ERROR); |
|
206 } |
|
207 } |
|
208 |
|
209 static UHashtable* hashTable = NULL; |
|
210 |
|
211 |
|
212 typedef struct ValueStruct { |
|
213 UChar* mapping; |
|
214 int16_t length; |
|
215 UStringPrepType type; |
|
216 } ValueStruct; |
|
217 |
|
218 /* Callback for deleting the value from the hashtable */ |
|
219 static void U_CALLCONV valueDeleter(void* obj){ |
|
220 ValueStruct* value = (ValueStruct*) obj; |
|
221 uprv_free(value->mapping); |
|
222 uprv_free(value); |
|
223 } |
|
224 |
|
225 /* Callback for hashing the entry */ |
|
226 static int32_t U_CALLCONV hashEntry(const UHashTok parm) { |
|
227 return parm.integer; |
|
228 } |
|
229 |
|
230 /* Callback for comparing two entries */ |
|
231 static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) { |
|
232 return (UBool)(p1.integer != p2.integer); |
|
233 } |
|
234 |
|
235 |
|
236 static void |
|
237 storeMappingData(){ |
|
238 |
|
239 int32_t pos = -1; |
|
240 const UHashElement* element = NULL; |
|
241 ValueStruct* value = NULL; |
|
242 int32_t codepoint = 0; |
|
243 int32_t elementCount = 0; |
|
244 int32_t writtenElementCount = 0; |
|
245 int32_t mappingLength = 1; /* minimum mapping length */ |
|
246 int32_t oldMappingLength = 0; |
|
247 uint16_t trieWord =0; |
|
248 int32_t limitIndex = 0; |
|
249 |
|
250 if (hashTable == NULL) { |
|
251 return; |
|
252 } |
|
253 elementCount = uhash_count(hashTable); |
|
254 |
|
255 /*initialize the mapping data */ |
|
256 mappingData = (uint16_t*) uprv_calloc(mappingDataCapacity, U_SIZEOF_UCHAR); |
|
257 |
|
258 while(writtenElementCount < elementCount){ |
|
259 |
|
260 while( (element = uhash_nextElement(hashTable, &pos))!=NULL){ |
|
261 |
|
262 codepoint = element->key.integer; |
|
263 value = (ValueStruct*)element->value.pointer; |
|
264 |
|
265 /* store the start of indexes */ |
|
266 if(oldMappingLength != mappingLength){ |
|
267 /* Assume that index[] is used according to the enums defined */ |
|
268 if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){ |
|
269 indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex; |
|
270 } |
|
271 if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH && |
|
272 mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){ |
|
273 |
|
274 limitIndex = currentIndex; |
|
275 |
|
276 } |
|
277 oldMappingLength = mappingLength; |
|
278 } |
|
279 |
|
280 if(value->length == mappingLength){ |
|
281 uint32_t savedTrieWord = 0; |
|
282 trieWord = currentIndex << 2; |
|
283 /* turn on the 2nd bit to signal that the following bits contain an index */ |
|
284 trieWord += 0x02; |
|
285 |
|
286 if(trieWord > _SPREP_TYPE_THRESHOLD){ |
|
287 fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD); |
|
288 exit(U_ILLEGAL_CHAR_FOUND); |
|
289 } |
|
290 /* figure out if the code point has type already stored */ |
|
291 savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); |
|
292 if(savedTrieWord!=0){ |
|
293 if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ |
|
294 /* turn on the first bit in trie word */ |
|
295 trieWord += 0x01; |
|
296 }else{ |
|
297 /* |
|
298 * the codepoint has value something other than prohibited |
|
299 * and a mapping .. error! |
|
300 */ |
|
301 fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); |
|
302 exit(U_ILLEGAL_ARGUMENT_ERROR); |
|
303 } |
|
304 } |
|
305 |
|
306 /* now set the value in the trie */ |
|
307 if(!utrie_set32(sprepTrie,codepoint,trieWord)){ |
|
308 fprintf(stderr,"Could not set the value for code point.\n"); |
|
309 exit(U_ILLEGAL_ARGUMENT_ERROR); |
|
310 } |
|
311 |
|
312 /* written the trie word for the codepoint... increment the count*/ |
|
313 writtenElementCount++; |
|
314 |
|
315 /* sanity check are we exceeding the max number allowed */ |
|
316 if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){ |
|
317 fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n", |
|
318 currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); |
|
319 exit(U_INDEX_OUTOFBOUNDS_ERROR); |
|
320 } |
|
321 |
|
322 /* copy the mapping data */ |
|
323 /* write the length */ |
|
324 if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ |
|
325 /* the cast here is safe since we donot expect the length to be > 65535 */ |
|
326 mappingData[currentIndex++] = (uint16_t) mappingLength; |
|
327 } |
|
328 /* copy the contents to mappindData array */ |
|
329 uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR); |
|
330 currentIndex += value->length; |
|
331 if (currentIndex > mappingDataCapacity) { |
|
332 /* If this happens there is a bug in the computation of the mapping data size in storeMapping() */ |
|
333 fprintf(stderr, "gensprep, fatal error at %s, %d. Aborting.\n", __FILE__, __LINE__); |
|
334 exit(U_INTERNAL_PROGRAM_ERROR); |
|
335 } |
|
336 } |
|
337 } |
|
338 mappingLength++; |
|
339 pos = -1; |
|
340 } |
|
341 /* set the last length for range check */ |
|
342 if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){ |
|
343 indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1; |
|
344 }else{ |
|
345 indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex; |
|
346 } |
|
347 |
|
348 } |
|
349 |
|
350 extern void setOptions(int32_t options){ |
|
351 indexes[_SPREP_OPTIONS] = options; |
|
352 } |
|
353 extern void |
|
354 storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, |
|
355 UStringPrepType type, UErrorCode* status){ |
|
356 |
|
357 |
|
358 UChar* map = NULL; |
|
359 int16_t adjustedLen=0, i, j; |
|
360 uint16_t trieWord = 0; |
|
361 ValueStruct *value = NULL; |
|
362 uint32_t savedTrieWord = 0; |
|
363 |
|
364 /* initialize the hashtable */ |
|
365 if(hashTable==NULL){ |
|
366 hashTable = uhash_open(hashEntry, compareEntries, NULL, status); |
|
367 uhash_setValueDeleter(hashTable, valueDeleter); |
|
368 } |
|
369 |
|
370 /* figure out if the code point has type already stored */ |
|
371 savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); |
|
372 if(savedTrieWord!=0){ |
|
373 if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ |
|
374 /* turn on the first bit in trie word */ |
|
375 trieWord += 0x01; |
|
376 }else{ |
|
377 /* |
|
378 * the codepoint has value something other than prohibited |
|
379 * and a mapping .. error! |
|
380 */ |
|
381 fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); |
|
382 exit(U_ILLEGAL_ARGUMENT_ERROR); |
|
383 } |
|
384 } |
|
385 |
|
386 /* figure out the real length */ |
|
387 for(i=0; i<length; i++){ |
|
388 adjustedLen += U16_LENGTH(mapping[i]); |
|
389 } |
|
390 |
|
391 if(adjustedLen == 0){ |
|
392 trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); |
|
393 /* make sure that the value of trieWord is less than the threshold */ |
|
394 if(trieWord < _SPREP_TYPE_THRESHOLD){ |
|
395 /* now set the value in the trie */ |
|
396 if(!utrie_set32(sprepTrie,codepoint,trieWord)){ |
|
397 fprintf(stderr,"Could not set the value for code point.\n"); |
|
398 exit(U_ILLEGAL_ARGUMENT_ERROR); |
|
399 } |
|
400 /* value is set so just return */ |
|
401 return; |
|
402 }else{ |
|
403 fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); |
|
404 exit(U_ILLEGAL_CHAR_FOUND); |
|
405 } |
|
406 } |
|
407 |
|
408 if(adjustedLen == 1){ |
|
409 /* calculate the delta */ |
|
410 int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]); |
|
411 if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){ |
|
412 |
|
413 trieWord = delta << 2; |
|
414 |
|
415 |
|
416 /* make sure that the second bit is OFF */ |
|
417 if((trieWord & 0x02) != 0 ){ |
|
418 fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); |
|
419 exit(U_INTERNAL_PROGRAM_ERROR); |
|
420 } |
|
421 /* make sure that the value of trieWord is less than the threshold */ |
|
422 if(trieWord < _SPREP_TYPE_THRESHOLD){ |
|
423 /* now set the value in the trie */ |
|
424 if(!utrie_set32(sprepTrie,codepoint,trieWord)){ |
|
425 fprintf(stderr,"Could not set the value for code point.\n"); |
|
426 exit(U_ILLEGAL_ARGUMENT_ERROR); |
|
427 } |
|
428 /* value is set so just return */ |
|
429 return; |
|
430 } |
|
431 } |
|
432 /* |
|
433 * if the delta is not in the given range or if the trieWord is larger than the threshold |
|
434 * just fall through for storing the mapping in the mapping table |
|
435 */ |
|
436 } |
|
437 |
|
438 map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR); |
|
439 |
|
440 for (i=0, j=0; i<length; i++) { |
|
441 U16_APPEND_UNSAFE(map, j, mapping[i]); |
|
442 } |
|
443 |
|
444 value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct)); |
|
445 value->mapping = map; |
|
446 value->type = type; |
|
447 value->length = adjustedLen; |
|
448 if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ |
|
449 mappingDataCapacity++; |
|
450 } |
|
451 if(maxLength < value->length){ |
|
452 maxLength = value->length; |
|
453 } |
|
454 uhash_iput(hashTable,codepoint,value,status); |
|
455 mappingDataCapacity += adjustedLen; |
|
456 |
|
457 if(U_FAILURE(*status)){ |
|
458 fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status)); |
|
459 exit(*status); |
|
460 } |
|
461 } |
|
462 |
|
463 |
|
464 extern void |
|
465 storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status){ |
|
466 uint16_t trieWord = 0; |
|
467 |
|
468 if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){ |
|
469 fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); |
|
470 exit(U_ILLEGAL_CHAR_FOUND); |
|
471 } |
|
472 trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */ |
|
473 if(start == end){ |
|
474 uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); |
|
475 if(savedTrieWord>0){ |
|
476 if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){ |
|
477 /* |
|
478 * A mapping is stored in the trie word |
|
479 * and the only other possible type that a |
|
480 * code point can have is USPREP_PROHIBITED |
|
481 * |
|
482 */ |
|
483 |
|
484 /* turn on the 0th bit in the savedTrieWord */ |
|
485 savedTrieWord += 0x01; |
|
486 |
|
487 /* the downcast is safe since we only save 16 bit values */ |
|
488 trieWord = (uint16_t)savedTrieWord; |
|
489 |
|
490 /* make sure that the value of trieWord is less than the threshold */ |
|
491 if(trieWord < _SPREP_TYPE_THRESHOLD){ |
|
492 /* now set the value in the trie */ |
|
493 if(!utrie_set32(sprepTrie,start,trieWord)){ |
|
494 fprintf(stderr,"Could not set the value for code point.\n"); |
|
495 exit(U_ILLEGAL_ARGUMENT_ERROR); |
|
496 } |
|
497 /* value is set so just return */ |
|
498 return; |
|
499 }else{ |
|
500 fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); |
|
501 exit(U_ILLEGAL_CHAR_FOUND); |
|
502 } |
|
503 |
|
504 }else if(savedTrieWord != trieWord){ |
|
505 fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start); |
|
506 exit(U_ILLEGAL_ARGUMENT_ERROR); |
|
507 } |
|
508 /* if savedTrieWord == trieWord .. fall through and set the value */ |
|
509 } |
|
510 if(!utrie_set32(sprepTrie,start,trieWord)){ |
|
511 fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start); |
|
512 exit(U_ILLEGAL_ARGUMENT_ERROR); |
|
513 } |
|
514 }else{ |
|
515 if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){ |
|
516 fprintf(stderr,"Value for certain codepoint already set.\n"); |
|
517 exit(U_ILLEGAL_CHAR_FOUND); |
|
518 } |
|
519 } |
|
520 |
|
521 } |
|
522 |
|
523 /* folding value: just store the offset (16 bits) if there is any non-0 entry */ |
|
524 static uint32_t U_CALLCONV |
|
525 getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { |
|
526 uint32_t value; |
|
527 UChar32 limit=0; |
|
528 UBool inBlockZero; |
|
529 |
|
530 limit=start+0x400; |
|
531 while(start<limit) { |
|
532 value=utrie_get32(trie, start, &inBlockZero); |
|
533 if(inBlockZero) { |
|
534 start+=UTRIE_DATA_BLOCK_LENGTH; |
|
535 } else if(value!=0) { |
|
536 return (uint32_t)offset; |
|
537 } else { |
|
538 ++start; |
|
539 } |
|
540 } |
|
541 return 0; |
|
542 |
|
543 } |
|
544 |
|
545 #endif /* #if !UCONFIG_NO_IDNA */ |
|
546 |
|
547 extern void |
|
548 generateData(const char *dataDir, const char* bundleName) { |
|
549 static uint8_t sprepTrieBlock[100000]; |
|
550 |
|
551 UNewDataMemory *pData; |
|
552 UErrorCode errorCode=U_ZERO_ERROR; |
|
553 int32_t size, dataLength; |
|
554 char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100); |
|
555 |
|
556 #if UCONFIG_NO_IDNA |
|
557 |
|
558 size=0; |
|
559 |
|
560 #else |
|
561 |
|
562 int32_t sprepTrieSize; |
|
563 |
|
564 /* sort and add mapping data */ |
|
565 storeMappingData(); |
|
566 |
|
567 sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, TRUE, &errorCode); |
|
568 if(U_FAILURE(errorCode)) { |
|
569 fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode)); |
|
570 exit(errorCode); |
|
571 } |
|
572 |
|
573 size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes); |
|
574 if(beVerbose) { |
|
575 printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize); |
|
576 printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size); |
|
577 printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR); |
|
578 printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex); |
|
579 printf("Maximum length of the mapping string is : %i \n", (int)maxLength); |
|
580 } |
|
581 |
|
582 #endif |
|
583 |
|
584 fileName[0]=0; |
|
585 uprv_strcat(fileName,bundleName); |
|
586 /* write the data */ |
|
587 pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo, |
|
588 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); |
|
589 if(U_FAILURE(errorCode)) { |
|
590 fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode); |
|
591 exit(errorCode); |
|
592 } |
|
593 |
|
594 #if !UCONFIG_NO_IDNA |
|
595 |
|
596 indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize; |
|
597 indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR; |
|
598 |
|
599 udata_writeBlock(pData, indexes, sizeof(indexes)); |
|
600 udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize); |
|
601 udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]); |
|
602 |
|
603 |
|
604 #endif |
|
605 |
|
606 /* finish up */ |
|
607 dataLength=udata_finish(pData, &errorCode); |
|
608 if(U_FAILURE(errorCode)) { |
|
609 fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode); |
|
610 exit(errorCode); |
|
611 } |
|
612 |
|
613 if(dataLength!=size) { |
|
614 fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n", |
|
615 (long)dataLength, (long)size); |
|
616 exit(U_INTERNAL_PROGRAM_ERROR); |
|
617 } |
|
618 |
|
619 #if !UCONFIG_NO_IDNA |
|
620 /* done with writing the data .. close the hashtable */ |
|
621 if (hashTable != NULL) { |
|
622 uhash_close(hashTable); |
|
623 } |
|
624 #endif |
|
625 |
|
626 uprv_free(fileName); |
|
627 } |
|
628 |
|
629 #if !UCONFIG_NO_IDNA |
|
630 |
|
631 extern void |
|
632 cleanUpData(void) { |
|
633 uprv_free(mappingData); |
|
634 utrie_close(sprepTrie); |
|
635 uprv_free(sprepTrie); |
|
636 } |
|
637 |
|
638 #endif /* #if !UCONFIG_NO_IDNA */ |
|
639 |
|
640 /* |
|
641 * Hey, Emacs, please set the following: |
|
642 * |
|
643 * Local Variables: |
|
644 * indent-tabs-mode: nil |
|
645 * End: |
|
646 * |
|
647 */ |