|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2003-2013, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: gencnvex.c |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2003oct12 |
|
14 * created by: Markus W. Scherer |
|
15 */ |
|
16 |
|
17 #include <stdio.h> |
|
18 #include "unicode/utypes.h" |
|
19 #include "unicode/ustring.h" |
|
20 #include "cstring.h" |
|
21 #include "cmemory.h" |
|
22 #include "ucnv_cnv.h" |
|
23 #include "ucnvmbcs.h" |
|
24 #include "toolutil.h" |
|
25 #include "unewdata.h" |
|
26 #include "ucm.h" |
|
27 #include "makeconv.h" |
|
28 #include "genmbcs.h" |
|
29 |
|
30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
31 |
|
32 |
|
33 static void |
|
34 CnvExtClose(NewConverter *cnvData); |
|
35 |
|
36 static UBool |
|
37 CnvExtIsValid(NewConverter *cnvData, |
|
38 const uint8_t *bytes, int32_t length); |
|
39 |
|
40 static UBool |
|
41 CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); |
|
42 |
|
43 static uint32_t |
|
44 CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, |
|
45 UNewDataMemory *pData, int32_t tableType); |
|
46 |
|
47 typedef struct CnvExtData { |
|
48 NewConverter newConverter; |
|
49 |
|
50 UCMFile *ucm; |
|
51 |
|
52 /* toUnicode (state table in ucm->states) */ |
|
53 UToolMemory *toUTable, *toUUChars; |
|
54 |
|
55 /* fromUnicode */ |
|
56 UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes; |
|
57 |
|
58 uint16_t stage1[MBCS_STAGE_1_SIZE]; |
|
59 uint16_t stage2[MBCS_STAGE_2_SIZE]; |
|
60 uint16_t stage3[0x10000<<UCNV_EXT_STAGE_2_LEFT_SHIFT]; /* 0x10000 because of 16-bit stage 2/3 indexes */ |
|
61 uint32_t stage3b[0x10000]; |
|
62 |
|
63 int32_t stage1Top, stage2Top, stage3Top, stage3bTop; |
|
64 |
|
65 /* for stage3 compaction of <subchar1> |2 mappings */ |
|
66 uint16_t stage3Sub1Block; |
|
67 |
|
68 /* statistics */ |
|
69 int32_t |
|
70 maxInBytes, maxOutBytes, maxBytesPerUChar, |
|
71 maxInUChars, maxOutUChars, maxUCharsPerByte; |
|
72 } CnvExtData; |
|
73 |
|
74 NewConverter * |
|
75 CnvExtOpen(UCMFile *ucm) { |
|
76 CnvExtData *extData; |
|
77 |
|
78 extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData)); |
|
79 if(extData==NULL) { |
|
80 printf("out of memory\n"); |
|
81 exit(U_MEMORY_ALLOCATION_ERROR); |
|
82 } |
|
83 uprv_memset(extData, 0, sizeof(CnvExtData)); |
|
84 |
|
85 extData->ucm=ucm; /* aliased, not owned */ |
|
86 |
|
87 extData->newConverter.close=CnvExtClose; |
|
88 extData->newConverter.isValid=CnvExtIsValid; |
|
89 extData->newConverter.addTable=CnvExtAddTable; |
|
90 extData->newConverter.write=CnvExtWrite; |
|
91 return &extData->newConverter; |
|
92 } |
|
93 |
|
94 static void |
|
95 CnvExtClose(NewConverter *cnvData) { |
|
96 CnvExtData *extData=(CnvExtData *)cnvData; |
|
97 if(extData!=NULL) { |
|
98 utm_close(extData->toUTable); |
|
99 utm_close(extData->toUUChars); |
|
100 utm_close(extData->fromUTableUChars); |
|
101 utm_close(extData->fromUTableValues); |
|
102 utm_close(extData->fromUBytes); |
|
103 uprv_free(extData); |
|
104 } |
|
105 } |
|
106 |
|
107 /* we do not expect this to be called */ |
|
108 static UBool |
|
109 CnvExtIsValid(NewConverter *cnvData, |
|
110 const uint8_t *bytes, int32_t length) { |
|
111 return FALSE; |
|
112 } |
|
113 |
|
114 static uint32_t |
|
115 CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, |
|
116 UNewDataMemory *pData, int32_t tableType) { |
|
117 CnvExtData *extData=(CnvExtData *)cnvData; |
|
118 int32_t length, top, headerSize; |
|
119 |
|
120 int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 }; |
|
121 |
|
122 if(tableType&TABLE_BASE) { |
|
123 headerSize=0; |
|
124 } else { |
|
125 _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 }; |
|
126 |
|
127 /* write the header and base table name for an extension-only table */ |
|
128 length=(int32_t)uprv_strlen(extData->ucm->baseName)+1; |
|
129 while(length&3) { |
|
130 /* add padding */ |
|
131 extData->ucm->baseName[length++]=0; |
|
132 } |
|
133 |
|
134 headerSize=MBCS_HEADER_V4_LENGTH*4+length; |
|
135 |
|
136 /* fill the header */ |
|
137 header.version[0]=4; |
|
138 header.version[1]=2; |
|
139 header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY); |
|
140 |
|
141 /* write the header and the base table name */ |
|
142 udata_writeBlock(pData, &header, MBCS_HEADER_V4_LENGTH*4); |
|
143 udata_writeBlock(pData, extData->ucm->baseName, length); |
|
144 } |
|
145 |
|
146 /* fill indexes[] - offsets/indexes are in units of the target array */ |
|
147 top=0; |
|
148 |
|
149 indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH; |
|
150 top+=length*4; |
|
151 |
|
152 indexes[UCNV_EXT_TO_U_INDEX]=top; |
|
153 indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable); |
|
154 top+=length*4; |
|
155 |
|
156 indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top; |
|
157 indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars); |
|
158 top+=length*2; |
|
159 |
|
160 indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top; |
|
161 length=utm_countItems(extData->fromUTableUChars); |
|
162 top+=length*2; |
|
163 |
|
164 if(top&3) { |
|
165 /* add padding */ |
|
166 *((UChar *)utm_alloc(extData->fromUTableUChars))=0; |
|
167 *((uint32_t *)utm_alloc(extData->fromUTableValues))=0; |
|
168 ++length; |
|
169 top+=2; |
|
170 } |
|
171 indexes[UCNV_EXT_FROM_U_LENGTH]=length; |
|
172 |
|
173 indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top; |
|
174 top+=length*4; |
|
175 |
|
176 indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top; |
|
177 length=utm_countItems(extData->fromUBytes); |
|
178 top+=length; |
|
179 |
|
180 if(top&1) { |
|
181 /* add padding */ |
|
182 *((uint8_t *)utm_alloc(extData->fromUBytes))=0; |
|
183 ++length; |
|
184 ++top; |
|
185 } |
|
186 indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length; |
|
187 |
|
188 indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top; |
|
189 indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top; |
|
190 indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top; |
|
191 top+=length*2; |
|
192 |
|
193 indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top; |
|
194 length=extData->stage3Top; |
|
195 top+=length*2; |
|
196 |
|
197 if(top&3) { |
|
198 /* add padding */ |
|
199 extData->stage3[extData->stage3Top++]=0; |
|
200 ++length; |
|
201 top+=2; |
|
202 } |
|
203 indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length; |
|
204 |
|
205 indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top; |
|
206 indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop; |
|
207 top+=length*4; |
|
208 |
|
209 indexes[UCNV_EXT_SIZE]=top; |
|
210 |
|
211 /* statistics */ |
|
212 indexes[UCNV_EXT_COUNT_BYTES]= |
|
213 (extData->maxInBytes<<16)| |
|
214 (extData->maxOutBytes<<8)| |
|
215 extData->maxBytesPerUChar; |
|
216 indexes[UCNV_EXT_COUNT_UCHARS]= |
|
217 (extData->maxInUChars<<16)| |
|
218 (extData->maxOutUChars<<8)| |
|
219 extData->maxUCharsPerByte; |
|
220 |
|
221 indexes[UCNV_EXT_FLAGS]=extData->ucm->ext->unicodeMask; |
|
222 |
|
223 /* write the extension data */ |
|
224 udata_writeBlock(pData, indexes, sizeof(indexes)); |
|
225 udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4); |
|
226 udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2); |
|
227 |
|
228 udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2); |
|
229 udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4); |
|
230 udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]); |
|
231 |
|
232 udata_writeBlock(pData, extData->stage1, extData->stage1Top*2); |
|
233 udata_writeBlock(pData, extData->stage2, extData->stage2Top*2); |
|
234 udata_writeBlock(pData, extData->stage3, extData->stage3Top*2); |
|
235 udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4); |
|
236 |
|
237 #if 0 |
|
238 { |
|
239 int32_t i, j; |
|
240 |
|
241 length=extData->stage1Top; |
|
242 printf("\nstage1[%x]:\n", length); |
|
243 |
|
244 for(i=0; i<length; ++i) { |
|
245 if(extData->stage1[i]!=length) { |
|
246 printf("stage1[%04x]=%04x\n", i, extData->stage1[i]); |
|
247 } |
|
248 } |
|
249 |
|
250 j=length; |
|
251 length=extData->stage2Top; |
|
252 printf("\nstage2[%x]:\n", length); |
|
253 |
|
254 for(i=0; i<length; ++j, ++i) { |
|
255 if(extData->stage2[i]!=0) { |
|
256 printf("stage12[%04x]=%04x\n", j, extData->stage2[i]); |
|
257 } |
|
258 } |
|
259 |
|
260 length=extData->stage3Top; |
|
261 printf("\nstage3[%x]:\n", length); |
|
262 |
|
263 for(i=0; i<length; ++i) { |
|
264 if(extData->stage3[i]!=0) { |
|
265 printf("stage3[%04x]=%04x\n", i, extData->stage3[i]); |
|
266 } |
|
267 } |
|
268 |
|
269 length=extData->stage3bTop; |
|
270 printf("\nstage3b[%x]:\n", length); |
|
271 |
|
272 for(i=0; i<length; ++i) { |
|
273 if(extData->stage3b[i]!=0) { |
|
274 printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]); |
|
275 } |
|
276 } |
|
277 } |
|
278 #endif |
|
279 |
|
280 if(VERBOSE) { |
|
281 printf("size of extension data: %ld\n", (long)top); |
|
282 } |
|
283 |
|
284 /* return the number of bytes that should have been written */ |
|
285 return (uint32_t)(headerSize+top); |
|
286 } |
|
287 |
|
288 /* to Unicode --------------------------------------------------------------- */ |
|
289 |
|
290 /* |
|
291 * Remove fromUnicode fallbacks and SUB mappings which are irrelevant for |
|
292 * the toUnicode table. |
|
293 * This includes mappings with MBCS_FROM_U_EXT_FLAG which were suitable |
|
294 * for the base toUnicode table but not for the base fromUnicode table. |
|
295 * The table must be sorted. |
|
296 * Modifies previous data in the reverseMap. |
|
297 */ |
|
298 static int32_t |
|
299 reduceToUMappings(UCMTable *table) { |
|
300 UCMapping *mappings; |
|
301 int32_t *map; |
|
302 int32_t i, j, count; |
|
303 int8_t flag; |
|
304 |
|
305 mappings=table->mappings; |
|
306 map=table->reverseMap; |
|
307 count=table->mappingsLength; |
|
308 |
|
309 /* leave the map alone for the initial mappings with desired flags */ |
|
310 for(i=j=0; i<count; ++i) { |
|
311 flag=mappings[map[i]].f; |
|
312 if(flag!=0 && flag!=3) { |
|
313 break; |
|
314 } |
|
315 } |
|
316 |
|
317 /* reduce from here to the rest */ |
|
318 for(j=i; i<count; ++i) { |
|
319 flag=mappings[map[i]].f; |
|
320 if(flag==0 || flag==3) { |
|
321 map[j++]=map[i]; |
|
322 } |
|
323 } |
|
324 |
|
325 return j; |
|
326 } |
|
327 |
|
328 static uint32_t |
|
329 getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { |
|
330 UChar32 *u32; |
|
331 UChar *u; |
|
332 uint32_t value; |
|
333 int32_t u16Length, ratio; |
|
334 UErrorCode errorCode; |
|
335 |
|
336 /* write the Unicode result code point or string index */ |
|
337 if(m->uLen==1) { |
|
338 u16Length=U16_LENGTH(m->u); |
|
339 value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u); |
|
340 } else { |
|
341 /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */ |
|
342 |
|
343 /* get the result code point string and its 16-bit string length */ |
|
344 u32=UCM_GET_CODE_POINTS(table, m); |
|
345 errorCode=U_ZERO_ERROR; |
|
346 u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode); |
|
347 if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { |
|
348 exit(errorCode); |
|
349 } |
|
350 |
|
351 /* allocate it and put its length and index into the value */ |
|
352 value= |
|
353 (((uint32_t)u16Length+UCNV_EXT_TO_U_LENGTH_OFFSET)<<UCNV_EXT_TO_U_LENGTH_SHIFT)| |
|
354 ((uint32_t)utm_countItems(extData->toUUChars)); |
|
355 u=utm_allocN(extData->toUUChars, u16Length); |
|
356 |
|
357 /* write the result 16-bit string */ |
|
358 errorCode=U_ZERO_ERROR; |
|
359 u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode); |
|
360 if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { |
|
361 exit(errorCode); |
|
362 } |
|
363 } |
|
364 if(m->f==0) { |
|
365 value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG; |
|
366 } |
|
367 |
|
368 /* update statistics */ |
|
369 if(m->bLen>extData->maxInBytes) { |
|
370 extData->maxInBytes=m->bLen; |
|
371 } |
|
372 if(u16Length>extData->maxOutUChars) { |
|
373 extData->maxOutUChars=u16Length; |
|
374 } |
|
375 |
|
376 ratio=(u16Length+(m->bLen-1))/m->bLen; |
|
377 if(ratio>extData->maxUCharsPerByte) { |
|
378 extData->maxUCharsPerByte=ratio; |
|
379 } |
|
380 |
|
381 return value; |
|
382 } |
|
383 |
|
384 /* |
|
385 * Recursive toUTable generator core function. |
|
386 * Preconditions: |
|
387 * - start<limit (There is at least one mapping.) |
|
388 * - The mappings are sorted lexically. (Access is through the reverseMap.) |
|
389 * - All mappings between start and limit have input sequences that share |
|
390 * the same prefix of unitIndex length, and therefore all of these sequences |
|
391 * are at least unitIndex+1 long. |
|
392 * - There are only relevant mappings available through the reverseMap, |
|
393 * see reduceToUMappings(). |
|
394 * |
|
395 * One function invocation generates one section table. |
|
396 * |
|
397 * Steps: |
|
398 * 1. Count the number of unique unit values and get the low/high unit values |
|
399 * that occur at unitIndex. |
|
400 * 2. Allocate the section table with possible optimization for linear access. |
|
401 * 3. Write temporary version of the section table with start indexes of |
|
402 * subsections, each corresponding to one unit value at unitIndex. |
|
403 * 4. Iterate through the table once more, and depending on the subsection length: |
|
404 * 0: write 0 as a result value (unused byte in linear-access section table) |
|
405 * >0: if there is one mapping with an input unit sequence of unitIndex+1 |
|
406 * then defaultValue=compute the mapping result for this whole sequence |
|
407 * else defaultValue=0 |
|
408 * |
|
409 * recurse into the subsection |
|
410 */ |
|
411 static UBool |
|
412 generateToUTable(CnvExtData *extData, UCMTable *table, |
|
413 int32_t start, int32_t limit, int32_t unitIndex, |
|
414 uint32_t defaultValue) { |
|
415 UCMapping *mappings, *m; |
|
416 int32_t *map; |
|
417 int32_t i, j, uniqueCount, count, subStart, subLimit; |
|
418 |
|
419 uint8_t *bytes; |
|
420 int32_t low, high, prev; |
|
421 |
|
422 uint32_t *section; |
|
423 |
|
424 mappings=table->mappings; |
|
425 map=table->reverseMap; |
|
426 |
|
427 /* step 1: examine the input units; set low, high, uniqueCount */ |
|
428 m=mappings+map[start]; |
|
429 bytes=UCM_GET_BYTES(table, m); |
|
430 low=bytes[unitIndex]; |
|
431 uniqueCount=1; |
|
432 |
|
433 prev=high=low; |
|
434 for(i=start+1; i<limit; ++i) { |
|
435 m=mappings+map[i]; |
|
436 bytes=UCM_GET_BYTES(table, m); |
|
437 high=bytes[unitIndex]; |
|
438 |
|
439 if(high!=prev) { |
|
440 prev=high; |
|
441 ++uniqueCount; |
|
442 } |
|
443 } |
|
444 |
|
445 /* step 2: allocate the section; set count, section */ |
|
446 count=(high-low)+1; |
|
447 if(count<0x100 && (unitIndex==0 || uniqueCount>=(3*count)/4)) { |
|
448 /* |
|
449 * for the root table and for fairly full tables: |
|
450 * allocate for direct, linear array access |
|
451 * by keeping count, to write an entry for each unit value |
|
452 * from low to high |
|
453 * exception: use a compact table if count==0x100 because |
|
454 * that cannot be encoded in the length byte |
|
455 */ |
|
456 } else { |
|
457 count=uniqueCount; |
|
458 } |
|
459 |
|
460 if(count>=0x100) { |
|
461 fprintf(stderr, "error: toUnicode extension table section overflow: %ld section entries\n", (long)count); |
|
462 return FALSE; |
|
463 } |
|
464 |
|
465 /* allocate the section: 1 entry for the header + count for the items */ |
|
466 section=(uint32_t *)utm_allocN(extData->toUTable, 1+count); |
|
467 |
|
468 /* write the section header */ |
|
469 *section++=((uint32_t)count<<UCNV_EXT_TO_U_BYTE_SHIFT)|defaultValue; |
|
470 |
|
471 /* step 3: write temporary section table with subsection starts */ |
|
472 prev=low-1; /* just before low to prevent empty subsections before low */ |
|
473 j=0; /* section table index */ |
|
474 for(i=start; i<limit; ++i) { |
|
475 m=mappings+map[i]; |
|
476 bytes=UCM_GET_BYTES(table, m); |
|
477 high=bytes[unitIndex]; |
|
478 |
|
479 if(high!=prev) { |
|
480 /* start of a new subsection for unit high */ |
|
481 if(count>uniqueCount) { |
|
482 /* write empty subsections for unused units in a linear table */ |
|
483 while(++prev<high) { |
|
484 section[j++]=((uint32_t)prev<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i; |
|
485 } |
|
486 } else { |
|
487 prev=high; |
|
488 } |
|
489 |
|
490 /* write the entry with the subsection start */ |
|
491 section[j++]=((uint32_t)high<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i; |
|
492 } |
|
493 } |
|
494 /* assert(j==count) */ |
|
495 |
|
496 /* step 4: recurse and write results */ |
|
497 subLimit=UCNV_EXT_TO_U_GET_VALUE(section[0]); |
|
498 for(j=0; j<count; ++j) { |
|
499 subStart=subLimit; |
|
500 subLimit= (j+1)<count ? UCNV_EXT_TO_U_GET_VALUE(section[j+1]) : limit; |
|
501 |
|
502 /* remove the subStart temporary value */ |
|
503 section[j]&=~UCNV_EXT_TO_U_VALUE_MASK; |
|
504 |
|
505 if(subStart==subLimit) { |
|
506 /* leave the value zero: empty subsection for unused unit in a linear table */ |
|
507 continue; |
|
508 } |
|
509 |
|
510 /* see if there is exactly one input unit sequence of length unitIndex+1 */ |
|
511 defaultValue=0; |
|
512 m=mappings+map[subStart]; |
|
513 if(m->bLen==unitIndex+1) { |
|
514 /* do not include this in generateToUTable() */ |
|
515 ++subStart; |
|
516 |
|
517 if(subStart<subLimit && mappings[map[subStart]].bLen==unitIndex+1) { |
|
518 /* print error for multiple same-input-sequence mappings */ |
|
519 fprintf(stderr, "error: multiple mappings from same bytes\n"); |
|
520 ucm_printMapping(table, m, stderr); |
|
521 ucm_printMapping(table, mappings+map[subStart], stderr); |
|
522 return FALSE; |
|
523 } |
|
524 |
|
525 defaultValue=getToUnicodeValue(extData, table, m); |
|
526 } |
|
527 |
|
528 if(subStart==subLimit) { |
|
529 /* write the result for the input sequence ending here */ |
|
530 section[j]|=defaultValue; |
|
531 } else { |
|
532 /* write the index to the subsection table */ |
|
533 section[j]|=(uint32_t)utm_countItems(extData->toUTable); |
|
534 |
|
535 /* recurse */ |
|
536 if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { |
|
537 return FALSE; |
|
538 } |
|
539 } |
|
540 } |
|
541 return TRUE; |
|
542 } |
|
543 |
|
544 /* |
|
545 * Generate the toUTable and toUUChars from the input table. |
|
546 * The input table must be sorted, and all precision flags must be 0..3. |
|
547 * This function will modify the table's reverseMap. |
|
548 */ |
|
549 static UBool |
|
550 makeToUTable(CnvExtData *extData, UCMTable *table) { |
|
551 int32_t toUCount; |
|
552 |
|
553 toUCount=reduceToUMappings(table); |
|
554 |
|
555 extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4); |
|
556 extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2); |
|
557 |
|
558 return generateToUTable(extData, table, 0, toUCount, 0, 0); |
|
559 } |
|
560 |
|
561 /* from Unicode ------------------------------------------------------------- */ |
|
562 |
|
563 /* |
|
564 * preprocessing: |
|
565 * rebuild reverseMap with mapping indexes for mappings relevant for from Unicode |
|
566 * change each Unicode string to encode all but the first code point in 16-bit form |
|
567 * |
|
568 * generation: |
|
569 * for each unique code point |
|
570 * write an entry in the 3-stage trie |
|
571 * check that there is only one single-code point sequence |
|
572 * start recursion for following 16-bit input units |
|
573 */ |
|
574 |
|
575 /* |
|
576 * Remove toUnicode fallbacks and non-<subchar1> SUB mappings |
|
577 * which are irrelevant for the fromUnicode extension table. |
|
578 * Remove MBCS_FROM_U_EXT_FLAG bits. |
|
579 * Overwrite the reverseMap with an index array to the relevant mappings. |
|
580 * Modify the code point sequences to a generator-friendly format where |
|
581 * the first code points remains unchanged but the following are recoded |
|
582 * into 16-bit Unicode string form. |
|
583 * The table must be sorted. |
|
584 * Destroys previous data in the reverseMap. |
|
585 */ |
|
586 static int32_t |
|
587 prepareFromUMappings(UCMTable *table) { |
|
588 UCMapping *mappings, *m; |
|
589 int32_t *map; |
|
590 int32_t i, j, count; |
|
591 int8_t flag; |
|
592 |
|
593 mappings=table->mappings; |
|
594 map=table->reverseMap; |
|
595 count=table->mappingsLength; |
|
596 |
|
597 /* |
|
598 * we do not go through the map on input because the mappings are |
|
599 * sorted lexically |
|
600 */ |
|
601 m=mappings; |
|
602 |
|
603 for(i=j=0; i<count; ++m, ++i) { |
|
604 flag=m->f; |
|
605 if(flag>=0) { |
|
606 flag&=MBCS_FROM_U_EXT_MASK; |
|
607 m->f=flag; |
|
608 } |
|
609 if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) { |
|
610 map[j++]=i; |
|
611 |
|
612 if(m->uLen>1) { |
|
613 /* recode all but the first code point to 16-bit Unicode */ |
|
614 UChar32 *u32; |
|
615 UChar *u; |
|
616 UChar32 c; |
|
617 int32_t q, r; |
|
618 |
|
619 u32=UCM_GET_CODE_POINTS(table, m); |
|
620 u=(UChar *)u32; /* destructive in-place recoding */ |
|
621 for(r=2, q=1; q<m->uLen; ++q) { |
|
622 c=u32[q]; |
|
623 U16_APPEND_UNSAFE(u, r, c); |
|
624 } |
|
625 |
|
626 /* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */ |
|
627 m->uLen=(int8_t)r; |
|
628 } |
|
629 } |
|
630 } |
|
631 |
|
632 return j; |
|
633 } |
|
634 |
|
635 static uint32_t |
|
636 getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { |
|
637 uint8_t *bytes, *resultBytes; |
|
638 uint32_t value; |
|
639 int32_t u16Length, ratio; |
|
640 |
|
641 if(m->f==2) { |
|
642 /* |
|
643 * no mapping, <subchar1> preferred |
|
644 * |
|
645 * no need to count in statistics because the subchars are already |
|
646 * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData, |
|
647 * and this non-mapping does not count for maxInUChars which are always |
|
648 * trivially at least two if counting unmappable supplementary code points |
|
649 */ |
|
650 return UCNV_EXT_FROM_U_SUBCHAR1; |
|
651 } |
|
652 |
|
653 bytes=UCM_GET_BYTES(table, m); |
|
654 value=0; |
|
655 switch(m->bLen) { |
|
656 /* 1..3: store the bytes in the value word */ |
|
657 case 3: |
|
658 value=((uint32_t)*bytes++)<<16; |
|
659 case 2: |
|
660 value|=((uint32_t)*bytes++)<<8; |
|
661 case 1: |
|
662 value|=*bytes; |
|
663 break; |
|
664 default: |
|
665 /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */ |
|
666 /* store the bytes in fromUBytes[] and the index in the value word */ |
|
667 value=(uint32_t)utm_countItems(extData->fromUBytes); |
|
668 resultBytes=utm_allocN(extData->fromUBytes, m->bLen); |
|
669 uprv_memcpy(resultBytes, bytes, m->bLen); |
|
670 break; |
|
671 } |
|
672 value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT; |
|
673 if(m->f==0) { |
|
674 value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; |
|
675 } else if(m->f==4) { |
|
676 value|=UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG; |
|
677 } |
|
678 |
|
679 /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */ |
|
680 if(m->uLen==1) { |
|
681 u16Length=U16_LENGTH(m->u); |
|
682 } else { |
|
683 u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2); |
|
684 } |
|
685 |
|
686 /* update statistics */ |
|
687 if(u16Length>extData->maxInUChars) { |
|
688 extData->maxInUChars=u16Length; |
|
689 } |
|
690 if(m->bLen>extData->maxOutBytes) { |
|
691 extData->maxOutBytes=m->bLen; |
|
692 } |
|
693 |
|
694 ratio=(m->bLen+(u16Length-1))/u16Length; |
|
695 if(ratio>extData->maxBytesPerUChar) { |
|
696 extData->maxBytesPerUChar=ratio; |
|
697 } |
|
698 |
|
699 return value; |
|
700 } |
|
701 |
|
702 /* |
|
703 * works like generateToUTable(), except that the |
|
704 * output section consists of two arrays, one for input UChars and one |
|
705 * for result values |
|
706 * |
|
707 * also, fromUTable sections are always stored in a compact form for |
|
708 * access via binary search |
|
709 */ |
|
710 static UBool |
|
711 generateFromUTable(CnvExtData *extData, UCMTable *table, |
|
712 int32_t start, int32_t limit, int32_t unitIndex, |
|
713 uint32_t defaultValue) { |
|
714 UCMapping *mappings, *m; |
|
715 int32_t *map; |
|
716 int32_t i, j, uniqueCount, count, subStart, subLimit; |
|
717 |
|
718 UChar *uchars; |
|
719 UChar32 low, high, prev; |
|
720 |
|
721 UChar *sectionUChars; |
|
722 uint32_t *sectionValues; |
|
723 |
|
724 mappings=table->mappings; |
|
725 map=table->reverseMap; |
|
726 |
|
727 /* step 1: examine the input units; set low, high, uniqueCount */ |
|
728 m=mappings+map[start]; |
|
729 uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); |
|
730 low=uchars[unitIndex]; |
|
731 uniqueCount=1; |
|
732 |
|
733 prev=high=low; |
|
734 for(i=start+1; i<limit; ++i) { |
|
735 m=mappings+map[i]; |
|
736 uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); |
|
737 high=uchars[unitIndex]; |
|
738 |
|
739 if(high!=prev) { |
|
740 prev=high; |
|
741 ++uniqueCount; |
|
742 } |
|
743 } |
|
744 |
|
745 /* step 2: allocate the section; set count, section */ |
|
746 /* the fromUTable always stores for access via binary search */ |
|
747 count=uniqueCount; |
|
748 |
|
749 /* allocate the section: 1 entry for the header + count for the items */ |
|
750 sectionUChars=(UChar *)utm_allocN(extData->fromUTableUChars, 1+count); |
|
751 sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count); |
|
752 |
|
753 /* write the section header */ |
|
754 *sectionUChars++=(UChar)count; |
|
755 *sectionValues++=defaultValue; |
|
756 |
|
757 /* step 3: write temporary section table with subsection starts */ |
|
758 prev=low-1; /* just before low to prevent empty subsections before low */ |
|
759 j=0; /* section table index */ |
|
760 for(i=start; i<limit; ++i) { |
|
761 m=mappings+map[i]; |
|
762 uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); |
|
763 high=uchars[unitIndex]; |
|
764 |
|
765 if(high!=prev) { |
|
766 /* start of a new subsection for unit high */ |
|
767 prev=high; |
|
768 |
|
769 /* write the entry with the subsection start */ |
|
770 sectionUChars[j]=(UChar)high; |
|
771 sectionValues[j]=(uint32_t)i; |
|
772 ++j; |
|
773 } |
|
774 } |
|
775 /* assert(j==count) */ |
|
776 |
|
777 /* step 4: recurse and write results */ |
|
778 subLimit=(int32_t)(sectionValues[0]); |
|
779 for(j=0; j<count; ++j) { |
|
780 subStart=subLimit; |
|
781 subLimit= (j+1)<count ? (int32_t)(sectionValues[j+1]) : limit; |
|
782 |
|
783 /* see if there is exactly one input unit sequence of length unitIndex+1 */ |
|
784 defaultValue=0; |
|
785 m=mappings+map[subStart]; |
|
786 if(m->uLen==unitIndex+1) { |
|
787 /* do not include this in generateToUTable() */ |
|
788 ++subStart; |
|
789 |
|
790 if(subStart<subLimit && mappings[map[subStart]].uLen==unitIndex+1) { |
|
791 /* print error for multiple same-input-sequence mappings */ |
|
792 fprintf(stderr, "error: multiple mappings from same Unicode code points\n"); |
|
793 ucm_printMapping(table, m, stderr); |
|
794 ucm_printMapping(table, mappings+map[subStart], stderr); |
|
795 return FALSE; |
|
796 } |
|
797 |
|
798 defaultValue=getFromUBytesValue(extData, table, m); |
|
799 } |
|
800 |
|
801 if(subStart==subLimit) { |
|
802 /* write the result for the input sequence ending here */ |
|
803 sectionValues[j]=defaultValue; |
|
804 } else { |
|
805 /* write the index to the subsection table */ |
|
806 sectionValues[j]=(uint32_t)utm_countItems(extData->fromUTableValues); |
|
807 |
|
808 /* recurse */ |
|
809 if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { |
|
810 return FALSE; |
|
811 } |
|
812 } |
|
813 } |
|
814 return TRUE; |
|
815 } |
|
816 |
|
817 /* |
|
818 * add entries to the fromUnicode trie, |
|
819 * assume to be called with code points in ascending order |
|
820 * and use that to build the trie in precompacted form |
|
821 */ |
|
822 static void |
|
823 addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) { |
|
824 int32_t i1, i2, i3, i3b, nextOffset, min, newBlock; |
|
825 |
|
826 if(value==0) { |
|
827 return; |
|
828 } |
|
829 |
|
830 /* |
|
831 * compute the index for each stage, |
|
832 * allocate a stage block if necessary, |
|
833 * and write the stage value |
|
834 */ |
|
835 i1=c>>10; |
|
836 if(i1>=extData->stage1Top) { |
|
837 extData->stage1Top=i1+1; |
|
838 } |
|
839 |
|
840 nextOffset=(c>>4)&0x3f; |
|
841 |
|
842 if(extData->stage1[i1]==0) { |
|
843 /* allocate another block in stage 2; overlap with the previous block */ |
|
844 newBlock=extData->stage2Top; |
|
845 min=newBlock-nextOffset; /* minimum block start with overlap */ |
|
846 while(min<newBlock && extData->stage2[newBlock-1]==0) { |
|
847 --newBlock; |
|
848 } |
|
849 |
|
850 extData->stage1[i1]=(uint16_t)newBlock; |
|
851 extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE; |
|
852 if(extData->stage2Top>LENGTHOF(extData->stage2)) { |
|
853 fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", (int)c); |
|
854 exit(U_MEMORY_ALLOCATION_ERROR); |
|
855 } |
|
856 } |
|
857 |
|
858 i2=extData->stage1[i1]+nextOffset; |
|
859 nextOffset=c&0xf; |
|
860 |
|
861 if(extData->stage2[i2]==0) { |
|
862 /* allocate another block in stage 3; overlap with the previous block */ |
|
863 newBlock=extData->stage3Top; |
|
864 min=newBlock-nextOffset; /* minimum block start with overlap */ |
|
865 while(min<newBlock && extData->stage3[newBlock-1]==0) { |
|
866 --newBlock; |
|
867 } |
|
868 |
|
869 /* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */ |
|
870 newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1); |
|
871 extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT); |
|
872 |
|
873 extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE; |
|
874 if(extData->stage3Top>LENGTHOF(extData->stage3)) { |
|
875 fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", (int)c); |
|
876 exit(U_MEMORY_ALLOCATION_ERROR); |
|
877 } |
|
878 } |
|
879 |
|
880 i3=((int32_t)extData->stage2[i2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)+nextOffset; |
|
881 /* |
|
882 * assume extData->stage3[i3]==0 because we get |
|
883 * code points in strictly ascending order |
|
884 */ |
|
885 |
|
886 if(value==UCNV_EXT_FROM_U_SUBCHAR1) { |
|
887 /* <subchar1> SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */ |
|
888 extData->stage3[i3]=1; |
|
889 |
|
890 /* |
|
891 * precompaction is not optimal for <subchar1> |2 mappings because |
|
892 * stage3 values for them are all the same, unlike for other mappings |
|
893 * which all have unique values; |
|
894 * use a simple compaction of reusing a whole block filled with these |
|
895 * mappings |
|
896 */ |
|
897 |
|
898 /* is the entire block filled with <subchar1> |2 mappings? */ |
|
899 if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) { |
|
900 for(min=i3-nextOffset; |
|
901 min<i3 && extData->stage3[min]==1; |
|
902 ++min) {} |
|
903 |
|
904 if(min==i3) { |
|
905 /* the entire block is filled with these mappings */ |
|
906 if(extData->stage3Sub1Block!=0) { |
|
907 /* point to the previous such block and remove this block from stage3 */ |
|
908 extData->stage2[i2]=extData->stage3Sub1Block; |
|
909 extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE; |
|
910 uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2); |
|
911 } else { |
|
912 /* remember this block's stage2 entry */ |
|
913 extData->stage3Sub1Block=extData->stage2[i2]; |
|
914 } |
|
915 } |
|
916 } |
|
917 } else { |
|
918 if((i3b=extData->stage3bTop++)>=LENGTHOF(extData->stage3b)) { |
|
919 fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", (int)c); |
|
920 exit(U_MEMORY_ALLOCATION_ERROR); |
|
921 } |
|
922 |
|
923 /* roundtrip or fallback mapping */ |
|
924 extData->stage3[i3]=(uint16_t)i3b; |
|
925 extData->stage3b[i3b]=value; |
|
926 } |
|
927 } |
|
928 |
|
929 static UBool |
|
930 generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) { |
|
931 UCMapping *mappings, *m; |
|
932 int32_t *map; |
|
933 uint32_t value; |
|
934 int32_t subStart, subLimit; |
|
935 |
|
936 UChar32 *codePoints; |
|
937 UChar32 c, next; |
|
938 |
|
939 if(mapLength==0) { |
|
940 return TRUE; |
|
941 } |
|
942 |
|
943 mappings=table->mappings; |
|
944 map=table->reverseMap; |
|
945 |
|
946 /* |
|
947 * iterate over same-initial-code point mappings, |
|
948 * enter the initial code point into the trie, |
|
949 * and start a recursion on the corresponding mappings section |
|
950 * with generateFromUTable() |
|
951 */ |
|
952 m=mappings+map[0]; |
|
953 codePoints=UCM_GET_CODE_POINTS(table, m); |
|
954 next=codePoints[0]; |
|
955 subLimit=0; |
|
956 while(subLimit<mapLength) { |
|
957 /* get a new subsection of mappings starting with the same code point */ |
|
958 subStart=subLimit; |
|
959 c=next; |
|
960 while(next==c && ++subLimit<mapLength) { |
|
961 m=mappings+map[subLimit]; |
|
962 codePoints=UCM_GET_CODE_POINTS(table, m); |
|
963 next=codePoints[0]; |
|
964 } |
|
965 |
|
966 /* |
|
967 * compute the value for this code point; |
|
968 * if there is a mapping for this code point alone, it is at subStart |
|
969 * because the table is sorted lexically |
|
970 */ |
|
971 value=0; |
|
972 m=mappings+map[subStart]; |
|
973 codePoints=UCM_GET_CODE_POINTS(table, m); |
|
974 if(m->uLen==1) { |
|
975 /* do not include this in generateFromUTable() */ |
|
976 ++subStart; |
|
977 |
|
978 if(subStart<subLimit && mappings[map[subStart]].uLen==1) { |
|
979 /* print error for multiple same-input-sequence mappings */ |
|
980 fprintf(stderr, "error: multiple mappings from same Unicode code points\n"); |
|
981 ucm_printMapping(table, m, stderr); |
|
982 ucm_printMapping(table, mappings+map[subStart], stderr); |
|
983 return FALSE; |
|
984 } |
|
985 |
|
986 value=getFromUBytesValue(extData, table, m); |
|
987 } |
|
988 |
|
989 if(subStart==subLimit) { |
|
990 /* write the result for this one code point */ |
|
991 addFromUTrieEntry(extData, c, value); |
|
992 } else { |
|
993 /* write the index to the subsection table */ |
|
994 addFromUTrieEntry(extData, c, (uint32_t)utm_countItems(extData->fromUTableValues)); |
|
995 |
|
996 /* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */ |
|
997 if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) { |
|
998 return FALSE; |
|
999 } |
|
1000 } |
|
1001 } |
|
1002 return TRUE; |
|
1003 } |
|
1004 |
|
1005 /* |
|
1006 * Generate the fromU data structures from the input table. |
|
1007 * The input table must be sorted, and all precision flags must be 0..3. |
|
1008 * This function will modify the table's reverseMap. |
|
1009 */ |
|
1010 static UBool |
|
1011 makeFromUTable(CnvExtData *extData, UCMTable *table) { |
|
1012 uint16_t *stage1; |
|
1013 int32_t i, stage1Top, fromUCount; |
|
1014 |
|
1015 fromUCount=prepareFromUMappings(table); |
|
1016 |
|
1017 extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2); |
|
1018 extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4); |
|
1019 extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1); |
|
1020 |
|
1021 /* allocate all-unassigned stage blocks */ |
|
1022 extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; |
|
1023 extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED; |
|
1024 |
|
1025 /* |
|
1026 * stage 3b stores only unique values, and in |
|
1027 * index 0: 0 for "no mapping" |
|
1028 * index 1: "no mapping" with preference for <subchar1> rather than <subchar> |
|
1029 */ |
|
1030 extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1; |
|
1031 extData->stage3bTop=2; |
|
1032 |
|
1033 /* allocate the first entry in the fromUTable because index 0 means "no result" */ |
|
1034 utm_alloc(extData->fromUTableUChars); |
|
1035 utm_alloc(extData->fromUTableValues); |
|
1036 |
|
1037 if(!generateFromUTrie(extData, table, fromUCount)) { |
|
1038 return FALSE; |
|
1039 } |
|
1040 |
|
1041 /* |
|
1042 * offset the stage 1 trie entries by stage1Top because they will |
|
1043 * be stored in a single array |
|
1044 */ |
|
1045 stage1=extData->stage1; |
|
1046 stage1Top=extData->stage1Top; |
|
1047 for(i=0; i<stage1Top; ++i) { |
|
1048 stage1[i]=(uint16_t)(stage1[i]+stage1Top); |
|
1049 } |
|
1050 |
|
1051 return TRUE; |
|
1052 } |
|
1053 |
|
1054 /* -------------------------------------------------------------------------- */ |
|
1055 |
|
1056 static UBool |
|
1057 CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) { |
|
1058 CnvExtData *extData; |
|
1059 |
|
1060 if(table->unicodeMask&UCNV_HAS_SURROGATES) { |
|
1061 fprintf(stderr, "error: contains mappings for surrogate code points\n"); |
|
1062 return FALSE; |
|
1063 } |
|
1064 |
|
1065 staticData->conversionType=UCNV_MBCS; |
|
1066 |
|
1067 extData=(CnvExtData *)cnvData; |
|
1068 |
|
1069 /* |
|
1070 * assume that the table is sorted |
|
1071 * |
|
1072 * call the functions in this order because |
|
1073 * makeToUTable() modifies the original reverseMap, |
|
1074 * makeFromUTable() writes a whole new mapping into reverseMap |
|
1075 */ |
|
1076 return |
|
1077 makeToUTable(extData, table) && |
|
1078 makeFromUTable(extData, table); |
|
1079 } |