|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2002-2011, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: propsvec.c |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2002feb22 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * Store bits (Unicode character properties) in bit set vectors. |
|
17 */ |
|
18 |
|
19 #include <stdlib.h> |
|
20 #include "unicode/utypes.h" |
|
21 #include "cmemory.h" |
|
22 #include "utrie.h" |
|
23 #include "utrie2.h" |
|
24 #include "uarrsort.h" |
|
25 #include "propsvec.h" |
|
26 #include "uassert.h" |
|
27 |
|
28 struct UPropsVectors { |
|
29 uint32_t *v; |
|
30 int32_t columns; /* number of columns, plus two for start & limit values */ |
|
31 int32_t maxRows; |
|
32 int32_t rows; |
|
33 int32_t prevRow; /* search optimization: remember last row seen */ |
|
34 UBool isCompacted; |
|
35 }; |
|
36 |
|
37 #define UPVEC_INITIAL_ROWS (1<<12) |
|
38 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) |
|
39 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) |
|
40 |
|
41 U_CAPI UPropsVectors * U_EXPORT2 |
|
42 upvec_open(int32_t columns, UErrorCode *pErrorCode) { |
|
43 UPropsVectors *pv; |
|
44 uint32_t *v, *row; |
|
45 uint32_t cp; |
|
46 |
|
47 if(U_FAILURE(*pErrorCode)) { |
|
48 return NULL; |
|
49 } |
|
50 if(columns<1) { |
|
51 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
52 return NULL; |
|
53 } |
|
54 columns+=2; /* count range start and limit columns */ |
|
55 |
|
56 pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); |
|
57 v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); |
|
58 if(pv==NULL || v==NULL) { |
|
59 uprv_free(pv); |
|
60 uprv_free(v); |
|
61 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
62 return NULL; |
|
63 } |
|
64 uprv_memset(pv, 0, sizeof(UPropsVectors)); |
|
65 pv->v=v; |
|
66 pv->columns=columns; |
|
67 pv->maxRows=UPVEC_INITIAL_ROWS; |
|
68 pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); |
|
69 |
|
70 /* set the all-Unicode row and the special-value rows */ |
|
71 row=pv->v; |
|
72 uprv_memset(row, 0, pv->rows*columns*4); |
|
73 row[0]=0; |
|
74 row[1]=0x110000; |
|
75 row+=columns; |
|
76 for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { |
|
77 row[0]=cp; |
|
78 row[1]=cp+1; |
|
79 row+=columns; |
|
80 } |
|
81 return pv; |
|
82 } |
|
83 |
|
84 U_CAPI void U_EXPORT2 |
|
85 upvec_close(UPropsVectors *pv) { |
|
86 if(pv!=NULL) { |
|
87 uprv_free(pv->v); |
|
88 uprv_free(pv); |
|
89 } |
|
90 } |
|
91 |
|
92 static uint32_t * |
|
93 _findRow(UPropsVectors *pv, UChar32 rangeStart) { |
|
94 uint32_t *row; |
|
95 int32_t columns, i, start, limit, prevRow; |
|
96 |
|
97 columns=pv->columns; |
|
98 limit=pv->rows; |
|
99 prevRow=pv->prevRow; |
|
100 |
|
101 /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ |
|
102 row=pv->v+prevRow*columns; |
|
103 if(rangeStart>=(UChar32)row[0]) { |
|
104 if(rangeStart<(UChar32)row[1]) { |
|
105 /* same row as last seen */ |
|
106 return row; |
|
107 } else if(rangeStart<(UChar32)(row+=columns)[1]) { |
|
108 /* next row after the last one */ |
|
109 pv->prevRow=prevRow+1; |
|
110 return row; |
|
111 } else if(rangeStart<(UChar32)(row+=columns)[1]) { |
|
112 /* second row after the last one */ |
|
113 pv->prevRow=prevRow+2; |
|
114 return row; |
|
115 } else if((rangeStart-(UChar32)row[1])<10) { |
|
116 /* we are close, continue looping */ |
|
117 prevRow+=2; |
|
118 do { |
|
119 ++prevRow; |
|
120 row+=columns; |
|
121 } while(rangeStart>=(UChar32)row[1]); |
|
122 pv->prevRow=prevRow; |
|
123 return row; |
|
124 } |
|
125 } else if(rangeStart<(UChar32)pv->v[1]) { |
|
126 /* the very first row */ |
|
127 pv->prevRow=0; |
|
128 return pv->v; |
|
129 } |
|
130 |
|
131 /* do a binary search for the start of the range */ |
|
132 start=0; |
|
133 while(start<limit-1) { |
|
134 i=(start+limit)/2; |
|
135 row=pv->v+i*columns; |
|
136 if(rangeStart<(UChar32)row[0]) { |
|
137 limit=i; |
|
138 } else if(rangeStart<(UChar32)row[1]) { |
|
139 pv->prevRow=i; |
|
140 return row; |
|
141 } else { |
|
142 start=i; |
|
143 } |
|
144 } |
|
145 |
|
146 /* must be found because all ranges together always cover all of Unicode */ |
|
147 pv->prevRow=start; |
|
148 return pv->v+start*columns; |
|
149 } |
|
150 |
|
151 U_CAPI void U_EXPORT2 |
|
152 upvec_setValue(UPropsVectors *pv, |
|
153 UChar32 start, UChar32 end, |
|
154 int32_t column, |
|
155 uint32_t value, uint32_t mask, |
|
156 UErrorCode *pErrorCode) { |
|
157 uint32_t *firstRow, *lastRow; |
|
158 int32_t columns; |
|
159 UChar32 limit; |
|
160 UBool splitFirstRow, splitLastRow; |
|
161 |
|
162 /* argument checking */ |
|
163 if(U_FAILURE(*pErrorCode)) { |
|
164 return; |
|
165 } |
|
166 if( pv==NULL || |
|
167 start<0 || start>end || end>UPVEC_MAX_CP || |
|
168 column<0 || column>=(pv->columns-2) |
|
169 ) { |
|
170 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
171 return; |
|
172 } |
|
173 if(pv->isCompacted) { |
|
174 *pErrorCode=U_NO_WRITE_PERMISSION; |
|
175 return; |
|
176 } |
|
177 limit=end+1; |
|
178 |
|
179 /* initialize */ |
|
180 columns=pv->columns; |
|
181 column+=2; /* skip range start and limit columns */ |
|
182 value&=mask; |
|
183 |
|
184 /* find the rows whose ranges overlap with the input range */ |
|
185 |
|
186 /* find the first and last rows, always successful */ |
|
187 firstRow=_findRow(pv, start); |
|
188 lastRow=_findRow(pv, end); |
|
189 |
|
190 /* |
|
191 * Rows need to be split if they partially overlap with the |
|
192 * input range (only possible for the first and last rows) |
|
193 * and if their value differs from the input value. |
|
194 */ |
|
195 splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); |
|
196 splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); |
|
197 |
|
198 /* split first/last rows if necessary */ |
|
199 if(splitFirstRow || splitLastRow) { |
|
200 int32_t count, rows; |
|
201 |
|
202 rows=pv->rows; |
|
203 if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { |
|
204 uint32_t *newVectors; |
|
205 int32_t newMaxRows; |
|
206 |
|
207 if(pv->maxRows<UPVEC_MEDIUM_ROWS) { |
|
208 newMaxRows=UPVEC_MEDIUM_ROWS; |
|
209 } else if(pv->maxRows<UPVEC_MAX_ROWS) { |
|
210 newMaxRows=UPVEC_MAX_ROWS; |
|
211 } else { |
|
212 /* Implementation bug, or UPVEC_MAX_ROWS too low. */ |
|
213 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; |
|
214 return; |
|
215 } |
|
216 newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4); |
|
217 if(newVectors==NULL) { |
|
218 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
219 return; |
|
220 } |
|
221 uprv_memcpy(newVectors, pv->v, rows*columns*4); |
|
222 firstRow=newVectors+(firstRow-pv->v); |
|
223 lastRow=newVectors+(lastRow-pv->v); |
|
224 uprv_free(pv->v); |
|
225 pv->v=newVectors; |
|
226 pv->maxRows=newMaxRows; |
|
227 } |
|
228 |
|
229 /* count the number of row cells to move after the last row, and move them */ |
|
230 count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); |
|
231 if(count>0) { |
|
232 uprv_memmove( |
|
233 lastRow+(1+splitFirstRow+splitLastRow)*columns, |
|
234 lastRow+columns, |
|
235 count*4); |
|
236 } |
|
237 pv->rows=rows+splitFirstRow+splitLastRow; |
|
238 |
|
239 /* split the first row, and move the firstRow pointer to the second part */ |
|
240 if(splitFirstRow) { |
|
241 /* copy all affected rows up one and move the lastRow pointer */ |
|
242 count = (int32_t)((lastRow-firstRow)+columns); |
|
243 uprv_memmove(firstRow+columns, firstRow, count*4); |
|
244 lastRow+=columns; |
|
245 |
|
246 /* split the range and move the firstRow pointer */ |
|
247 firstRow[1]=firstRow[columns]=(uint32_t)start; |
|
248 firstRow+=columns; |
|
249 } |
|
250 |
|
251 /* split the last row */ |
|
252 if(splitLastRow) { |
|
253 /* copy the last row data */ |
|
254 uprv_memcpy(lastRow+columns, lastRow, columns*4); |
|
255 |
|
256 /* split the range and move the firstRow pointer */ |
|
257 lastRow[1]=lastRow[columns]=(uint32_t)limit; |
|
258 } |
|
259 } |
|
260 |
|
261 /* set the "row last seen" to the last row for the range */ |
|
262 pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); |
|
263 |
|
264 /* set the input value in all remaining rows */ |
|
265 firstRow+=column; |
|
266 lastRow+=column; |
|
267 mask=~mask; |
|
268 for(;;) { |
|
269 *firstRow=(*firstRow&mask)|value; |
|
270 if(firstRow==lastRow) { |
|
271 break; |
|
272 } |
|
273 firstRow+=columns; |
|
274 } |
|
275 } |
|
276 |
|
277 U_CAPI uint32_t U_EXPORT2 |
|
278 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { |
|
279 uint32_t *row; |
|
280 UPropsVectors *ncpv; |
|
281 |
|
282 if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { |
|
283 return 0; |
|
284 } |
|
285 ncpv=(UPropsVectors *)pv; |
|
286 row=_findRow(ncpv, c); |
|
287 return row[2+column]; |
|
288 } |
|
289 |
|
290 U_CAPI uint32_t * U_EXPORT2 |
|
291 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, |
|
292 UChar32 *pRangeStart, UChar32 *pRangeEnd) { |
|
293 uint32_t *row; |
|
294 int32_t columns; |
|
295 |
|
296 if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { |
|
297 return NULL; |
|
298 } |
|
299 |
|
300 columns=pv->columns; |
|
301 row=pv->v+rowIndex*columns; |
|
302 if(pRangeStart!=NULL) { |
|
303 *pRangeStart=(UChar32)row[0]; |
|
304 } |
|
305 if(pRangeEnd!=NULL) { |
|
306 *pRangeEnd=(UChar32)row[1]-1; |
|
307 } |
|
308 return row+2; |
|
309 } |
|
310 |
|
311 static int32_t U_CALLCONV |
|
312 upvec_compareRows(const void *context, const void *l, const void *r) { |
|
313 const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; |
|
314 const UPropsVectors *pv=(const UPropsVectors *)context; |
|
315 int32_t i, count, columns; |
|
316 |
|
317 count=columns=pv->columns; /* includes start/limit columns */ |
|
318 |
|
319 /* start comparing after start/limit but wrap around to them */ |
|
320 i=2; |
|
321 do { |
|
322 if(left[i]!=right[i]) { |
|
323 return left[i]<right[i] ? -1 : 1; |
|
324 } |
|
325 if(++i==columns) { |
|
326 i=0; |
|
327 } |
|
328 } while(--count>0); |
|
329 |
|
330 return 0; |
|
331 } |
|
332 |
|
333 U_CAPI void U_EXPORT2 |
|
334 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { |
|
335 uint32_t *row; |
|
336 int32_t i, columns, valueColumns, rows, count; |
|
337 UChar32 start, limit; |
|
338 |
|
339 /* argument checking */ |
|
340 if(U_FAILURE(*pErrorCode)) { |
|
341 return; |
|
342 } |
|
343 if(handler==NULL) { |
|
344 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
345 return; |
|
346 } |
|
347 if(pv->isCompacted) { |
|
348 return; |
|
349 } |
|
350 |
|
351 /* Set the flag now: Sorting and compacting destroys the builder data structure. */ |
|
352 pv->isCompacted=TRUE; |
|
353 |
|
354 rows=pv->rows; |
|
355 columns=pv->columns; |
|
356 U_ASSERT(columns>=3); /* upvec_open asserts this */ |
|
357 valueColumns=columns-2; /* not counting start & limit */ |
|
358 |
|
359 /* sort the properties vectors to find unique vector values */ |
|
360 uprv_sortArray(pv->v, rows, columns*4, |
|
361 upvec_compareRows, pv, FALSE, pErrorCode); |
|
362 if(U_FAILURE(*pErrorCode)) { |
|
363 return; |
|
364 } |
|
365 |
|
366 /* |
|
367 * Find and set the special values. |
|
368 * This has to do almost the same work as the compaction below, |
|
369 * to find the indexes where the special-value rows will move. |
|
370 */ |
|
371 row=pv->v; |
|
372 count=-valueColumns; |
|
373 for(i=0; i<rows; ++i) { |
|
374 start=(UChar32)row[0]; |
|
375 |
|
376 /* count a new values vector if it is different from the current one */ |
|
377 if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) { |
|
378 count+=valueColumns; |
|
379 } |
|
380 |
|
381 if(start>=UPVEC_FIRST_SPECIAL_CP) { |
|
382 handler(context, start, start, count, row+2, valueColumns, pErrorCode); |
|
383 if(U_FAILURE(*pErrorCode)) { |
|
384 return; |
|
385 } |
|
386 } |
|
387 |
|
388 row+=columns; |
|
389 } |
|
390 |
|
391 /* count is at the beginning of the last vector, add valueColumns to include that last vector */ |
|
392 count+=valueColumns; |
|
393 |
|
394 /* Call the handler once more to signal the start of delivering real values. */ |
|
395 handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, |
|
396 count, row-valueColumns, valueColumns, pErrorCode); |
|
397 if(U_FAILURE(*pErrorCode)) { |
|
398 return; |
|
399 } |
|
400 |
|
401 /* |
|
402 * Move vector contents up to a contiguous array with only unique |
|
403 * vector values, and call the handler function for each vector. |
|
404 * |
|
405 * This destroys the Properties Vector structure and replaces it |
|
406 * with an array of just vector values. |
|
407 */ |
|
408 row=pv->v; |
|
409 count=-valueColumns; |
|
410 for(i=0; i<rows; ++i) { |
|
411 /* fetch these first before memmove() may overwrite them */ |
|
412 start=(UChar32)row[0]; |
|
413 limit=(UChar32)row[1]; |
|
414 |
|
415 /* add a new values vector if it is different from the current one */ |
|
416 if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) { |
|
417 count+=valueColumns; |
|
418 uprv_memmove(pv->v+count, row+2, valueColumns*4); |
|
419 } |
|
420 |
|
421 if(start<UPVEC_FIRST_SPECIAL_CP) { |
|
422 handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode); |
|
423 if(U_FAILURE(*pErrorCode)) { |
|
424 return; |
|
425 } |
|
426 } |
|
427 |
|
428 row+=columns; |
|
429 } |
|
430 |
|
431 /* count is at the beginning of the last vector, add one to include that last vector */ |
|
432 pv->rows=count/valueColumns+1; |
|
433 } |
|
434 |
|
435 U_CAPI const uint32_t * U_EXPORT2 |
|
436 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { |
|
437 if(!pv->isCompacted) { |
|
438 return NULL; |
|
439 } |
|
440 if(pRows!=NULL) { |
|
441 *pRows=pv->rows; |
|
442 } |
|
443 if(pColumns!=NULL) { |
|
444 *pColumns=pv->columns-2; |
|
445 } |
|
446 return pv->v; |
|
447 } |
|
448 |
|
449 U_CAPI uint32_t * U_EXPORT2 |
|
450 upvec_cloneArray(const UPropsVectors *pv, |
|
451 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { |
|
452 uint32_t *clonedArray; |
|
453 int32_t byteLength; |
|
454 |
|
455 if(U_FAILURE(*pErrorCode)) { |
|
456 return NULL; |
|
457 } |
|
458 if(!pv->isCompacted) { |
|
459 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
460 return NULL; |
|
461 } |
|
462 byteLength=pv->rows*(pv->columns-2)*4; |
|
463 clonedArray=(uint32_t *)uprv_malloc(byteLength); |
|
464 if(clonedArray==NULL) { |
|
465 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
466 return NULL; |
|
467 } |
|
468 uprv_memcpy(clonedArray, pv->v, byteLength); |
|
469 if(pRows!=NULL) { |
|
470 *pRows=pv->rows; |
|
471 } |
|
472 if(pColumns!=NULL) { |
|
473 *pColumns=pv->columns-2; |
|
474 } |
|
475 return clonedArray; |
|
476 } |
|
477 |
|
478 U_CAPI UTrie2 * U_EXPORT2 |
|
479 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { |
|
480 UPVecToUTrie2Context toUTrie2={ NULL }; |
|
481 upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); |
|
482 utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); |
|
483 if(U_FAILURE(*pErrorCode)) { |
|
484 utrie2_close(toUTrie2.trie); |
|
485 toUTrie2.trie=NULL; |
|
486 } |
|
487 return toUTrie2.trie; |
|
488 } |
|
489 |
|
490 /* |
|
491 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts |
|
492 * some 16-bit field and builds and returns a UTrie2. |
|
493 */ |
|
494 |
|
495 U_CAPI void U_CALLCONV |
|
496 upvec_compactToUTrie2Handler(void *context, |
|
497 UChar32 start, UChar32 end, |
|
498 int32_t rowIndex, uint32_t *row, int32_t columns, |
|
499 UErrorCode *pErrorCode) { |
|
500 UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; |
|
501 if(start<UPVEC_FIRST_SPECIAL_CP) { |
|
502 utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); |
|
503 } else { |
|
504 switch(start) { |
|
505 case UPVEC_INITIAL_VALUE_CP: |
|
506 toUTrie2->initialValue=rowIndex; |
|
507 break; |
|
508 case UPVEC_ERROR_VALUE_CP: |
|
509 toUTrie2->errorValue=rowIndex; |
|
510 break; |
|
511 case UPVEC_START_REAL_VALUES_CP: |
|
512 toUTrie2->maxValue=rowIndex; |
|
513 if(rowIndex>0xffff) { |
|
514 /* too many rows for a 16-bit trie */ |
|
515 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
516 } else { |
|
517 toUTrie2->trie=utrie2_open(toUTrie2->initialValue, |
|
518 toUTrie2->errorValue, pErrorCode); |
|
519 } |
|
520 break; |
|
521 default: |
|
522 break; |
|
523 } |
|
524 } |
|
525 } |