michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2002-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: propsvec.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2002feb22 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * Store bits (Unicode character properties) in bit set vectors. michael@0: */ michael@0: michael@0: #include michael@0: #include "unicode/utypes.h" michael@0: #include "cmemory.h" michael@0: #include "utrie.h" michael@0: #include "utrie2.h" michael@0: #include "uarrsort.h" michael@0: #include "propsvec.h" michael@0: #include "uassert.h" michael@0: michael@0: struct UPropsVectors { michael@0: uint32_t *v; michael@0: int32_t columns; /* number of columns, plus two for start & limit values */ michael@0: int32_t maxRows; michael@0: int32_t rows; michael@0: int32_t prevRow; /* search optimization: remember last row seen */ michael@0: UBool isCompacted; michael@0: }; michael@0: michael@0: #define UPVEC_INITIAL_ROWS (1<<12) michael@0: #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) michael@0: #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) michael@0: michael@0: U_CAPI UPropsVectors * U_EXPORT2 michael@0: upvec_open(int32_t columns, UErrorCode *pErrorCode) { michael@0: UPropsVectors *pv; michael@0: uint32_t *v, *row; michael@0: uint32_t cp; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: if(columns<1) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: columns+=2; /* count range start and limit columns */ michael@0: michael@0: pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); michael@0: v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); michael@0: if(pv==NULL || v==NULL) { michael@0: uprv_free(pv); michael@0: uprv_free(v); michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uprv_memset(pv, 0, sizeof(UPropsVectors)); michael@0: pv->v=v; michael@0: pv->columns=columns; michael@0: pv->maxRows=UPVEC_INITIAL_ROWS; michael@0: pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); michael@0: michael@0: /* set the all-Unicode row and the special-value rows */ michael@0: row=pv->v; michael@0: uprv_memset(row, 0, pv->rows*columns*4); michael@0: row[0]=0; michael@0: row[1]=0x110000; michael@0: row+=columns; michael@0: for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { michael@0: row[0]=cp; michael@0: row[1]=cp+1; michael@0: row+=columns; michael@0: } michael@0: return pv; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: upvec_close(UPropsVectors *pv) { michael@0: if(pv!=NULL) { michael@0: uprv_free(pv->v); michael@0: uprv_free(pv); michael@0: } michael@0: } michael@0: michael@0: static uint32_t * michael@0: _findRow(UPropsVectors *pv, UChar32 rangeStart) { michael@0: uint32_t *row; michael@0: int32_t columns, i, start, limit, prevRow; michael@0: michael@0: columns=pv->columns; michael@0: limit=pv->rows; michael@0: prevRow=pv->prevRow; michael@0: michael@0: /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ michael@0: row=pv->v+prevRow*columns; michael@0: if(rangeStart>=(UChar32)row[0]) { michael@0: if(rangeStart<(UChar32)row[1]) { michael@0: /* same row as last seen */ michael@0: return row; michael@0: } else if(rangeStart<(UChar32)(row+=columns)[1]) { michael@0: /* next row after the last one */ michael@0: pv->prevRow=prevRow+1; michael@0: return row; michael@0: } else if(rangeStart<(UChar32)(row+=columns)[1]) { michael@0: /* second row after the last one */ michael@0: pv->prevRow=prevRow+2; michael@0: return row; michael@0: } else if((rangeStart-(UChar32)row[1])<10) { michael@0: /* we are close, continue looping */ michael@0: prevRow+=2; michael@0: do { michael@0: ++prevRow; michael@0: row+=columns; michael@0: } while(rangeStart>=(UChar32)row[1]); michael@0: pv->prevRow=prevRow; michael@0: return row; michael@0: } michael@0: } else if(rangeStart<(UChar32)pv->v[1]) { michael@0: /* the very first row */ michael@0: pv->prevRow=0; michael@0: return pv->v; michael@0: } michael@0: michael@0: /* do a binary search for the start of the range */ michael@0: start=0; michael@0: while(startv+i*columns; michael@0: if(rangeStart<(UChar32)row[0]) { michael@0: limit=i; michael@0: } else if(rangeStart<(UChar32)row[1]) { michael@0: pv->prevRow=i; michael@0: return row; michael@0: } else { michael@0: start=i; michael@0: } michael@0: } michael@0: michael@0: /* must be found because all ranges together always cover all of Unicode */ michael@0: pv->prevRow=start; michael@0: return pv->v+start*columns; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: upvec_setValue(UPropsVectors *pv, michael@0: UChar32 start, UChar32 end, michael@0: int32_t column, michael@0: uint32_t value, uint32_t mask, michael@0: UErrorCode *pErrorCode) { michael@0: uint32_t *firstRow, *lastRow; michael@0: int32_t columns; michael@0: UChar32 limit; michael@0: UBool splitFirstRow, splitLastRow; michael@0: michael@0: /* argument checking */ michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: if( pv==NULL || michael@0: start<0 || start>end || end>UPVEC_MAX_CP || michael@0: column<0 || column>=(pv->columns-2) michael@0: ) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: if(pv->isCompacted) { michael@0: *pErrorCode=U_NO_WRITE_PERMISSION; michael@0: return; michael@0: } michael@0: limit=end+1; michael@0: michael@0: /* initialize */ michael@0: columns=pv->columns; michael@0: column+=2; /* skip range start and limit columns */ michael@0: value&=mask; michael@0: michael@0: /* find the rows whose ranges overlap with the input range */ michael@0: michael@0: /* find the first and last rows, always successful */ michael@0: firstRow=_findRow(pv, start); michael@0: lastRow=_findRow(pv, end); michael@0: michael@0: /* michael@0: * Rows need to be split if they partially overlap with the michael@0: * input range (only possible for the first and last rows) michael@0: * and if their value differs from the input value. michael@0: */ michael@0: splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); michael@0: splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); michael@0: michael@0: /* split first/last rows if necessary */ michael@0: if(splitFirstRow || splitLastRow) { michael@0: int32_t count, rows; michael@0: michael@0: rows=pv->rows; michael@0: if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { michael@0: uint32_t *newVectors; michael@0: int32_t newMaxRows; michael@0: michael@0: if(pv->maxRowsmaxRowsv, rows*columns*4); michael@0: firstRow=newVectors+(firstRow-pv->v); michael@0: lastRow=newVectors+(lastRow-pv->v); michael@0: uprv_free(pv->v); michael@0: pv->v=newVectors; michael@0: pv->maxRows=newMaxRows; michael@0: } michael@0: michael@0: /* count the number of row cells to move after the last row, and move them */ michael@0: count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); michael@0: if(count>0) { michael@0: uprv_memmove( michael@0: lastRow+(1+splitFirstRow+splitLastRow)*columns, michael@0: lastRow+columns, michael@0: count*4); michael@0: } michael@0: pv->rows=rows+splitFirstRow+splitLastRow; michael@0: michael@0: /* split the first row, and move the firstRow pointer to the second part */ michael@0: if(splitFirstRow) { michael@0: /* copy all affected rows up one and move the lastRow pointer */ michael@0: count = (int32_t)((lastRow-firstRow)+columns); michael@0: uprv_memmove(firstRow+columns, firstRow, count*4); michael@0: lastRow+=columns; michael@0: michael@0: /* split the range and move the firstRow pointer */ michael@0: firstRow[1]=firstRow[columns]=(uint32_t)start; michael@0: firstRow+=columns; michael@0: } michael@0: michael@0: /* split the last row */ michael@0: if(splitLastRow) { michael@0: /* copy the last row data */ michael@0: uprv_memcpy(lastRow+columns, lastRow, columns*4); michael@0: michael@0: /* split the range and move the firstRow pointer */ michael@0: lastRow[1]=lastRow[columns]=(uint32_t)limit; michael@0: } michael@0: } michael@0: michael@0: /* set the "row last seen" to the last row for the range */ michael@0: pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); michael@0: michael@0: /* set the input value in all remaining rows */ michael@0: firstRow+=column; michael@0: lastRow+=column; michael@0: mask=~mask; michael@0: for(;;) { michael@0: *firstRow=(*firstRow&mask)|value; michael@0: if(firstRow==lastRow) { michael@0: break; michael@0: } michael@0: firstRow+=columns; michael@0: } michael@0: } michael@0: michael@0: U_CAPI uint32_t U_EXPORT2 michael@0: upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { michael@0: uint32_t *row; michael@0: UPropsVectors *ncpv; michael@0: michael@0: if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { michael@0: return 0; michael@0: } michael@0: ncpv=(UPropsVectors *)pv; michael@0: row=_findRow(ncpv, c); michael@0: return row[2+column]; michael@0: } michael@0: michael@0: U_CAPI uint32_t * U_EXPORT2 michael@0: upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, michael@0: UChar32 *pRangeStart, UChar32 *pRangeEnd) { michael@0: uint32_t *row; michael@0: int32_t columns; michael@0: michael@0: if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { michael@0: return NULL; michael@0: } michael@0: michael@0: columns=pv->columns; michael@0: row=pv->v+rowIndex*columns; michael@0: if(pRangeStart!=NULL) { michael@0: *pRangeStart=(UChar32)row[0]; michael@0: } michael@0: if(pRangeEnd!=NULL) { michael@0: *pRangeEnd=(UChar32)row[1]-1; michael@0: } michael@0: return row+2; michael@0: } michael@0: michael@0: static int32_t U_CALLCONV michael@0: upvec_compareRows(const void *context, const void *l, const void *r) { michael@0: const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; michael@0: const UPropsVectors *pv=(const UPropsVectors *)context; michael@0: int32_t i, count, columns; michael@0: michael@0: count=columns=pv->columns; /* includes start/limit columns */ michael@0: michael@0: /* start comparing after start/limit but wrap around to them */ michael@0: i=2; michael@0: do { michael@0: if(left[i]!=right[i]) { michael@0: return left[i]0); michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { michael@0: uint32_t *row; michael@0: int32_t i, columns, valueColumns, rows, count; michael@0: UChar32 start, limit; michael@0: michael@0: /* argument checking */ michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: if(handler==NULL) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: if(pv->isCompacted) { michael@0: return; michael@0: } michael@0: michael@0: /* Set the flag now: Sorting and compacting destroys the builder data structure. */ michael@0: pv->isCompacted=TRUE; michael@0: michael@0: rows=pv->rows; michael@0: columns=pv->columns; michael@0: U_ASSERT(columns>=3); /* upvec_open asserts this */ michael@0: valueColumns=columns-2; /* not counting start & limit */ michael@0: michael@0: /* sort the properties vectors to find unique vector values */ michael@0: uprv_sortArray(pv->v, rows, columns*4, michael@0: upvec_compareRows, pv, FALSE, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: michael@0: /* michael@0: * Find and set the special values. michael@0: * This has to do almost the same work as the compaction below, michael@0: * to find the indexes where the special-value rows will move. michael@0: */ michael@0: row=pv->v; michael@0: count=-valueColumns; michael@0: for(i=0; i=UPVEC_FIRST_SPECIAL_CP) { michael@0: handler(context, start, start, count, row+2, valueColumns, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: } michael@0: michael@0: row+=columns; michael@0: } michael@0: michael@0: /* count is at the beginning of the last vector, add valueColumns to include that last vector */ michael@0: count+=valueColumns; michael@0: michael@0: /* Call the handler once more to signal the start of delivering real values. */ michael@0: handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, michael@0: count, row-valueColumns, valueColumns, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: michael@0: /* michael@0: * Move vector contents up to a contiguous array with only unique michael@0: * vector values, and call the handler function for each vector. michael@0: * michael@0: * This destroys the Properties Vector structure and replaces it michael@0: * with an array of just vector values. michael@0: */ michael@0: row=pv->v; michael@0: count=-valueColumns; michael@0: for(i=0; iv+count, valueColumns*4)) { michael@0: count+=valueColumns; michael@0: uprv_memmove(pv->v+count, row+2, valueColumns*4); michael@0: } michael@0: michael@0: if(startv+count, valueColumns, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: } michael@0: michael@0: row+=columns; michael@0: } michael@0: michael@0: /* count is at the beginning of the last vector, add one to include that last vector */ michael@0: pv->rows=count/valueColumns+1; michael@0: } michael@0: michael@0: U_CAPI const uint32_t * U_EXPORT2 michael@0: upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { michael@0: if(!pv->isCompacted) { michael@0: return NULL; michael@0: } michael@0: if(pRows!=NULL) { michael@0: *pRows=pv->rows; michael@0: } michael@0: if(pColumns!=NULL) { michael@0: *pColumns=pv->columns-2; michael@0: } michael@0: return pv->v; michael@0: } michael@0: michael@0: U_CAPI uint32_t * U_EXPORT2 michael@0: upvec_cloneArray(const UPropsVectors *pv, michael@0: int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { michael@0: uint32_t *clonedArray; michael@0: int32_t byteLength; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: if(!pv->isCompacted) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: byteLength=pv->rows*(pv->columns-2)*4; michael@0: clonedArray=(uint32_t *)uprv_malloc(byteLength); michael@0: if(clonedArray==NULL) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uprv_memcpy(clonedArray, pv->v, byteLength); michael@0: if(pRows!=NULL) { michael@0: *pRows=pv->rows; michael@0: } michael@0: if(pColumns!=NULL) { michael@0: *pColumns=pv->columns-2; michael@0: } michael@0: return clonedArray; michael@0: } michael@0: michael@0: U_CAPI UTrie2 * U_EXPORT2 michael@0: upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { michael@0: UPVecToUTrie2Context toUTrie2={ NULL }; michael@0: upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); michael@0: utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: utrie2_close(toUTrie2.trie); michael@0: toUTrie2.trie=NULL; michael@0: } michael@0: return toUTrie2.trie; michael@0: } michael@0: michael@0: /* michael@0: * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts michael@0: * some 16-bit field and builds and returns a UTrie2. michael@0: */ michael@0: michael@0: U_CAPI void U_CALLCONV michael@0: upvec_compactToUTrie2Handler(void *context, michael@0: UChar32 start, UChar32 end, michael@0: int32_t rowIndex, uint32_t *row, int32_t columns, michael@0: UErrorCode *pErrorCode) { michael@0: UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; michael@0: if(starttrie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); michael@0: } else { michael@0: switch(start) { michael@0: case UPVEC_INITIAL_VALUE_CP: michael@0: toUTrie2->initialValue=rowIndex; michael@0: break; michael@0: case UPVEC_ERROR_VALUE_CP: michael@0: toUTrie2->errorValue=rowIndex; michael@0: break; michael@0: case UPVEC_START_REAL_VALUES_CP: michael@0: toUTrie2->maxValue=rowIndex; michael@0: if(rowIndex>0xffff) { michael@0: /* too many rows for a 16-bit trie */ michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: } else { michael@0: toUTrie2->trie=utrie2_open(toUTrie2->initialValue, michael@0: toUTrie2->errorValue, pErrorCode); michael@0: } michael@0: break; michael@0: default: michael@0: break; michael@0: } michael@0: } michael@0: }