michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2003-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: unorm_it.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2003jan21 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/uiter.h" michael@0: #include "unicode/unorm.h" michael@0: #include "unicode/utf.h" michael@0: #include "unorm_it.h" michael@0: #include "cmemory.h" michael@0: michael@0: /* UNormIterator ------------------------------------------------------------ */ michael@0: michael@0: enum { michael@0: INITIAL_CAPACITY=100 michael@0: }; michael@0: michael@0: struct UNormIterator { michael@0: UCharIterator api; michael@0: UCharIterator *iter; michael@0: michael@0: /* michael@0: * chars and states either use the static buffers michael@0: * or are allocated in the same memory block michael@0: * michael@0: * They are parallel arrays with states[] holding the getState() values michael@0: * from normalization boundaries, and UITER_NO_STATE in between. michael@0: */ michael@0: UChar *chars; michael@0: uint32_t *states; michael@0: michael@0: /* michael@0: * api.start: first valid character & state in the arrays michael@0: * api.index: current position michael@0: * api.limit: one past the last valid character in chars[], but states[limit] is valid michael@0: * capacity: length of allocated arrays michael@0: */ michael@0: int32_t capacity; michael@0: michael@0: /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */ michael@0: uint32_t state; michael@0: michael@0: /* there are UChars available before start or after limit? */ michael@0: UBool hasPrevious, hasNext, isStackAllocated; michael@0: michael@0: UNormalizationMode mode; michael@0: michael@0: UChar charsBuffer[INITIAL_CAPACITY]; michael@0: uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */ michael@0: }; michael@0: michael@0: static void michael@0: initIndexes(UNormIterator *uni, UCharIterator *iter) { michael@0: /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ michael@0: UCharIterator *api=&uni->api; michael@0: michael@0: if(!iter->hasPrevious(iter)) { michael@0: /* set indexes to the beginning of the arrays */ michael@0: api->start=api->index=api->limit=0; michael@0: uni->hasPrevious=FALSE; michael@0: uni->hasNext=iter->hasNext(iter); michael@0: } else if(!iter->hasNext(iter)) { michael@0: /* set indexes to the end of the arrays */ michael@0: api->start=api->index=api->limit=uni->capacity; michael@0: uni->hasNext=FALSE; michael@0: uni->hasPrevious=iter->hasPrevious(iter); michael@0: } else { michael@0: /* set indexes into the middle of the arrays */ michael@0: api->start=api->index=api->limit=uni->capacity/2; michael@0: uni->hasPrevious=uni->hasNext=TRUE; michael@0: } michael@0: } michael@0: michael@0: static UBool michael@0: reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) { michael@0: /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ michael@0: UCharIterator *api=&uni->api; michael@0: michael@0: uint32_t *states; michael@0: UChar *chars; michael@0: int32_t start, limit; michael@0: michael@0: states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2); michael@0: if(states==NULL) { michael@0: return FALSE; michael@0: } michael@0: michael@0: chars=(UChar *)(states+(capacity+1)); michael@0: uni->capacity=capacity; michael@0: michael@0: start=api->start; michael@0: limit=api->limit; michael@0: michael@0: if(addAtStart) { michael@0: /* copy old contents to the end of the new arrays */ michael@0: int32_t delta; michael@0: michael@0: delta=capacity-uni->capacity; michael@0: uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4); michael@0: uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4); michael@0: michael@0: api->start=start+delta; michael@0: api->index+=delta; michael@0: api->limit=limit+delta; michael@0: } else { michael@0: /* copy old contents to the beginning of the new arrays */ michael@0: uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4); michael@0: uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4); michael@0: } michael@0: michael@0: uni->chars=chars; michael@0: uni->states=states; michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: static void michael@0: moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) { michael@0: /* move array contents up to make room */ michael@0: int32_t srcIndex, destIndex, limit; michael@0: michael@0: limit=api->limit; michael@0: srcIndex=delta; michael@0: if(srcIndex>api->start) { michael@0: /* look for a position in the arrays with a known state */ michael@0: while(srcIndexstart=destIndex=0; michael@0: while(srcIndexlimit=destIndex; michael@0: } michael@0: michael@0: static void michael@0: moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) { michael@0: /* move array contents up to make room */ michael@0: int32_t srcIndex, destIndex, start; michael@0: michael@0: start=api->start; michael@0: destIndex=((UNormIterator *)api)->capacity; michael@0: srcIndex=destIndex-delta; michael@0: if(srcIndexlimit) { michael@0: /* look for a position in the arrays with a known state */ michael@0: while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) { michael@0: --srcIndex; michael@0: } michael@0: } michael@0: michael@0: /* now actually move the array contents */ michael@0: api->limit=destIndex; michael@0: michael@0: /* copy states[limit] as well! */ michael@0: states[destIndex]=states[srcIndex]; michael@0: michael@0: while(srcIndex>start) { michael@0: chars[--destIndex]=chars[--srcIndex]; michael@0: states[destIndex]=states[srcIndex]; michael@0: } michael@0: michael@0: api->start=destIndex; michael@0: } michael@0: michael@0: /* normalize forward from the limit, assume hasNext is true */ michael@0: static UBool michael@0: readNext(UNormIterator *uni, UCharIterator *iter) { michael@0: /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ michael@0: UCharIterator *api=&uni->api; michael@0: michael@0: /* make capacity/4 room at the end of the arrays */ michael@0: int32_t limit, capacity, room; michael@0: UErrorCode errorCode; michael@0: michael@0: limit=api->limit; michael@0: capacity=uni->capacity; michael@0: room=capacity/4; michael@0: if(room>(capacity-limit)) { michael@0: /* move array contents to make room */ michael@0: moveContentsTowardStart(api, uni->chars, uni->states, room); michael@0: api->index=limit=api->limit; michael@0: uni->hasPrevious=TRUE; michael@0: } michael@0: michael@0: /* normalize starting from the limit position */ michael@0: errorCode=U_ZERO_ERROR; michael@0: if(uni->state!=uni->states[limit]) { michael@0: uiter_setState(iter, uni->states[limit], &errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: uni->state=UITER_NO_STATE; michael@0: uni->hasNext=FALSE; michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode); michael@0: if(errorCode==U_BUFFER_OVERFLOW_ERROR) { michael@0: if(room<=capacity) { michael@0: /* empty and re-use the arrays */ michael@0: uni->states[0]=uni->states[limit]; michael@0: api->start=api->index=api->limit=limit=0; michael@0: uni->hasPrevious=TRUE; michael@0: } else { michael@0: capacity+=room+100; michael@0: if(!reallocArrays(uni, capacity, FALSE)) { michael@0: uni->state=UITER_NO_STATE; michael@0: uni->hasNext=FALSE; michael@0: return FALSE; michael@0: } michael@0: limit=api->limit; michael@0: } michael@0: michael@0: errorCode=U_ZERO_ERROR; michael@0: uiter_setState(iter, uni->states[limit], &errorCode); michael@0: room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode); michael@0: } michael@0: if(U_FAILURE(errorCode) || room==0) { michael@0: uni->state=UITER_NO_STATE; michael@0: uni->hasNext=FALSE; michael@0: return FALSE; michael@0: } michael@0: michael@0: /* room>0 */ michael@0: ++limit; /* leave the known states[limit] alone */ michael@0: for(--room; room>0; --room) { michael@0: /* set unknown states for all but the normalization boundaries */ michael@0: uni->states[limit++]=UITER_NO_STATE; michael@0: } michael@0: uni->states[limit]=uni->state=uiter_getState(iter); michael@0: uni->hasNext=iter->hasNext(iter); michael@0: api->limit=limit; michael@0: return TRUE; michael@0: } michael@0: michael@0: /* normalize backward from the start, assume hasPrevious is true */ michael@0: static UBool michael@0: readPrevious(UNormIterator *uni, UCharIterator *iter) { michael@0: /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ michael@0: UCharIterator *api=&uni->api; michael@0: michael@0: /* make capacity/4 room at the start of the arrays */ michael@0: int32_t start, capacity, room; michael@0: UErrorCode errorCode; michael@0: michael@0: start=api->start; michael@0: capacity=uni->capacity; michael@0: room=capacity/4; michael@0: if(room>start) { michael@0: /* move array contents to make room */ michael@0: moveContentsTowardEnd(api, uni->chars, uni->states, room); michael@0: api->index=start=api->start; michael@0: uni->hasNext=TRUE; michael@0: } michael@0: michael@0: /* normalize ending at the start position */ michael@0: errorCode=U_ZERO_ERROR; michael@0: if(uni->state!=uni->states[start]) { michael@0: uiter_setState(iter, uni->states[start], &errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: uni->state=UITER_NO_STATE; michael@0: uni->hasPrevious=FALSE; michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode); michael@0: if(errorCode==U_BUFFER_OVERFLOW_ERROR) { michael@0: if(room<=capacity) { michael@0: /* empty and re-use the arrays */ michael@0: uni->states[capacity]=uni->states[start]; michael@0: api->start=api->index=api->limit=start=capacity; michael@0: uni->hasNext=TRUE; michael@0: } else { michael@0: capacity+=room+100; michael@0: if(!reallocArrays(uni, capacity, TRUE)) { michael@0: uni->state=UITER_NO_STATE; michael@0: uni->hasPrevious=FALSE; michael@0: return FALSE; michael@0: } michael@0: start=api->start; michael@0: } michael@0: michael@0: errorCode=U_ZERO_ERROR; michael@0: uiter_setState(iter, uni->states[start], &errorCode); michael@0: room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode); michael@0: } michael@0: if(U_FAILURE(errorCode) || room==0) { michael@0: uni->state=UITER_NO_STATE; michael@0: uni->hasPrevious=FALSE; michael@0: return FALSE; michael@0: } michael@0: michael@0: /* room>0 */ michael@0: do { michael@0: /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */ michael@0: uni->chars[--start]=uni->chars[--room]; michael@0: /* set unknown states for all but the normalization boundaries */ michael@0: uni->states[start]=UITER_NO_STATE; michael@0: } while(room>0); michael@0: uni->states[start]=uni->state=uiter_getState(iter); michael@0: uni->hasPrevious=iter->hasPrevious(iter); michael@0: api->start=start; michael@0: return TRUE; michael@0: } michael@0: michael@0: /* Iterator runtime API functions ------------------------------------------- */ michael@0: michael@0: static int32_t U_CALLCONV michael@0: unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) { michael@0: switch(origin) { michael@0: case UITER_ZERO: michael@0: case UITER_START: michael@0: return 0; michael@0: case UITER_CURRENT: michael@0: case UITER_LIMIT: michael@0: case UITER_LENGTH: michael@0: return UITER_UNKNOWN_INDEX; michael@0: default: michael@0: /* not a valid origin */ michael@0: /* Should never get here! */ michael@0: return -1; michael@0: } michael@0: } michael@0: michael@0: static int32_t U_CALLCONV michael@0: unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) { michael@0: UNormIterator *uni=(UNormIterator *)api; michael@0: UCharIterator *iter=uni->iter; michael@0: int32_t pos; michael@0: michael@0: switch(origin) { michael@0: case UITER_ZERO: michael@0: case UITER_START: michael@0: /* restart from the beginning */ michael@0: if(uni->hasPrevious) { michael@0: iter->move(iter, 0, UITER_START); michael@0: api->start=api->index=api->limit=0; michael@0: uni->states[api->limit]=uni->state=uiter_getState(iter); michael@0: uni->hasPrevious=FALSE; michael@0: uni->hasNext=iter->hasNext(iter); michael@0: } else { michael@0: /* we already have the beginning of the normalized text */ michael@0: api->index=api->start; michael@0: } michael@0: break; michael@0: case UITER_CURRENT: michael@0: break; michael@0: case UITER_LIMIT: michael@0: case UITER_LENGTH: michael@0: /* restart from the end */ michael@0: if(uni->hasNext) { michael@0: iter->move(iter, 0, UITER_LIMIT); michael@0: api->start=api->index=api->limit=uni->capacity; michael@0: uni->states[api->limit]=uni->state=uiter_getState(iter); michael@0: uni->hasPrevious=iter->hasPrevious(iter); michael@0: uni->hasNext=FALSE; michael@0: } else { michael@0: /* we already have the end of the normalized text */ michael@0: api->index=api->limit; michael@0: } michael@0: break; michael@0: default: michael@0: return -1; /* Error */ michael@0: } michael@0: michael@0: /* move relative to the current position by delta normalized UChars */ michael@0: if(delta==0) { michael@0: /* nothing to do */ michael@0: } else if(delta>0) { michael@0: /* go forward until the requested position is in the buffer */ michael@0: for(;;) { michael@0: pos=api->index+delta; /* requested position */ michael@0: delta=pos-api->limit; /* remainder beyond buffered text */ michael@0: if(delta<=0) { michael@0: api->index=pos; /* position reached */ michael@0: break; michael@0: } michael@0: michael@0: /* go to end of buffer and normalize further */ michael@0: api->index=api->limit; michael@0: if(!uni->hasNext || !readNext(uni, iter)) { michael@0: break; /* reached end of text */ michael@0: } michael@0: } michael@0: } else /* delta<0 */ { michael@0: /* go backward until the requested position is in the buffer */ michael@0: for(;;) { michael@0: pos=api->index+delta; /* requested position */ michael@0: delta=pos-api->start; /* remainder beyond buffered text */ michael@0: if(delta>=0) { michael@0: api->index=pos; /* position reached */ michael@0: break; michael@0: } michael@0: michael@0: /* go to start of buffer and normalize further */ michael@0: api->index=api->start; michael@0: if(!uni->hasPrevious || !readPrevious(uni, iter)) { michael@0: break; /* reached start of text */ michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(api->index==api->start && !uni->hasPrevious) { michael@0: return 0; michael@0: } else { michael@0: return UITER_UNKNOWN_INDEX; michael@0: } michael@0: } michael@0: michael@0: static UBool U_CALLCONV michael@0: unormIteratorHasNext(UCharIterator *api) { michael@0: return api->indexlimit || ((UNormIterator *)api)->hasNext; michael@0: } michael@0: michael@0: static UBool U_CALLCONV michael@0: unormIteratorHasPrevious(UCharIterator *api) { michael@0: return api->index>api->start || ((UNormIterator *)api)->hasPrevious; michael@0: } michael@0: michael@0: static UChar32 U_CALLCONV michael@0: unormIteratorCurrent(UCharIterator *api) { michael@0: UNormIterator *uni=(UNormIterator *)api; michael@0: michael@0: if( api->indexlimit || michael@0: (uni->hasNext && readNext(uni, uni->iter)) michael@0: ) { michael@0: return uni->chars[api->index]; michael@0: } else { michael@0: return U_SENTINEL; michael@0: } michael@0: } michael@0: michael@0: static UChar32 U_CALLCONV michael@0: unormIteratorNext(UCharIterator *api) { michael@0: UNormIterator *uni=(UNormIterator *)api; michael@0: michael@0: if( api->indexlimit || michael@0: (uni->hasNext && readNext(uni, uni->iter)) michael@0: ) { michael@0: return uni->chars[api->index++]; michael@0: } else { michael@0: return U_SENTINEL; michael@0: } michael@0: } michael@0: michael@0: static UChar32 U_CALLCONV michael@0: unormIteratorPrevious(UCharIterator *api) { michael@0: UNormIterator *uni=(UNormIterator *)api; michael@0: michael@0: if( api->index>api->start || michael@0: (uni->hasPrevious && readPrevious(uni, uni->iter)) michael@0: ) { michael@0: return uni->chars[--api->index]; michael@0: } else { michael@0: return U_SENTINEL; michael@0: } michael@0: } michael@0: michael@0: static uint32_t U_CALLCONV michael@0: unormIteratorGetState(const UCharIterator *api) { michael@0: /* not uni->state because that may not be at api->index */ michael@0: return ((UNormIterator *)api)->states[api->index]; michael@0: } michael@0: michael@0: static void U_CALLCONV michael@0: unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) { michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: /* do nothing */ michael@0: } else if(api==NULL) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: } else if(state==UITER_NO_STATE) { michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: } else { michael@0: UNormIterator *uni=(UNormIterator *)api; michael@0: UCharIterator *iter=((UNormIterator *)api)->iter; michael@0: if(state!=uni->state) { michael@0: uni->state=state; michael@0: uiter_setState(iter, state, pErrorCode); michael@0: } michael@0: michael@0: /* michael@0: * Try shortcuts: If the requested state is in the array contents michael@0: * then just set the index there. michael@0: * michael@0: * We assume that the state is unique per position! michael@0: */ michael@0: if(state==uni->states[api->index]) { michael@0: return; michael@0: } else if(state==uni->states[api->limit]) { michael@0: api->index=api->limit; michael@0: return; michael@0: } else { michael@0: /* search for the index with this state */ michael@0: int32_t i; michael@0: michael@0: for(i=api->start; ilimit; ++i) { michael@0: if(state==uni->states[i]) { michael@0: api->index=i; michael@0: return; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* there is no array index for this state, reset for fresh contents */ michael@0: initIndexes((UNormIterator *)api, iter); michael@0: uni->states[api->limit]=state; michael@0: } michael@0: } michael@0: michael@0: static const UCharIterator unormIterator={ michael@0: NULL, 0, 0, 0, 0, 0, michael@0: unormIteratorGetIndex, michael@0: unormIteratorMove, michael@0: unormIteratorHasNext, michael@0: unormIteratorHasPrevious, michael@0: unormIteratorCurrent, michael@0: unormIteratorNext, michael@0: unormIteratorPrevious, michael@0: NULL, michael@0: unormIteratorGetState, michael@0: unormIteratorSetState michael@0: }; michael@0: michael@0: /* Setup functions ---------------------------------------------------------- */ michael@0: michael@0: U_CAPI UNormIterator * U_EXPORT2 michael@0: unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) { michael@0: UNormIterator *uni; michael@0: michael@0: /* argument checking */ michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: michael@0: /* allocate */ michael@0: uni=NULL; michael@0: if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) { michael@0: if(U_ALIGNMENT_OFFSET(stackMem)==0) { michael@0: /* already aligned */ michael@0: uni=(UNormIterator *)stackMem; michael@0: } else { michael@0: int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem); michael@0: if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) { michael@0: /* needs alignment */ michael@0: uni=(UNormIterator *)((char *)stackMem+align); michael@0: } michael@0: } michael@0: /* else does not fit */ michael@0: } michael@0: michael@0: if(uni!=NULL) { michael@0: uni->isStackAllocated=TRUE; michael@0: } else { michael@0: uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator)); michael@0: if(uni==NULL) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uni->isStackAllocated=FALSE; michael@0: } michael@0: michael@0: /* michael@0: * initialize michael@0: * do not memset because that would unnecessarily initialize the arrays michael@0: */ michael@0: uni->iter=NULL; michael@0: uni->chars=uni->charsBuffer; michael@0: uni->states=uni->statesBuffer; michael@0: uni->capacity=INITIAL_CAPACITY; michael@0: uni->state=UITER_NO_STATE; michael@0: uni->hasPrevious=uni->hasNext=FALSE; michael@0: uni->mode=UNORM_NONE; michael@0: michael@0: /* set a no-op iterator into the api */ michael@0: uiter_setString(&uni->api, NULL, 0); michael@0: return uni; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: unorm_closeIter(UNormIterator *uni) { michael@0: if(uni!=NULL) { michael@0: if(uni->states!=uni->statesBuffer) { michael@0: /* chars and states are allocated in the same memory block */ michael@0: uprv_free(uni->states); michael@0: } michael@0: if(!uni->isStackAllocated) { michael@0: uprv_free(uni); michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_CAPI UCharIterator * U_EXPORT2 michael@0: unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) { michael@0: /* argument checking */ michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: if(uni==NULL) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: if( iter==NULL || iter->getState==NULL || iter->setState==NULL || michael@0: modeapi, NULL, 0); michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: /* set the iterator and initialize */ michael@0: uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator)); michael@0: michael@0: uni->iter=iter; michael@0: uni->mode=mode; michael@0: michael@0: initIndexes(uni, iter); michael@0: uni->states[uni->api.limit]=uni->state=uiter_getState(iter); michael@0: michael@0: return &uni->api; michael@0: } michael@0: michael@0: #endif /* uconfig.h switches */