michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2003-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: ucm.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2003jun20 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * This file reads a .ucm file, stores its mappings and sorts them. michael@0: * It implements handling of Unicode conversion mappings from .ucm files michael@0: * for makeconv, canonucm, rptp2ucm, etc. michael@0: * michael@0: * Unicode code point sequences with a length of more than 1, michael@0: * as well as byte sequences with more than 4 bytes or more than one complete michael@0: * character sequence are handled to support m:n mappings. michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/ustring.h" michael@0: #include "cstring.h" michael@0: #include "cmemory.h" michael@0: #include "filestrm.h" michael@0: #include "uarrsort.h" michael@0: #include "ucnvmbcs.h" michael@0: #include "ucnv_bld.h" michael@0: #include "ucnv_ext.h" michael@0: #include "uparse.h" michael@0: #include "ucm.h" michael@0: #include michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: /* -------------------------------------------------------------------------- */ michael@0: michael@0: static void michael@0: printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { michael@0: int32_t j; michael@0: michael@0: for(j=0; juLen; ++j) { michael@0: fprintf(f, "", (long)codePoints[j]); michael@0: } michael@0: michael@0: fputc(' ', f); michael@0: michael@0: for(j=0; jbLen; ++j) { michael@0: fprintf(f, "\\x%02X", bytes[j]); michael@0: } michael@0: michael@0: if(m->f>=0) { michael@0: fprintf(f, " |%u\n", m->f); michael@0: } else { michael@0: fputs("\n", f); michael@0: } michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { michael@0: printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { michael@0: UCMapping *m; michael@0: int32_t i, length; michael@0: michael@0: m=table->mappings; michael@0: length=table->mappingsLength; michael@0: if(byUnicode) { michael@0: for(i=0; ireverseMap; michael@0: for(i=0; iuLen==1 && r->uLen==1) { michael@0: /* compare two single code points */ michael@0: return l->u-r->u; michael@0: } michael@0: michael@0: /* get pointers to the code point sequences */ michael@0: lu=UCM_GET_CODE_POINTS(lTable, l); michael@0: ru=UCM_GET_CODE_POINTS(rTable, r); michael@0: michael@0: /* get the minimum length */ michael@0: if(l->uLen<=r->uLen) { michael@0: length=l->uLen; michael@0: } else { michael@0: length=r->uLen; michael@0: } michael@0: michael@0: /* compare the code points */ michael@0: for(i=0; iuLen-r->uLen; michael@0: } michael@0: michael@0: static int32_t michael@0: compareBytes(UCMTable *lTable, const UCMapping *l, michael@0: UCMTable *rTable, const UCMapping *r, michael@0: UBool lexical) { michael@0: const uint8_t *lb, *rb; michael@0: int32_t result, i, length; michael@0: michael@0: /* michael@0: * A lexical comparison is used for sorting in the builder, to allow michael@0: * an efficient search for a byte sequence that could be a prefix michael@0: * of a previously entered byte sequence. michael@0: * michael@0: * Comparing by lengths first is for compatibility with old .ucm tools michael@0: * like canonucm and rptp2ucm. michael@0: */ michael@0: if(lexical) { michael@0: /* get the minimum length and continue */ michael@0: if(l->bLen<=r->bLen) { michael@0: length=l->bLen; michael@0: } else { michael@0: length=r->bLen; michael@0: } michael@0: } else { michael@0: /* compare lengths first */ michael@0: result=l->bLen-r->bLen; michael@0: if(result!=0) { michael@0: return result; michael@0: } else { michael@0: length=l->bLen; michael@0: } michael@0: } michael@0: michael@0: /* get pointers to the byte sequences */ michael@0: lb=UCM_GET_BYTES(lTable, l); michael@0: rb=UCM_GET_BYTES(rTable, r); michael@0: michael@0: /* compare the bytes */ michael@0: for(i=0; ibLen-r->bLen; michael@0: } michael@0: michael@0: /* compare UCMappings for sorting */ michael@0: static int32_t michael@0: compareMappings(UCMTable *lTable, const UCMapping *l, michael@0: UCMTable *rTable, const UCMapping *r, michael@0: UBool uFirst) { michael@0: int32_t result; michael@0: michael@0: /* choose which side to compare first */ michael@0: if(uFirst) { michael@0: /* Unicode then bytes */ michael@0: result=compareUnicode(lTable, l, rTable, r); michael@0: if(result==0) { michael@0: result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */ michael@0: } michael@0: } else { michael@0: /* bytes then Unicode */ michael@0: result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */ michael@0: if(result==0) { michael@0: result=compareUnicode(lTable, l, rTable, r); michael@0: } michael@0: } michael@0: michael@0: if(result!=0) { michael@0: return result; michael@0: } michael@0: michael@0: /* compare the flags */ michael@0: return l->f-r->f; michael@0: } michael@0: michael@0: /* sorting by Unicode first sorts mappings directly */ michael@0: static int32_t michael@0: compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { michael@0: return compareMappings( michael@0: (UCMTable *)context, (const UCMapping *)left, michael@0: (UCMTable *)context, (const UCMapping *)right, TRUE); michael@0: } michael@0: michael@0: /* sorting by bytes first sorts the reverseMap; use indirection to mappings */ michael@0: static int32_t michael@0: compareMappingsBytesFirst(const void *context, const void *left, const void *right) { michael@0: UCMTable *table=(UCMTable *)context; michael@0: int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; michael@0: return compareMappings( michael@0: table, table->mappings+l, michael@0: table, table->mappings+r, FALSE); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_sortTable(UCMTable *t) { michael@0: UErrorCode errorCode; michael@0: int32_t i; michael@0: michael@0: if(t->isSorted) { michael@0: return; michael@0: } michael@0: michael@0: errorCode=U_ZERO_ERROR; michael@0: michael@0: /* 1. sort by Unicode first */ michael@0: uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), michael@0: compareMappingsUnicodeFirst, t, michael@0: FALSE, &errorCode); michael@0: michael@0: /* build the reverseMap */ michael@0: if(t->reverseMap==NULL) { michael@0: /* michael@0: * allocate mappingsCapacity instead of mappingsLength so that michael@0: * if mappings are added, the reverseMap need not be michael@0: * reallocated each time michael@0: * (see ucm_moveMappings() and ucm_addMapping()) michael@0: */ michael@0: t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); michael@0: if(t->reverseMap==NULL) { michael@0: fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: } michael@0: for(i=0; imappingsLength; ++i) { michael@0: t->reverseMap[i]=i; michael@0: } michael@0: michael@0: /* 2. sort reverseMap by mappings bytes first */ michael@0: uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), michael@0: compareMappingsBytesFirst, t, michael@0: FALSE, &errorCode); michael@0: michael@0: if(U_FAILURE(errorCode)) { michael@0: fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", michael@0: u_errorName(errorCode)); michael@0: exit(errorCode); michael@0: } michael@0: michael@0: t->isSorted=TRUE; michael@0: } michael@0: michael@0: /* michael@0: * remove mappings with their move flag set from the base table michael@0: * and move some of them (with UCM_MOVE_TO_EXT) to the extension table michael@0: */ michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_moveMappings(UCMTable *base, UCMTable *ext) { michael@0: UCMapping *mb, *mbLimit; michael@0: int8_t flag; michael@0: michael@0: mb=base->mappings; michael@0: mbLimit=mb+base->mappingsLength; michael@0: michael@0: while(mbmoveFlag; michael@0: if(flag!=0) { michael@0: /* reset the move flag */ michael@0: mb->moveFlag=0; michael@0: michael@0: if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) { michael@0: /* add the mapping to the extension table */ michael@0: ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); michael@0: } michael@0: michael@0: /* remove this mapping: move the last base mapping down and overwrite the current one */ michael@0: if(mb<(mbLimit-1)) { michael@0: uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); michael@0: } michael@0: --mbLimit; michael@0: --base->mappingsLength; michael@0: base->isSorted=FALSE; michael@0: } else { michael@0: ++mb; michael@0: } michael@0: } michael@0: } michael@0: michael@0: enum { michael@0: NEEDS_MOVE=1, michael@0: HAS_ERRORS=2 michael@0: }; michael@0: michael@0: static uint8_t michael@0: checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, michael@0: UBool moveToExt, UBool intersectBase) { michael@0: UCMapping *mb, *me, *mbLimit, *meLimit; michael@0: int32_t cmp; michael@0: uint8_t result; michael@0: michael@0: mb=base->mappings; michael@0: mbLimit=mb+base->mappingsLength; michael@0: michael@0: me=ext->mappings; michael@0: meLimit=me+ext->mappingsLength; michael@0: michael@0: result=0; michael@0: michael@0: for(;;) { michael@0: /* skip irrelevant mappings on both sides */ michael@0: for(;;) { michael@0: if(mb==mbLimit) { michael@0: return result; michael@0: } michael@0: michael@0: if((0<=mb->f && mb->f<=2) || mb->f==4) { michael@0: break; michael@0: } michael@0: michael@0: ++mb; michael@0: } michael@0: michael@0: for(;;) { michael@0: if(me==meLimit) { michael@0: return result; michael@0: } michael@0: michael@0: if((0<=me->f && me->f<=2) || me->f==4) { michael@0: break; michael@0: } michael@0: michael@0: ++me; michael@0: } michael@0: michael@0: /* compare the base and extension mappings */ michael@0: cmp=compareUnicode(base, mb, ext, me); michael@0: if(cmp<0) { michael@0: if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { michael@0: /* michael@0: * mapping in base but not in ext, move it michael@0: * michael@0: * if ext is DBCS, move DBCS mappings here michael@0: * and check SBCS ones for Unicode prefix below michael@0: */ michael@0: mb->moveFlag|=UCM_MOVE_TO_EXT; michael@0: result|=NEEDS_MOVE; michael@0: michael@0: /* does mb map from an input sequence that is a prefix of me's? */ michael@0: } else if( mb->uLenuLen && michael@0: 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) michael@0: ) { michael@0: if(moveToExt) { michael@0: /* mark this mapping to be moved to the extension table */ michael@0: mb->moveFlag|=UCM_MOVE_TO_EXT; michael@0: result|=NEEDS_MOVE; michael@0: } else { michael@0: fprintf(stderr, michael@0: "ucm error: the base table contains a mapping whose input sequence\n" michael@0: " is a prefix of the input sequence of an extension mapping\n"); michael@0: ucm_printMapping(base, mb, stderr); michael@0: ucm_printMapping(ext, me, stderr); michael@0: result|=HAS_ERRORS; michael@0: } michael@0: } michael@0: michael@0: ++mb; michael@0: } else if(cmp==0) { michael@0: /* michael@0: * same output: remove the extension mapping, michael@0: * otherwise treat as an error michael@0: */ michael@0: if( mb->f==me->f && mb->bLen==me->bLen && michael@0: 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) michael@0: ) { michael@0: me->moveFlag|=UCM_REMOVE_MAPPING; michael@0: result|=NEEDS_MOVE; michael@0: } else if(intersectBase) { michael@0: /* mapping in base but not in ext, move it */ michael@0: mb->moveFlag|=UCM_MOVE_TO_EXT; michael@0: result|=NEEDS_MOVE; michael@0: } else { michael@0: fprintf(stderr, michael@0: "ucm error: the base table contains a mapping whose input sequence\n" michael@0: " is the same as the input sequence of an extension mapping\n" michael@0: " but it maps differently\n"); michael@0: ucm_printMapping(base, mb, stderr); michael@0: ucm_printMapping(ext, me, stderr); michael@0: result|=HAS_ERRORS; michael@0: } michael@0: michael@0: ++mb; michael@0: } else /* cmp>0 */ { michael@0: ++me; michael@0: } michael@0: } michael@0: } michael@0: michael@0: static uint8_t michael@0: checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, michael@0: UBool moveToExt, UBool intersectBase) { michael@0: UCMapping *mb, *me; michael@0: int32_t *baseMap, *extMap; michael@0: int32_t b, e, bLimit, eLimit, cmp; michael@0: uint8_t result; michael@0: UBool isSISO; michael@0: michael@0: baseMap=base->reverseMap; michael@0: extMap=ext->reverseMap; michael@0: michael@0: b=e=0; michael@0: bLimit=base->mappingsLength; michael@0: eLimit=ext->mappingsLength; michael@0: michael@0: result=0; michael@0: michael@0: isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); michael@0: michael@0: for(;;) { michael@0: /* skip irrelevant mappings on both sides */ michael@0: for(;; ++b) { michael@0: if(b==bLimit) { michael@0: return result; michael@0: } michael@0: mb=base->mappings+baseMap[b]; michael@0: michael@0: if(intersectBase==2 && mb->bLen==1) { michael@0: /* michael@0: * comparing a base against a DBCS extension: michael@0: * leave SBCS base mappings alone michael@0: */ michael@0: continue; michael@0: } michael@0: michael@0: if(mb->f==0 || mb->f==3) { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: for(;;) { michael@0: if(e==eLimit) { michael@0: return result; michael@0: } michael@0: me=ext->mappings+extMap[e]; michael@0: michael@0: if(me->f==0 || me->f==3) { michael@0: break; michael@0: } michael@0: michael@0: ++e; michael@0: } michael@0: michael@0: /* compare the base and extension mappings */ michael@0: cmp=compareBytes(base, mb, ext, me, TRUE); michael@0: if(cmp<0) { michael@0: if(intersectBase) { michael@0: /* mapping in base but not in ext, move it */ michael@0: mb->moveFlag|=UCM_MOVE_TO_EXT; michael@0: result|=NEEDS_MOVE; michael@0: michael@0: /* michael@0: * does mb map from an input sequence that is a prefix of me's? michael@0: * for SI/SO tables, a single byte is never a prefix because it michael@0: * occurs in a separate single-byte state michael@0: */ michael@0: } else if( mb->bLenbLen && michael@0: (!isSISO || mb->bLen>1) && michael@0: 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) michael@0: ) { michael@0: if(moveToExt) { michael@0: /* mark this mapping to be moved to the extension table */ michael@0: mb->moveFlag|=UCM_MOVE_TO_EXT; michael@0: result|=NEEDS_MOVE; michael@0: } else { michael@0: fprintf(stderr, michael@0: "ucm error: the base table contains a mapping whose input sequence\n" michael@0: " is a prefix of the input sequence of an extension mapping\n"); michael@0: ucm_printMapping(base, mb, stderr); michael@0: ucm_printMapping(ext, me, stderr); michael@0: result|=HAS_ERRORS; michael@0: } michael@0: } michael@0: michael@0: ++b; michael@0: } else if(cmp==0) { michael@0: /* michael@0: * same output: remove the extension mapping, michael@0: * otherwise treat as an error michael@0: */ michael@0: if( mb->f==me->f && mb->uLen==me->uLen && michael@0: 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) michael@0: ) { michael@0: me->moveFlag|=UCM_REMOVE_MAPPING; michael@0: result|=NEEDS_MOVE; michael@0: } else if(intersectBase) { michael@0: /* mapping in base but not in ext, move it */ michael@0: mb->moveFlag|=UCM_MOVE_TO_EXT; michael@0: result|=NEEDS_MOVE; michael@0: } else { michael@0: fprintf(stderr, michael@0: "ucm error: the base table contains a mapping whose input sequence\n" michael@0: " is the same as the input sequence of an extension mapping\n" michael@0: " but it maps differently\n"); michael@0: ucm_printMapping(base, mb, stderr); michael@0: ucm_printMapping(ext, me, stderr); michael@0: result|=HAS_ERRORS; michael@0: } michael@0: michael@0: ++b; michael@0: } else /* cmp>0 */ { michael@0: ++e; michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { michael@0: UCMapping *m, *mLimit; michael@0: int32_t count; michael@0: UBool isOK; michael@0: michael@0: m=table->mappings; michael@0: mLimit=m+table->mappingsLength; michael@0: isOK=TRUE; michael@0: michael@0: while(mbLen); michael@0: if(count<1) { michael@0: ucm_printMapping(table, m, stderr); michael@0: isOK=FALSE; michael@0: } michael@0: ++m; michael@0: } michael@0: michael@0: return isOK; michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_checkBaseExt(UCMStates *baseStates, michael@0: UCMTable *base, UCMTable *ext, UCMTable *moveTarget, michael@0: UBool intersectBase) { michael@0: uint8_t result; michael@0: michael@0: /* if we have an extension table, we must always use precision flags */ michael@0: if(base->flagsType&UCM_FLAGS_IMPLICIT) { michael@0: fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); michael@0: return FALSE; michael@0: } michael@0: if(ext->flagsType&UCM_FLAGS_IMPLICIT) { michael@0: fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); michael@0: return FALSE; michael@0: } michael@0: michael@0: /* checking requires both tables to be sorted */ michael@0: ucm_sortTable(base); michael@0: ucm_sortTable(ext); michael@0: michael@0: /* check */ michael@0: result= michael@0: checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)| michael@0: checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase); michael@0: michael@0: if(result&HAS_ERRORS) { michael@0: return FALSE; michael@0: } michael@0: michael@0: if(result&NEEDS_MOVE) { michael@0: ucm_moveMappings(ext, NULL); michael@0: ucm_moveMappings(base, moveTarget); michael@0: ucm_sortTable(base); michael@0: ucm_sortTable(ext); michael@0: if(moveTarget!=NULL) { michael@0: ucm_sortTable(moveTarget); michael@0: } michael@0: } michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: /* merge tables for rptp2ucm ------------------------------------------------ */ michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, michael@0: const uint8_t *subchar, int32_t subcharLength, michael@0: uint8_t subchar1) { michael@0: UCMapping *fromUMapping, *toUMapping; michael@0: int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; michael@0: michael@0: ucm_sortTable(fromUTable); michael@0: ucm_sortTable(toUTable); michael@0: michael@0: fromUMapping=fromUTable->mappings; michael@0: toUMapping=toUTable->mappings; michael@0: michael@0: fromUTop=fromUTable->mappingsLength; michael@0: toUTop=toUTable->mappingsLength; michael@0: michael@0: fromUIndex=toUIndex=0; michael@0: michael@0: while(fromUIndexcodepage michael@0: */ michael@0: if( (fromUMapping->bLen==subcharLength && michael@0: 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || michael@0: (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) michael@0: ) { michael@0: fromUMapping->f=2; /* SUB mapping */ michael@0: } else { michael@0: fromUMapping->f=1; /* normal fallback */ michael@0: } michael@0: michael@0: ++fromUMapping; michael@0: ++fromUIndex; michael@0: } else { michael@0: /* michael@0: * the toU mapping does not have a fromU counterpart: michael@0: * (reverse) fallback codepage->Unicode, copy it to the fromU table michael@0: */ michael@0: michael@0: /* ignore reverse fallbacks to Unicode SUB */ michael@0: if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { michael@0: toUMapping->f=3; /* reverse fallback */ michael@0: ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); michael@0: michael@0: /* the table may have been reallocated */ michael@0: fromUMapping=fromUTable->mappings+fromUIndex; michael@0: } michael@0: michael@0: ++toUMapping; michael@0: ++toUIndex; michael@0: } michael@0: } michael@0: michael@0: /* either one or both tables are exhausted */ michael@0: while(fromUIndexbLen==subcharLength && michael@0: 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || michael@0: (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) michael@0: ) { michael@0: fromUMapping->f=2; /* SUB mapping */ michael@0: } else { michael@0: fromUMapping->f=1; /* normal fallback */ michael@0: } michael@0: michael@0: ++fromUMapping; michael@0: ++fromUIndex; michael@0: } michael@0: michael@0: while(toUIndexuLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { michael@0: toUMapping->f=3; /* reverse fallback */ michael@0: ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); michael@0: } michael@0: michael@0: ++toUMapping; michael@0: ++toUIndex; michael@0: } michael@0: michael@0: fromUTable->isSorted=FALSE; michael@0: } michael@0: michael@0: /* separate extension mappings out of base table for rptp2ucm --------------- */ michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_separateMappings(UCMFile *ucm, UBool isSISO) { michael@0: UCMTable *table; michael@0: UCMapping *m, *mLimit; michael@0: int32_t type; michael@0: UBool needsMove, isOK; michael@0: michael@0: table=ucm->base; michael@0: m=table->mappings; michael@0: mLimit=m+table->mappingsLength; michael@0: michael@0: needsMove=FALSE; michael@0: isOK=TRUE; michael@0: michael@0: for(; mbLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { michael@0: fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); michael@0: ucm_printMapping(table, m, stderr); michael@0: m->moveFlag|=UCM_REMOVE_MAPPING; michael@0: needsMove=TRUE; michael@0: continue; michael@0: } michael@0: michael@0: type=ucm_mappingType( michael@0: &ucm->states, m, michael@0: UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); michael@0: if(type<0) { michael@0: /* illegal byte sequence */ michael@0: printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); michael@0: isOK=FALSE; michael@0: } else if(type>0) { michael@0: m->moveFlag|=UCM_MOVE_TO_EXT; michael@0: needsMove=TRUE; michael@0: } michael@0: } michael@0: michael@0: if(!isOK) { michael@0: return FALSE; michael@0: } michael@0: if(needsMove) { michael@0: ucm_moveMappings(ucm->base, ucm->ext); michael@0: return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE); michael@0: } else { michael@0: ucm_sortTable(ucm->base); michael@0: return TRUE; michael@0: } michael@0: } michael@0: michael@0: /* ucm parser --------------------------------------------------------------- */ michael@0: michael@0: U_CAPI int8_t U_EXPORT2 michael@0: ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { michael@0: const char *s=*ps; michael@0: char *end; michael@0: uint8_t byte; michael@0: int8_t bLen; michael@0: michael@0: bLen=0; michael@0: for(;;) { michael@0: /* skip an optional plus sign */ michael@0: if(bLen>0 && *s=='+') { michael@0: ++s; michael@0: } michael@0: if(*s!='\\') { michael@0: break; michael@0: } michael@0: michael@0: if( s[1]!='x' || michael@0: (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 michael@0: ) { michael@0: fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); michael@0: return -1; michael@0: } michael@0: michael@0: if(bLen==UCNV_EXT_MAX_BYTES) { michael@0: fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); michael@0: return -1; michael@0: } michael@0: bytes[bLen++]=byte; michael@0: s=end; michael@0: } michael@0: michael@0: *ps=s; michael@0: return bLen; michael@0: } michael@0: michael@0: /* parse a mapping line; must not be empty */ michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_parseMappingLine(UCMapping *m, michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS], michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES], michael@0: const char *line) { michael@0: const char *s; michael@0: char *end; michael@0: UChar32 cp; michael@0: int32_t u16Length; michael@0: int8_t uLen, bLen, f; michael@0: michael@0: s=line; michael@0: uLen=bLen=0; michael@0: michael@0: /* parse code points */ michael@0: for(;;) { michael@0: /* skip an optional plus sign */ michael@0: if(uLen>0 && *s=='+') { michael@0: ++s; michael@0: } michael@0: if(*s!='<') { michael@0: break; michael@0: } michael@0: michael@0: if( s[1]!='U' || michael@0: (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || michael@0: *end!='>' michael@0: ) { michael@0: fprintf(stderr, "ucm error: Unicode code point must be formatted as (1..6 hex digits) - \"%s\"\n", line); michael@0: return FALSE; michael@0: } michael@0: if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { michael@0: fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); michael@0: return FALSE; michael@0: } michael@0: michael@0: if(uLen==UCNV_EXT_MAX_UCHARS) { michael@0: fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); michael@0: return FALSE; michael@0: } michael@0: codePoints[uLen++]=cp; michael@0: s=end+1; michael@0: } michael@0: michael@0: if(uLen==0) { michael@0: fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); michael@0: return FALSE; michael@0: } else if(uLen==1) { michael@0: m->u=codePoints[0]; michael@0: } else { michael@0: UErrorCode errorCode=U_ZERO_ERROR; michael@0: u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); michael@0: if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || michael@0: u16Length>UCNV_EXT_MAX_UCHARS michael@0: ) { michael@0: fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: s=u_skipWhitespace(s); michael@0: michael@0: /* parse bytes */ michael@0: bLen=ucm_parseBytes(bytes, line, &s); michael@0: michael@0: if(bLen<0) { michael@0: return FALSE; michael@0: } else if(bLen==0) { michael@0: fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); michael@0: return FALSE; michael@0: } else if(bLen<=4) { michael@0: uprv_memcpy(m->b.bytes, bytes, bLen); michael@0: } michael@0: michael@0: /* skip everything until the fallback indicator, even the start of a comment */ michael@0: for(;;) { michael@0: if(*s==0) { michael@0: f=-1; /* no fallback indicator */ michael@0: break; michael@0: } else if(*s=='|') { michael@0: f=(int8_t)(s[1]-'0'); michael@0: if((uint8_t)f>4) { michael@0: fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); michael@0: return FALSE; michael@0: } michael@0: break; michael@0: } michael@0: ++s; michael@0: } michael@0: michael@0: m->uLen=uLen; michael@0: m->bLen=bLen; michael@0: m->f=f; michael@0: return TRUE; michael@0: } michael@0: michael@0: /* general APIs ------------------------------------------------------------- */ michael@0: michael@0: U_CAPI UCMTable * U_EXPORT2 michael@0: ucm_openTable() { michael@0: UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); michael@0: if(table==NULL) { michael@0: fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: michael@0: memset(table, 0, sizeof(UCMTable)); michael@0: return table; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_closeTable(UCMTable *table) { michael@0: if(table!=NULL) { michael@0: uprv_free(table->mappings); michael@0: uprv_free(table->codePoints); michael@0: uprv_free(table->bytes); michael@0: uprv_free(table->reverseMap); michael@0: uprv_free(table); michael@0: } michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_resetTable(UCMTable *table) { michael@0: if(table!=NULL) { michael@0: table->mappingsLength=0; michael@0: table->flagsType=0; michael@0: table->unicodeMask=0; michael@0: table->bytesLength=table->codePointsLength=0; michael@0: table->isSorted=FALSE; michael@0: } michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_addMapping(UCMTable *table, michael@0: UCMapping *m, michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS], michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES]) { michael@0: UCMapping *tm; michael@0: UChar32 c; michael@0: int32_t idx; michael@0: michael@0: if(table->mappingsLength>=table->mappingsCapacity) { michael@0: /* make the mappings array larger */ michael@0: if(table->mappingsCapacity==0) { michael@0: table->mappingsCapacity=1000; michael@0: } else { michael@0: table->mappingsCapacity*=10; michael@0: } michael@0: table->mappings=(UCMapping *)uprv_realloc(table->mappings, michael@0: table->mappingsCapacity*sizeof(UCMapping)); michael@0: if(table->mappings==NULL) { michael@0: fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", michael@0: (int)table->mappingsCapacity); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: michael@0: if(table->reverseMap!=NULL) { michael@0: /* the reverseMap must be reallocated in a new sort */ michael@0: uprv_free(table->reverseMap); michael@0: table->reverseMap=NULL; michael@0: } michael@0: } michael@0: michael@0: if(m->uLen>1 && table->codePointsCapacity==0) { michael@0: table->codePointsCapacity=10000; michael@0: table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); michael@0: if(table->codePoints==NULL) { michael@0: fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", michael@0: (int)table->codePointsCapacity); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: } michael@0: michael@0: if(m->bLen>4 && table->bytesCapacity==0) { michael@0: table->bytesCapacity=10000; michael@0: table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); michael@0: if(table->bytes==NULL) { michael@0: fprintf(stderr, "ucm error: unable to allocate %d bytes\n", michael@0: (int)table->bytesCapacity); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: } michael@0: michael@0: if(m->uLen>1) { michael@0: idx=table->codePointsLength; michael@0: table->codePointsLength+=m->uLen; michael@0: if(table->codePointsLength>table->codePointsCapacity) { michael@0: fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: michael@0: uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4); michael@0: m->u=idx; michael@0: } michael@0: michael@0: if(m->bLen>4) { michael@0: idx=table->bytesLength; michael@0: table->bytesLength+=m->bLen; michael@0: if(table->bytesLength>table->bytesCapacity) { michael@0: fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: michael@0: uprv_memcpy(table->bytes+idx, bytes, m->bLen); michael@0: m->b.idx=idx; michael@0: } michael@0: michael@0: /* set unicodeMask */ michael@0: for(idx=0; idxuLen; ++idx) { michael@0: c=codePoints[idx]; michael@0: if(c>=0x10000) { michael@0: table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ michael@0: } else if(U_IS_SURROGATE(c)) { michael@0: table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ michael@0: } michael@0: } michael@0: michael@0: /* set flagsType */ michael@0: if(m->f<0) { michael@0: table->flagsType|=UCM_FLAGS_IMPLICIT; michael@0: } else { michael@0: table->flagsType|=UCM_FLAGS_EXPLICIT; michael@0: } michael@0: michael@0: tm=table->mappings+table->mappingsLength++; michael@0: uprv_memcpy(tm, m, sizeof(UCMapping)); michael@0: michael@0: table->isSorted=FALSE; michael@0: } michael@0: michael@0: U_CAPI UCMFile * U_EXPORT2 michael@0: ucm_open() { michael@0: UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); michael@0: if(ucm==NULL) { michael@0: fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); michael@0: exit(U_MEMORY_ALLOCATION_ERROR); michael@0: } michael@0: michael@0: memset(ucm, 0, sizeof(UCMFile)); michael@0: michael@0: ucm->base=ucm_openTable(); michael@0: ucm->ext=ucm_openTable(); michael@0: michael@0: ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; michael@0: ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; michael@0: ucm->states.outputType=-1; michael@0: ucm->states.minCharLength=ucm->states.maxCharLength=1; michael@0: michael@0: return ucm; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_close(UCMFile *ucm) { michael@0: if(ucm!=NULL) { michael@0: ucm_closeTable(ucm->base); michael@0: ucm_closeTable(ucm->ext); michael@0: uprv_free(ucm); michael@0: } michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ucm_mappingType(UCMStates *baseStates, michael@0: UCMapping *m, michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS], michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES]) { michael@0: /* check validity of the bytes and count the characters in them */ michael@0: int32_t count=ucm_countChars(baseStates, bytes, m->bLen); michael@0: if(count<1) { michael@0: /* illegal byte sequence */ michael@0: return -1; michael@0: } michael@0: michael@0: /* michael@0: * Suitable for an ICU conversion base table means: michael@0: * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) michael@0: * - precision flag 0..3 michael@0: * - SBCS: any 1:1 mapping michael@0: * (the table stores additional bits to distinguish mapping types) michael@0: * - MBCS: not a |2 SUB mapping for michael@0: * - MBCS: not a |1 fallback to 0x00 michael@0: * - MBCS: not a multi-byte mapping with leading 0x00 bytes michael@0: * michael@0: * Further restrictions for fromUnicode tables michael@0: * are enforced in makeconv (MBCSOkForBaseFromUnicode()). michael@0: * michael@0: * All of the MBCS fromUnicode specific tests could be removed from here, michael@0: * but the ones above are for unusual mappings, and removing the tests michael@0: * from here would change canonucm output which seems gratuitous. michael@0: * (Markus Scherer 2006-nov-28) michael@0: * michael@0: * Exception: All implicit mappings (f<0) that need to be moved michael@0: * because of fromUnicode restrictions _must_ be moved here because michael@0: * makeconv uses a hack for moving mappings only for the fromUnicode table michael@0: * that only works with non-negative values of f. michael@0: */ michael@0: if( m->uLen==1 && count==1 && m->f<=3 && michael@0: (baseStates->maxCharLength==1 || michael@0: !((m->f==2 && m->bLen==1) || michael@0: (m->f==1 && bytes[0]==0) || michael@0: (m->f<=1 && m->bLen>1 && bytes[0]==0))) michael@0: ) { michael@0: return 0; /* suitable for a base table */ michael@0: } else { michael@0: return 1; /* needs to go into an extension table */ michael@0: } michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, michael@0: UCMapping *m, michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS], michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES]) { michael@0: int32_t type; michael@0: michael@0: if(m->f==2 && m->uLen>1) { michael@0: fprintf(stderr, "ucm error: illegal |2 mapping from multiple code points\n"); michael@0: printMapping(m, codePoints, bytes, stderr); michael@0: return FALSE; michael@0: } michael@0: michael@0: if(baseStates!=NULL) { michael@0: /* check validity of the bytes and count the characters in them */ michael@0: type=ucm_mappingType(baseStates, m, codePoints, bytes); michael@0: if(type<0) { michael@0: /* illegal byte sequence */ michael@0: printMapping(m, codePoints, bytes, stderr); michael@0: return FALSE; michael@0: } michael@0: } else { michael@0: /* not used - adding a mapping for an extension-only table before its base table is read */ michael@0: type=1; michael@0: } michael@0: michael@0: /* michael@0: * Add the mapping to the base table if this is requested and suitable. michael@0: * Otherwise, add it to the extension table. michael@0: */ michael@0: if(forBase && type==0) { michael@0: ucm_addMapping(ucm->base, m, codePoints, bytes); michael@0: } else { michael@0: ucm_addMapping(ucm->ext, m, codePoints, bytes); michael@0: } michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { michael@0: UCMapping m={ 0 }; michael@0: UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; michael@0: uint8_t bytes[UCNV_EXT_MAX_BYTES]; michael@0: michael@0: const char *s; michael@0: michael@0: /* ignore empty and comment lines */ michael@0: if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { michael@0: return TRUE; michael@0: } michael@0: michael@0: return michael@0: ucm_parseMappingLine(&m, codePoints, bytes, line) && michael@0: ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_readTable(UCMFile *ucm, FileStream* convFile, michael@0: UBool forBase, UCMStates *baseStates, michael@0: UErrorCode *pErrorCode) { michael@0: char line[500]; michael@0: char *end; michael@0: UBool isOK; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: michael@0: isOK=TRUE; michael@0: michael@0: for(;;) { michael@0: /* read the next line */ michael@0: if(!T_FileStream_readLine(convFile, line, sizeof(line))) { michael@0: fprintf(stderr, "incomplete charmap section\n"); michael@0: isOK=FALSE; michael@0: break; michael@0: } michael@0: michael@0: /* remove CR LF */ michael@0: end=uprv_strchr(line, 0); michael@0: while(line