michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2000-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: ucnvscsu.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2000nov18 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * This is an implementation of the Standard Compression Scheme for Unicode michael@0: * as defined in http://www.unicode.org/unicode/reports/tr6/ . michael@0: * Reserved commands and window settings are treated as illegal sequences and michael@0: * will result in callback calls. michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: #include "unicode/ucnv.h" michael@0: #include "unicode/ucnv_cb.h" michael@0: #include "unicode/utf16.h" michael@0: #include "ucnv_bld.h" michael@0: #include "ucnv_cnv.h" michael@0: #include "cmemory.h" michael@0: michael@0: /* SCSU definitions --------------------------------------------------------- */ michael@0: michael@0: /* SCSU command byte values */ michael@0: enum { michael@0: SQ0=0x01, /* Quote from window pair 0 */ michael@0: SQ7=0x08, /* Quote from window pair 7 */ michael@0: SDX=0x0B, /* Define a window as extended */ michael@0: Srs=0x0C, /* reserved */ michael@0: SQU=0x0E, /* Quote a single Unicode character */ michael@0: SCU=0x0F, /* Change to Unicode mode */ michael@0: SC0=0x10, /* Select window 0 */ michael@0: SC7=0x17, /* Select window 7 */ michael@0: SD0=0x18, /* Define and select window 0 */ michael@0: SD7=0x1F, /* Define and select window 7 */ michael@0: michael@0: UC0=0xE0, /* Select window 0 */ michael@0: UC7=0xE7, /* Select window 7 */ michael@0: UD0=0xE8, /* Define and select window 0 */ michael@0: UD7=0xEF, /* Define and select window 7 */ michael@0: UQU=0xF0, /* Quote a single Unicode character */ michael@0: UDX=0xF1, /* Define a Window as extended */ michael@0: Urs=0xF2 /* reserved */ michael@0: }; michael@0: michael@0: enum { michael@0: /* michael@0: * Unicode code points from 3400 to E000 are not adressible by michael@0: * dynamic window, since in these areas no short run alphabets are michael@0: * found. Therefore add gapOffset to all values from gapThreshold. michael@0: */ michael@0: gapThreshold=0x68, michael@0: gapOffset=0xAC00, michael@0: michael@0: /* values between reservedStart and fixedThreshold are reserved */ michael@0: reservedStart=0xA8, michael@0: michael@0: /* use table of predefined fixed offsets for values from fixedThreshold */ michael@0: fixedThreshold=0xF9 michael@0: }; michael@0: michael@0: /* constant offsets for the 8 static windows */ michael@0: static const uint32_t staticOffsets[8]={ michael@0: 0x0000, /* ASCII for quoted tags */ michael@0: 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ michael@0: 0x0100, /* Latin Extended-A */ michael@0: 0x0300, /* Combining Diacritical Marks */ michael@0: 0x2000, /* General Punctuation */ michael@0: 0x2080, /* Currency Symbols */ michael@0: 0x2100, /* Letterlike Symbols and Number Forms */ michael@0: 0x3000 /* CJK Symbols and punctuation */ michael@0: }; michael@0: michael@0: /* initial offsets for the 8 dynamic (sliding) windows */ michael@0: static const uint32_t initialDynamicOffsets[8]={ michael@0: 0x0080, /* Latin-1 */ michael@0: 0x00C0, /* Latin Extended A */ michael@0: 0x0400, /* Cyrillic */ michael@0: 0x0600, /* Arabic */ michael@0: 0x0900, /* Devanagari */ michael@0: 0x3040, /* Hiragana */ michael@0: 0x30A0, /* Katakana */ michael@0: 0xFF00 /* Fullwidth ASCII */ michael@0: }; michael@0: michael@0: /* Table of fixed predefined Offsets */ michael@0: static const uint32_t fixedOffsets[]={ michael@0: /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ michael@0: /* 0xFA */ 0x0250, /* IPA extensions */ michael@0: /* 0xFB */ 0x0370, /* Greek */ michael@0: /* 0xFC */ 0x0530, /* Armenian */ michael@0: /* 0xFD */ 0x3040, /* Hiragana */ michael@0: /* 0xFE */ 0x30A0, /* Katakana */ michael@0: /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ michael@0: }; michael@0: michael@0: /* state values */ michael@0: enum { michael@0: readCommand, michael@0: quotePairOne, michael@0: quotePairTwo, michael@0: quoteOne, michael@0: definePairOne, michael@0: definePairTwo, michael@0: defineOne michael@0: }; michael@0: michael@0: typedef struct SCSUData { michael@0: /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ michael@0: uint32_t toUDynamicOffsets[8]; michael@0: uint32_t fromUDynamicOffsets[8]; michael@0: michael@0: /* state machine state - toUnicode */ michael@0: UBool toUIsSingleByteMode; michael@0: uint8_t toUState; michael@0: int8_t toUQuoteWindow, toUDynamicWindow; michael@0: uint8_t toUByteOne; michael@0: uint8_t toUPadding[3]; michael@0: michael@0: /* state machine state - fromUnicode */ michael@0: UBool fromUIsSingleByteMode; michael@0: int8_t fromUDynamicWindow; michael@0: michael@0: /* michael@0: * windowUse[] keeps track of the use of the dynamic windows: michael@0: * At nextWindowUseIndex there is the least recently used window, michael@0: * and the following windows (in a wrapping manner) are more and more michael@0: * recently used. michael@0: * At nextWindowUseIndex-1 there is the most recently used window. michael@0: */ michael@0: uint8_t locale; michael@0: int8_t nextWindowUseIndex; michael@0: int8_t windowUse[8]; michael@0: } SCSUData; michael@0: michael@0: static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; michael@0: static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; michael@0: michael@0: enum { michael@0: lGeneric, l_ja michael@0: }; michael@0: michael@0: /* SCSU setup functions ----------------------------------------------------- */ michael@0: michael@0: static void michael@0: _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { michael@0: SCSUData *scsu=(SCSUData *)cnv->extraInfo; michael@0: michael@0: if(choice<=UCNV_RESET_TO_UNICODE) { michael@0: /* reset toUnicode */ michael@0: uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); michael@0: michael@0: scsu->toUIsSingleByteMode=TRUE; michael@0: scsu->toUState=readCommand; michael@0: scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; michael@0: scsu->toUByteOne=0; michael@0: michael@0: cnv->toULength=0; michael@0: } michael@0: if(choice!=UCNV_RESET_TO_UNICODE) { michael@0: /* reset fromUnicode */ michael@0: uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); michael@0: michael@0: scsu->fromUIsSingleByteMode=TRUE; michael@0: scsu->fromUDynamicWindow=0; michael@0: michael@0: scsu->nextWindowUseIndex=0; michael@0: switch(scsu->locale) { michael@0: case l_ja: michael@0: uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); michael@0: break; michael@0: default: michael@0: uprv_memcpy(scsu->windowUse, initialWindowUse, 8); michael@0: break; michael@0: } michael@0: michael@0: cnv->fromUChar32=0; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: _SCSUOpen(UConverter *cnv, michael@0: UConverterLoadArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: const char *locale=pArgs->locale; michael@0: if(pArgs->onlyTestIsLoadable) { michael@0: return; michael@0: } michael@0: cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); michael@0: if(cnv->extraInfo!=NULL) { michael@0: if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { michael@0: ((SCSUData *)cnv->extraInfo)->locale=l_ja; michael@0: } else { michael@0: ((SCSUData *)cnv->extraInfo)->locale=lGeneric; michael@0: } michael@0: _SCSUReset(cnv, UCNV_RESET_BOTH); michael@0: } else { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: michael@0: /* Set the substitution character U+fffd as a Unicode string. */ michael@0: cnv->subUChars[0]=0xfffd; michael@0: cnv->subCharLen=-1; michael@0: } michael@0: michael@0: static void michael@0: _SCSUClose(UConverter *cnv) { michael@0: if(cnv->extraInfo!=NULL) { michael@0: if(!cnv->isExtraLocal) { michael@0: uprv_free(cnv->extraInfo); michael@0: } michael@0: cnv->extraInfo=NULL; michael@0: } michael@0: } michael@0: michael@0: /* SCSU-to-Unicode conversion functions ------------------------------------- */ michael@0: michael@0: static void michael@0: _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv; michael@0: SCSUData *scsu; michael@0: const uint8_t *source, *sourceLimit; michael@0: UChar *target; michael@0: const UChar *targetLimit; michael@0: int32_t *offsets; michael@0: UBool isSingleByteMode; michael@0: uint8_t state, byteOne; michael@0: int8_t quoteWindow, dynamicWindow; michael@0: michael@0: int32_t sourceIndex, nextSourceIndex; michael@0: michael@0: uint8_t b; michael@0: michael@0: /* set up the local pointers */ michael@0: cnv=pArgs->converter; michael@0: scsu=(SCSUData *)cnv->extraInfo; michael@0: michael@0: source=(const uint8_t *)pArgs->source; michael@0: sourceLimit=(const uint8_t *)pArgs->sourceLimit; michael@0: target=pArgs->target; michael@0: targetLimit=pArgs->targetLimit; michael@0: offsets=pArgs->offsets; michael@0: michael@0: /* get the state machine state */ michael@0: isSingleByteMode=scsu->toUIsSingleByteMode; michael@0: state=scsu->toUState; michael@0: quoteWindow=scsu->toUQuoteWindow; michael@0: dynamicWindow=scsu->toUDynamicWindow; michael@0: byteOne=scsu->toUByteOne; michael@0: michael@0: /* sourceIndex=-1 if the current character began in the previous buffer */ michael@0: sourceIndex=state==readCommand ? 0 : -1; michael@0: nextSourceIndex=0; michael@0: michael@0: /* michael@0: * conversion "loop" michael@0: * michael@0: * For performance, this is not a normal C loop. michael@0: * Instead, there are two code blocks for the two SCSU modes. michael@0: * The function branches to either one, and a change of the mode is done with a goto to michael@0: * the other branch. michael@0: * michael@0: * Each branch has two conventional loops: michael@0: * - a fast-path loop for the most common codes in the mode michael@0: * - a loop for all other codes in the mode michael@0: * When the fast-path runs into a code that it cannot handle, its loop ends and it michael@0: * runs into the following loop to handle the other codes. michael@0: * The end of the input or output buffer is also handled by the slower loop. michael@0: * The slow loop jumps (goto) to the fast-path loop again as soon as possible. michael@0: * michael@0: * The callback handling is done by returning with an error code. michael@0: * The conversion framework actually calls the callback function. michael@0: */ michael@0: if(isSingleByteMode) { michael@0: /* fast path for single-byte mode */ michael@0: if(state==readCommand) { michael@0: fastSingle: michael@0: while(source=0x20) { michael@0: ++source; michael@0: ++nextSourceIndex; michael@0: if(b<=0x7f) { michael@0: /* write US-ASCII graphic character or DEL */ michael@0: *target++=(UChar)b; michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: } else { michael@0: /* write from dynamic window */ michael@0: uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); michael@0: if(c<=0xffff) { michael@0: *target++=(UChar)c; michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: } else { michael@0: /* output surrogate pair */ michael@0: *target++=(UChar)(0xd7c0+(c>>10)); michael@0: if(targetUCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); michael@0: cnv->UCharErrorBufferLength=1; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: goto endloop; michael@0: } michael@0: } michael@0: } michael@0: sourceIndex=nextSourceIndex; michael@0: } michael@0: } michael@0: michael@0: /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ michael@0: singleByteMode: michael@0: while(source=targetLimit) { michael@0: /* target is full */ michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: break; michael@0: } michael@0: b=*source++; michael@0: ++nextSourceIndex; michael@0: switch(state) { michael@0: case readCommand: michael@0: /* redundant conditions are commented out */ michael@0: /* here: b<0x20 because otherwise we would be in fastSingle */ michael@0: if((1UL<toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: goto endloop; michael@0: } michael@0: michael@0: /* store the first byte of a multibyte sequence in toUBytes[] */ michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: break; michael@0: case quotePairOne: michael@0: byteOne=b; michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: state=quotePairTwo; michael@0: break; michael@0: case quotePairTwo: michael@0: *target++=(UChar)((byteOne<<8)|b); michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: sourceIndex=nextSourceIndex; michael@0: state=readCommand; michael@0: goto fastSingle; michael@0: case quoteOne: michael@0: if(b<0x80) { michael@0: /* all static offsets are in the BMP */ michael@0: *target++=(UChar)(staticOffsets[quoteWindow]+b); michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: } else { michael@0: /* write from dynamic window */ michael@0: uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); michael@0: if(c<=0xffff) { michael@0: *target++=(UChar)c; michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: } else { michael@0: /* output surrogate pair */ michael@0: *target++=(UChar)(0xd7c0+(c>>10)); michael@0: if(targetUCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); michael@0: cnv->UCharErrorBufferLength=1; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: goto endloop; michael@0: } michael@0: } michael@0: } michael@0: sourceIndex=nextSourceIndex; michael@0: state=readCommand; michael@0: goto fastSingle; michael@0: case definePairOne: michael@0: dynamicWindow=(int8_t)((b>>5)&7); michael@0: byteOne=(uint8_t)(b&0x1f); michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: state=definePairTwo; michael@0: break; michael@0: case definePairTwo: michael@0: scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); michael@0: sourceIndex=nextSourceIndex; michael@0: state=readCommand; michael@0: goto fastSingle; michael@0: case defineOne: michael@0: if(b==0) { michael@0: /* callback(illegal): Reserved window offset value 0 */ michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: goto endloop; michael@0: } else if(btoUDynamicOffsets[dynamicWindow]=b<<7UL; michael@0: } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { michael@0: scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; michael@0: } else if(b>=fixedThreshold) { michael@0: scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; michael@0: } else { michael@0: /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: goto endloop; michael@0: } michael@0: sourceIndex=nextSourceIndex; michael@0: state=readCommand; michael@0: goto fastSingle; michael@0: } michael@0: } michael@0: } else { michael@0: /* fast path for Unicode mode */ michael@0: if(state==readCommand) { michael@0: fastUnicode: michael@0: while(source+1(Urs-UC0)) { michael@0: *target++=(UChar)((b<<8)|source[1]); michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: sourceIndex=nextSourceIndex; michael@0: nextSourceIndex+=2; michael@0: source+=2; michael@0: } michael@0: } michael@0: michael@0: /* normal state machine for Unicode mode */ michael@0: /* unicodeByteMode: */ michael@0: while(source=targetLimit) { michael@0: /* target is full */ michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: break; michael@0: } michael@0: b=*source++; michael@0: ++nextSourceIndex; michael@0: switch(state) { michael@0: case readCommand: michael@0: if((uint8_t)(b-UC0)>(Urs-UC0)) { michael@0: byteOne=b; michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: state=quotePairTwo; michael@0: } else if(/* UC0<=b && */ b<=UC7) { michael@0: dynamicWindow=(int8_t)(b-UC0); michael@0: sourceIndex=nextSourceIndex; michael@0: isSingleByteMode=TRUE; michael@0: goto fastSingle; michael@0: } else if(/* UD0<=b && */ b<=UD7) { michael@0: dynamicWindow=(int8_t)(b-UD0); michael@0: isSingleByteMode=TRUE; michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: state=defineOne; michael@0: goto singleByteMode; michael@0: } else if(b==UDX) { michael@0: isSingleByteMode=TRUE; michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: state=definePairOne; michael@0: goto singleByteMode; michael@0: } else if(b==UQU) { michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: state=quotePairOne; michael@0: } else /* Urs */ { michael@0: /* callback(illegal) */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: goto endloop; michael@0: } michael@0: break; michael@0: case quotePairOne: michael@0: byteOne=b; michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: state=quotePairTwo; michael@0: break; michael@0: case quotePairTwo: michael@0: *target++=(UChar)((byteOne<<8)|b); michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: sourceIndex=nextSourceIndex; michael@0: state=readCommand; michael@0: goto fastUnicode; michael@0: } michael@0: } michael@0: } michael@0: endloop: michael@0: michael@0: /* set the converter state back into UConverter */ michael@0: if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { michael@0: /* reset to deal with the next character */ michael@0: state=readCommand; michael@0: } else if(state==readCommand) { michael@0: /* not in a multi-byte sequence, reset toULength */ michael@0: cnv->toULength=0; michael@0: } michael@0: scsu->toUIsSingleByteMode=isSingleByteMode; michael@0: scsu->toUState=state; michael@0: scsu->toUQuoteWindow=quoteWindow; michael@0: scsu->toUDynamicWindow=dynamicWindow; michael@0: scsu->toUByteOne=byteOne; michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=(const char *)source; michael@0: pArgs->target=target; michael@0: pArgs->offsets=offsets; michael@0: return; michael@0: } michael@0: michael@0: /* michael@0: * Identical to _SCSUToUnicodeWithOffsets but without offset handling. michael@0: * If a change is made in the original function, then either michael@0: * change this function the same way or michael@0: * re-copy the original function and remove the variables michael@0: * offsets, sourceIndex, and nextSourceIndex. michael@0: */ michael@0: static void michael@0: _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv; michael@0: SCSUData *scsu; michael@0: const uint8_t *source, *sourceLimit; michael@0: UChar *target; michael@0: const UChar *targetLimit; michael@0: UBool isSingleByteMode; michael@0: uint8_t state, byteOne; michael@0: int8_t quoteWindow, dynamicWindow; michael@0: michael@0: uint8_t b; michael@0: michael@0: /* set up the local pointers */ michael@0: cnv=pArgs->converter; michael@0: scsu=(SCSUData *)cnv->extraInfo; michael@0: michael@0: source=(const uint8_t *)pArgs->source; michael@0: sourceLimit=(const uint8_t *)pArgs->sourceLimit; michael@0: target=pArgs->target; michael@0: targetLimit=pArgs->targetLimit; michael@0: michael@0: /* get the state machine state */ michael@0: isSingleByteMode=scsu->toUIsSingleByteMode; michael@0: state=scsu->toUState; michael@0: quoteWindow=scsu->toUQuoteWindow; michael@0: dynamicWindow=scsu->toUDynamicWindow; michael@0: byteOne=scsu->toUByteOne; michael@0: michael@0: /* michael@0: * conversion "loop" michael@0: * michael@0: * For performance, this is not a normal C loop. michael@0: * Instead, there are two code blocks for the two SCSU modes. michael@0: * The function branches to either one, and a change of the mode is done with a goto to michael@0: * the other branch. michael@0: * michael@0: * Each branch has two conventional loops: michael@0: * - a fast-path loop for the most common codes in the mode michael@0: * - a loop for all other codes in the mode michael@0: * When the fast-path runs into a code that it cannot handle, its loop ends and it michael@0: * runs into the following loop to handle the other codes. michael@0: * The end of the input or output buffer is also handled by the slower loop. michael@0: * The slow loop jumps (goto) to the fast-path loop again as soon as possible. michael@0: * michael@0: * The callback handling is done by returning with an error code. michael@0: * The conversion framework actually calls the callback function. michael@0: */ michael@0: if(isSingleByteMode) { michael@0: /* fast path for single-byte mode */ michael@0: if(state==readCommand) { michael@0: fastSingle: michael@0: while(source=0x20) { michael@0: ++source; michael@0: if(b<=0x7f) { michael@0: /* write US-ASCII graphic character or DEL */ michael@0: *target++=(UChar)b; michael@0: } else { michael@0: /* write from dynamic window */ michael@0: uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); michael@0: if(c<=0xffff) { michael@0: *target++=(UChar)c; michael@0: } else { michael@0: /* output surrogate pair */ michael@0: *target++=(UChar)(0xd7c0+(c>>10)); michael@0: if(targetUCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); michael@0: cnv->UCharErrorBufferLength=1; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: goto endloop; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ michael@0: singleByteMode: michael@0: while(source=targetLimit) { michael@0: /* target is full */ michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: break; michael@0: } michael@0: b=*source++; michael@0: switch(state) { michael@0: case readCommand: michael@0: /* redundant conditions are commented out */ michael@0: /* here: b<0x20 because otherwise we would be in fastSingle */ michael@0: if((1UL<toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: goto endloop; michael@0: } michael@0: michael@0: /* store the first byte of a multibyte sequence in toUBytes[] */ michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: break; michael@0: case quotePairOne: michael@0: byteOne=b; michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: state=quotePairTwo; michael@0: break; michael@0: case quotePairTwo: michael@0: *target++=(UChar)((byteOne<<8)|b); michael@0: state=readCommand; michael@0: goto fastSingle; michael@0: case quoteOne: michael@0: if(b<0x80) { michael@0: /* all static offsets are in the BMP */ michael@0: *target++=(UChar)(staticOffsets[quoteWindow]+b); michael@0: } else { michael@0: /* write from dynamic window */ michael@0: uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); michael@0: if(c<=0xffff) { michael@0: *target++=(UChar)c; michael@0: } else { michael@0: /* output surrogate pair */ michael@0: *target++=(UChar)(0xd7c0+(c>>10)); michael@0: if(targetUCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); michael@0: cnv->UCharErrorBufferLength=1; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: goto endloop; michael@0: } michael@0: } michael@0: } michael@0: state=readCommand; michael@0: goto fastSingle; michael@0: case definePairOne: michael@0: dynamicWindow=(int8_t)((b>>5)&7); michael@0: byteOne=(uint8_t)(b&0x1f); michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: state=definePairTwo; michael@0: break; michael@0: case definePairTwo: michael@0: scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); michael@0: state=readCommand; michael@0: goto fastSingle; michael@0: case defineOne: michael@0: if(b==0) { michael@0: /* callback(illegal): Reserved window offset value 0 */ michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: goto endloop; michael@0: } else if(btoUDynamicOffsets[dynamicWindow]=b<<7UL; michael@0: } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { michael@0: scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; michael@0: } else if(b>=fixedThreshold) { michael@0: scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; michael@0: } else { michael@0: /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: goto endloop; michael@0: } michael@0: state=readCommand; michael@0: goto fastSingle; michael@0: } michael@0: } michael@0: } else { michael@0: /* fast path for Unicode mode */ michael@0: if(state==readCommand) { michael@0: fastUnicode: michael@0: while(source+1(Urs-UC0)) { michael@0: *target++=(UChar)((b<<8)|source[1]); michael@0: source+=2; michael@0: } michael@0: } michael@0: michael@0: /* normal state machine for Unicode mode */ michael@0: /* unicodeByteMode: */ michael@0: while(source=targetLimit) { michael@0: /* target is full */ michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: break; michael@0: } michael@0: b=*source++; michael@0: switch(state) { michael@0: case readCommand: michael@0: if((uint8_t)(b-UC0)>(Urs-UC0)) { michael@0: byteOne=b; michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: state=quotePairTwo; michael@0: } else if(/* UC0<=b && */ b<=UC7) { michael@0: dynamicWindow=(int8_t)(b-UC0); michael@0: isSingleByteMode=TRUE; michael@0: goto fastSingle; michael@0: } else if(/* UD0<=b && */ b<=UD7) { michael@0: dynamicWindow=(int8_t)(b-UD0); michael@0: isSingleByteMode=TRUE; michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: state=defineOne; michael@0: goto singleByteMode; michael@0: } else if(b==UDX) { michael@0: isSingleByteMode=TRUE; michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: state=definePairOne; michael@0: goto singleByteMode; michael@0: } else if(b==UQU) { michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: state=quotePairOne; michael@0: } else /* Urs */ { michael@0: /* callback(illegal) */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: cnv->toUBytes[0]=b; michael@0: cnv->toULength=1; michael@0: goto endloop; michael@0: } michael@0: break; michael@0: case quotePairOne: michael@0: byteOne=b; michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: state=quotePairTwo; michael@0: break; michael@0: case quotePairTwo: michael@0: *target++=(UChar)((byteOne<<8)|b); michael@0: state=readCommand; michael@0: goto fastUnicode; michael@0: } michael@0: } michael@0: } michael@0: endloop: michael@0: michael@0: /* set the converter state back into UConverter */ michael@0: if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { michael@0: /* reset to deal with the next character */ michael@0: state=readCommand; michael@0: } else if(state==readCommand) { michael@0: /* not in a multi-byte sequence, reset toULength */ michael@0: cnv->toULength=0; michael@0: } michael@0: scsu->toUIsSingleByteMode=isSingleByteMode; michael@0: scsu->toUState=state; michael@0: scsu->toUQuoteWindow=quoteWindow; michael@0: scsu->toUDynamicWindow=dynamicWindow; michael@0: scsu->toUByteOne=byteOne; michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=(const char *)source; michael@0: pArgs->target=target; michael@0: return; michael@0: } michael@0: michael@0: /* SCSU-from-Unicode conversion functions ----------------------------------- */ michael@0: michael@0: /* michael@0: * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve michael@0: * reasonable results. The lookahead is minimal. michael@0: * Many cases are simple: michael@0: * A character fits directly into the current mode, a dynamic or static window, michael@0: * or is not compressible. These cases are tested first. michael@0: * Real compression heuristics are applied to the rest, in code branches for michael@0: * single/Unicode mode and BMP/supplementary code points. michael@0: * The heuristics used here are extremely simple. michael@0: */ michael@0: michael@0: /* get the number of the window that this character is in, or -1 */ michael@0: static int8_t michael@0: getWindow(const uint32_t offsets[8], uint32_t c) { michael@0: int i; michael@0: for(i=0; i<8; ++i) { michael@0: if((uint32_t)(c-offsets[i])<=0x7f) { michael@0: return (int8_t)(i); michael@0: } michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ michael@0: static UBool michael@0: isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { michael@0: return (UBool)(c<=offset+0x7f && michael@0: (c>=offset || (c<=0x7f && michael@0: (c>=0x20 || (1UL<windowUse[scsu->nextWindowUseIndex]; michael@0: if(++scsu->nextWindowUseIndex==8) { michael@0: scsu->nextWindowUseIndex=0; michael@0: } michael@0: return window; michael@0: } michael@0: michael@0: /* michael@0: * useDynamicWindow() adjusts michael@0: * windowUse[] and nextWindowUseIndex for the algorithm to choose michael@0: * the next dynamic window to be defined; michael@0: * a subclass may override it and provide its own algorithm. michael@0: */ michael@0: static void michael@0: useDynamicWindow(SCSUData *scsu, int8_t window) { michael@0: /* michael@0: * move the existing window, which just became the most recently used one, michael@0: * up in windowUse[] to nextWindowUseIndex-1 michael@0: */ michael@0: michael@0: /* first, find the index of the window - backwards to favor the more recently used windows */ michael@0: int i, j; michael@0: michael@0: i=scsu->nextWindowUseIndex; michael@0: do { michael@0: if(--i<0) { michael@0: i=7; michael@0: } michael@0: } while(scsu->windowUse[i]!=window); michael@0: michael@0: /* now copy each windowUse[i+1] to [i] */ michael@0: j=i+1; michael@0: if(j==8) { michael@0: j=0; michael@0: } michael@0: while(j!=scsu->nextWindowUseIndex) { michael@0: scsu->windowUse[i]=scsu->windowUse[j]; michael@0: i=j; michael@0: if(++j==8) { j=0; } michael@0: } michael@0: michael@0: /* finally, set the window into the most recently used index */ michael@0: scsu->windowUse[i]=window; michael@0: } michael@0: michael@0: /* michael@0: * calculate the offset and the code for a dynamic window that contains the character michael@0: * takes fixed offsets into account michael@0: * the offset of the window is stored in the offset variable, michael@0: * the code is returned michael@0: * michael@0: * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code michael@0: */ michael@0: static int michael@0: getDynamicOffset(uint32_t c, uint32_t *pOffset) { michael@0: int i; michael@0: michael@0: for(i=0; i<7; ++i) { michael@0: if((uint32_t)(c-fixedOffsets[i])<=0x7f) { michael@0: *pOffset=fixedOffsets[i]; michael@0: return 0xf9+i; michael@0: } michael@0: } michael@0: michael@0: if(c<0x80) { michael@0: /* No dynamic window for US-ASCII. */ michael@0: return -1; michael@0: } else if(c<0x3400 || michael@0: (uint32_t)(c-0x10000)<(0x14000-0x10000) || michael@0: (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) michael@0: ) { michael@0: /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ michael@0: *pOffset=c&0x7fffff80; michael@0: return (int)(c>>7); michael@0: } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { michael@0: /* For these characters we need to take the gapOffset into account. */ michael@0: *pOffset=c&0x7fffff80; michael@0: return (int)((c-gapOffset)>>7); michael@0: } else { michael@0: return -1; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Idea for compression: michael@0: * - save SCSUData and other state before really starting work michael@0: * - at endloop, see if compression could be better with just unicode mode michael@0: * - don't do this if a callback has been called michael@0: * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning michael@0: * - different buffer handling! michael@0: * michael@0: * Drawback or need for corrective handling: michael@0: * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and michael@0: * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible michael@0: * not only for compression but also for HTML/XML documents with following charset/encoding announcers. michael@0: * michael@0: * How to achieve both? michael@0: * - Only replace the result after an SDX or SCU? michael@0: */ michael@0: michael@0: static void michael@0: _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv; michael@0: SCSUData *scsu; michael@0: const UChar *source, *sourceLimit; michael@0: uint8_t *target; michael@0: int32_t targetCapacity; michael@0: int32_t *offsets; michael@0: michael@0: UBool isSingleByteMode; michael@0: uint8_t dynamicWindow; michael@0: uint32_t currentOffset; michael@0: michael@0: uint32_t c, delta; michael@0: michael@0: int32_t sourceIndex, nextSourceIndex; michael@0: michael@0: int32_t length; michael@0: michael@0: /* variables for compression heuristics */ michael@0: uint32_t offset; michael@0: UChar lead, trail; michael@0: int code; michael@0: int8_t window; michael@0: michael@0: /* set up the local pointers */ michael@0: cnv=pArgs->converter; michael@0: scsu=(SCSUData *)cnv->extraInfo; michael@0: michael@0: /* set up the local pointers */ michael@0: source=pArgs->source; michael@0: sourceLimit=pArgs->sourceLimit; michael@0: target=(uint8_t *)pArgs->target; michael@0: targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); michael@0: offsets=pArgs->offsets; michael@0: michael@0: /* get the state machine state */ michael@0: isSingleByteMode=scsu->fromUIsSingleByteMode; michael@0: dynamicWindow=scsu->fromUDynamicWindow; michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; michael@0: michael@0: c=cnv->fromUChar32; michael@0: michael@0: /* sourceIndex=-1 if the current character began in the previous buffer */ michael@0: sourceIndex= c==0 ? 0 : -1; michael@0: nextSourceIndex=0; michael@0: michael@0: /* similar conversion "loop" as in toUnicode */ michael@0: loop: michael@0: if(isSingleByteMode) { michael@0: if(c!=0 && targetCapacity>0) { michael@0: goto getTrailSingle; michael@0: } michael@0: michael@0: /* state machine for single-byte mode */ michael@0: /* singleByteMode: */ michael@0: while(sourcefromUDynamicOffsets, c))>=0) { michael@0: /* there is a dynamic window that contains this character, change to it */ michael@0: dynamicWindow=window; michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if((code=getDynamicOffset(c, &offset))>=0) { michael@0: /* might check if there are more characters in this window to come */ michael@0: /* define an extended window with this character */ michael@0: code-=0x200; michael@0: dynamicWindow=getNextDynamicWindow(scsu); michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; michael@0: length=4; michael@0: goto outputBytes; michael@0: } else { michael@0: /* change to Unicode mode and output this (lead, trail) pair */ michael@0: isSingleByteMode=FALSE; michael@0: *target++=(uint8_t)SCU; michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: --targetCapacity; michael@0: c=((uint32_t)lead<<16)|trail; michael@0: length=4; michael@0: goto outputBytes; michael@0: } michael@0: } else if(c<0xa0) { michael@0: /* quote C1 control character */ michael@0: c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if(c==0xfeff || c>=0xfff0) { michael@0: /* quote signature character=byte order mark and specials */ michael@0: c|=SQU<<16; michael@0: length=3; michael@0: goto outputBytes; michael@0: } else { michael@0: /* compress all other BMP characters */ michael@0: if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { michael@0: /* there is a window defined that contains this character - switch to it or quote from it? */ michael@0: if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { michael@0: /* change to dynamic window */ michael@0: dynamicWindow=window; michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } else { michael@0: /* quote from dynamic window */ michael@0: c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } michael@0: } else if((window=getWindow(staticOffsets, c))>=0) { michael@0: /* quote from static window */ michael@0: c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if((code=getDynamicOffset(c, &offset))>=0) { michael@0: /* define a dynamic window with this character */ michael@0: dynamicWindow=getNextDynamicWindow(scsu); michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; michael@0: length=3; michael@0: goto outputBytes; michael@0: } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && michael@0: (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) michael@0: ) { michael@0: /* michael@0: * this character is not compressible (a BMP ideograph or similar); michael@0: * switch to Unicode mode if this is the last character in the block michael@0: * or there is at least one more ideograph following immediately michael@0: */ michael@0: isSingleByteMode=FALSE; michael@0: c|=SCU<<16; michael@0: length=3; michael@0: goto outputBytes; michael@0: } else { michael@0: /* quote Unicode */ michael@0: c|=SQU<<16; michael@0: length=3; michael@0: goto outputBytes; michael@0: } michael@0: } michael@0: michael@0: /* normal end of conversion: prepare for a new character */ michael@0: c=0; michael@0: sourceIndex=nextSourceIndex; michael@0: } michael@0: } else { michael@0: if(c!=0 && targetCapacity>0) { michael@0: goto getTrailUnicode; michael@0: } michael@0: michael@0: /* state machine for Unicode mode */ michael@0: /* unicodeByteMode: */ michael@0: while(source=2) { michael@0: *target++=(uint8_t)(c>>8); michael@0: *target++=(uint8_t)c; michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: } michael@0: targetCapacity-=2; michael@0: } else { michael@0: length=2; michael@0: goto outputBytes; michael@0: } michael@0: } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { michael@0: /* compress BMP character if the following one is not an uncompressible ideograph */ michael@0: if(!(sourcefromUDynamicOffsets, c))>=0) { michael@0: /* there is a dynamic window that contains this character, change to it */ michael@0: isSingleByteMode=TRUE; michael@0: dynamicWindow=window; michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if((code=getDynamicOffset(c, &offset))>=0) { michael@0: /* define a dynamic window with this character */ michael@0: isSingleByteMode=TRUE; michael@0: dynamicWindow=getNextDynamicWindow(scsu); michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; michael@0: length=3; michael@0: goto outputBytes; michael@0: } michael@0: } michael@0: michael@0: /* don't know how to compress this character, just write it directly */ michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if(c<0xe000) { michael@0: /* c is a surrogate */ michael@0: if(U16_IS_SURROGATE_LEAD(c)) { michael@0: getTrailUnicode: michael@0: lead=(UChar)c; michael@0: if(sourcefromUDynamicOffsets, c))>=0 && michael@0: !(sourcefromUDynamicOffsets[dynamicWindow]; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if(source=0 michael@0: ) { michael@0: /* two supplementary characters in (probably) the same window - define an extended one */ michael@0: isSingleByteMode=TRUE; michael@0: code-=0x200; michael@0: dynamicWindow=getNextDynamicWindow(scsu); michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; michael@0: length=4; michael@0: goto outputBytes; michael@0: } else { michael@0: /* don't know how to compress this character, just write it directly */ michael@0: c=((uint32_t)lead<<16)|trail; michael@0: length=4; michael@0: goto outputBytes; michael@0: } michael@0: } else /* 0xe000<=c<0xf300 */ { michael@0: /* quote to avoid SCSU tags */ michael@0: c|=UQU<<16; michael@0: length=3; michael@0: goto outputBytes; michael@0: } michael@0: michael@0: /* normal end of conversion: prepare for a new character */ michael@0: c=0; michael@0: sourceIndex=nextSourceIndex; michael@0: } michael@0: } michael@0: endloop: michael@0: michael@0: /* set the converter state back into UConverter */ michael@0: scsu->fromUIsSingleByteMode=isSingleByteMode; michael@0: scsu->fromUDynamicWindow=dynamicWindow; michael@0: michael@0: cnv->fromUChar32=c; michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=source; michael@0: pArgs->target=(char *)target; michael@0: pArgs->offsets=offsets; michael@0: return; michael@0: michael@0: outputBytes: michael@0: /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ michael@0: /* from the first if in the loop we know that targetCapacity>0 */ michael@0: if(length<=targetCapacity) { michael@0: if(offsets==NULL) { michael@0: switch(length) { michael@0: /* each branch falls through to the next one */ michael@0: case 4: michael@0: *target++=(uint8_t)(c>>24); michael@0: case 3: /*fall through*/ michael@0: *target++=(uint8_t)(c>>16); michael@0: case 2: /*fall through*/ michael@0: *target++=(uint8_t)(c>>8); michael@0: case 1: /*fall through*/ michael@0: *target++=(uint8_t)c; michael@0: default: michael@0: /* will never occur */ michael@0: break; michael@0: } michael@0: } else { michael@0: switch(length) { michael@0: /* each branch falls through to the next one */ michael@0: case 4: michael@0: *target++=(uint8_t)(c>>24); michael@0: *offsets++=sourceIndex; michael@0: case 3: /*fall through*/ michael@0: *target++=(uint8_t)(c>>16); michael@0: *offsets++=sourceIndex; michael@0: case 2: /*fall through*/ michael@0: *target++=(uint8_t)(c>>8); michael@0: *offsets++=sourceIndex; michael@0: case 1: /*fall through*/ michael@0: *target++=(uint8_t)c; michael@0: *offsets++=sourceIndex; michael@0: default: michael@0: /* will never occur */ michael@0: break; michael@0: } michael@0: } michael@0: targetCapacity-=length; michael@0: michael@0: /* normal end of conversion: prepare for a new character */ michael@0: c=0; michael@0: sourceIndex=nextSourceIndex; michael@0: goto loop; michael@0: } else { michael@0: uint8_t *p; michael@0: michael@0: /* michael@0: * We actually do this backwards here: michael@0: * In order to save an intermediate variable, we output michael@0: * first to the overflow buffer what does not fit into the michael@0: * regular target. michael@0: */ michael@0: /* we know that 0<=targetCapacitycharErrorBuffer; michael@0: switch(length) { michael@0: /* each branch falls through to the next one */ michael@0: case 4: michael@0: *p++=(uint8_t)(c>>24); michael@0: case 3: /*fall through*/ michael@0: *p++=(uint8_t)(c>>16); michael@0: case 2: /*fall through*/ michael@0: *p++=(uint8_t)(c>>8); michael@0: case 1: /*fall through*/ michael@0: *p=(uint8_t)c; michael@0: default: michael@0: /* will never occur */ michael@0: break; michael@0: } michael@0: cnv->charErrorBufferLength=(int8_t)length; michael@0: michael@0: /* now output what fits into the regular target */ michael@0: c>>=8*length; /* length was reduced by targetCapacity */ michael@0: switch(targetCapacity) { michael@0: /* each branch falls through to the next one */ michael@0: case 3: michael@0: *target++=(uint8_t)(c>>16); michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: case 2: /*fall through*/ michael@0: *target++=(uint8_t)(c>>8); michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: case 1: /*fall through*/ michael@0: *target++=(uint8_t)c; michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: default: michael@0: break; michael@0: } michael@0: michael@0: /* target overflow */ michael@0: targetCapacity=0; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: c=0; michael@0: goto endloop; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. michael@0: * If a change is made in the original function, then either michael@0: * change this function the same way or michael@0: * re-copy the original function and remove the variables michael@0: * offsets, sourceIndex, and nextSourceIndex. michael@0: */ michael@0: static void michael@0: _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv; michael@0: SCSUData *scsu; michael@0: const UChar *source, *sourceLimit; michael@0: uint8_t *target; michael@0: int32_t targetCapacity; michael@0: michael@0: UBool isSingleByteMode; michael@0: uint8_t dynamicWindow; michael@0: uint32_t currentOffset; michael@0: michael@0: uint32_t c, delta; michael@0: michael@0: int32_t length; michael@0: michael@0: /* variables for compression heuristics */ michael@0: uint32_t offset; michael@0: UChar lead, trail; michael@0: int code; michael@0: int8_t window; michael@0: michael@0: /* set up the local pointers */ michael@0: cnv=pArgs->converter; michael@0: scsu=(SCSUData *)cnv->extraInfo; michael@0: michael@0: /* set up the local pointers */ michael@0: source=pArgs->source; michael@0: sourceLimit=pArgs->sourceLimit; michael@0: target=(uint8_t *)pArgs->target; michael@0: targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); michael@0: michael@0: /* get the state machine state */ michael@0: isSingleByteMode=scsu->fromUIsSingleByteMode; michael@0: dynamicWindow=scsu->fromUDynamicWindow; michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; michael@0: michael@0: c=cnv->fromUChar32; michael@0: michael@0: /* similar conversion "loop" as in toUnicode */ michael@0: loop: michael@0: if(isSingleByteMode) { michael@0: if(c!=0 && targetCapacity>0) { michael@0: goto getTrailSingle; michael@0: } michael@0: michael@0: /* state machine for single-byte mode */ michael@0: /* singleByteMode: */ michael@0: while(sourcefromUDynamicOffsets, c))>=0) { michael@0: /* there is a dynamic window that contains this character, change to it */ michael@0: dynamicWindow=window; michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if((code=getDynamicOffset(c, &offset))>=0) { michael@0: /* might check if there are more characters in this window to come */ michael@0: /* define an extended window with this character */ michael@0: code-=0x200; michael@0: dynamicWindow=getNextDynamicWindow(scsu); michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; michael@0: length=4; michael@0: goto outputBytes; michael@0: } else { michael@0: /* change to Unicode mode and output this (lead, trail) pair */ michael@0: isSingleByteMode=FALSE; michael@0: *target++=(uint8_t)SCU; michael@0: --targetCapacity; michael@0: c=((uint32_t)lead<<16)|trail; michael@0: length=4; michael@0: goto outputBytes; michael@0: } michael@0: } else if(c<0xa0) { michael@0: /* quote C1 control character */ michael@0: c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if(c==0xfeff || c>=0xfff0) { michael@0: /* quote signature character=byte order mark and specials */ michael@0: c|=SQU<<16; michael@0: length=3; michael@0: goto outputBytes; michael@0: } else { michael@0: /* compress all other BMP characters */ michael@0: if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { michael@0: /* there is a window defined that contains this character - switch to it or quote from it? */ michael@0: if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { michael@0: /* change to dynamic window */ michael@0: dynamicWindow=window; michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } else { michael@0: /* quote from dynamic window */ michael@0: c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } michael@0: } else if((window=getWindow(staticOffsets, c))>=0) { michael@0: /* quote from static window */ michael@0: c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if((code=getDynamicOffset(c, &offset))>=0) { michael@0: /* define a dynamic window with this character */ michael@0: dynamicWindow=getNextDynamicWindow(scsu); michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; michael@0: length=3; michael@0: goto outputBytes; michael@0: } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && michael@0: (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) michael@0: ) { michael@0: /* michael@0: * this character is not compressible (a BMP ideograph or similar); michael@0: * switch to Unicode mode if this is the last character in the block michael@0: * or there is at least one more ideograph following immediately michael@0: */ michael@0: isSingleByteMode=FALSE; michael@0: c|=SCU<<16; michael@0: length=3; michael@0: goto outputBytes; michael@0: } else { michael@0: /* quote Unicode */ michael@0: c|=SQU<<16; michael@0: length=3; michael@0: goto outputBytes; michael@0: } michael@0: } michael@0: michael@0: /* normal end of conversion: prepare for a new character */ michael@0: c=0; michael@0: } michael@0: } else { michael@0: if(c!=0 && targetCapacity>0) { michael@0: goto getTrailUnicode; michael@0: } michael@0: michael@0: /* state machine for Unicode mode */ michael@0: /* unicodeByteMode: */ michael@0: while(source=2) { michael@0: *target++=(uint8_t)(c>>8); michael@0: *target++=(uint8_t)c; michael@0: targetCapacity-=2; michael@0: } else { michael@0: length=2; michael@0: goto outputBytes; michael@0: } michael@0: } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { michael@0: /* compress BMP character if the following one is not an uncompressible ideograph */ michael@0: if(!(sourcefromUDynamicOffsets, c))>=0) { michael@0: /* there is a dynamic window that contains this character, change to it */ michael@0: isSingleByteMode=TRUE; michael@0: dynamicWindow=window; michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if((code=getDynamicOffset(c, &offset))>=0) { michael@0: /* define a dynamic window with this character */ michael@0: isSingleByteMode=TRUE; michael@0: dynamicWindow=getNextDynamicWindow(scsu); michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; michael@0: length=3; michael@0: goto outputBytes; michael@0: } michael@0: } michael@0: michael@0: /* don't know how to compress this character, just write it directly */ michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if(c<0xe000) { michael@0: /* c is a surrogate */ michael@0: if(U16_IS_SURROGATE_LEAD(c)) { michael@0: getTrailUnicode: michael@0: lead=(UChar)c; michael@0: if(sourcefromUDynamicOffsets, c))>=0 && michael@0: !(sourcefromUDynamicOffsets[dynamicWindow]; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; michael@0: length=2; michael@0: goto outputBytes; michael@0: } else if(source=0 michael@0: ) { michael@0: /* two supplementary characters in (probably) the same window - define an extended one */ michael@0: isSingleByteMode=TRUE; michael@0: code-=0x200; michael@0: dynamicWindow=getNextDynamicWindow(scsu); michael@0: currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; michael@0: useDynamicWindow(scsu, dynamicWindow); michael@0: c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; michael@0: length=4; michael@0: goto outputBytes; michael@0: } else { michael@0: /* don't know how to compress this character, just write it directly */ michael@0: c=((uint32_t)lead<<16)|trail; michael@0: length=4; michael@0: goto outputBytes; michael@0: } michael@0: } else /* 0xe000<=c<0xf300 */ { michael@0: /* quote to avoid SCSU tags */ michael@0: c|=UQU<<16; michael@0: length=3; michael@0: goto outputBytes; michael@0: } michael@0: michael@0: /* normal end of conversion: prepare for a new character */ michael@0: c=0; michael@0: } michael@0: } michael@0: endloop: michael@0: michael@0: /* set the converter state back into UConverter */ michael@0: scsu->fromUIsSingleByteMode=isSingleByteMode; michael@0: scsu->fromUDynamicWindow=dynamicWindow; michael@0: michael@0: cnv->fromUChar32=c; michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=source; michael@0: pArgs->target=(char *)target; michael@0: return; michael@0: michael@0: outputBytes: michael@0: /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ michael@0: /* from the first if in the loop we know that targetCapacity>0 */ michael@0: if(length<=targetCapacity) { michael@0: switch(length) { michael@0: /* each branch falls through to the next one */ michael@0: case 4: michael@0: *target++=(uint8_t)(c>>24); michael@0: case 3: /*fall through*/ michael@0: *target++=(uint8_t)(c>>16); michael@0: case 2: /*fall through*/ michael@0: *target++=(uint8_t)(c>>8); michael@0: case 1: /*fall through*/ michael@0: *target++=(uint8_t)c; michael@0: default: michael@0: /* will never occur */ michael@0: break; michael@0: } michael@0: targetCapacity-=length; michael@0: michael@0: /* normal end of conversion: prepare for a new character */ michael@0: c=0; michael@0: goto loop; michael@0: } else { michael@0: uint8_t *p; michael@0: michael@0: /* michael@0: * We actually do this backwards here: michael@0: * In order to save an intermediate variable, we output michael@0: * first to the overflow buffer what does not fit into the michael@0: * regular target. michael@0: */ michael@0: /* we know that 0<=targetCapacitycharErrorBuffer; michael@0: switch(length) { michael@0: /* each branch falls through to the next one */ michael@0: case 4: michael@0: *p++=(uint8_t)(c>>24); michael@0: case 3: /*fall through*/ michael@0: *p++=(uint8_t)(c>>16); michael@0: case 2: /*fall through*/ michael@0: *p++=(uint8_t)(c>>8); michael@0: case 1: /*fall through*/ michael@0: *p=(uint8_t)c; michael@0: default: michael@0: /* will never occur */ michael@0: break; michael@0: } michael@0: cnv->charErrorBufferLength=(int8_t)length; michael@0: michael@0: /* now output what fits into the regular target */ michael@0: c>>=8*length; /* length was reduced by targetCapacity */ michael@0: switch(targetCapacity) { michael@0: /* each branch falls through to the next one */ michael@0: case 3: michael@0: *target++=(uint8_t)(c>>16); michael@0: case 2: /*fall through*/ michael@0: *target++=(uint8_t)(c>>8); michael@0: case 1: /*fall through*/ michael@0: *target++=(uint8_t)c; michael@0: default: michael@0: break; michael@0: } michael@0: michael@0: /* target overflow */ michael@0: targetCapacity=0; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: c=0; michael@0: goto endloop; michael@0: } michael@0: } michael@0: michael@0: /* miscellaneous ------------------------------------------------------------ */ michael@0: michael@0: static const char * michael@0: _SCSUGetName(const UConverter *cnv) { michael@0: SCSUData *scsu=(SCSUData *)cnv->extraInfo; michael@0: michael@0: switch(scsu->locale) { michael@0: case l_ja: michael@0: return "SCSU,locale=ja"; michael@0: default: michael@0: return "SCSU"; michael@0: } michael@0: } michael@0: michael@0: /* structure for SafeClone calculations */ michael@0: struct cloneSCSUStruct michael@0: { michael@0: UConverter cnv; michael@0: SCSUData mydata; michael@0: }; michael@0: michael@0: static UConverter * michael@0: _SCSUSafeClone(const UConverter *cnv, michael@0: void *stackBuffer, michael@0: int32_t *pBufferSize, michael@0: UErrorCode *status) michael@0: { michael@0: struct cloneSCSUStruct * localClone; michael@0: int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); michael@0: michael@0: if (U_FAILURE(*status)){ michael@0: return 0; michael@0: } michael@0: michael@0: if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ michael@0: *pBufferSize = bufferSizeNeeded; michael@0: return 0; michael@0: } michael@0: michael@0: localClone = (struct cloneSCSUStruct *)stackBuffer; michael@0: /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ michael@0: michael@0: uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); michael@0: localClone->cnv.extraInfo = &localClone->mydata; michael@0: localClone->cnv.isExtraLocal = TRUE; michael@0: michael@0: return &localClone->cnv; michael@0: } michael@0: michael@0: michael@0: static const UConverterImpl _SCSUImpl={ michael@0: UCNV_SCSU, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _SCSUOpen, michael@0: _SCSUClose, michael@0: _SCSUReset, michael@0: michael@0: _SCSUToUnicode, michael@0: _SCSUToUnicodeWithOffsets, michael@0: _SCSUFromUnicode, michael@0: _SCSUFromUnicodeWithOffsets, michael@0: NULL, michael@0: michael@0: NULL, michael@0: _SCSUGetName, michael@0: NULL, michael@0: _SCSUSafeClone, michael@0: ucnv_getCompleteUnicodeSet michael@0: }; michael@0: michael@0: static const UConverterStaticData _SCSUStaticData={ michael@0: sizeof(UConverterStaticData), michael@0: "SCSU", michael@0: 1212, /* CCSID for SCSU */ michael@0: UCNV_IBM, UCNV_SCSU, michael@0: 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ michael@0: /* michael@0: * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode michael@0: * substitution string. michael@0: */ michael@0: { 0x0e, 0xff, 0xfd, 0 }, 3, michael@0: FALSE, FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: michael@0: const UConverterSharedData _SCSUData={ michael@0: sizeof(UConverterSharedData), ~((uint32_t)0), michael@0: NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, michael@0: 0 michael@0: }; michael@0: michael@0: #endif