1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnvscsu.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2018 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2000-2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: ucnvscsu.c 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2000nov18 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* This is an implementation of the Standard Compression Scheme for Unicode 1.20 +* as defined in http://www.unicode.org/unicode/reports/tr6/ . 1.21 +* Reserved commands and window settings are treated as illegal sequences and 1.22 +* will result in callback calls. 1.23 +*/ 1.24 + 1.25 +#include "unicode/utypes.h" 1.26 + 1.27 +#if !UCONFIG_NO_CONVERSION 1.28 + 1.29 +#include "unicode/ucnv.h" 1.30 +#include "unicode/ucnv_cb.h" 1.31 +#include "unicode/utf16.h" 1.32 +#include "ucnv_bld.h" 1.33 +#include "ucnv_cnv.h" 1.34 +#include "cmemory.h" 1.35 + 1.36 +/* SCSU definitions --------------------------------------------------------- */ 1.37 + 1.38 +/* SCSU command byte values */ 1.39 +enum { 1.40 + SQ0=0x01, /* Quote from window pair 0 */ 1.41 + SQ7=0x08, /* Quote from window pair 7 */ 1.42 + SDX=0x0B, /* Define a window as extended */ 1.43 + Srs=0x0C, /* reserved */ 1.44 + SQU=0x0E, /* Quote a single Unicode character */ 1.45 + SCU=0x0F, /* Change to Unicode mode */ 1.46 + SC0=0x10, /* Select window 0 */ 1.47 + SC7=0x17, /* Select window 7 */ 1.48 + SD0=0x18, /* Define and select window 0 */ 1.49 + SD7=0x1F, /* Define and select window 7 */ 1.50 + 1.51 + UC0=0xE0, /* Select window 0 */ 1.52 + UC7=0xE7, /* Select window 7 */ 1.53 + UD0=0xE8, /* Define and select window 0 */ 1.54 + UD7=0xEF, /* Define and select window 7 */ 1.55 + UQU=0xF0, /* Quote a single Unicode character */ 1.56 + UDX=0xF1, /* Define a Window as extended */ 1.57 + Urs=0xF2 /* reserved */ 1.58 +}; 1.59 + 1.60 +enum { 1.61 + /* 1.62 + * Unicode code points from 3400 to E000 are not adressible by 1.63 + * dynamic window, since in these areas no short run alphabets are 1.64 + * found. Therefore add gapOffset to all values from gapThreshold. 1.65 + */ 1.66 + gapThreshold=0x68, 1.67 + gapOffset=0xAC00, 1.68 + 1.69 + /* values between reservedStart and fixedThreshold are reserved */ 1.70 + reservedStart=0xA8, 1.71 + 1.72 + /* use table of predefined fixed offsets for values from fixedThreshold */ 1.73 + fixedThreshold=0xF9 1.74 +}; 1.75 + 1.76 +/* constant offsets for the 8 static windows */ 1.77 +static const uint32_t staticOffsets[8]={ 1.78 + 0x0000, /* ASCII for quoted tags */ 1.79 + 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ 1.80 + 0x0100, /* Latin Extended-A */ 1.81 + 0x0300, /* Combining Diacritical Marks */ 1.82 + 0x2000, /* General Punctuation */ 1.83 + 0x2080, /* Currency Symbols */ 1.84 + 0x2100, /* Letterlike Symbols and Number Forms */ 1.85 + 0x3000 /* CJK Symbols and punctuation */ 1.86 +}; 1.87 + 1.88 +/* initial offsets for the 8 dynamic (sliding) windows */ 1.89 +static const uint32_t initialDynamicOffsets[8]={ 1.90 + 0x0080, /* Latin-1 */ 1.91 + 0x00C0, /* Latin Extended A */ 1.92 + 0x0400, /* Cyrillic */ 1.93 + 0x0600, /* Arabic */ 1.94 + 0x0900, /* Devanagari */ 1.95 + 0x3040, /* Hiragana */ 1.96 + 0x30A0, /* Katakana */ 1.97 + 0xFF00 /* Fullwidth ASCII */ 1.98 +}; 1.99 + 1.100 +/* Table of fixed predefined Offsets */ 1.101 +static const uint32_t fixedOffsets[]={ 1.102 + /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ 1.103 + /* 0xFA */ 0x0250, /* IPA extensions */ 1.104 + /* 0xFB */ 0x0370, /* Greek */ 1.105 + /* 0xFC */ 0x0530, /* Armenian */ 1.106 + /* 0xFD */ 0x3040, /* Hiragana */ 1.107 + /* 0xFE */ 0x30A0, /* Katakana */ 1.108 + /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ 1.109 +}; 1.110 + 1.111 +/* state values */ 1.112 +enum { 1.113 + readCommand, 1.114 + quotePairOne, 1.115 + quotePairTwo, 1.116 + quoteOne, 1.117 + definePairOne, 1.118 + definePairTwo, 1.119 + defineOne 1.120 +}; 1.121 + 1.122 +typedef struct SCSUData { 1.123 + /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ 1.124 + uint32_t toUDynamicOffsets[8]; 1.125 + uint32_t fromUDynamicOffsets[8]; 1.126 + 1.127 + /* state machine state - toUnicode */ 1.128 + UBool toUIsSingleByteMode; 1.129 + uint8_t toUState; 1.130 + int8_t toUQuoteWindow, toUDynamicWindow; 1.131 + uint8_t toUByteOne; 1.132 + uint8_t toUPadding[3]; 1.133 + 1.134 + /* state machine state - fromUnicode */ 1.135 + UBool fromUIsSingleByteMode; 1.136 + int8_t fromUDynamicWindow; 1.137 + 1.138 + /* 1.139 + * windowUse[] keeps track of the use of the dynamic windows: 1.140 + * At nextWindowUseIndex there is the least recently used window, 1.141 + * and the following windows (in a wrapping manner) are more and more 1.142 + * recently used. 1.143 + * At nextWindowUseIndex-1 there is the most recently used window. 1.144 + */ 1.145 + uint8_t locale; 1.146 + int8_t nextWindowUseIndex; 1.147 + int8_t windowUse[8]; 1.148 +} SCSUData; 1.149 + 1.150 +static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; 1.151 +static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; 1.152 + 1.153 +enum { 1.154 + lGeneric, l_ja 1.155 +}; 1.156 + 1.157 +/* SCSU setup functions ----------------------------------------------------- */ 1.158 + 1.159 +static void 1.160 +_SCSUReset(UConverter *cnv, UConverterResetChoice choice) { 1.161 + SCSUData *scsu=(SCSUData *)cnv->extraInfo; 1.162 + 1.163 + if(choice<=UCNV_RESET_TO_UNICODE) { 1.164 + /* reset toUnicode */ 1.165 + uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); 1.166 + 1.167 + scsu->toUIsSingleByteMode=TRUE; 1.168 + scsu->toUState=readCommand; 1.169 + scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; 1.170 + scsu->toUByteOne=0; 1.171 + 1.172 + cnv->toULength=0; 1.173 + } 1.174 + if(choice!=UCNV_RESET_TO_UNICODE) { 1.175 + /* reset fromUnicode */ 1.176 + uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); 1.177 + 1.178 + scsu->fromUIsSingleByteMode=TRUE; 1.179 + scsu->fromUDynamicWindow=0; 1.180 + 1.181 + scsu->nextWindowUseIndex=0; 1.182 + switch(scsu->locale) { 1.183 + case l_ja: 1.184 + uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); 1.185 + break; 1.186 + default: 1.187 + uprv_memcpy(scsu->windowUse, initialWindowUse, 8); 1.188 + break; 1.189 + } 1.190 + 1.191 + cnv->fromUChar32=0; 1.192 + } 1.193 +} 1.194 + 1.195 +static void 1.196 +_SCSUOpen(UConverter *cnv, 1.197 + UConverterLoadArgs *pArgs, 1.198 + UErrorCode *pErrorCode) { 1.199 + const char *locale=pArgs->locale; 1.200 + if(pArgs->onlyTestIsLoadable) { 1.201 + return; 1.202 + } 1.203 + cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); 1.204 + if(cnv->extraInfo!=NULL) { 1.205 + if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { 1.206 + ((SCSUData *)cnv->extraInfo)->locale=l_ja; 1.207 + } else { 1.208 + ((SCSUData *)cnv->extraInfo)->locale=lGeneric; 1.209 + } 1.210 + _SCSUReset(cnv, UCNV_RESET_BOTH); 1.211 + } else { 1.212 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.213 + } 1.214 + 1.215 + /* Set the substitution character U+fffd as a Unicode string. */ 1.216 + cnv->subUChars[0]=0xfffd; 1.217 + cnv->subCharLen=-1; 1.218 +} 1.219 + 1.220 +static void 1.221 +_SCSUClose(UConverter *cnv) { 1.222 + if(cnv->extraInfo!=NULL) { 1.223 + if(!cnv->isExtraLocal) { 1.224 + uprv_free(cnv->extraInfo); 1.225 + } 1.226 + cnv->extraInfo=NULL; 1.227 + } 1.228 +} 1.229 + 1.230 +/* SCSU-to-Unicode conversion functions ------------------------------------- */ 1.231 + 1.232 +static void 1.233 +_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.234 + UErrorCode *pErrorCode) { 1.235 + UConverter *cnv; 1.236 + SCSUData *scsu; 1.237 + const uint8_t *source, *sourceLimit; 1.238 + UChar *target; 1.239 + const UChar *targetLimit; 1.240 + int32_t *offsets; 1.241 + UBool isSingleByteMode; 1.242 + uint8_t state, byteOne; 1.243 + int8_t quoteWindow, dynamicWindow; 1.244 + 1.245 + int32_t sourceIndex, nextSourceIndex; 1.246 + 1.247 + uint8_t b; 1.248 + 1.249 + /* set up the local pointers */ 1.250 + cnv=pArgs->converter; 1.251 + scsu=(SCSUData *)cnv->extraInfo; 1.252 + 1.253 + source=(const uint8_t *)pArgs->source; 1.254 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.255 + target=pArgs->target; 1.256 + targetLimit=pArgs->targetLimit; 1.257 + offsets=pArgs->offsets; 1.258 + 1.259 + /* get the state machine state */ 1.260 + isSingleByteMode=scsu->toUIsSingleByteMode; 1.261 + state=scsu->toUState; 1.262 + quoteWindow=scsu->toUQuoteWindow; 1.263 + dynamicWindow=scsu->toUDynamicWindow; 1.264 + byteOne=scsu->toUByteOne; 1.265 + 1.266 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.267 + sourceIndex=state==readCommand ? 0 : -1; 1.268 + nextSourceIndex=0; 1.269 + 1.270 + /* 1.271 + * conversion "loop" 1.272 + * 1.273 + * For performance, this is not a normal C loop. 1.274 + * Instead, there are two code blocks for the two SCSU modes. 1.275 + * The function branches to either one, and a change of the mode is done with a goto to 1.276 + * the other branch. 1.277 + * 1.278 + * Each branch has two conventional loops: 1.279 + * - a fast-path loop for the most common codes in the mode 1.280 + * - a loop for all other codes in the mode 1.281 + * When the fast-path runs into a code that it cannot handle, its loop ends and it 1.282 + * runs into the following loop to handle the other codes. 1.283 + * The end of the input or output buffer is also handled by the slower loop. 1.284 + * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 1.285 + * 1.286 + * The callback handling is done by returning with an error code. 1.287 + * The conversion framework actually calls the callback function. 1.288 + */ 1.289 + if(isSingleByteMode) { 1.290 + /* fast path for single-byte mode */ 1.291 + if(state==readCommand) { 1.292 +fastSingle: 1.293 + while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 1.294 + ++source; 1.295 + ++nextSourceIndex; 1.296 + if(b<=0x7f) { 1.297 + /* write US-ASCII graphic character or DEL */ 1.298 + *target++=(UChar)b; 1.299 + if(offsets!=NULL) { 1.300 + *offsets++=sourceIndex; 1.301 + } 1.302 + } else { 1.303 + /* write from dynamic window */ 1.304 + uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 1.305 + if(c<=0xffff) { 1.306 + *target++=(UChar)c; 1.307 + if(offsets!=NULL) { 1.308 + *offsets++=sourceIndex; 1.309 + } 1.310 + } else { 1.311 + /* output surrogate pair */ 1.312 + *target++=(UChar)(0xd7c0+(c>>10)); 1.313 + if(target<targetLimit) { 1.314 + *target++=(UChar)(0xdc00|(c&0x3ff)); 1.315 + if(offsets!=NULL) { 1.316 + *offsets++=sourceIndex; 1.317 + *offsets++=sourceIndex; 1.318 + } 1.319 + } else { 1.320 + /* target overflow */ 1.321 + if(offsets!=NULL) { 1.322 + *offsets++=sourceIndex; 1.323 + } 1.324 + cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 1.325 + cnv->UCharErrorBufferLength=1; 1.326 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.327 + goto endloop; 1.328 + } 1.329 + } 1.330 + } 1.331 + sourceIndex=nextSourceIndex; 1.332 + } 1.333 + } 1.334 + 1.335 + /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 1.336 +singleByteMode: 1.337 + while(source<sourceLimit) { 1.338 + if(target>=targetLimit) { 1.339 + /* target is full */ 1.340 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.341 + break; 1.342 + } 1.343 + b=*source++; 1.344 + ++nextSourceIndex; 1.345 + switch(state) { 1.346 + case readCommand: 1.347 + /* redundant conditions are commented out */ 1.348 + /* here: b<0x20 because otherwise we would be in fastSingle */ 1.349 + if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1.350 + /* CR/LF/TAB/NUL */ 1.351 + *target++=(UChar)b; 1.352 + if(offsets!=NULL) { 1.353 + *offsets++=sourceIndex; 1.354 + } 1.355 + sourceIndex=nextSourceIndex; 1.356 + goto fastSingle; 1.357 + } else if(SC0<=b) { 1.358 + if(b<=SC7) { 1.359 + dynamicWindow=(int8_t)(b-SC0); 1.360 + sourceIndex=nextSourceIndex; 1.361 + goto fastSingle; 1.362 + } else /* if(SD0<=b && b<=SD7) */ { 1.363 + dynamicWindow=(int8_t)(b-SD0); 1.364 + state=defineOne; 1.365 + } 1.366 + } else if(/* SQ0<=b && */ b<=SQ7) { 1.367 + quoteWindow=(int8_t)(b-SQ0); 1.368 + state=quoteOne; 1.369 + } else if(b==SDX) { 1.370 + state=definePairOne; 1.371 + } else if(b==SQU) { 1.372 + state=quotePairOne; 1.373 + } else if(b==SCU) { 1.374 + sourceIndex=nextSourceIndex; 1.375 + isSingleByteMode=FALSE; 1.376 + goto fastUnicode; 1.377 + } else /* Srs */ { 1.378 + /* callback(illegal) */ 1.379 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.380 + cnv->toUBytes[0]=b; 1.381 + cnv->toULength=1; 1.382 + goto endloop; 1.383 + } 1.384 + 1.385 + /* store the first byte of a multibyte sequence in toUBytes[] */ 1.386 + cnv->toUBytes[0]=b; 1.387 + cnv->toULength=1; 1.388 + break; 1.389 + case quotePairOne: 1.390 + byteOne=b; 1.391 + cnv->toUBytes[1]=b; 1.392 + cnv->toULength=2; 1.393 + state=quotePairTwo; 1.394 + break; 1.395 + case quotePairTwo: 1.396 + *target++=(UChar)((byteOne<<8)|b); 1.397 + if(offsets!=NULL) { 1.398 + *offsets++=sourceIndex; 1.399 + } 1.400 + sourceIndex=nextSourceIndex; 1.401 + state=readCommand; 1.402 + goto fastSingle; 1.403 + case quoteOne: 1.404 + if(b<0x80) { 1.405 + /* all static offsets are in the BMP */ 1.406 + *target++=(UChar)(staticOffsets[quoteWindow]+b); 1.407 + if(offsets!=NULL) { 1.408 + *offsets++=sourceIndex; 1.409 + } 1.410 + } else { 1.411 + /* write from dynamic window */ 1.412 + uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 1.413 + if(c<=0xffff) { 1.414 + *target++=(UChar)c; 1.415 + if(offsets!=NULL) { 1.416 + *offsets++=sourceIndex; 1.417 + } 1.418 + } else { 1.419 + /* output surrogate pair */ 1.420 + *target++=(UChar)(0xd7c0+(c>>10)); 1.421 + if(target<targetLimit) { 1.422 + *target++=(UChar)(0xdc00|(c&0x3ff)); 1.423 + if(offsets!=NULL) { 1.424 + *offsets++=sourceIndex; 1.425 + *offsets++=sourceIndex; 1.426 + } 1.427 + } else { 1.428 + /* target overflow */ 1.429 + if(offsets!=NULL) { 1.430 + *offsets++=sourceIndex; 1.431 + } 1.432 + cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 1.433 + cnv->UCharErrorBufferLength=1; 1.434 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.435 + goto endloop; 1.436 + } 1.437 + } 1.438 + } 1.439 + sourceIndex=nextSourceIndex; 1.440 + state=readCommand; 1.441 + goto fastSingle; 1.442 + case definePairOne: 1.443 + dynamicWindow=(int8_t)((b>>5)&7); 1.444 + byteOne=(uint8_t)(b&0x1f); 1.445 + cnv->toUBytes[1]=b; 1.446 + cnv->toULength=2; 1.447 + state=definePairTwo; 1.448 + break; 1.449 + case definePairTwo: 1.450 + scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 1.451 + sourceIndex=nextSourceIndex; 1.452 + state=readCommand; 1.453 + goto fastSingle; 1.454 + case defineOne: 1.455 + if(b==0) { 1.456 + /* callback(illegal): Reserved window offset value 0 */ 1.457 + cnv->toUBytes[1]=b; 1.458 + cnv->toULength=2; 1.459 + goto endloop; 1.460 + } else if(b<gapThreshold) { 1.461 + scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 1.462 + } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 1.463 + scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 1.464 + } else if(b>=fixedThreshold) { 1.465 + scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 1.466 + } else { 1.467 + /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 1.468 + cnv->toUBytes[1]=b; 1.469 + cnv->toULength=2; 1.470 + goto endloop; 1.471 + } 1.472 + sourceIndex=nextSourceIndex; 1.473 + state=readCommand; 1.474 + goto fastSingle; 1.475 + } 1.476 + } 1.477 + } else { 1.478 + /* fast path for Unicode mode */ 1.479 + if(state==readCommand) { 1.480 +fastUnicode: 1.481 + while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 1.482 + *target++=(UChar)((b<<8)|source[1]); 1.483 + if(offsets!=NULL) { 1.484 + *offsets++=sourceIndex; 1.485 + } 1.486 + sourceIndex=nextSourceIndex; 1.487 + nextSourceIndex+=2; 1.488 + source+=2; 1.489 + } 1.490 + } 1.491 + 1.492 + /* normal state machine for Unicode mode */ 1.493 +/* unicodeByteMode: */ 1.494 + while(source<sourceLimit) { 1.495 + if(target>=targetLimit) { 1.496 + /* target is full */ 1.497 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.498 + break; 1.499 + } 1.500 + b=*source++; 1.501 + ++nextSourceIndex; 1.502 + switch(state) { 1.503 + case readCommand: 1.504 + if((uint8_t)(b-UC0)>(Urs-UC0)) { 1.505 + byteOne=b; 1.506 + cnv->toUBytes[0]=b; 1.507 + cnv->toULength=1; 1.508 + state=quotePairTwo; 1.509 + } else if(/* UC0<=b && */ b<=UC7) { 1.510 + dynamicWindow=(int8_t)(b-UC0); 1.511 + sourceIndex=nextSourceIndex; 1.512 + isSingleByteMode=TRUE; 1.513 + goto fastSingle; 1.514 + } else if(/* UD0<=b && */ b<=UD7) { 1.515 + dynamicWindow=(int8_t)(b-UD0); 1.516 + isSingleByteMode=TRUE; 1.517 + cnv->toUBytes[0]=b; 1.518 + cnv->toULength=1; 1.519 + state=defineOne; 1.520 + goto singleByteMode; 1.521 + } else if(b==UDX) { 1.522 + isSingleByteMode=TRUE; 1.523 + cnv->toUBytes[0]=b; 1.524 + cnv->toULength=1; 1.525 + state=definePairOne; 1.526 + goto singleByteMode; 1.527 + } else if(b==UQU) { 1.528 + cnv->toUBytes[0]=b; 1.529 + cnv->toULength=1; 1.530 + state=quotePairOne; 1.531 + } else /* Urs */ { 1.532 + /* callback(illegal) */ 1.533 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.534 + cnv->toUBytes[0]=b; 1.535 + cnv->toULength=1; 1.536 + goto endloop; 1.537 + } 1.538 + break; 1.539 + case quotePairOne: 1.540 + byteOne=b; 1.541 + cnv->toUBytes[1]=b; 1.542 + cnv->toULength=2; 1.543 + state=quotePairTwo; 1.544 + break; 1.545 + case quotePairTwo: 1.546 + *target++=(UChar)((byteOne<<8)|b); 1.547 + if(offsets!=NULL) { 1.548 + *offsets++=sourceIndex; 1.549 + } 1.550 + sourceIndex=nextSourceIndex; 1.551 + state=readCommand; 1.552 + goto fastUnicode; 1.553 + } 1.554 + } 1.555 + } 1.556 +endloop: 1.557 + 1.558 + /* set the converter state back into UConverter */ 1.559 + if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 1.560 + /* reset to deal with the next character */ 1.561 + state=readCommand; 1.562 + } else if(state==readCommand) { 1.563 + /* not in a multi-byte sequence, reset toULength */ 1.564 + cnv->toULength=0; 1.565 + } 1.566 + scsu->toUIsSingleByteMode=isSingleByteMode; 1.567 + scsu->toUState=state; 1.568 + scsu->toUQuoteWindow=quoteWindow; 1.569 + scsu->toUDynamicWindow=dynamicWindow; 1.570 + scsu->toUByteOne=byteOne; 1.571 + 1.572 + /* write back the updated pointers */ 1.573 + pArgs->source=(const char *)source; 1.574 + pArgs->target=target; 1.575 + pArgs->offsets=offsets; 1.576 + return; 1.577 +} 1.578 + 1.579 +/* 1.580 + * Identical to _SCSUToUnicodeWithOffsets but without offset handling. 1.581 + * If a change is made in the original function, then either 1.582 + * change this function the same way or 1.583 + * re-copy the original function and remove the variables 1.584 + * offsets, sourceIndex, and nextSourceIndex. 1.585 + */ 1.586 +static void 1.587 +_SCSUToUnicode(UConverterToUnicodeArgs *pArgs, 1.588 + UErrorCode *pErrorCode) { 1.589 + UConverter *cnv; 1.590 + SCSUData *scsu; 1.591 + const uint8_t *source, *sourceLimit; 1.592 + UChar *target; 1.593 + const UChar *targetLimit; 1.594 + UBool isSingleByteMode; 1.595 + uint8_t state, byteOne; 1.596 + int8_t quoteWindow, dynamicWindow; 1.597 + 1.598 + uint8_t b; 1.599 + 1.600 + /* set up the local pointers */ 1.601 + cnv=pArgs->converter; 1.602 + scsu=(SCSUData *)cnv->extraInfo; 1.603 + 1.604 + source=(const uint8_t *)pArgs->source; 1.605 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.606 + target=pArgs->target; 1.607 + targetLimit=pArgs->targetLimit; 1.608 + 1.609 + /* get the state machine state */ 1.610 + isSingleByteMode=scsu->toUIsSingleByteMode; 1.611 + state=scsu->toUState; 1.612 + quoteWindow=scsu->toUQuoteWindow; 1.613 + dynamicWindow=scsu->toUDynamicWindow; 1.614 + byteOne=scsu->toUByteOne; 1.615 + 1.616 + /* 1.617 + * conversion "loop" 1.618 + * 1.619 + * For performance, this is not a normal C loop. 1.620 + * Instead, there are two code blocks for the two SCSU modes. 1.621 + * The function branches to either one, and a change of the mode is done with a goto to 1.622 + * the other branch. 1.623 + * 1.624 + * Each branch has two conventional loops: 1.625 + * - a fast-path loop for the most common codes in the mode 1.626 + * - a loop for all other codes in the mode 1.627 + * When the fast-path runs into a code that it cannot handle, its loop ends and it 1.628 + * runs into the following loop to handle the other codes. 1.629 + * The end of the input or output buffer is also handled by the slower loop. 1.630 + * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 1.631 + * 1.632 + * The callback handling is done by returning with an error code. 1.633 + * The conversion framework actually calls the callback function. 1.634 + */ 1.635 + if(isSingleByteMode) { 1.636 + /* fast path for single-byte mode */ 1.637 + if(state==readCommand) { 1.638 +fastSingle: 1.639 + while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 1.640 + ++source; 1.641 + if(b<=0x7f) { 1.642 + /* write US-ASCII graphic character or DEL */ 1.643 + *target++=(UChar)b; 1.644 + } else { 1.645 + /* write from dynamic window */ 1.646 + uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 1.647 + if(c<=0xffff) { 1.648 + *target++=(UChar)c; 1.649 + } else { 1.650 + /* output surrogate pair */ 1.651 + *target++=(UChar)(0xd7c0+(c>>10)); 1.652 + if(target<targetLimit) { 1.653 + *target++=(UChar)(0xdc00|(c&0x3ff)); 1.654 + } else { 1.655 + /* target overflow */ 1.656 + cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 1.657 + cnv->UCharErrorBufferLength=1; 1.658 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.659 + goto endloop; 1.660 + } 1.661 + } 1.662 + } 1.663 + } 1.664 + } 1.665 + 1.666 + /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 1.667 +singleByteMode: 1.668 + while(source<sourceLimit) { 1.669 + if(target>=targetLimit) { 1.670 + /* target is full */ 1.671 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.672 + break; 1.673 + } 1.674 + b=*source++; 1.675 + switch(state) { 1.676 + case readCommand: 1.677 + /* redundant conditions are commented out */ 1.678 + /* here: b<0x20 because otherwise we would be in fastSingle */ 1.679 + if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1.680 + /* CR/LF/TAB/NUL */ 1.681 + *target++=(UChar)b; 1.682 + goto fastSingle; 1.683 + } else if(SC0<=b) { 1.684 + if(b<=SC7) { 1.685 + dynamicWindow=(int8_t)(b-SC0); 1.686 + goto fastSingle; 1.687 + } else /* if(SD0<=b && b<=SD7) */ { 1.688 + dynamicWindow=(int8_t)(b-SD0); 1.689 + state=defineOne; 1.690 + } 1.691 + } else if(/* SQ0<=b && */ b<=SQ7) { 1.692 + quoteWindow=(int8_t)(b-SQ0); 1.693 + state=quoteOne; 1.694 + } else if(b==SDX) { 1.695 + state=definePairOne; 1.696 + } else if(b==SQU) { 1.697 + state=quotePairOne; 1.698 + } else if(b==SCU) { 1.699 + isSingleByteMode=FALSE; 1.700 + goto fastUnicode; 1.701 + } else /* Srs */ { 1.702 + /* callback(illegal) */ 1.703 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.704 + cnv->toUBytes[0]=b; 1.705 + cnv->toULength=1; 1.706 + goto endloop; 1.707 + } 1.708 + 1.709 + /* store the first byte of a multibyte sequence in toUBytes[] */ 1.710 + cnv->toUBytes[0]=b; 1.711 + cnv->toULength=1; 1.712 + break; 1.713 + case quotePairOne: 1.714 + byteOne=b; 1.715 + cnv->toUBytes[1]=b; 1.716 + cnv->toULength=2; 1.717 + state=quotePairTwo; 1.718 + break; 1.719 + case quotePairTwo: 1.720 + *target++=(UChar)((byteOne<<8)|b); 1.721 + state=readCommand; 1.722 + goto fastSingle; 1.723 + case quoteOne: 1.724 + if(b<0x80) { 1.725 + /* all static offsets are in the BMP */ 1.726 + *target++=(UChar)(staticOffsets[quoteWindow]+b); 1.727 + } else { 1.728 + /* write from dynamic window */ 1.729 + uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 1.730 + if(c<=0xffff) { 1.731 + *target++=(UChar)c; 1.732 + } else { 1.733 + /* output surrogate pair */ 1.734 + *target++=(UChar)(0xd7c0+(c>>10)); 1.735 + if(target<targetLimit) { 1.736 + *target++=(UChar)(0xdc00|(c&0x3ff)); 1.737 + } else { 1.738 + /* target overflow */ 1.739 + cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 1.740 + cnv->UCharErrorBufferLength=1; 1.741 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.742 + goto endloop; 1.743 + } 1.744 + } 1.745 + } 1.746 + state=readCommand; 1.747 + goto fastSingle; 1.748 + case definePairOne: 1.749 + dynamicWindow=(int8_t)((b>>5)&7); 1.750 + byteOne=(uint8_t)(b&0x1f); 1.751 + cnv->toUBytes[1]=b; 1.752 + cnv->toULength=2; 1.753 + state=definePairTwo; 1.754 + break; 1.755 + case definePairTwo: 1.756 + scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 1.757 + state=readCommand; 1.758 + goto fastSingle; 1.759 + case defineOne: 1.760 + if(b==0) { 1.761 + /* callback(illegal): Reserved window offset value 0 */ 1.762 + cnv->toUBytes[1]=b; 1.763 + cnv->toULength=2; 1.764 + goto endloop; 1.765 + } else if(b<gapThreshold) { 1.766 + scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 1.767 + } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 1.768 + scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 1.769 + } else if(b>=fixedThreshold) { 1.770 + scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 1.771 + } else { 1.772 + /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 1.773 + cnv->toUBytes[1]=b; 1.774 + cnv->toULength=2; 1.775 + goto endloop; 1.776 + } 1.777 + state=readCommand; 1.778 + goto fastSingle; 1.779 + } 1.780 + } 1.781 + } else { 1.782 + /* fast path for Unicode mode */ 1.783 + if(state==readCommand) { 1.784 +fastUnicode: 1.785 + while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 1.786 + *target++=(UChar)((b<<8)|source[1]); 1.787 + source+=2; 1.788 + } 1.789 + } 1.790 + 1.791 + /* normal state machine for Unicode mode */ 1.792 +/* unicodeByteMode: */ 1.793 + while(source<sourceLimit) { 1.794 + if(target>=targetLimit) { 1.795 + /* target is full */ 1.796 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.797 + break; 1.798 + } 1.799 + b=*source++; 1.800 + switch(state) { 1.801 + case readCommand: 1.802 + if((uint8_t)(b-UC0)>(Urs-UC0)) { 1.803 + byteOne=b; 1.804 + cnv->toUBytes[0]=b; 1.805 + cnv->toULength=1; 1.806 + state=quotePairTwo; 1.807 + } else if(/* UC0<=b && */ b<=UC7) { 1.808 + dynamicWindow=(int8_t)(b-UC0); 1.809 + isSingleByteMode=TRUE; 1.810 + goto fastSingle; 1.811 + } else if(/* UD0<=b && */ b<=UD7) { 1.812 + dynamicWindow=(int8_t)(b-UD0); 1.813 + isSingleByteMode=TRUE; 1.814 + cnv->toUBytes[0]=b; 1.815 + cnv->toULength=1; 1.816 + state=defineOne; 1.817 + goto singleByteMode; 1.818 + } else if(b==UDX) { 1.819 + isSingleByteMode=TRUE; 1.820 + cnv->toUBytes[0]=b; 1.821 + cnv->toULength=1; 1.822 + state=definePairOne; 1.823 + goto singleByteMode; 1.824 + } else if(b==UQU) { 1.825 + cnv->toUBytes[0]=b; 1.826 + cnv->toULength=1; 1.827 + state=quotePairOne; 1.828 + } else /* Urs */ { 1.829 + /* callback(illegal) */ 1.830 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.831 + cnv->toUBytes[0]=b; 1.832 + cnv->toULength=1; 1.833 + goto endloop; 1.834 + } 1.835 + break; 1.836 + case quotePairOne: 1.837 + byteOne=b; 1.838 + cnv->toUBytes[1]=b; 1.839 + cnv->toULength=2; 1.840 + state=quotePairTwo; 1.841 + break; 1.842 + case quotePairTwo: 1.843 + *target++=(UChar)((byteOne<<8)|b); 1.844 + state=readCommand; 1.845 + goto fastUnicode; 1.846 + } 1.847 + } 1.848 + } 1.849 +endloop: 1.850 + 1.851 + /* set the converter state back into UConverter */ 1.852 + if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 1.853 + /* reset to deal with the next character */ 1.854 + state=readCommand; 1.855 + } else if(state==readCommand) { 1.856 + /* not in a multi-byte sequence, reset toULength */ 1.857 + cnv->toULength=0; 1.858 + } 1.859 + scsu->toUIsSingleByteMode=isSingleByteMode; 1.860 + scsu->toUState=state; 1.861 + scsu->toUQuoteWindow=quoteWindow; 1.862 + scsu->toUDynamicWindow=dynamicWindow; 1.863 + scsu->toUByteOne=byteOne; 1.864 + 1.865 + /* write back the updated pointers */ 1.866 + pArgs->source=(const char *)source; 1.867 + pArgs->target=target; 1.868 + return; 1.869 +} 1.870 + 1.871 +/* SCSU-from-Unicode conversion functions ----------------------------------- */ 1.872 + 1.873 +/* 1.874 + * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve 1.875 + * reasonable results. The lookahead is minimal. 1.876 + * Many cases are simple: 1.877 + * A character fits directly into the current mode, a dynamic or static window, 1.878 + * or is not compressible. These cases are tested first. 1.879 + * Real compression heuristics are applied to the rest, in code branches for 1.880 + * single/Unicode mode and BMP/supplementary code points. 1.881 + * The heuristics used here are extremely simple. 1.882 + */ 1.883 + 1.884 +/* get the number of the window that this character is in, or -1 */ 1.885 +static int8_t 1.886 +getWindow(const uint32_t offsets[8], uint32_t c) { 1.887 + int i; 1.888 + for(i=0; i<8; ++i) { 1.889 + if((uint32_t)(c-offsets[i])<=0x7f) { 1.890 + return (int8_t)(i); 1.891 + } 1.892 + } 1.893 + return -1; 1.894 +} 1.895 + 1.896 +/* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ 1.897 +static UBool 1.898 +isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { 1.899 + return (UBool)(c<=offset+0x7f && 1.900 + (c>=offset || (c<=0x7f && 1.901 + (c>=0x20 || (1UL<<c)&0x2601)))); 1.902 + /* binary 0010 0110 0000 0001, 1.903 + check for b==0xd || b==0xa || b==9 || b==0 */ 1.904 +} 1.905 + 1.906 +/* 1.907 + * getNextDynamicWindow returns the next dynamic window to be redefined 1.908 + */ 1.909 +static int8_t 1.910 +getNextDynamicWindow(SCSUData *scsu) { 1.911 + int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; 1.912 + if(++scsu->nextWindowUseIndex==8) { 1.913 + scsu->nextWindowUseIndex=0; 1.914 + } 1.915 + return window; 1.916 +} 1.917 + 1.918 +/* 1.919 + * useDynamicWindow() adjusts 1.920 + * windowUse[] and nextWindowUseIndex for the algorithm to choose 1.921 + * the next dynamic window to be defined; 1.922 + * a subclass may override it and provide its own algorithm. 1.923 + */ 1.924 +static void 1.925 +useDynamicWindow(SCSUData *scsu, int8_t window) { 1.926 + /* 1.927 + * move the existing window, which just became the most recently used one, 1.928 + * up in windowUse[] to nextWindowUseIndex-1 1.929 + */ 1.930 + 1.931 + /* first, find the index of the window - backwards to favor the more recently used windows */ 1.932 + int i, j; 1.933 + 1.934 + i=scsu->nextWindowUseIndex; 1.935 + do { 1.936 + if(--i<0) { 1.937 + i=7; 1.938 + } 1.939 + } while(scsu->windowUse[i]!=window); 1.940 + 1.941 + /* now copy each windowUse[i+1] to [i] */ 1.942 + j=i+1; 1.943 + if(j==8) { 1.944 + j=0; 1.945 + } 1.946 + while(j!=scsu->nextWindowUseIndex) { 1.947 + scsu->windowUse[i]=scsu->windowUse[j]; 1.948 + i=j; 1.949 + if(++j==8) { j=0; } 1.950 + } 1.951 + 1.952 + /* finally, set the window into the most recently used index */ 1.953 + scsu->windowUse[i]=window; 1.954 +} 1.955 + 1.956 +/* 1.957 + * calculate the offset and the code for a dynamic window that contains the character 1.958 + * takes fixed offsets into account 1.959 + * the offset of the window is stored in the offset variable, 1.960 + * the code is returned 1.961 + * 1.962 + * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code 1.963 + */ 1.964 +static int 1.965 +getDynamicOffset(uint32_t c, uint32_t *pOffset) { 1.966 + int i; 1.967 + 1.968 + for(i=0; i<7; ++i) { 1.969 + if((uint32_t)(c-fixedOffsets[i])<=0x7f) { 1.970 + *pOffset=fixedOffsets[i]; 1.971 + return 0xf9+i; 1.972 + } 1.973 + } 1.974 + 1.975 + if(c<0x80) { 1.976 + /* No dynamic window for US-ASCII. */ 1.977 + return -1; 1.978 + } else if(c<0x3400 || 1.979 + (uint32_t)(c-0x10000)<(0x14000-0x10000) || 1.980 + (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) 1.981 + ) { 1.982 + /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ 1.983 + *pOffset=c&0x7fffff80; 1.984 + return (int)(c>>7); 1.985 + } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { 1.986 + /* For these characters we need to take the gapOffset into account. */ 1.987 + *pOffset=c&0x7fffff80; 1.988 + return (int)((c-gapOffset)>>7); 1.989 + } else { 1.990 + return -1; 1.991 + } 1.992 +} 1.993 + 1.994 +/* 1.995 + * Idea for compression: 1.996 + * - save SCSUData and other state before really starting work 1.997 + * - at endloop, see if compression could be better with just unicode mode 1.998 + * - don't do this if a callback has been called 1.999 + * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning 1.1000 + * - different buffer handling! 1.1001 + * 1.1002 + * Drawback or need for corrective handling: 1.1003 + * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and 1.1004 + * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible 1.1005 + * not only for compression but also for HTML/XML documents with following charset/encoding announcers. 1.1006 + * 1.1007 + * How to achieve both? 1.1008 + * - Only replace the result after an SDX or SCU? 1.1009 + */ 1.1010 + 1.1011 +static void 1.1012 +_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.1013 + UErrorCode *pErrorCode) { 1.1014 + UConverter *cnv; 1.1015 + SCSUData *scsu; 1.1016 + const UChar *source, *sourceLimit; 1.1017 + uint8_t *target; 1.1018 + int32_t targetCapacity; 1.1019 + int32_t *offsets; 1.1020 + 1.1021 + UBool isSingleByteMode; 1.1022 + uint8_t dynamicWindow; 1.1023 + uint32_t currentOffset; 1.1024 + 1.1025 + uint32_t c, delta; 1.1026 + 1.1027 + int32_t sourceIndex, nextSourceIndex; 1.1028 + 1.1029 + int32_t length; 1.1030 + 1.1031 + /* variables for compression heuristics */ 1.1032 + uint32_t offset; 1.1033 + UChar lead, trail; 1.1034 + int code; 1.1035 + int8_t window; 1.1036 + 1.1037 + /* set up the local pointers */ 1.1038 + cnv=pArgs->converter; 1.1039 + scsu=(SCSUData *)cnv->extraInfo; 1.1040 + 1.1041 + /* set up the local pointers */ 1.1042 + source=pArgs->source; 1.1043 + sourceLimit=pArgs->sourceLimit; 1.1044 + target=(uint8_t *)pArgs->target; 1.1045 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.1046 + offsets=pArgs->offsets; 1.1047 + 1.1048 + /* get the state machine state */ 1.1049 + isSingleByteMode=scsu->fromUIsSingleByteMode; 1.1050 + dynamicWindow=scsu->fromUDynamicWindow; 1.1051 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1052 + 1.1053 + c=cnv->fromUChar32; 1.1054 + 1.1055 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.1056 + sourceIndex= c==0 ? 0 : -1; 1.1057 + nextSourceIndex=0; 1.1058 + 1.1059 + /* similar conversion "loop" as in toUnicode */ 1.1060 +loop: 1.1061 + if(isSingleByteMode) { 1.1062 + if(c!=0 && targetCapacity>0) { 1.1063 + goto getTrailSingle; 1.1064 + } 1.1065 + 1.1066 + /* state machine for single-byte mode */ 1.1067 +/* singleByteMode: */ 1.1068 + while(source<sourceLimit) { 1.1069 + if(targetCapacity<=0) { 1.1070 + /* target is full */ 1.1071 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1072 + break; 1.1073 + } 1.1074 + c=*source++; 1.1075 + ++nextSourceIndex; 1.1076 + 1.1077 + if((c-0x20)<=0x5f) { 1.1078 + /* pass US-ASCII graphic character through */ 1.1079 + *target++=(uint8_t)c; 1.1080 + if(offsets!=NULL) { 1.1081 + *offsets++=sourceIndex; 1.1082 + } 1.1083 + --targetCapacity; 1.1084 + } else if(c<0x20) { 1.1085 + if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1.1086 + /* CR/LF/TAB/NUL */ 1.1087 + *target++=(uint8_t)c; 1.1088 + if(offsets!=NULL) { 1.1089 + *offsets++=sourceIndex; 1.1090 + } 1.1091 + --targetCapacity; 1.1092 + } else { 1.1093 + /* quote C0 control character */ 1.1094 + c|=SQ0<<8; 1.1095 + length=2; 1.1096 + goto outputBytes; 1.1097 + } 1.1098 + } else if((delta=c-currentOffset)<=0x7f) { 1.1099 + /* use the current dynamic window */ 1.1100 + *target++=(uint8_t)(delta|0x80); 1.1101 + if(offsets!=NULL) { 1.1102 + *offsets++=sourceIndex; 1.1103 + } 1.1104 + --targetCapacity; 1.1105 + } else if(U16_IS_SURROGATE(c)) { 1.1106 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1107 +getTrailSingle: 1.1108 + lead=(UChar)c; 1.1109 + if(source<sourceLimit) { 1.1110 + /* test the following code unit */ 1.1111 + trail=*source; 1.1112 + if(U16_IS_TRAIL(trail)) { 1.1113 + ++source; 1.1114 + ++nextSourceIndex; 1.1115 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.1116 + /* convert this surrogate code point */ 1.1117 + /* exit this condition tree */ 1.1118 + } else { 1.1119 + /* this is an unmatched lead code unit (1st surrogate) */ 1.1120 + /* callback(illegal) */ 1.1121 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1122 + goto endloop; 1.1123 + } 1.1124 + } else { 1.1125 + /* no more input */ 1.1126 + break; 1.1127 + } 1.1128 + } else { 1.1129 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.1130 + /* callback(illegal) */ 1.1131 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1132 + goto endloop; 1.1133 + } 1.1134 + 1.1135 + /* compress supplementary character U+10000..U+10ffff */ 1.1136 + if((delta=c-currentOffset)<=0x7f) { 1.1137 + /* use the current dynamic window */ 1.1138 + *target++=(uint8_t)(delta|0x80); 1.1139 + if(offsets!=NULL) { 1.1140 + *offsets++=sourceIndex; 1.1141 + } 1.1142 + --targetCapacity; 1.1143 + } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1.1144 + /* there is a dynamic window that contains this character, change to it */ 1.1145 + dynamicWindow=window; 1.1146 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1147 + useDynamicWindow(scsu, dynamicWindow); 1.1148 + c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1.1149 + length=2; 1.1150 + goto outputBytes; 1.1151 + } else if((code=getDynamicOffset(c, &offset))>=0) { 1.1152 + /* might check if there are more characters in this window to come */ 1.1153 + /* define an extended window with this character */ 1.1154 + code-=0x200; 1.1155 + dynamicWindow=getNextDynamicWindow(scsu); 1.1156 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1.1157 + useDynamicWindow(scsu, dynamicWindow); 1.1158 + c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1.1159 + length=4; 1.1160 + goto outputBytes; 1.1161 + } else { 1.1162 + /* change to Unicode mode and output this (lead, trail) pair */ 1.1163 + isSingleByteMode=FALSE; 1.1164 + *target++=(uint8_t)SCU; 1.1165 + if(offsets!=NULL) { 1.1166 + *offsets++=sourceIndex; 1.1167 + } 1.1168 + --targetCapacity; 1.1169 + c=((uint32_t)lead<<16)|trail; 1.1170 + length=4; 1.1171 + goto outputBytes; 1.1172 + } 1.1173 + } else if(c<0xa0) { 1.1174 + /* quote C1 control character */ 1.1175 + c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1.1176 + length=2; 1.1177 + goto outputBytes; 1.1178 + } else if(c==0xfeff || c>=0xfff0) { 1.1179 + /* quote signature character=byte order mark and specials */ 1.1180 + c|=SQU<<16; 1.1181 + length=3; 1.1182 + goto outputBytes; 1.1183 + } else { 1.1184 + /* compress all other BMP characters */ 1.1185 + if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1.1186 + /* there is a window defined that contains this character - switch to it or quote from it? */ 1.1187 + if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1.1188 + /* change to dynamic window */ 1.1189 + dynamicWindow=window; 1.1190 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1191 + useDynamicWindow(scsu, dynamicWindow); 1.1192 + c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1.1193 + length=2; 1.1194 + goto outputBytes; 1.1195 + } else { 1.1196 + /* quote from dynamic window */ 1.1197 + c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1.1198 + length=2; 1.1199 + goto outputBytes; 1.1200 + } 1.1201 + } else if((window=getWindow(staticOffsets, c))>=0) { 1.1202 + /* quote from static window */ 1.1203 + c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1.1204 + length=2; 1.1205 + goto outputBytes; 1.1206 + } else if((code=getDynamicOffset(c, &offset))>=0) { 1.1207 + /* define a dynamic window with this character */ 1.1208 + dynamicWindow=getNextDynamicWindow(scsu); 1.1209 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1.1210 + useDynamicWindow(scsu, dynamicWindow); 1.1211 + c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1.1212 + length=3; 1.1213 + goto outputBytes; 1.1214 + } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && 1.1215 + (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1.1216 + ) { 1.1217 + /* 1.1218 + * this character is not compressible (a BMP ideograph or similar); 1.1219 + * switch to Unicode mode if this is the last character in the block 1.1220 + * or there is at least one more ideograph following immediately 1.1221 + */ 1.1222 + isSingleByteMode=FALSE; 1.1223 + c|=SCU<<16; 1.1224 + length=3; 1.1225 + goto outputBytes; 1.1226 + } else { 1.1227 + /* quote Unicode */ 1.1228 + c|=SQU<<16; 1.1229 + length=3; 1.1230 + goto outputBytes; 1.1231 + } 1.1232 + } 1.1233 + 1.1234 + /* normal end of conversion: prepare for a new character */ 1.1235 + c=0; 1.1236 + sourceIndex=nextSourceIndex; 1.1237 + } 1.1238 + } else { 1.1239 + if(c!=0 && targetCapacity>0) { 1.1240 + goto getTrailUnicode; 1.1241 + } 1.1242 + 1.1243 + /* state machine for Unicode mode */ 1.1244 +/* unicodeByteMode: */ 1.1245 + while(source<sourceLimit) { 1.1246 + if(targetCapacity<=0) { 1.1247 + /* target is full */ 1.1248 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1249 + break; 1.1250 + } 1.1251 + c=*source++; 1.1252 + ++nextSourceIndex; 1.1253 + 1.1254 + if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { 1.1255 + /* not compressible, write character directly */ 1.1256 + if(targetCapacity>=2) { 1.1257 + *target++=(uint8_t)(c>>8); 1.1258 + *target++=(uint8_t)c; 1.1259 + if(offsets!=NULL) { 1.1260 + *offsets++=sourceIndex; 1.1261 + *offsets++=sourceIndex; 1.1262 + } 1.1263 + targetCapacity-=2; 1.1264 + } else { 1.1265 + length=2; 1.1266 + goto outputBytes; 1.1267 + } 1.1268 + } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { 1.1269 + /* compress BMP character if the following one is not an uncompressible ideograph */ 1.1270 + if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1.1271 + if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { 1.1272 + /* ASCII digit or letter */ 1.1273 + isSingleByteMode=TRUE; 1.1274 + c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1.1275 + length=2; 1.1276 + goto outputBytes; 1.1277 + } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1.1278 + /* there is a dynamic window that contains this character, change to it */ 1.1279 + isSingleByteMode=TRUE; 1.1280 + dynamicWindow=window; 1.1281 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1282 + useDynamicWindow(scsu, dynamicWindow); 1.1283 + c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1.1284 + length=2; 1.1285 + goto outputBytes; 1.1286 + } else if((code=getDynamicOffset(c, &offset))>=0) { 1.1287 + /* define a dynamic window with this character */ 1.1288 + isSingleByteMode=TRUE; 1.1289 + dynamicWindow=getNextDynamicWindow(scsu); 1.1290 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1.1291 + useDynamicWindow(scsu, dynamicWindow); 1.1292 + c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1.1293 + length=3; 1.1294 + goto outputBytes; 1.1295 + } 1.1296 + } 1.1297 + 1.1298 + /* don't know how to compress this character, just write it directly */ 1.1299 + length=2; 1.1300 + goto outputBytes; 1.1301 + } else if(c<0xe000) { 1.1302 + /* c is a surrogate */ 1.1303 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1304 +getTrailUnicode: 1.1305 + lead=(UChar)c; 1.1306 + if(source<sourceLimit) { 1.1307 + /* test the following code unit */ 1.1308 + trail=*source; 1.1309 + if(U16_IS_TRAIL(trail)) { 1.1310 + ++source; 1.1311 + ++nextSourceIndex; 1.1312 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.1313 + /* convert this surrogate code point */ 1.1314 + /* exit this condition tree */ 1.1315 + } else { 1.1316 + /* this is an unmatched lead code unit (1st surrogate) */ 1.1317 + /* callback(illegal) */ 1.1318 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1319 + goto endloop; 1.1320 + } 1.1321 + } else { 1.1322 + /* no more input */ 1.1323 + break; 1.1324 + } 1.1325 + } else { 1.1326 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.1327 + /* callback(illegal) */ 1.1328 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1329 + goto endloop; 1.1330 + } 1.1331 + 1.1332 + /* compress supplementary character */ 1.1333 + if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1.1334 + !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1.1335 + ) { 1.1336 + /* 1.1337 + * there is a dynamic window that contains this character and 1.1338 + * the following character is not uncompressible, 1.1339 + * change to the window 1.1340 + */ 1.1341 + isSingleByteMode=TRUE; 1.1342 + dynamicWindow=window; 1.1343 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1344 + useDynamicWindow(scsu, dynamicWindow); 1.1345 + c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1.1346 + length=2; 1.1347 + goto outputBytes; 1.1348 + } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1.1349 + (code=getDynamicOffset(c, &offset))>=0 1.1350 + ) { 1.1351 + /* two supplementary characters in (probably) the same window - define an extended one */ 1.1352 + isSingleByteMode=TRUE; 1.1353 + code-=0x200; 1.1354 + dynamicWindow=getNextDynamicWindow(scsu); 1.1355 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1.1356 + useDynamicWindow(scsu, dynamicWindow); 1.1357 + c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1.1358 + length=4; 1.1359 + goto outputBytes; 1.1360 + } else { 1.1361 + /* don't know how to compress this character, just write it directly */ 1.1362 + c=((uint32_t)lead<<16)|trail; 1.1363 + length=4; 1.1364 + goto outputBytes; 1.1365 + } 1.1366 + } else /* 0xe000<=c<0xf300 */ { 1.1367 + /* quote to avoid SCSU tags */ 1.1368 + c|=UQU<<16; 1.1369 + length=3; 1.1370 + goto outputBytes; 1.1371 + } 1.1372 + 1.1373 + /* normal end of conversion: prepare for a new character */ 1.1374 + c=0; 1.1375 + sourceIndex=nextSourceIndex; 1.1376 + } 1.1377 + } 1.1378 +endloop: 1.1379 + 1.1380 + /* set the converter state back into UConverter */ 1.1381 + scsu->fromUIsSingleByteMode=isSingleByteMode; 1.1382 + scsu->fromUDynamicWindow=dynamicWindow; 1.1383 + 1.1384 + cnv->fromUChar32=c; 1.1385 + 1.1386 + /* write back the updated pointers */ 1.1387 + pArgs->source=source; 1.1388 + pArgs->target=(char *)target; 1.1389 + pArgs->offsets=offsets; 1.1390 + return; 1.1391 + 1.1392 +outputBytes: 1.1393 + /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1.1394 + /* from the first if in the loop we know that targetCapacity>0 */ 1.1395 + if(length<=targetCapacity) { 1.1396 + if(offsets==NULL) { 1.1397 + switch(length) { 1.1398 + /* each branch falls through to the next one */ 1.1399 + case 4: 1.1400 + *target++=(uint8_t)(c>>24); 1.1401 + case 3: /*fall through*/ 1.1402 + *target++=(uint8_t)(c>>16); 1.1403 + case 2: /*fall through*/ 1.1404 + *target++=(uint8_t)(c>>8); 1.1405 + case 1: /*fall through*/ 1.1406 + *target++=(uint8_t)c; 1.1407 + default: 1.1408 + /* will never occur */ 1.1409 + break; 1.1410 + } 1.1411 + } else { 1.1412 + switch(length) { 1.1413 + /* each branch falls through to the next one */ 1.1414 + case 4: 1.1415 + *target++=(uint8_t)(c>>24); 1.1416 + *offsets++=sourceIndex; 1.1417 + case 3: /*fall through*/ 1.1418 + *target++=(uint8_t)(c>>16); 1.1419 + *offsets++=sourceIndex; 1.1420 + case 2: /*fall through*/ 1.1421 + *target++=(uint8_t)(c>>8); 1.1422 + *offsets++=sourceIndex; 1.1423 + case 1: /*fall through*/ 1.1424 + *target++=(uint8_t)c; 1.1425 + *offsets++=sourceIndex; 1.1426 + default: 1.1427 + /* will never occur */ 1.1428 + break; 1.1429 + } 1.1430 + } 1.1431 + targetCapacity-=length; 1.1432 + 1.1433 + /* normal end of conversion: prepare for a new character */ 1.1434 + c=0; 1.1435 + sourceIndex=nextSourceIndex; 1.1436 + goto loop; 1.1437 + } else { 1.1438 + uint8_t *p; 1.1439 + 1.1440 + /* 1.1441 + * We actually do this backwards here: 1.1442 + * In order to save an intermediate variable, we output 1.1443 + * first to the overflow buffer what does not fit into the 1.1444 + * regular target. 1.1445 + */ 1.1446 + /* we know that 0<=targetCapacity<length<=4 */ 1.1447 + /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1.1448 + length-=targetCapacity; 1.1449 + p=(uint8_t *)cnv->charErrorBuffer; 1.1450 + switch(length) { 1.1451 + /* each branch falls through to the next one */ 1.1452 + case 4: 1.1453 + *p++=(uint8_t)(c>>24); 1.1454 + case 3: /*fall through*/ 1.1455 + *p++=(uint8_t)(c>>16); 1.1456 + case 2: /*fall through*/ 1.1457 + *p++=(uint8_t)(c>>8); 1.1458 + case 1: /*fall through*/ 1.1459 + *p=(uint8_t)c; 1.1460 + default: 1.1461 + /* will never occur */ 1.1462 + break; 1.1463 + } 1.1464 + cnv->charErrorBufferLength=(int8_t)length; 1.1465 + 1.1466 + /* now output what fits into the regular target */ 1.1467 + c>>=8*length; /* length was reduced by targetCapacity */ 1.1468 + switch(targetCapacity) { 1.1469 + /* each branch falls through to the next one */ 1.1470 + case 3: 1.1471 + *target++=(uint8_t)(c>>16); 1.1472 + if(offsets!=NULL) { 1.1473 + *offsets++=sourceIndex; 1.1474 + } 1.1475 + case 2: /*fall through*/ 1.1476 + *target++=(uint8_t)(c>>8); 1.1477 + if(offsets!=NULL) { 1.1478 + *offsets++=sourceIndex; 1.1479 + } 1.1480 + case 1: /*fall through*/ 1.1481 + *target++=(uint8_t)c; 1.1482 + if(offsets!=NULL) { 1.1483 + *offsets++=sourceIndex; 1.1484 + } 1.1485 + default: 1.1486 + break; 1.1487 + } 1.1488 + 1.1489 + /* target overflow */ 1.1490 + targetCapacity=0; 1.1491 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1492 + c=0; 1.1493 + goto endloop; 1.1494 + } 1.1495 +} 1.1496 + 1.1497 +/* 1.1498 + * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. 1.1499 + * If a change is made in the original function, then either 1.1500 + * change this function the same way or 1.1501 + * re-copy the original function and remove the variables 1.1502 + * offsets, sourceIndex, and nextSourceIndex. 1.1503 + */ 1.1504 +static void 1.1505 +_SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, 1.1506 + UErrorCode *pErrorCode) { 1.1507 + UConverter *cnv; 1.1508 + SCSUData *scsu; 1.1509 + const UChar *source, *sourceLimit; 1.1510 + uint8_t *target; 1.1511 + int32_t targetCapacity; 1.1512 + 1.1513 + UBool isSingleByteMode; 1.1514 + uint8_t dynamicWindow; 1.1515 + uint32_t currentOffset; 1.1516 + 1.1517 + uint32_t c, delta; 1.1518 + 1.1519 + int32_t length; 1.1520 + 1.1521 + /* variables for compression heuristics */ 1.1522 + uint32_t offset; 1.1523 + UChar lead, trail; 1.1524 + int code; 1.1525 + int8_t window; 1.1526 + 1.1527 + /* set up the local pointers */ 1.1528 + cnv=pArgs->converter; 1.1529 + scsu=(SCSUData *)cnv->extraInfo; 1.1530 + 1.1531 + /* set up the local pointers */ 1.1532 + source=pArgs->source; 1.1533 + sourceLimit=pArgs->sourceLimit; 1.1534 + target=(uint8_t *)pArgs->target; 1.1535 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.1536 + 1.1537 + /* get the state machine state */ 1.1538 + isSingleByteMode=scsu->fromUIsSingleByteMode; 1.1539 + dynamicWindow=scsu->fromUDynamicWindow; 1.1540 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1541 + 1.1542 + c=cnv->fromUChar32; 1.1543 + 1.1544 + /* similar conversion "loop" as in toUnicode */ 1.1545 +loop: 1.1546 + if(isSingleByteMode) { 1.1547 + if(c!=0 && targetCapacity>0) { 1.1548 + goto getTrailSingle; 1.1549 + } 1.1550 + 1.1551 + /* state machine for single-byte mode */ 1.1552 +/* singleByteMode: */ 1.1553 + while(source<sourceLimit) { 1.1554 + if(targetCapacity<=0) { 1.1555 + /* target is full */ 1.1556 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1557 + break; 1.1558 + } 1.1559 + c=*source++; 1.1560 + 1.1561 + if((c-0x20)<=0x5f) { 1.1562 + /* pass US-ASCII graphic character through */ 1.1563 + *target++=(uint8_t)c; 1.1564 + --targetCapacity; 1.1565 + } else if(c<0x20) { 1.1566 + if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1.1567 + /* CR/LF/TAB/NUL */ 1.1568 + *target++=(uint8_t)c; 1.1569 + --targetCapacity; 1.1570 + } else { 1.1571 + /* quote C0 control character */ 1.1572 + c|=SQ0<<8; 1.1573 + length=2; 1.1574 + goto outputBytes; 1.1575 + } 1.1576 + } else if((delta=c-currentOffset)<=0x7f) { 1.1577 + /* use the current dynamic window */ 1.1578 + *target++=(uint8_t)(delta|0x80); 1.1579 + --targetCapacity; 1.1580 + } else if(U16_IS_SURROGATE(c)) { 1.1581 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1582 +getTrailSingle: 1.1583 + lead=(UChar)c; 1.1584 + if(source<sourceLimit) { 1.1585 + /* test the following code unit */ 1.1586 + trail=*source; 1.1587 + if(U16_IS_TRAIL(trail)) { 1.1588 + ++source; 1.1589 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.1590 + /* convert this surrogate code point */ 1.1591 + /* exit this condition tree */ 1.1592 + } else { 1.1593 + /* this is an unmatched lead code unit (1st surrogate) */ 1.1594 + /* callback(illegal) */ 1.1595 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1596 + goto endloop; 1.1597 + } 1.1598 + } else { 1.1599 + /* no more input */ 1.1600 + break; 1.1601 + } 1.1602 + } else { 1.1603 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.1604 + /* callback(illegal) */ 1.1605 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1606 + goto endloop; 1.1607 + } 1.1608 + 1.1609 + /* compress supplementary character U+10000..U+10ffff */ 1.1610 + if((delta=c-currentOffset)<=0x7f) { 1.1611 + /* use the current dynamic window */ 1.1612 + *target++=(uint8_t)(delta|0x80); 1.1613 + --targetCapacity; 1.1614 + } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1.1615 + /* there is a dynamic window that contains this character, change to it */ 1.1616 + dynamicWindow=window; 1.1617 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1618 + useDynamicWindow(scsu, dynamicWindow); 1.1619 + c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1.1620 + length=2; 1.1621 + goto outputBytes; 1.1622 + } else if((code=getDynamicOffset(c, &offset))>=0) { 1.1623 + /* might check if there are more characters in this window to come */ 1.1624 + /* define an extended window with this character */ 1.1625 + code-=0x200; 1.1626 + dynamicWindow=getNextDynamicWindow(scsu); 1.1627 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1.1628 + useDynamicWindow(scsu, dynamicWindow); 1.1629 + c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1.1630 + length=4; 1.1631 + goto outputBytes; 1.1632 + } else { 1.1633 + /* change to Unicode mode and output this (lead, trail) pair */ 1.1634 + isSingleByteMode=FALSE; 1.1635 + *target++=(uint8_t)SCU; 1.1636 + --targetCapacity; 1.1637 + c=((uint32_t)lead<<16)|trail; 1.1638 + length=4; 1.1639 + goto outputBytes; 1.1640 + } 1.1641 + } else if(c<0xa0) { 1.1642 + /* quote C1 control character */ 1.1643 + c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1.1644 + length=2; 1.1645 + goto outputBytes; 1.1646 + } else if(c==0xfeff || c>=0xfff0) { 1.1647 + /* quote signature character=byte order mark and specials */ 1.1648 + c|=SQU<<16; 1.1649 + length=3; 1.1650 + goto outputBytes; 1.1651 + } else { 1.1652 + /* compress all other BMP characters */ 1.1653 + if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1.1654 + /* there is a window defined that contains this character - switch to it or quote from it? */ 1.1655 + if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1.1656 + /* change to dynamic window */ 1.1657 + dynamicWindow=window; 1.1658 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1659 + useDynamicWindow(scsu, dynamicWindow); 1.1660 + c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1.1661 + length=2; 1.1662 + goto outputBytes; 1.1663 + } else { 1.1664 + /* quote from dynamic window */ 1.1665 + c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1.1666 + length=2; 1.1667 + goto outputBytes; 1.1668 + } 1.1669 + } else if((window=getWindow(staticOffsets, c))>=0) { 1.1670 + /* quote from static window */ 1.1671 + c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1.1672 + length=2; 1.1673 + goto outputBytes; 1.1674 + } else if((code=getDynamicOffset(c, &offset))>=0) { 1.1675 + /* define a dynamic window with this character */ 1.1676 + dynamicWindow=getNextDynamicWindow(scsu); 1.1677 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1.1678 + useDynamicWindow(scsu, dynamicWindow); 1.1679 + c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1.1680 + length=3; 1.1681 + goto outputBytes; 1.1682 + } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && 1.1683 + (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1.1684 + ) { 1.1685 + /* 1.1686 + * this character is not compressible (a BMP ideograph or similar); 1.1687 + * switch to Unicode mode if this is the last character in the block 1.1688 + * or there is at least one more ideograph following immediately 1.1689 + */ 1.1690 + isSingleByteMode=FALSE; 1.1691 + c|=SCU<<16; 1.1692 + length=3; 1.1693 + goto outputBytes; 1.1694 + } else { 1.1695 + /* quote Unicode */ 1.1696 + c|=SQU<<16; 1.1697 + length=3; 1.1698 + goto outputBytes; 1.1699 + } 1.1700 + } 1.1701 + 1.1702 + /* normal end of conversion: prepare for a new character */ 1.1703 + c=0; 1.1704 + } 1.1705 + } else { 1.1706 + if(c!=0 && targetCapacity>0) { 1.1707 + goto getTrailUnicode; 1.1708 + } 1.1709 + 1.1710 + /* state machine for Unicode mode */ 1.1711 +/* unicodeByteMode: */ 1.1712 + while(source<sourceLimit) { 1.1713 + if(targetCapacity<=0) { 1.1714 + /* target is full */ 1.1715 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1716 + break; 1.1717 + } 1.1718 + c=*source++; 1.1719 + 1.1720 + if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { 1.1721 + /* not compressible, write character directly */ 1.1722 + if(targetCapacity>=2) { 1.1723 + *target++=(uint8_t)(c>>8); 1.1724 + *target++=(uint8_t)c; 1.1725 + targetCapacity-=2; 1.1726 + } else { 1.1727 + length=2; 1.1728 + goto outputBytes; 1.1729 + } 1.1730 + } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { 1.1731 + /* compress BMP character if the following one is not an uncompressible ideograph */ 1.1732 + if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1.1733 + if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { 1.1734 + /* ASCII digit or letter */ 1.1735 + isSingleByteMode=TRUE; 1.1736 + c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1.1737 + length=2; 1.1738 + goto outputBytes; 1.1739 + } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1.1740 + /* there is a dynamic window that contains this character, change to it */ 1.1741 + isSingleByteMode=TRUE; 1.1742 + dynamicWindow=window; 1.1743 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1744 + useDynamicWindow(scsu, dynamicWindow); 1.1745 + c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1.1746 + length=2; 1.1747 + goto outputBytes; 1.1748 + } else if((code=getDynamicOffset(c, &offset))>=0) { 1.1749 + /* define a dynamic window with this character */ 1.1750 + isSingleByteMode=TRUE; 1.1751 + dynamicWindow=getNextDynamicWindow(scsu); 1.1752 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1.1753 + useDynamicWindow(scsu, dynamicWindow); 1.1754 + c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1.1755 + length=3; 1.1756 + goto outputBytes; 1.1757 + } 1.1758 + } 1.1759 + 1.1760 + /* don't know how to compress this character, just write it directly */ 1.1761 + length=2; 1.1762 + goto outputBytes; 1.1763 + } else if(c<0xe000) { 1.1764 + /* c is a surrogate */ 1.1765 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1766 +getTrailUnicode: 1.1767 + lead=(UChar)c; 1.1768 + if(source<sourceLimit) { 1.1769 + /* test the following code unit */ 1.1770 + trail=*source; 1.1771 + if(U16_IS_TRAIL(trail)) { 1.1772 + ++source; 1.1773 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.1774 + /* convert this surrogate code point */ 1.1775 + /* exit this condition tree */ 1.1776 + } else { 1.1777 + /* this is an unmatched lead code unit (1st surrogate) */ 1.1778 + /* callback(illegal) */ 1.1779 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1780 + goto endloop; 1.1781 + } 1.1782 + } else { 1.1783 + /* no more input */ 1.1784 + break; 1.1785 + } 1.1786 + } else { 1.1787 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.1788 + /* callback(illegal) */ 1.1789 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1790 + goto endloop; 1.1791 + } 1.1792 + 1.1793 + /* compress supplementary character */ 1.1794 + if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1.1795 + !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1.1796 + ) { 1.1797 + /* 1.1798 + * there is a dynamic window that contains this character and 1.1799 + * the following character is not uncompressible, 1.1800 + * change to the window 1.1801 + */ 1.1802 + isSingleByteMode=TRUE; 1.1803 + dynamicWindow=window; 1.1804 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1.1805 + useDynamicWindow(scsu, dynamicWindow); 1.1806 + c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1.1807 + length=2; 1.1808 + goto outputBytes; 1.1809 + } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1.1810 + (code=getDynamicOffset(c, &offset))>=0 1.1811 + ) { 1.1812 + /* two supplementary characters in (probably) the same window - define an extended one */ 1.1813 + isSingleByteMode=TRUE; 1.1814 + code-=0x200; 1.1815 + dynamicWindow=getNextDynamicWindow(scsu); 1.1816 + currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1.1817 + useDynamicWindow(scsu, dynamicWindow); 1.1818 + c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1.1819 + length=4; 1.1820 + goto outputBytes; 1.1821 + } else { 1.1822 + /* don't know how to compress this character, just write it directly */ 1.1823 + c=((uint32_t)lead<<16)|trail; 1.1824 + length=4; 1.1825 + goto outputBytes; 1.1826 + } 1.1827 + } else /* 0xe000<=c<0xf300 */ { 1.1828 + /* quote to avoid SCSU tags */ 1.1829 + c|=UQU<<16; 1.1830 + length=3; 1.1831 + goto outputBytes; 1.1832 + } 1.1833 + 1.1834 + /* normal end of conversion: prepare for a new character */ 1.1835 + c=0; 1.1836 + } 1.1837 + } 1.1838 +endloop: 1.1839 + 1.1840 + /* set the converter state back into UConverter */ 1.1841 + scsu->fromUIsSingleByteMode=isSingleByteMode; 1.1842 + scsu->fromUDynamicWindow=dynamicWindow; 1.1843 + 1.1844 + cnv->fromUChar32=c; 1.1845 + 1.1846 + /* write back the updated pointers */ 1.1847 + pArgs->source=source; 1.1848 + pArgs->target=(char *)target; 1.1849 + return; 1.1850 + 1.1851 +outputBytes: 1.1852 + /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1.1853 + /* from the first if in the loop we know that targetCapacity>0 */ 1.1854 + if(length<=targetCapacity) { 1.1855 + switch(length) { 1.1856 + /* each branch falls through to the next one */ 1.1857 + case 4: 1.1858 + *target++=(uint8_t)(c>>24); 1.1859 + case 3: /*fall through*/ 1.1860 + *target++=(uint8_t)(c>>16); 1.1861 + case 2: /*fall through*/ 1.1862 + *target++=(uint8_t)(c>>8); 1.1863 + case 1: /*fall through*/ 1.1864 + *target++=(uint8_t)c; 1.1865 + default: 1.1866 + /* will never occur */ 1.1867 + break; 1.1868 + } 1.1869 + targetCapacity-=length; 1.1870 + 1.1871 + /* normal end of conversion: prepare for a new character */ 1.1872 + c=0; 1.1873 + goto loop; 1.1874 + } else { 1.1875 + uint8_t *p; 1.1876 + 1.1877 + /* 1.1878 + * We actually do this backwards here: 1.1879 + * In order to save an intermediate variable, we output 1.1880 + * first to the overflow buffer what does not fit into the 1.1881 + * regular target. 1.1882 + */ 1.1883 + /* we know that 0<=targetCapacity<length<=4 */ 1.1884 + /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1.1885 + length-=targetCapacity; 1.1886 + p=(uint8_t *)cnv->charErrorBuffer; 1.1887 + switch(length) { 1.1888 + /* each branch falls through to the next one */ 1.1889 + case 4: 1.1890 + *p++=(uint8_t)(c>>24); 1.1891 + case 3: /*fall through*/ 1.1892 + *p++=(uint8_t)(c>>16); 1.1893 + case 2: /*fall through*/ 1.1894 + *p++=(uint8_t)(c>>8); 1.1895 + case 1: /*fall through*/ 1.1896 + *p=(uint8_t)c; 1.1897 + default: 1.1898 + /* will never occur */ 1.1899 + break; 1.1900 + } 1.1901 + cnv->charErrorBufferLength=(int8_t)length; 1.1902 + 1.1903 + /* now output what fits into the regular target */ 1.1904 + c>>=8*length; /* length was reduced by targetCapacity */ 1.1905 + switch(targetCapacity) { 1.1906 + /* each branch falls through to the next one */ 1.1907 + case 3: 1.1908 + *target++=(uint8_t)(c>>16); 1.1909 + case 2: /*fall through*/ 1.1910 + *target++=(uint8_t)(c>>8); 1.1911 + case 1: /*fall through*/ 1.1912 + *target++=(uint8_t)c; 1.1913 + default: 1.1914 + break; 1.1915 + } 1.1916 + 1.1917 + /* target overflow */ 1.1918 + targetCapacity=0; 1.1919 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1920 + c=0; 1.1921 + goto endloop; 1.1922 + } 1.1923 +} 1.1924 + 1.1925 +/* miscellaneous ------------------------------------------------------------ */ 1.1926 + 1.1927 +static const char * 1.1928 +_SCSUGetName(const UConverter *cnv) { 1.1929 + SCSUData *scsu=(SCSUData *)cnv->extraInfo; 1.1930 + 1.1931 + switch(scsu->locale) { 1.1932 + case l_ja: 1.1933 + return "SCSU,locale=ja"; 1.1934 + default: 1.1935 + return "SCSU"; 1.1936 + } 1.1937 +} 1.1938 + 1.1939 +/* structure for SafeClone calculations */ 1.1940 +struct cloneSCSUStruct 1.1941 +{ 1.1942 + UConverter cnv; 1.1943 + SCSUData mydata; 1.1944 +}; 1.1945 + 1.1946 +static UConverter * 1.1947 +_SCSUSafeClone(const UConverter *cnv, 1.1948 + void *stackBuffer, 1.1949 + int32_t *pBufferSize, 1.1950 + UErrorCode *status) 1.1951 +{ 1.1952 + struct cloneSCSUStruct * localClone; 1.1953 + int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); 1.1954 + 1.1955 + if (U_FAILURE(*status)){ 1.1956 + return 0; 1.1957 + } 1.1958 + 1.1959 + if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 1.1960 + *pBufferSize = bufferSizeNeeded; 1.1961 + return 0; 1.1962 + } 1.1963 + 1.1964 + localClone = (struct cloneSCSUStruct *)stackBuffer; 1.1965 + /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 1.1966 + 1.1967 + uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); 1.1968 + localClone->cnv.extraInfo = &localClone->mydata; 1.1969 + localClone->cnv.isExtraLocal = TRUE; 1.1970 + 1.1971 + return &localClone->cnv; 1.1972 +} 1.1973 + 1.1974 + 1.1975 +static const UConverterImpl _SCSUImpl={ 1.1976 + UCNV_SCSU, 1.1977 + 1.1978 + NULL, 1.1979 + NULL, 1.1980 + 1.1981 + _SCSUOpen, 1.1982 + _SCSUClose, 1.1983 + _SCSUReset, 1.1984 + 1.1985 + _SCSUToUnicode, 1.1986 + _SCSUToUnicodeWithOffsets, 1.1987 + _SCSUFromUnicode, 1.1988 + _SCSUFromUnicodeWithOffsets, 1.1989 + NULL, 1.1990 + 1.1991 + NULL, 1.1992 + _SCSUGetName, 1.1993 + NULL, 1.1994 + _SCSUSafeClone, 1.1995 + ucnv_getCompleteUnicodeSet 1.1996 +}; 1.1997 + 1.1998 +static const UConverterStaticData _SCSUStaticData={ 1.1999 + sizeof(UConverterStaticData), 1.2000 + "SCSU", 1.2001 + 1212, /* CCSID for SCSU */ 1.2002 + UCNV_IBM, UCNV_SCSU, 1.2003 + 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ 1.2004 + /* 1.2005 + * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode 1.2006 + * substitution string. 1.2007 + */ 1.2008 + { 0x0e, 0xff, 0xfd, 0 }, 3, 1.2009 + FALSE, FALSE, 1.2010 + 0, 1.2011 + 0, 1.2012 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.2013 +}; 1.2014 + 1.2015 +const UConverterSharedData _SCSUData={ 1.2016 + sizeof(UConverterSharedData), ~((uint32_t)0), 1.2017 + NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, 1.2018 + 0 1.2019 +}; 1.2020 + 1.2021 +#endif