intl/icu/source/common/ucnvscsu.c

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2000-2011, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 * file name: ucnvscsu.c
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2000nov18
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * This is an implementation of the Standard Compression Scheme for Unicode
michael@0 17 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
michael@0 18 * Reserved commands and window settings are treated as illegal sequences and
michael@0 19 * will result in callback calls.
michael@0 20 */
michael@0 21
michael@0 22 #include "unicode/utypes.h"
michael@0 23
michael@0 24 #if !UCONFIG_NO_CONVERSION
michael@0 25
michael@0 26 #include "unicode/ucnv.h"
michael@0 27 #include "unicode/ucnv_cb.h"
michael@0 28 #include "unicode/utf16.h"
michael@0 29 #include "ucnv_bld.h"
michael@0 30 #include "ucnv_cnv.h"
michael@0 31 #include "cmemory.h"
michael@0 32
michael@0 33 /* SCSU definitions --------------------------------------------------------- */
michael@0 34
michael@0 35 /* SCSU command byte values */
michael@0 36 enum {
michael@0 37 SQ0=0x01, /* Quote from window pair 0 */
michael@0 38 SQ7=0x08, /* Quote from window pair 7 */
michael@0 39 SDX=0x0B, /* Define a window as extended */
michael@0 40 Srs=0x0C, /* reserved */
michael@0 41 SQU=0x0E, /* Quote a single Unicode character */
michael@0 42 SCU=0x0F, /* Change to Unicode mode */
michael@0 43 SC0=0x10, /* Select window 0 */
michael@0 44 SC7=0x17, /* Select window 7 */
michael@0 45 SD0=0x18, /* Define and select window 0 */
michael@0 46 SD7=0x1F, /* Define and select window 7 */
michael@0 47
michael@0 48 UC0=0xE0, /* Select window 0 */
michael@0 49 UC7=0xE7, /* Select window 7 */
michael@0 50 UD0=0xE8, /* Define and select window 0 */
michael@0 51 UD7=0xEF, /* Define and select window 7 */
michael@0 52 UQU=0xF0, /* Quote a single Unicode character */
michael@0 53 UDX=0xF1, /* Define a Window as extended */
michael@0 54 Urs=0xF2 /* reserved */
michael@0 55 };
michael@0 56
michael@0 57 enum {
michael@0 58 /*
michael@0 59 * Unicode code points from 3400 to E000 are not adressible by
michael@0 60 * dynamic window, since in these areas no short run alphabets are
michael@0 61 * found. Therefore add gapOffset to all values from gapThreshold.
michael@0 62 */
michael@0 63 gapThreshold=0x68,
michael@0 64 gapOffset=0xAC00,
michael@0 65
michael@0 66 /* values between reservedStart and fixedThreshold are reserved */
michael@0 67 reservedStart=0xA8,
michael@0 68
michael@0 69 /* use table of predefined fixed offsets for values from fixedThreshold */
michael@0 70 fixedThreshold=0xF9
michael@0 71 };
michael@0 72
michael@0 73 /* constant offsets for the 8 static windows */
michael@0 74 static const uint32_t staticOffsets[8]={
michael@0 75 0x0000, /* ASCII for quoted tags */
michael@0 76 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
michael@0 77 0x0100, /* Latin Extended-A */
michael@0 78 0x0300, /* Combining Diacritical Marks */
michael@0 79 0x2000, /* General Punctuation */
michael@0 80 0x2080, /* Currency Symbols */
michael@0 81 0x2100, /* Letterlike Symbols and Number Forms */
michael@0 82 0x3000 /* CJK Symbols and punctuation */
michael@0 83 };
michael@0 84
michael@0 85 /* initial offsets for the 8 dynamic (sliding) windows */
michael@0 86 static const uint32_t initialDynamicOffsets[8]={
michael@0 87 0x0080, /* Latin-1 */
michael@0 88 0x00C0, /* Latin Extended A */
michael@0 89 0x0400, /* Cyrillic */
michael@0 90 0x0600, /* Arabic */
michael@0 91 0x0900, /* Devanagari */
michael@0 92 0x3040, /* Hiragana */
michael@0 93 0x30A0, /* Katakana */
michael@0 94 0xFF00 /* Fullwidth ASCII */
michael@0 95 };
michael@0 96
michael@0 97 /* Table of fixed predefined Offsets */
michael@0 98 static const uint32_t fixedOffsets[]={
michael@0 99 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
michael@0 100 /* 0xFA */ 0x0250, /* IPA extensions */
michael@0 101 /* 0xFB */ 0x0370, /* Greek */
michael@0 102 /* 0xFC */ 0x0530, /* Armenian */
michael@0 103 /* 0xFD */ 0x3040, /* Hiragana */
michael@0 104 /* 0xFE */ 0x30A0, /* Katakana */
michael@0 105 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
michael@0 106 };
michael@0 107
michael@0 108 /* state values */
michael@0 109 enum {
michael@0 110 readCommand,
michael@0 111 quotePairOne,
michael@0 112 quotePairTwo,
michael@0 113 quoteOne,
michael@0 114 definePairOne,
michael@0 115 definePairTwo,
michael@0 116 defineOne
michael@0 117 };
michael@0 118
michael@0 119 typedef struct SCSUData {
michael@0 120 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
michael@0 121 uint32_t toUDynamicOffsets[8];
michael@0 122 uint32_t fromUDynamicOffsets[8];
michael@0 123
michael@0 124 /* state machine state - toUnicode */
michael@0 125 UBool toUIsSingleByteMode;
michael@0 126 uint8_t toUState;
michael@0 127 int8_t toUQuoteWindow, toUDynamicWindow;
michael@0 128 uint8_t toUByteOne;
michael@0 129 uint8_t toUPadding[3];
michael@0 130
michael@0 131 /* state machine state - fromUnicode */
michael@0 132 UBool fromUIsSingleByteMode;
michael@0 133 int8_t fromUDynamicWindow;
michael@0 134
michael@0 135 /*
michael@0 136 * windowUse[] keeps track of the use of the dynamic windows:
michael@0 137 * At nextWindowUseIndex there is the least recently used window,
michael@0 138 * and the following windows (in a wrapping manner) are more and more
michael@0 139 * recently used.
michael@0 140 * At nextWindowUseIndex-1 there is the most recently used window.
michael@0 141 */
michael@0 142 uint8_t locale;
michael@0 143 int8_t nextWindowUseIndex;
michael@0 144 int8_t windowUse[8];
michael@0 145 } SCSUData;
michael@0 146
michael@0 147 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
michael@0 148 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
michael@0 149
michael@0 150 enum {
michael@0 151 lGeneric, l_ja
michael@0 152 };
michael@0 153
michael@0 154 /* SCSU setup functions ----------------------------------------------------- */
michael@0 155
michael@0 156 static void
michael@0 157 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
michael@0 158 SCSUData *scsu=(SCSUData *)cnv->extraInfo;
michael@0 159
michael@0 160 if(choice<=UCNV_RESET_TO_UNICODE) {
michael@0 161 /* reset toUnicode */
michael@0 162 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
michael@0 163
michael@0 164 scsu->toUIsSingleByteMode=TRUE;
michael@0 165 scsu->toUState=readCommand;
michael@0 166 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
michael@0 167 scsu->toUByteOne=0;
michael@0 168
michael@0 169 cnv->toULength=0;
michael@0 170 }
michael@0 171 if(choice!=UCNV_RESET_TO_UNICODE) {
michael@0 172 /* reset fromUnicode */
michael@0 173 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
michael@0 174
michael@0 175 scsu->fromUIsSingleByteMode=TRUE;
michael@0 176 scsu->fromUDynamicWindow=0;
michael@0 177
michael@0 178 scsu->nextWindowUseIndex=0;
michael@0 179 switch(scsu->locale) {
michael@0 180 case l_ja:
michael@0 181 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
michael@0 182 break;
michael@0 183 default:
michael@0 184 uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
michael@0 185 break;
michael@0 186 }
michael@0 187
michael@0 188 cnv->fromUChar32=0;
michael@0 189 }
michael@0 190 }
michael@0 191
michael@0 192 static void
michael@0 193 _SCSUOpen(UConverter *cnv,
michael@0 194 UConverterLoadArgs *pArgs,
michael@0 195 UErrorCode *pErrorCode) {
michael@0 196 const char *locale=pArgs->locale;
michael@0 197 if(pArgs->onlyTestIsLoadable) {
michael@0 198 return;
michael@0 199 }
michael@0 200 cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
michael@0 201 if(cnv->extraInfo!=NULL) {
michael@0 202 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
michael@0 203 ((SCSUData *)cnv->extraInfo)->locale=l_ja;
michael@0 204 } else {
michael@0 205 ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
michael@0 206 }
michael@0 207 _SCSUReset(cnv, UCNV_RESET_BOTH);
michael@0 208 } else {
michael@0 209 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 210 }
michael@0 211
michael@0 212 /* Set the substitution character U+fffd as a Unicode string. */
michael@0 213 cnv->subUChars[0]=0xfffd;
michael@0 214 cnv->subCharLen=-1;
michael@0 215 }
michael@0 216
michael@0 217 static void
michael@0 218 _SCSUClose(UConverter *cnv) {
michael@0 219 if(cnv->extraInfo!=NULL) {
michael@0 220 if(!cnv->isExtraLocal) {
michael@0 221 uprv_free(cnv->extraInfo);
michael@0 222 }
michael@0 223 cnv->extraInfo=NULL;
michael@0 224 }
michael@0 225 }
michael@0 226
michael@0 227 /* SCSU-to-Unicode conversion functions ------------------------------------- */
michael@0 228
michael@0 229 static void
michael@0 230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 231 UErrorCode *pErrorCode) {
michael@0 232 UConverter *cnv;
michael@0 233 SCSUData *scsu;
michael@0 234 const uint8_t *source, *sourceLimit;
michael@0 235 UChar *target;
michael@0 236 const UChar *targetLimit;
michael@0 237 int32_t *offsets;
michael@0 238 UBool isSingleByteMode;
michael@0 239 uint8_t state, byteOne;
michael@0 240 int8_t quoteWindow, dynamicWindow;
michael@0 241
michael@0 242 int32_t sourceIndex, nextSourceIndex;
michael@0 243
michael@0 244 uint8_t b;
michael@0 245
michael@0 246 /* set up the local pointers */
michael@0 247 cnv=pArgs->converter;
michael@0 248 scsu=(SCSUData *)cnv->extraInfo;
michael@0 249
michael@0 250 source=(const uint8_t *)pArgs->source;
michael@0 251 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0 252 target=pArgs->target;
michael@0 253 targetLimit=pArgs->targetLimit;
michael@0 254 offsets=pArgs->offsets;
michael@0 255
michael@0 256 /* get the state machine state */
michael@0 257 isSingleByteMode=scsu->toUIsSingleByteMode;
michael@0 258 state=scsu->toUState;
michael@0 259 quoteWindow=scsu->toUQuoteWindow;
michael@0 260 dynamicWindow=scsu->toUDynamicWindow;
michael@0 261 byteOne=scsu->toUByteOne;
michael@0 262
michael@0 263 /* sourceIndex=-1 if the current character began in the previous buffer */
michael@0 264 sourceIndex=state==readCommand ? 0 : -1;
michael@0 265 nextSourceIndex=0;
michael@0 266
michael@0 267 /*
michael@0 268 * conversion "loop"
michael@0 269 *
michael@0 270 * For performance, this is not a normal C loop.
michael@0 271 * Instead, there are two code blocks for the two SCSU modes.
michael@0 272 * The function branches to either one, and a change of the mode is done with a goto to
michael@0 273 * the other branch.
michael@0 274 *
michael@0 275 * Each branch has two conventional loops:
michael@0 276 * - a fast-path loop for the most common codes in the mode
michael@0 277 * - a loop for all other codes in the mode
michael@0 278 * When the fast-path runs into a code that it cannot handle, its loop ends and it
michael@0 279 * runs into the following loop to handle the other codes.
michael@0 280 * The end of the input or output buffer is also handled by the slower loop.
michael@0 281 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
michael@0 282 *
michael@0 283 * The callback handling is done by returning with an error code.
michael@0 284 * The conversion framework actually calls the callback function.
michael@0 285 */
michael@0 286 if(isSingleByteMode) {
michael@0 287 /* fast path for single-byte mode */
michael@0 288 if(state==readCommand) {
michael@0 289 fastSingle:
michael@0 290 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
michael@0 291 ++source;
michael@0 292 ++nextSourceIndex;
michael@0 293 if(b<=0x7f) {
michael@0 294 /* write US-ASCII graphic character or DEL */
michael@0 295 *target++=(UChar)b;
michael@0 296 if(offsets!=NULL) {
michael@0 297 *offsets++=sourceIndex;
michael@0 298 }
michael@0 299 } else {
michael@0 300 /* write from dynamic window */
michael@0 301 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
michael@0 302 if(c<=0xffff) {
michael@0 303 *target++=(UChar)c;
michael@0 304 if(offsets!=NULL) {
michael@0 305 *offsets++=sourceIndex;
michael@0 306 }
michael@0 307 } else {
michael@0 308 /* output surrogate pair */
michael@0 309 *target++=(UChar)(0xd7c0+(c>>10));
michael@0 310 if(target<targetLimit) {
michael@0 311 *target++=(UChar)(0xdc00|(c&0x3ff));
michael@0 312 if(offsets!=NULL) {
michael@0 313 *offsets++=sourceIndex;
michael@0 314 *offsets++=sourceIndex;
michael@0 315 }
michael@0 316 } else {
michael@0 317 /* target overflow */
michael@0 318 if(offsets!=NULL) {
michael@0 319 *offsets++=sourceIndex;
michael@0 320 }
michael@0 321 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
michael@0 322 cnv->UCharErrorBufferLength=1;
michael@0 323 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 324 goto endloop;
michael@0 325 }
michael@0 326 }
michael@0 327 }
michael@0 328 sourceIndex=nextSourceIndex;
michael@0 329 }
michael@0 330 }
michael@0 331
michael@0 332 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
michael@0 333 singleByteMode:
michael@0 334 while(source<sourceLimit) {
michael@0 335 if(target>=targetLimit) {
michael@0 336 /* target is full */
michael@0 337 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 338 break;
michael@0 339 }
michael@0 340 b=*source++;
michael@0 341 ++nextSourceIndex;
michael@0 342 switch(state) {
michael@0 343 case readCommand:
michael@0 344 /* redundant conditions are commented out */
michael@0 345 /* here: b<0x20 because otherwise we would be in fastSingle */
michael@0 346 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
michael@0 347 /* CR/LF/TAB/NUL */
michael@0 348 *target++=(UChar)b;
michael@0 349 if(offsets!=NULL) {
michael@0 350 *offsets++=sourceIndex;
michael@0 351 }
michael@0 352 sourceIndex=nextSourceIndex;
michael@0 353 goto fastSingle;
michael@0 354 } else if(SC0<=b) {
michael@0 355 if(b<=SC7) {
michael@0 356 dynamicWindow=(int8_t)(b-SC0);
michael@0 357 sourceIndex=nextSourceIndex;
michael@0 358 goto fastSingle;
michael@0 359 } else /* if(SD0<=b && b<=SD7) */ {
michael@0 360 dynamicWindow=(int8_t)(b-SD0);
michael@0 361 state=defineOne;
michael@0 362 }
michael@0 363 } else if(/* SQ0<=b && */ b<=SQ7) {
michael@0 364 quoteWindow=(int8_t)(b-SQ0);
michael@0 365 state=quoteOne;
michael@0 366 } else if(b==SDX) {
michael@0 367 state=definePairOne;
michael@0 368 } else if(b==SQU) {
michael@0 369 state=quotePairOne;
michael@0 370 } else if(b==SCU) {
michael@0 371 sourceIndex=nextSourceIndex;
michael@0 372 isSingleByteMode=FALSE;
michael@0 373 goto fastUnicode;
michael@0 374 } else /* Srs */ {
michael@0 375 /* callback(illegal) */
michael@0 376 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 377 cnv->toUBytes[0]=b;
michael@0 378 cnv->toULength=1;
michael@0 379 goto endloop;
michael@0 380 }
michael@0 381
michael@0 382 /* store the first byte of a multibyte sequence in toUBytes[] */
michael@0 383 cnv->toUBytes[0]=b;
michael@0 384 cnv->toULength=1;
michael@0 385 break;
michael@0 386 case quotePairOne:
michael@0 387 byteOne=b;
michael@0 388 cnv->toUBytes[1]=b;
michael@0 389 cnv->toULength=2;
michael@0 390 state=quotePairTwo;
michael@0 391 break;
michael@0 392 case quotePairTwo:
michael@0 393 *target++=(UChar)((byteOne<<8)|b);
michael@0 394 if(offsets!=NULL) {
michael@0 395 *offsets++=sourceIndex;
michael@0 396 }
michael@0 397 sourceIndex=nextSourceIndex;
michael@0 398 state=readCommand;
michael@0 399 goto fastSingle;
michael@0 400 case quoteOne:
michael@0 401 if(b<0x80) {
michael@0 402 /* all static offsets are in the BMP */
michael@0 403 *target++=(UChar)(staticOffsets[quoteWindow]+b);
michael@0 404 if(offsets!=NULL) {
michael@0 405 *offsets++=sourceIndex;
michael@0 406 }
michael@0 407 } else {
michael@0 408 /* write from dynamic window */
michael@0 409 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
michael@0 410 if(c<=0xffff) {
michael@0 411 *target++=(UChar)c;
michael@0 412 if(offsets!=NULL) {
michael@0 413 *offsets++=sourceIndex;
michael@0 414 }
michael@0 415 } else {
michael@0 416 /* output surrogate pair */
michael@0 417 *target++=(UChar)(0xd7c0+(c>>10));
michael@0 418 if(target<targetLimit) {
michael@0 419 *target++=(UChar)(0xdc00|(c&0x3ff));
michael@0 420 if(offsets!=NULL) {
michael@0 421 *offsets++=sourceIndex;
michael@0 422 *offsets++=sourceIndex;
michael@0 423 }
michael@0 424 } else {
michael@0 425 /* target overflow */
michael@0 426 if(offsets!=NULL) {
michael@0 427 *offsets++=sourceIndex;
michael@0 428 }
michael@0 429 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
michael@0 430 cnv->UCharErrorBufferLength=1;
michael@0 431 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 432 goto endloop;
michael@0 433 }
michael@0 434 }
michael@0 435 }
michael@0 436 sourceIndex=nextSourceIndex;
michael@0 437 state=readCommand;
michael@0 438 goto fastSingle;
michael@0 439 case definePairOne:
michael@0 440 dynamicWindow=(int8_t)((b>>5)&7);
michael@0 441 byteOne=(uint8_t)(b&0x1f);
michael@0 442 cnv->toUBytes[1]=b;
michael@0 443 cnv->toULength=2;
michael@0 444 state=definePairTwo;
michael@0 445 break;
michael@0 446 case definePairTwo:
michael@0 447 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
michael@0 448 sourceIndex=nextSourceIndex;
michael@0 449 state=readCommand;
michael@0 450 goto fastSingle;
michael@0 451 case defineOne:
michael@0 452 if(b==0) {
michael@0 453 /* callback(illegal): Reserved window offset value 0 */
michael@0 454 cnv->toUBytes[1]=b;
michael@0 455 cnv->toULength=2;
michael@0 456 goto endloop;
michael@0 457 } else if(b<gapThreshold) {
michael@0 458 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
michael@0 459 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
michael@0 460 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
michael@0 461 } else if(b>=fixedThreshold) {
michael@0 462 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
michael@0 463 } else {
michael@0 464 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
michael@0 465 cnv->toUBytes[1]=b;
michael@0 466 cnv->toULength=2;
michael@0 467 goto endloop;
michael@0 468 }
michael@0 469 sourceIndex=nextSourceIndex;
michael@0 470 state=readCommand;
michael@0 471 goto fastSingle;
michael@0 472 }
michael@0 473 }
michael@0 474 } else {
michael@0 475 /* fast path for Unicode mode */
michael@0 476 if(state==readCommand) {
michael@0 477 fastUnicode:
michael@0 478 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
michael@0 479 *target++=(UChar)((b<<8)|source[1]);
michael@0 480 if(offsets!=NULL) {
michael@0 481 *offsets++=sourceIndex;
michael@0 482 }
michael@0 483 sourceIndex=nextSourceIndex;
michael@0 484 nextSourceIndex+=2;
michael@0 485 source+=2;
michael@0 486 }
michael@0 487 }
michael@0 488
michael@0 489 /* normal state machine for Unicode mode */
michael@0 490 /* unicodeByteMode: */
michael@0 491 while(source<sourceLimit) {
michael@0 492 if(target>=targetLimit) {
michael@0 493 /* target is full */
michael@0 494 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 495 break;
michael@0 496 }
michael@0 497 b=*source++;
michael@0 498 ++nextSourceIndex;
michael@0 499 switch(state) {
michael@0 500 case readCommand:
michael@0 501 if((uint8_t)(b-UC0)>(Urs-UC0)) {
michael@0 502 byteOne=b;
michael@0 503 cnv->toUBytes[0]=b;
michael@0 504 cnv->toULength=1;
michael@0 505 state=quotePairTwo;
michael@0 506 } else if(/* UC0<=b && */ b<=UC7) {
michael@0 507 dynamicWindow=(int8_t)(b-UC0);
michael@0 508 sourceIndex=nextSourceIndex;
michael@0 509 isSingleByteMode=TRUE;
michael@0 510 goto fastSingle;
michael@0 511 } else if(/* UD0<=b && */ b<=UD7) {
michael@0 512 dynamicWindow=(int8_t)(b-UD0);
michael@0 513 isSingleByteMode=TRUE;
michael@0 514 cnv->toUBytes[0]=b;
michael@0 515 cnv->toULength=1;
michael@0 516 state=defineOne;
michael@0 517 goto singleByteMode;
michael@0 518 } else if(b==UDX) {
michael@0 519 isSingleByteMode=TRUE;
michael@0 520 cnv->toUBytes[0]=b;
michael@0 521 cnv->toULength=1;
michael@0 522 state=definePairOne;
michael@0 523 goto singleByteMode;
michael@0 524 } else if(b==UQU) {
michael@0 525 cnv->toUBytes[0]=b;
michael@0 526 cnv->toULength=1;
michael@0 527 state=quotePairOne;
michael@0 528 } else /* Urs */ {
michael@0 529 /* callback(illegal) */
michael@0 530 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 531 cnv->toUBytes[0]=b;
michael@0 532 cnv->toULength=1;
michael@0 533 goto endloop;
michael@0 534 }
michael@0 535 break;
michael@0 536 case quotePairOne:
michael@0 537 byteOne=b;
michael@0 538 cnv->toUBytes[1]=b;
michael@0 539 cnv->toULength=2;
michael@0 540 state=quotePairTwo;
michael@0 541 break;
michael@0 542 case quotePairTwo:
michael@0 543 *target++=(UChar)((byteOne<<8)|b);
michael@0 544 if(offsets!=NULL) {
michael@0 545 *offsets++=sourceIndex;
michael@0 546 }
michael@0 547 sourceIndex=nextSourceIndex;
michael@0 548 state=readCommand;
michael@0 549 goto fastUnicode;
michael@0 550 }
michael@0 551 }
michael@0 552 }
michael@0 553 endloop:
michael@0 554
michael@0 555 /* set the converter state back into UConverter */
michael@0 556 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
michael@0 557 /* reset to deal with the next character */
michael@0 558 state=readCommand;
michael@0 559 } else if(state==readCommand) {
michael@0 560 /* not in a multi-byte sequence, reset toULength */
michael@0 561 cnv->toULength=0;
michael@0 562 }
michael@0 563 scsu->toUIsSingleByteMode=isSingleByteMode;
michael@0 564 scsu->toUState=state;
michael@0 565 scsu->toUQuoteWindow=quoteWindow;
michael@0 566 scsu->toUDynamicWindow=dynamicWindow;
michael@0 567 scsu->toUByteOne=byteOne;
michael@0 568
michael@0 569 /* write back the updated pointers */
michael@0 570 pArgs->source=(const char *)source;
michael@0 571 pArgs->target=target;
michael@0 572 pArgs->offsets=offsets;
michael@0 573 return;
michael@0 574 }
michael@0 575
michael@0 576 /*
michael@0 577 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
michael@0 578 * If a change is made in the original function, then either
michael@0 579 * change this function the same way or
michael@0 580 * re-copy the original function and remove the variables
michael@0 581 * offsets, sourceIndex, and nextSourceIndex.
michael@0 582 */
michael@0 583 static void
michael@0 584 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
michael@0 585 UErrorCode *pErrorCode) {
michael@0 586 UConverter *cnv;
michael@0 587 SCSUData *scsu;
michael@0 588 const uint8_t *source, *sourceLimit;
michael@0 589 UChar *target;
michael@0 590 const UChar *targetLimit;
michael@0 591 UBool isSingleByteMode;
michael@0 592 uint8_t state, byteOne;
michael@0 593 int8_t quoteWindow, dynamicWindow;
michael@0 594
michael@0 595 uint8_t b;
michael@0 596
michael@0 597 /* set up the local pointers */
michael@0 598 cnv=pArgs->converter;
michael@0 599 scsu=(SCSUData *)cnv->extraInfo;
michael@0 600
michael@0 601 source=(const uint8_t *)pArgs->source;
michael@0 602 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0 603 target=pArgs->target;
michael@0 604 targetLimit=pArgs->targetLimit;
michael@0 605
michael@0 606 /* get the state machine state */
michael@0 607 isSingleByteMode=scsu->toUIsSingleByteMode;
michael@0 608 state=scsu->toUState;
michael@0 609 quoteWindow=scsu->toUQuoteWindow;
michael@0 610 dynamicWindow=scsu->toUDynamicWindow;
michael@0 611 byteOne=scsu->toUByteOne;
michael@0 612
michael@0 613 /*
michael@0 614 * conversion "loop"
michael@0 615 *
michael@0 616 * For performance, this is not a normal C loop.
michael@0 617 * Instead, there are two code blocks for the two SCSU modes.
michael@0 618 * The function branches to either one, and a change of the mode is done with a goto to
michael@0 619 * the other branch.
michael@0 620 *
michael@0 621 * Each branch has two conventional loops:
michael@0 622 * - a fast-path loop for the most common codes in the mode
michael@0 623 * - a loop for all other codes in the mode
michael@0 624 * When the fast-path runs into a code that it cannot handle, its loop ends and it
michael@0 625 * runs into the following loop to handle the other codes.
michael@0 626 * The end of the input or output buffer is also handled by the slower loop.
michael@0 627 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
michael@0 628 *
michael@0 629 * The callback handling is done by returning with an error code.
michael@0 630 * The conversion framework actually calls the callback function.
michael@0 631 */
michael@0 632 if(isSingleByteMode) {
michael@0 633 /* fast path for single-byte mode */
michael@0 634 if(state==readCommand) {
michael@0 635 fastSingle:
michael@0 636 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
michael@0 637 ++source;
michael@0 638 if(b<=0x7f) {
michael@0 639 /* write US-ASCII graphic character or DEL */
michael@0 640 *target++=(UChar)b;
michael@0 641 } else {
michael@0 642 /* write from dynamic window */
michael@0 643 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
michael@0 644 if(c<=0xffff) {
michael@0 645 *target++=(UChar)c;
michael@0 646 } else {
michael@0 647 /* output surrogate pair */
michael@0 648 *target++=(UChar)(0xd7c0+(c>>10));
michael@0 649 if(target<targetLimit) {
michael@0 650 *target++=(UChar)(0xdc00|(c&0x3ff));
michael@0 651 } else {
michael@0 652 /* target overflow */
michael@0 653 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
michael@0 654 cnv->UCharErrorBufferLength=1;
michael@0 655 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 656 goto endloop;
michael@0 657 }
michael@0 658 }
michael@0 659 }
michael@0 660 }
michael@0 661 }
michael@0 662
michael@0 663 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
michael@0 664 singleByteMode:
michael@0 665 while(source<sourceLimit) {
michael@0 666 if(target>=targetLimit) {
michael@0 667 /* target is full */
michael@0 668 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 669 break;
michael@0 670 }
michael@0 671 b=*source++;
michael@0 672 switch(state) {
michael@0 673 case readCommand:
michael@0 674 /* redundant conditions are commented out */
michael@0 675 /* here: b<0x20 because otherwise we would be in fastSingle */
michael@0 676 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
michael@0 677 /* CR/LF/TAB/NUL */
michael@0 678 *target++=(UChar)b;
michael@0 679 goto fastSingle;
michael@0 680 } else if(SC0<=b) {
michael@0 681 if(b<=SC7) {
michael@0 682 dynamicWindow=(int8_t)(b-SC0);
michael@0 683 goto fastSingle;
michael@0 684 } else /* if(SD0<=b && b<=SD7) */ {
michael@0 685 dynamicWindow=(int8_t)(b-SD0);
michael@0 686 state=defineOne;
michael@0 687 }
michael@0 688 } else if(/* SQ0<=b && */ b<=SQ7) {
michael@0 689 quoteWindow=(int8_t)(b-SQ0);
michael@0 690 state=quoteOne;
michael@0 691 } else if(b==SDX) {
michael@0 692 state=definePairOne;
michael@0 693 } else if(b==SQU) {
michael@0 694 state=quotePairOne;
michael@0 695 } else if(b==SCU) {
michael@0 696 isSingleByteMode=FALSE;
michael@0 697 goto fastUnicode;
michael@0 698 } else /* Srs */ {
michael@0 699 /* callback(illegal) */
michael@0 700 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 701 cnv->toUBytes[0]=b;
michael@0 702 cnv->toULength=1;
michael@0 703 goto endloop;
michael@0 704 }
michael@0 705
michael@0 706 /* store the first byte of a multibyte sequence in toUBytes[] */
michael@0 707 cnv->toUBytes[0]=b;
michael@0 708 cnv->toULength=1;
michael@0 709 break;
michael@0 710 case quotePairOne:
michael@0 711 byteOne=b;
michael@0 712 cnv->toUBytes[1]=b;
michael@0 713 cnv->toULength=2;
michael@0 714 state=quotePairTwo;
michael@0 715 break;
michael@0 716 case quotePairTwo:
michael@0 717 *target++=(UChar)((byteOne<<8)|b);
michael@0 718 state=readCommand;
michael@0 719 goto fastSingle;
michael@0 720 case quoteOne:
michael@0 721 if(b<0x80) {
michael@0 722 /* all static offsets are in the BMP */
michael@0 723 *target++=(UChar)(staticOffsets[quoteWindow]+b);
michael@0 724 } else {
michael@0 725 /* write from dynamic window */
michael@0 726 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
michael@0 727 if(c<=0xffff) {
michael@0 728 *target++=(UChar)c;
michael@0 729 } else {
michael@0 730 /* output surrogate pair */
michael@0 731 *target++=(UChar)(0xd7c0+(c>>10));
michael@0 732 if(target<targetLimit) {
michael@0 733 *target++=(UChar)(0xdc00|(c&0x3ff));
michael@0 734 } else {
michael@0 735 /* target overflow */
michael@0 736 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
michael@0 737 cnv->UCharErrorBufferLength=1;
michael@0 738 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 739 goto endloop;
michael@0 740 }
michael@0 741 }
michael@0 742 }
michael@0 743 state=readCommand;
michael@0 744 goto fastSingle;
michael@0 745 case definePairOne:
michael@0 746 dynamicWindow=(int8_t)((b>>5)&7);
michael@0 747 byteOne=(uint8_t)(b&0x1f);
michael@0 748 cnv->toUBytes[1]=b;
michael@0 749 cnv->toULength=2;
michael@0 750 state=definePairTwo;
michael@0 751 break;
michael@0 752 case definePairTwo:
michael@0 753 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
michael@0 754 state=readCommand;
michael@0 755 goto fastSingle;
michael@0 756 case defineOne:
michael@0 757 if(b==0) {
michael@0 758 /* callback(illegal): Reserved window offset value 0 */
michael@0 759 cnv->toUBytes[1]=b;
michael@0 760 cnv->toULength=2;
michael@0 761 goto endloop;
michael@0 762 } else if(b<gapThreshold) {
michael@0 763 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
michael@0 764 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
michael@0 765 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
michael@0 766 } else if(b>=fixedThreshold) {
michael@0 767 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
michael@0 768 } else {
michael@0 769 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
michael@0 770 cnv->toUBytes[1]=b;
michael@0 771 cnv->toULength=2;
michael@0 772 goto endloop;
michael@0 773 }
michael@0 774 state=readCommand;
michael@0 775 goto fastSingle;
michael@0 776 }
michael@0 777 }
michael@0 778 } else {
michael@0 779 /* fast path for Unicode mode */
michael@0 780 if(state==readCommand) {
michael@0 781 fastUnicode:
michael@0 782 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
michael@0 783 *target++=(UChar)((b<<8)|source[1]);
michael@0 784 source+=2;
michael@0 785 }
michael@0 786 }
michael@0 787
michael@0 788 /* normal state machine for Unicode mode */
michael@0 789 /* unicodeByteMode: */
michael@0 790 while(source<sourceLimit) {
michael@0 791 if(target>=targetLimit) {
michael@0 792 /* target is full */
michael@0 793 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 794 break;
michael@0 795 }
michael@0 796 b=*source++;
michael@0 797 switch(state) {
michael@0 798 case readCommand:
michael@0 799 if((uint8_t)(b-UC0)>(Urs-UC0)) {
michael@0 800 byteOne=b;
michael@0 801 cnv->toUBytes[0]=b;
michael@0 802 cnv->toULength=1;
michael@0 803 state=quotePairTwo;
michael@0 804 } else if(/* UC0<=b && */ b<=UC7) {
michael@0 805 dynamicWindow=(int8_t)(b-UC0);
michael@0 806 isSingleByteMode=TRUE;
michael@0 807 goto fastSingle;
michael@0 808 } else if(/* UD0<=b && */ b<=UD7) {
michael@0 809 dynamicWindow=(int8_t)(b-UD0);
michael@0 810 isSingleByteMode=TRUE;
michael@0 811 cnv->toUBytes[0]=b;
michael@0 812 cnv->toULength=1;
michael@0 813 state=defineOne;
michael@0 814 goto singleByteMode;
michael@0 815 } else if(b==UDX) {
michael@0 816 isSingleByteMode=TRUE;
michael@0 817 cnv->toUBytes[0]=b;
michael@0 818 cnv->toULength=1;
michael@0 819 state=definePairOne;
michael@0 820 goto singleByteMode;
michael@0 821 } else if(b==UQU) {
michael@0 822 cnv->toUBytes[0]=b;
michael@0 823 cnv->toULength=1;
michael@0 824 state=quotePairOne;
michael@0 825 } else /* Urs */ {
michael@0 826 /* callback(illegal) */
michael@0 827 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 828 cnv->toUBytes[0]=b;
michael@0 829 cnv->toULength=1;
michael@0 830 goto endloop;
michael@0 831 }
michael@0 832 break;
michael@0 833 case quotePairOne:
michael@0 834 byteOne=b;
michael@0 835 cnv->toUBytes[1]=b;
michael@0 836 cnv->toULength=2;
michael@0 837 state=quotePairTwo;
michael@0 838 break;
michael@0 839 case quotePairTwo:
michael@0 840 *target++=(UChar)((byteOne<<8)|b);
michael@0 841 state=readCommand;
michael@0 842 goto fastUnicode;
michael@0 843 }
michael@0 844 }
michael@0 845 }
michael@0 846 endloop:
michael@0 847
michael@0 848 /* set the converter state back into UConverter */
michael@0 849 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
michael@0 850 /* reset to deal with the next character */
michael@0 851 state=readCommand;
michael@0 852 } else if(state==readCommand) {
michael@0 853 /* not in a multi-byte sequence, reset toULength */
michael@0 854 cnv->toULength=0;
michael@0 855 }
michael@0 856 scsu->toUIsSingleByteMode=isSingleByteMode;
michael@0 857 scsu->toUState=state;
michael@0 858 scsu->toUQuoteWindow=quoteWindow;
michael@0 859 scsu->toUDynamicWindow=dynamicWindow;
michael@0 860 scsu->toUByteOne=byteOne;
michael@0 861
michael@0 862 /* write back the updated pointers */
michael@0 863 pArgs->source=(const char *)source;
michael@0 864 pArgs->target=target;
michael@0 865 return;
michael@0 866 }
michael@0 867
michael@0 868 /* SCSU-from-Unicode conversion functions ----------------------------------- */
michael@0 869
michael@0 870 /*
michael@0 871 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
michael@0 872 * reasonable results. The lookahead is minimal.
michael@0 873 * Many cases are simple:
michael@0 874 * A character fits directly into the current mode, a dynamic or static window,
michael@0 875 * or is not compressible. These cases are tested first.
michael@0 876 * Real compression heuristics are applied to the rest, in code branches for
michael@0 877 * single/Unicode mode and BMP/supplementary code points.
michael@0 878 * The heuristics used here are extremely simple.
michael@0 879 */
michael@0 880
michael@0 881 /* get the number of the window that this character is in, or -1 */
michael@0 882 static int8_t
michael@0 883 getWindow(const uint32_t offsets[8], uint32_t c) {
michael@0 884 int i;
michael@0 885 for(i=0; i<8; ++i) {
michael@0 886 if((uint32_t)(c-offsets[i])<=0x7f) {
michael@0 887 return (int8_t)(i);
michael@0 888 }
michael@0 889 }
michael@0 890 return -1;
michael@0 891 }
michael@0 892
michael@0 893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
michael@0 894 static UBool
michael@0 895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
michael@0 896 return (UBool)(c<=offset+0x7f &&
michael@0 897 (c>=offset || (c<=0x7f &&
michael@0 898 (c>=0x20 || (1UL<<c)&0x2601))));
michael@0 899 /* binary 0010 0110 0000 0001,
michael@0 900 check for b==0xd || b==0xa || b==9 || b==0 */
michael@0 901 }
michael@0 902
michael@0 903 /*
michael@0 904 * getNextDynamicWindow returns the next dynamic window to be redefined
michael@0 905 */
michael@0 906 static int8_t
michael@0 907 getNextDynamicWindow(SCSUData *scsu) {
michael@0 908 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
michael@0 909 if(++scsu->nextWindowUseIndex==8) {
michael@0 910 scsu->nextWindowUseIndex=0;
michael@0 911 }
michael@0 912 return window;
michael@0 913 }
michael@0 914
michael@0 915 /*
michael@0 916 * useDynamicWindow() adjusts
michael@0 917 * windowUse[] and nextWindowUseIndex for the algorithm to choose
michael@0 918 * the next dynamic window to be defined;
michael@0 919 * a subclass may override it and provide its own algorithm.
michael@0 920 */
michael@0 921 static void
michael@0 922 useDynamicWindow(SCSUData *scsu, int8_t window) {
michael@0 923 /*
michael@0 924 * move the existing window, which just became the most recently used one,
michael@0 925 * up in windowUse[] to nextWindowUseIndex-1
michael@0 926 */
michael@0 927
michael@0 928 /* first, find the index of the window - backwards to favor the more recently used windows */
michael@0 929 int i, j;
michael@0 930
michael@0 931 i=scsu->nextWindowUseIndex;
michael@0 932 do {
michael@0 933 if(--i<0) {
michael@0 934 i=7;
michael@0 935 }
michael@0 936 } while(scsu->windowUse[i]!=window);
michael@0 937
michael@0 938 /* now copy each windowUse[i+1] to [i] */
michael@0 939 j=i+1;
michael@0 940 if(j==8) {
michael@0 941 j=0;
michael@0 942 }
michael@0 943 while(j!=scsu->nextWindowUseIndex) {
michael@0 944 scsu->windowUse[i]=scsu->windowUse[j];
michael@0 945 i=j;
michael@0 946 if(++j==8) { j=0; }
michael@0 947 }
michael@0 948
michael@0 949 /* finally, set the window into the most recently used index */
michael@0 950 scsu->windowUse[i]=window;
michael@0 951 }
michael@0 952
michael@0 953 /*
michael@0 954 * calculate the offset and the code for a dynamic window that contains the character
michael@0 955 * takes fixed offsets into account
michael@0 956 * the offset of the window is stored in the offset variable,
michael@0 957 * the code is returned
michael@0 958 *
michael@0 959 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
michael@0 960 */
michael@0 961 static int
michael@0 962 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
michael@0 963 int i;
michael@0 964
michael@0 965 for(i=0; i<7; ++i) {
michael@0 966 if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
michael@0 967 *pOffset=fixedOffsets[i];
michael@0 968 return 0xf9+i;
michael@0 969 }
michael@0 970 }
michael@0 971
michael@0 972 if(c<0x80) {
michael@0 973 /* No dynamic window for US-ASCII. */
michael@0 974 return -1;
michael@0 975 } else if(c<0x3400 ||
michael@0 976 (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
michael@0 977 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
michael@0 978 ) {
michael@0 979 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
michael@0 980 *pOffset=c&0x7fffff80;
michael@0 981 return (int)(c>>7);
michael@0 982 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
michael@0 983 /* For these characters we need to take the gapOffset into account. */
michael@0 984 *pOffset=c&0x7fffff80;
michael@0 985 return (int)((c-gapOffset)>>7);
michael@0 986 } else {
michael@0 987 return -1;
michael@0 988 }
michael@0 989 }
michael@0 990
michael@0 991 /*
michael@0 992 * Idea for compression:
michael@0 993 * - save SCSUData and other state before really starting work
michael@0 994 * - at endloop, see if compression could be better with just unicode mode
michael@0 995 * - don't do this if a callback has been called
michael@0 996 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
michael@0 997 * - different buffer handling!
michael@0 998 *
michael@0 999 * Drawback or need for corrective handling:
michael@0 1000 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
michael@0 1001 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
michael@0 1002 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
michael@0 1003 *
michael@0 1004 * How to achieve both?
michael@0 1005 * - Only replace the result after an SDX or SCU?
michael@0 1006 */
michael@0 1007
michael@0 1008 static void
michael@0 1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
michael@0 1010 UErrorCode *pErrorCode) {
michael@0 1011 UConverter *cnv;
michael@0 1012 SCSUData *scsu;
michael@0 1013 const UChar *source, *sourceLimit;
michael@0 1014 uint8_t *target;
michael@0 1015 int32_t targetCapacity;
michael@0 1016 int32_t *offsets;
michael@0 1017
michael@0 1018 UBool isSingleByteMode;
michael@0 1019 uint8_t dynamicWindow;
michael@0 1020 uint32_t currentOffset;
michael@0 1021
michael@0 1022 uint32_t c, delta;
michael@0 1023
michael@0 1024 int32_t sourceIndex, nextSourceIndex;
michael@0 1025
michael@0 1026 int32_t length;
michael@0 1027
michael@0 1028 /* variables for compression heuristics */
michael@0 1029 uint32_t offset;
michael@0 1030 UChar lead, trail;
michael@0 1031 int code;
michael@0 1032 int8_t window;
michael@0 1033
michael@0 1034 /* set up the local pointers */
michael@0 1035 cnv=pArgs->converter;
michael@0 1036 scsu=(SCSUData *)cnv->extraInfo;
michael@0 1037
michael@0 1038 /* set up the local pointers */
michael@0 1039 source=pArgs->source;
michael@0 1040 sourceLimit=pArgs->sourceLimit;
michael@0 1041 target=(uint8_t *)pArgs->target;
michael@0 1042 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0 1043 offsets=pArgs->offsets;
michael@0 1044
michael@0 1045 /* get the state machine state */
michael@0 1046 isSingleByteMode=scsu->fromUIsSingleByteMode;
michael@0 1047 dynamicWindow=scsu->fromUDynamicWindow;
michael@0 1048 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1049
michael@0 1050 c=cnv->fromUChar32;
michael@0 1051
michael@0 1052 /* sourceIndex=-1 if the current character began in the previous buffer */
michael@0 1053 sourceIndex= c==0 ? 0 : -1;
michael@0 1054 nextSourceIndex=0;
michael@0 1055
michael@0 1056 /* similar conversion "loop" as in toUnicode */
michael@0 1057 loop:
michael@0 1058 if(isSingleByteMode) {
michael@0 1059 if(c!=0 && targetCapacity>0) {
michael@0 1060 goto getTrailSingle;
michael@0 1061 }
michael@0 1062
michael@0 1063 /* state machine for single-byte mode */
michael@0 1064 /* singleByteMode: */
michael@0 1065 while(source<sourceLimit) {
michael@0 1066 if(targetCapacity<=0) {
michael@0 1067 /* target is full */
michael@0 1068 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1069 break;
michael@0 1070 }
michael@0 1071 c=*source++;
michael@0 1072 ++nextSourceIndex;
michael@0 1073
michael@0 1074 if((c-0x20)<=0x5f) {
michael@0 1075 /* pass US-ASCII graphic character through */
michael@0 1076 *target++=(uint8_t)c;
michael@0 1077 if(offsets!=NULL) {
michael@0 1078 *offsets++=sourceIndex;
michael@0 1079 }
michael@0 1080 --targetCapacity;
michael@0 1081 } else if(c<0x20) {
michael@0 1082 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
michael@0 1083 /* CR/LF/TAB/NUL */
michael@0 1084 *target++=(uint8_t)c;
michael@0 1085 if(offsets!=NULL) {
michael@0 1086 *offsets++=sourceIndex;
michael@0 1087 }
michael@0 1088 --targetCapacity;
michael@0 1089 } else {
michael@0 1090 /* quote C0 control character */
michael@0 1091 c|=SQ0<<8;
michael@0 1092 length=2;
michael@0 1093 goto outputBytes;
michael@0 1094 }
michael@0 1095 } else if((delta=c-currentOffset)<=0x7f) {
michael@0 1096 /* use the current dynamic window */
michael@0 1097 *target++=(uint8_t)(delta|0x80);
michael@0 1098 if(offsets!=NULL) {
michael@0 1099 *offsets++=sourceIndex;
michael@0 1100 }
michael@0 1101 --targetCapacity;
michael@0 1102 } else if(U16_IS_SURROGATE(c)) {
michael@0 1103 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1104 getTrailSingle:
michael@0 1105 lead=(UChar)c;
michael@0 1106 if(source<sourceLimit) {
michael@0 1107 /* test the following code unit */
michael@0 1108 trail=*source;
michael@0 1109 if(U16_IS_TRAIL(trail)) {
michael@0 1110 ++source;
michael@0 1111 ++nextSourceIndex;
michael@0 1112 c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0 1113 /* convert this surrogate code point */
michael@0 1114 /* exit this condition tree */
michael@0 1115 } else {
michael@0 1116 /* this is an unmatched lead code unit (1st surrogate) */
michael@0 1117 /* callback(illegal) */
michael@0 1118 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1119 goto endloop;
michael@0 1120 }
michael@0 1121 } else {
michael@0 1122 /* no more input */
michael@0 1123 break;
michael@0 1124 }
michael@0 1125 } else {
michael@0 1126 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 1127 /* callback(illegal) */
michael@0 1128 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1129 goto endloop;
michael@0 1130 }
michael@0 1131
michael@0 1132 /* compress supplementary character U+10000..U+10ffff */
michael@0 1133 if((delta=c-currentOffset)<=0x7f) {
michael@0 1134 /* use the current dynamic window */
michael@0 1135 *target++=(uint8_t)(delta|0x80);
michael@0 1136 if(offsets!=NULL) {
michael@0 1137 *offsets++=sourceIndex;
michael@0 1138 }
michael@0 1139 --targetCapacity;
michael@0 1140 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
michael@0 1141 /* there is a dynamic window that contains this character, change to it */
michael@0 1142 dynamicWindow=window;
michael@0 1143 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1144 useDynamicWindow(scsu, dynamicWindow);
michael@0 1145 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
michael@0 1146 length=2;
michael@0 1147 goto outputBytes;
michael@0 1148 } else if((code=getDynamicOffset(c, &offset))>=0) {
michael@0 1149 /* might check if there are more characters in this window to come */
michael@0 1150 /* define an extended window with this character */
michael@0 1151 code-=0x200;
michael@0 1152 dynamicWindow=getNextDynamicWindow(scsu);
michael@0 1153 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
michael@0 1154 useDynamicWindow(scsu, dynamicWindow);
michael@0 1155 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
michael@0 1156 length=4;
michael@0 1157 goto outputBytes;
michael@0 1158 } else {
michael@0 1159 /* change to Unicode mode and output this (lead, trail) pair */
michael@0 1160 isSingleByteMode=FALSE;
michael@0 1161 *target++=(uint8_t)SCU;
michael@0 1162 if(offsets!=NULL) {
michael@0 1163 *offsets++=sourceIndex;
michael@0 1164 }
michael@0 1165 --targetCapacity;
michael@0 1166 c=((uint32_t)lead<<16)|trail;
michael@0 1167 length=4;
michael@0 1168 goto outputBytes;
michael@0 1169 }
michael@0 1170 } else if(c<0xa0) {
michael@0 1171 /* quote C1 control character */
michael@0 1172 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
michael@0 1173 length=2;
michael@0 1174 goto outputBytes;
michael@0 1175 } else if(c==0xfeff || c>=0xfff0) {
michael@0 1176 /* quote signature character=byte order mark and specials */
michael@0 1177 c|=SQU<<16;
michael@0 1178 length=3;
michael@0 1179 goto outputBytes;
michael@0 1180 } else {
michael@0 1181 /* compress all other BMP characters */
michael@0 1182 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
michael@0 1183 /* there is a window defined that contains this character - switch to it or quote from it? */
michael@0 1184 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
michael@0 1185 /* change to dynamic window */
michael@0 1186 dynamicWindow=window;
michael@0 1187 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1188 useDynamicWindow(scsu, dynamicWindow);
michael@0 1189 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
michael@0 1190 length=2;
michael@0 1191 goto outputBytes;
michael@0 1192 } else {
michael@0 1193 /* quote from dynamic window */
michael@0 1194 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
michael@0 1195 length=2;
michael@0 1196 goto outputBytes;
michael@0 1197 }
michael@0 1198 } else if((window=getWindow(staticOffsets, c))>=0) {
michael@0 1199 /* quote from static window */
michael@0 1200 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
michael@0 1201 length=2;
michael@0 1202 goto outputBytes;
michael@0 1203 } else if((code=getDynamicOffset(c, &offset))>=0) {
michael@0 1204 /* define a dynamic window with this character */
michael@0 1205 dynamicWindow=getNextDynamicWindow(scsu);
michael@0 1206 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
michael@0 1207 useDynamicWindow(scsu, dynamicWindow);
michael@0 1208 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
michael@0 1209 length=3;
michael@0 1210 goto outputBytes;
michael@0 1211 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
michael@0 1212 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
michael@0 1213 ) {
michael@0 1214 /*
michael@0 1215 * this character is not compressible (a BMP ideograph or similar);
michael@0 1216 * switch to Unicode mode if this is the last character in the block
michael@0 1217 * or there is at least one more ideograph following immediately
michael@0 1218 */
michael@0 1219 isSingleByteMode=FALSE;
michael@0 1220 c|=SCU<<16;
michael@0 1221 length=3;
michael@0 1222 goto outputBytes;
michael@0 1223 } else {
michael@0 1224 /* quote Unicode */
michael@0 1225 c|=SQU<<16;
michael@0 1226 length=3;
michael@0 1227 goto outputBytes;
michael@0 1228 }
michael@0 1229 }
michael@0 1230
michael@0 1231 /* normal end of conversion: prepare for a new character */
michael@0 1232 c=0;
michael@0 1233 sourceIndex=nextSourceIndex;
michael@0 1234 }
michael@0 1235 } else {
michael@0 1236 if(c!=0 && targetCapacity>0) {
michael@0 1237 goto getTrailUnicode;
michael@0 1238 }
michael@0 1239
michael@0 1240 /* state machine for Unicode mode */
michael@0 1241 /* unicodeByteMode: */
michael@0 1242 while(source<sourceLimit) {
michael@0 1243 if(targetCapacity<=0) {
michael@0 1244 /* target is full */
michael@0 1245 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1246 break;
michael@0 1247 }
michael@0 1248 c=*source++;
michael@0 1249 ++nextSourceIndex;
michael@0 1250
michael@0 1251 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
michael@0 1252 /* not compressible, write character directly */
michael@0 1253 if(targetCapacity>=2) {
michael@0 1254 *target++=(uint8_t)(c>>8);
michael@0 1255 *target++=(uint8_t)c;
michael@0 1256 if(offsets!=NULL) {
michael@0 1257 *offsets++=sourceIndex;
michael@0 1258 *offsets++=sourceIndex;
michael@0 1259 }
michael@0 1260 targetCapacity-=2;
michael@0 1261 } else {
michael@0 1262 length=2;
michael@0 1263 goto outputBytes;
michael@0 1264 }
michael@0 1265 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
michael@0 1266 /* compress BMP character if the following one is not an uncompressible ideograph */
michael@0 1267 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
michael@0 1268 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
michael@0 1269 /* ASCII digit or letter */
michael@0 1270 isSingleByteMode=TRUE;
michael@0 1271 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
michael@0 1272 length=2;
michael@0 1273 goto outputBytes;
michael@0 1274 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
michael@0 1275 /* there is a dynamic window that contains this character, change to it */
michael@0 1276 isSingleByteMode=TRUE;
michael@0 1277 dynamicWindow=window;
michael@0 1278 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1279 useDynamicWindow(scsu, dynamicWindow);
michael@0 1280 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
michael@0 1281 length=2;
michael@0 1282 goto outputBytes;
michael@0 1283 } else if((code=getDynamicOffset(c, &offset))>=0) {
michael@0 1284 /* define a dynamic window with this character */
michael@0 1285 isSingleByteMode=TRUE;
michael@0 1286 dynamicWindow=getNextDynamicWindow(scsu);
michael@0 1287 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
michael@0 1288 useDynamicWindow(scsu, dynamicWindow);
michael@0 1289 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
michael@0 1290 length=3;
michael@0 1291 goto outputBytes;
michael@0 1292 }
michael@0 1293 }
michael@0 1294
michael@0 1295 /* don't know how to compress this character, just write it directly */
michael@0 1296 length=2;
michael@0 1297 goto outputBytes;
michael@0 1298 } else if(c<0xe000) {
michael@0 1299 /* c is a surrogate */
michael@0 1300 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1301 getTrailUnicode:
michael@0 1302 lead=(UChar)c;
michael@0 1303 if(source<sourceLimit) {
michael@0 1304 /* test the following code unit */
michael@0 1305 trail=*source;
michael@0 1306 if(U16_IS_TRAIL(trail)) {
michael@0 1307 ++source;
michael@0 1308 ++nextSourceIndex;
michael@0 1309 c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0 1310 /* convert this surrogate code point */
michael@0 1311 /* exit this condition tree */
michael@0 1312 } else {
michael@0 1313 /* this is an unmatched lead code unit (1st surrogate) */
michael@0 1314 /* callback(illegal) */
michael@0 1315 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1316 goto endloop;
michael@0 1317 }
michael@0 1318 } else {
michael@0 1319 /* no more input */
michael@0 1320 break;
michael@0 1321 }
michael@0 1322 } else {
michael@0 1323 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 1324 /* callback(illegal) */
michael@0 1325 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1326 goto endloop;
michael@0 1327 }
michael@0 1328
michael@0 1329 /* compress supplementary character */
michael@0 1330 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
michael@0 1331 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
michael@0 1332 ) {
michael@0 1333 /*
michael@0 1334 * there is a dynamic window that contains this character and
michael@0 1335 * the following character is not uncompressible,
michael@0 1336 * change to the window
michael@0 1337 */
michael@0 1338 isSingleByteMode=TRUE;
michael@0 1339 dynamicWindow=window;
michael@0 1340 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1341 useDynamicWindow(scsu, dynamicWindow);
michael@0 1342 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
michael@0 1343 length=2;
michael@0 1344 goto outputBytes;
michael@0 1345 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
michael@0 1346 (code=getDynamicOffset(c, &offset))>=0
michael@0 1347 ) {
michael@0 1348 /* two supplementary characters in (probably) the same window - define an extended one */
michael@0 1349 isSingleByteMode=TRUE;
michael@0 1350 code-=0x200;
michael@0 1351 dynamicWindow=getNextDynamicWindow(scsu);
michael@0 1352 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
michael@0 1353 useDynamicWindow(scsu, dynamicWindow);
michael@0 1354 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
michael@0 1355 length=4;
michael@0 1356 goto outputBytes;
michael@0 1357 } else {
michael@0 1358 /* don't know how to compress this character, just write it directly */
michael@0 1359 c=((uint32_t)lead<<16)|trail;
michael@0 1360 length=4;
michael@0 1361 goto outputBytes;
michael@0 1362 }
michael@0 1363 } else /* 0xe000<=c<0xf300 */ {
michael@0 1364 /* quote to avoid SCSU tags */
michael@0 1365 c|=UQU<<16;
michael@0 1366 length=3;
michael@0 1367 goto outputBytes;
michael@0 1368 }
michael@0 1369
michael@0 1370 /* normal end of conversion: prepare for a new character */
michael@0 1371 c=0;
michael@0 1372 sourceIndex=nextSourceIndex;
michael@0 1373 }
michael@0 1374 }
michael@0 1375 endloop:
michael@0 1376
michael@0 1377 /* set the converter state back into UConverter */
michael@0 1378 scsu->fromUIsSingleByteMode=isSingleByteMode;
michael@0 1379 scsu->fromUDynamicWindow=dynamicWindow;
michael@0 1380
michael@0 1381 cnv->fromUChar32=c;
michael@0 1382
michael@0 1383 /* write back the updated pointers */
michael@0 1384 pArgs->source=source;
michael@0 1385 pArgs->target=(char *)target;
michael@0 1386 pArgs->offsets=offsets;
michael@0 1387 return;
michael@0 1388
michael@0 1389 outputBytes:
michael@0 1390 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
michael@0 1391 /* from the first if in the loop we know that targetCapacity>0 */
michael@0 1392 if(length<=targetCapacity) {
michael@0 1393 if(offsets==NULL) {
michael@0 1394 switch(length) {
michael@0 1395 /* each branch falls through to the next one */
michael@0 1396 case 4:
michael@0 1397 *target++=(uint8_t)(c>>24);
michael@0 1398 case 3: /*fall through*/
michael@0 1399 *target++=(uint8_t)(c>>16);
michael@0 1400 case 2: /*fall through*/
michael@0 1401 *target++=(uint8_t)(c>>8);
michael@0 1402 case 1: /*fall through*/
michael@0 1403 *target++=(uint8_t)c;
michael@0 1404 default:
michael@0 1405 /* will never occur */
michael@0 1406 break;
michael@0 1407 }
michael@0 1408 } else {
michael@0 1409 switch(length) {
michael@0 1410 /* each branch falls through to the next one */
michael@0 1411 case 4:
michael@0 1412 *target++=(uint8_t)(c>>24);
michael@0 1413 *offsets++=sourceIndex;
michael@0 1414 case 3: /*fall through*/
michael@0 1415 *target++=(uint8_t)(c>>16);
michael@0 1416 *offsets++=sourceIndex;
michael@0 1417 case 2: /*fall through*/
michael@0 1418 *target++=(uint8_t)(c>>8);
michael@0 1419 *offsets++=sourceIndex;
michael@0 1420 case 1: /*fall through*/
michael@0 1421 *target++=(uint8_t)c;
michael@0 1422 *offsets++=sourceIndex;
michael@0 1423 default:
michael@0 1424 /* will never occur */
michael@0 1425 break;
michael@0 1426 }
michael@0 1427 }
michael@0 1428 targetCapacity-=length;
michael@0 1429
michael@0 1430 /* normal end of conversion: prepare for a new character */
michael@0 1431 c=0;
michael@0 1432 sourceIndex=nextSourceIndex;
michael@0 1433 goto loop;
michael@0 1434 } else {
michael@0 1435 uint8_t *p;
michael@0 1436
michael@0 1437 /*
michael@0 1438 * We actually do this backwards here:
michael@0 1439 * In order to save an intermediate variable, we output
michael@0 1440 * first to the overflow buffer what does not fit into the
michael@0 1441 * regular target.
michael@0 1442 */
michael@0 1443 /* we know that 0<=targetCapacity<length<=4 */
michael@0 1444 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
michael@0 1445 length-=targetCapacity;
michael@0 1446 p=(uint8_t *)cnv->charErrorBuffer;
michael@0 1447 switch(length) {
michael@0 1448 /* each branch falls through to the next one */
michael@0 1449 case 4:
michael@0 1450 *p++=(uint8_t)(c>>24);
michael@0 1451 case 3: /*fall through*/
michael@0 1452 *p++=(uint8_t)(c>>16);
michael@0 1453 case 2: /*fall through*/
michael@0 1454 *p++=(uint8_t)(c>>8);
michael@0 1455 case 1: /*fall through*/
michael@0 1456 *p=(uint8_t)c;
michael@0 1457 default:
michael@0 1458 /* will never occur */
michael@0 1459 break;
michael@0 1460 }
michael@0 1461 cnv->charErrorBufferLength=(int8_t)length;
michael@0 1462
michael@0 1463 /* now output what fits into the regular target */
michael@0 1464 c>>=8*length; /* length was reduced by targetCapacity */
michael@0 1465 switch(targetCapacity) {
michael@0 1466 /* each branch falls through to the next one */
michael@0 1467 case 3:
michael@0 1468 *target++=(uint8_t)(c>>16);
michael@0 1469 if(offsets!=NULL) {
michael@0 1470 *offsets++=sourceIndex;
michael@0 1471 }
michael@0 1472 case 2: /*fall through*/
michael@0 1473 *target++=(uint8_t)(c>>8);
michael@0 1474 if(offsets!=NULL) {
michael@0 1475 *offsets++=sourceIndex;
michael@0 1476 }
michael@0 1477 case 1: /*fall through*/
michael@0 1478 *target++=(uint8_t)c;
michael@0 1479 if(offsets!=NULL) {
michael@0 1480 *offsets++=sourceIndex;
michael@0 1481 }
michael@0 1482 default:
michael@0 1483 break;
michael@0 1484 }
michael@0 1485
michael@0 1486 /* target overflow */
michael@0 1487 targetCapacity=0;
michael@0 1488 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1489 c=0;
michael@0 1490 goto endloop;
michael@0 1491 }
michael@0 1492 }
michael@0 1493
michael@0 1494 /*
michael@0 1495 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
michael@0 1496 * If a change is made in the original function, then either
michael@0 1497 * change this function the same way or
michael@0 1498 * re-copy the original function and remove the variables
michael@0 1499 * offsets, sourceIndex, and nextSourceIndex.
michael@0 1500 */
michael@0 1501 static void
michael@0 1502 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
michael@0 1503 UErrorCode *pErrorCode) {
michael@0 1504 UConverter *cnv;
michael@0 1505 SCSUData *scsu;
michael@0 1506 const UChar *source, *sourceLimit;
michael@0 1507 uint8_t *target;
michael@0 1508 int32_t targetCapacity;
michael@0 1509
michael@0 1510 UBool isSingleByteMode;
michael@0 1511 uint8_t dynamicWindow;
michael@0 1512 uint32_t currentOffset;
michael@0 1513
michael@0 1514 uint32_t c, delta;
michael@0 1515
michael@0 1516 int32_t length;
michael@0 1517
michael@0 1518 /* variables for compression heuristics */
michael@0 1519 uint32_t offset;
michael@0 1520 UChar lead, trail;
michael@0 1521 int code;
michael@0 1522 int8_t window;
michael@0 1523
michael@0 1524 /* set up the local pointers */
michael@0 1525 cnv=pArgs->converter;
michael@0 1526 scsu=(SCSUData *)cnv->extraInfo;
michael@0 1527
michael@0 1528 /* set up the local pointers */
michael@0 1529 source=pArgs->source;
michael@0 1530 sourceLimit=pArgs->sourceLimit;
michael@0 1531 target=(uint8_t *)pArgs->target;
michael@0 1532 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0 1533
michael@0 1534 /* get the state machine state */
michael@0 1535 isSingleByteMode=scsu->fromUIsSingleByteMode;
michael@0 1536 dynamicWindow=scsu->fromUDynamicWindow;
michael@0 1537 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1538
michael@0 1539 c=cnv->fromUChar32;
michael@0 1540
michael@0 1541 /* similar conversion "loop" as in toUnicode */
michael@0 1542 loop:
michael@0 1543 if(isSingleByteMode) {
michael@0 1544 if(c!=0 && targetCapacity>0) {
michael@0 1545 goto getTrailSingle;
michael@0 1546 }
michael@0 1547
michael@0 1548 /* state machine for single-byte mode */
michael@0 1549 /* singleByteMode: */
michael@0 1550 while(source<sourceLimit) {
michael@0 1551 if(targetCapacity<=0) {
michael@0 1552 /* target is full */
michael@0 1553 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1554 break;
michael@0 1555 }
michael@0 1556 c=*source++;
michael@0 1557
michael@0 1558 if((c-0x20)<=0x5f) {
michael@0 1559 /* pass US-ASCII graphic character through */
michael@0 1560 *target++=(uint8_t)c;
michael@0 1561 --targetCapacity;
michael@0 1562 } else if(c<0x20) {
michael@0 1563 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
michael@0 1564 /* CR/LF/TAB/NUL */
michael@0 1565 *target++=(uint8_t)c;
michael@0 1566 --targetCapacity;
michael@0 1567 } else {
michael@0 1568 /* quote C0 control character */
michael@0 1569 c|=SQ0<<8;
michael@0 1570 length=2;
michael@0 1571 goto outputBytes;
michael@0 1572 }
michael@0 1573 } else if((delta=c-currentOffset)<=0x7f) {
michael@0 1574 /* use the current dynamic window */
michael@0 1575 *target++=(uint8_t)(delta|0x80);
michael@0 1576 --targetCapacity;
michael@0 1577 } else if(U16_IS_SURROGATE(c)) {
michael@0 1578 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1579 getTrailSingle:
michael@0 1580 lead=(UChar)c;
michael@0 1581 if(source<sourceLimit) {
michael@0 1582 /* test the following code unit */
michael@0 1583 trail=*source;
michael@0 1584 if(U16_IS_TRAIL(trail)) {
michael@0 1585 ++source;
michael@0 1586 c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0 1587 /* convert this surrogate code point */
michael@0 1588 /* exit this condition tree */
michael@0 1589 } else {
michael@0 1590 /* this is an unmatched lead code unit (1st surrogate) */
michael@0 1591 /* callback(illegal) */
michael@0 1592 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1593 goto endloop;
michael@0 1594 }
michael@0 1595 } else {
michael@0 1596 /* no more input */
michael@0 1597 break;
michael@0 1598 }
michael@0 1599 } else {
michael@0 1600 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 1601 /* callback(illegal) */
michael@0 1602 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1603 goto endloop;
michael@0 1604 }
michael@0 1605
michael@0 1606 /* compress supplementary character U+10000..U+10ffff */
michael@0 1607 if((delta=c-currentOffset)<=0x7f) {
michael@0 1608 /* use the current dynamic window */
michael@0 1609 *target++=(uint8_t)(delta|0x80);
michael@0 1610 --targetCapacity;
michael@0 1611 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
michael@0 1612 /* there is a dynamic window that contains this character, change to it */
michael@0 1613 dynamicWindow=window;
michael@0 1614 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1615 useDynamicWindow(scsu, dynamicWindow);
michael@0 1616 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
michael@0 1617 length=2;
michael@0 1618 goto outputBytes;
michael@0 1619 } else if((code=getDynamicOffset(c, &offset))>=0) {
michael@0 1620 /* might check if there are more characters in this window to come */
michael@0 1621 /* define an extended window with this character */
michael@0 1622 code-=0x200;
michael@0 1623 dynamicWindow=getNextDynamicWindow(scsu);
michael@0 1624 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
michael@0 1625 useDynamicWindow(scsu, dynamicWindow);
michael@0 1626 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
michael@0 1627 length=4;
michael@0 1628 goto outputBytes;
michael@0 1629 } else {
michael@0 1630 /* change to Unicode mode and output this (lead, trail) pair */
michael@0 1631 isSingleByteMode=FALSE;
michael@0 1632 *target++=(uint8_t)SCU;
michael@0 1633 --targetCapacity;
michael@0 1634 c=((uint32_t)lead<<16)|trail;
michael@0 1635 length=4;
michael@0 1636 goto outputBytes;
michael@0 1637 }
michael@0 1638 } else if(c<0xa0) {
michael@0 1639 /* quote C1 control character */
michael@0 1640 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
michael@0 1641 length=2;
michael@0 1642 goto outputBytes;
michael@0 1643 } else if(c==0xfeff || c>=0xfff0) {
michael@0 1644 /* quote signature character=byte order mark and specials */
michael@0 1645 c|=SQU<<16;
michael@0 1646 length=3;
michael@0 1647 goto outputBytes;
michael@0 1648 } else {
michael@0 1649 /* compress all other BMP characters */
michael@0 1650 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
michael@0 1651 /* there is a window defined that contains this character - switch to it or quote from it? */
michael@0 1652 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
michael@0 1653 /* change to dynamic window */
michael@0 1654 dynamicWindow=window;
michael@0 1655 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1656 useDynamicWindow(scsu, dynamicWindow);
michael@0 1657 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
michael@0 1658 length=2;
michael@0 1659 goto outputBytes;
michael@0 1660 } else {
michael@0 1661 /* quote from dynamic window */
michael@0 1662 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
michael@0 1663 length=2;
michael@0 1664 goto outputBytes;
michael@0 1665 }
michael@0 1666 } else if((window=getWindow(staticOffsets, c))>=0) {
michael@0 1667 /* quote from static window */
michael@0 1668 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
michael@0 1669 length=2;
michael@0 1670 goto outputBytes;
michael@0 1671 } else if((code=getDynamicOffset(c, &offset))>=0) {
michael@0 1672 /* define a dynamic window with this character */
michael@0 1673 dynamicWindow=getNextDynamicWindow(scsu);
michael@0 1674 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
michael@0 1675 useDynamicWindow(scsu, dynamicWindow);
michael@0 1676 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
michael@0 1677 length=3;
michael@0 1678 goto outputBytes;
michael@0 1679 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
michael@0 1680 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
michael@0 1681 ) {
michael@0 1682 /*
michael@0 1683 * this character is not compressible (a BMP ideograph or similar);
michael@0 1684 * switch to Unicode mode if this is the last character in the block
michael@0 1685 * or there is at least one more ideograph following immediately
michael@0 1686 */
michael@0 1687 isSingleByteMode=FALSE;
michael@0 1688 c|=SCU<<16;
michael@0 1689 length=3;
michael@0 1690 goto outputBytes;
michael@0 1691 } else {
michael@0 1692 /* quote Unicode */
michael@0 1693 c|=SQU<<16;
michael@0 1694 length=3;
michael@0 1695 goto outputBytes;
michael@0 1696 }
michael@0 1697 }
michael@0 1698
michael@0 1699 /* normal end of conversion: prepare for a new character */
michael@0 1700 c=0;
michael@0 1701 }
michael@0 1702 } else {
michael@0 1703 if(c!=0 && targetCapacity>0) {
michael@0 1704 goto getTrailUnicode;
michael@0 1705 }
michael@0 1706
michael@0 1707 /* state machine for Unicode mode */
michael@0 1708 /* unicodeByteMode: */
michael@0 1709 while(source<sourceLimit) {
michael@0 1710 if(targetCapacity<=0) {
michael@0 1711 /* target is full */
michael@0 1712 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1713 break;
michael@0 1714 }
michael@0 1715 c=*source++;
michael@0 1716
michael@0 1717 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
michael@0 1718 /* not compressible, write character directly */
michael@0 1719 if(targetCapacity>=2) {
michael@0 1720 *target++=(uint8_t)(c>>8);
michael@0 1721 *target++=(uint8_t)c;
michael@0 1722 targetCapacity-=2;
michael@0 1723 } else {
michael@0 1724 length=2;
michael@0 1725 goto outputBytes;
michael@0 1726 }
michael@0 1727 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
michael@0 1728 /* compress BMP character if the following one is not an uncompressible ideograph */
michael@0 1729 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
michael@0 1730 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
michael@0 1731 /* ASCII digit or letter */
michael@0 1732 isSingleByteMode=TRUE;
michael@0 1733 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
michael@0 1734 length=2;
michael@0 1735 goto outputBytes;
michael@0 1736 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
michael@0 1737 /* there is a dynamic window that contains this character, change to it */
michael@0 1738 isSingleByteMode=TRUE;
michael@0 1739 dynamicWindow=window;
michael@0 1740 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1741 useDynamicWindow(scsu, dynamicWindow);
michael@0 1742 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
michael@0 1743 length=2;
michael@0 1744 goto outputBytes;
michael@0 1745 } else if((code=getDynamicOffset(c, &offset))>=0) {
michael@0 1746 /* define a dynamic window with this character */
michael@0 1747 isSingleByteMode=TRUE;
michael@0 1748 dynamicWindow=getNextDynamicWindow(scsu);
michael@0 1749 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
michael@0 1750 useDynamicWindow(scsu, dynamicWindow);
michael@0 1751 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
michael@0 1752 length=3;
michael@0 1753 goto outputBytes;
michael@0 1754 }
michael@0 1755 }
michael@0 1756
michael@0 1757 /* don't know how to compress this character, just write it directly */
michael@0 1758 length=2;
michael@0 1759 goto outputBytes;
michael@0 1760 } else if(c<0xe000) {
michael@0 1761 /* c is a surrogate */
michael@0 1762 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1763 getTrailUnicode:
michael@0 1764 lead=(UChar)c;
michael@0 1765 if(source<sourceLimit) {
michael@0 1766 /* test the following code unit */
michael@0 1767 trail=*source;
michael@0 1768 if(U16_IS_TRAIL(trail)) {
michael@0 1769 ++source;
michael@0 1770 c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0 1771 /* convert this surrogate code point */
michael@0 1772 /* exit this condition tree */
michael@0 1773 } else {
michael@0 1774 /* this is an unmatched lead code unit (1st surrogate) */
michael@0 1775 /* callback(illegal) */
michael@0 1776 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1777 goto endloop;
michael@0 1778 }
michael@0 1779 } else {
michael@0 1780 /* no more input */
michael@0 1781 break;
michael@0 1782 }
michael@0 1783 } else {
michael@0 1784 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 1785 /* callback(illegal) */
michael@0 1786 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1787 goto endloop;
michael@0 1788 }
michael@0 1789
michael@0 1790 /* compress supplementary character */
michael@0 1791 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
michael@0 1792 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
michael@0 1793 ) {
michael@0 1794 /*
michael@0 1795 * there is a dynamic window that contains this character and
michael@0 1796 * the following character is not uncompressible,
michael@0 1797 * change to the window
michael@0 1798 */
michael@0 1799 isSingleByteMode=TRUE;
michael@0 1800 dynamicWindow=window;
michael@0 1801 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
michael@0 1802 useDynamicWindow(scsu, dynamicWindow);
michael@0 1803 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
michael@0 1804 length=2;
michael@0 1805 goto outputBytes;
michael@0 1806 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
michael@0 1807 (code=getDynamicOffset(c, &offset))>=0
michael@0 1808 ) {
michael@0 1809 /* two supplementary characters in (probably) the same window - define an extended one */
michael@0 1810 isSingleByteMode=TRUE;
michael@0 1811 code-=0x200;
michael@0 1812 dynamicWindow=getNextDynamicWindow(scsu);
michael@0 1813 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
michael@0 1814 useDynamicWindow(scsu, dynamicWindow);
michael@0 1815 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
michael@0 1816 length=4;
michael@0 1817 goto outputBytes;
michael@0 1818 } else {
michael@0 1819 /* don't know how to compress this character, just write it directly */
michael@0 1820 c=((uint32_t)lead<<16)|trail;
michael@0 1821 length=4;
michael@0 1822 goto outputBytes;
michael@0 1823 }
michael@0 1824 } else /* 0xe000<=c<0xf300 */ {
michael@0 1825 /* quote to avoid SCSU tags */
michael@0 1826 c|=UQU<<16;
michael@0 1827 length=3;
michael@0 1828 goto outputBytes;
michael@0 1829 }
michael@0 1830
michael@0 1831 /* normal end of conversion: prepare for a new character */
michael@0 1832 c=0;
michael@0 1833 }
michael@0 1834 }
michael@0 1835 endloop:
michael@0 1836
michael@0 1837 /* set the converter state back into UConverter */
michael@0 1838 scsu->fromUIsSingleByteMode=isSingleByteMode;
michael@0 1839 scsu->fromUDynamicWindow=dynamicWindow;
michael@0 1840
michael@0 1841 cnv->fromUChar32=c;
michael@0 1842
michael@0 1843 /* write back the updated pointers */
michael@0 1844 pArgs->source=source;
michael@0 1845 pArgs->target=(char *)target;
michael@0 1846 return;
michael@0 1847
michael@0 1848 outputBytes:
michael@0 1849 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
michael@0 1850 /* from the first if in the loop we know that targetCapacity>0 */
michael@0 1851 if(length<=targetCapacity) {
michael@0 1852 switch(length) {
michael@0 1853 /* each branch falls through to the next one */
michael@0 1854 case 4:
michael@0 1855 *target++=(uint8_t)(c>>24);
michael@0 1856 case 3: /*fall through*/
michael@0 1857 *target++=(uint8_t)(c>>16);
michael@0 1858 case 2: /*fall through*/
michael@0 1859 *target++=(uint8_t)(c>>8);
michael@0 1860 case 1: /*fall through*/
michael@0 1861 *target++=(uint8_t)c;
michael@0 1862 default:
michael@0 1863 /* will never occur */
michael@0 1864 break;
michael@0 1865 }
michael@0 1866 targetCapacity-=length;
michael@0 1867
michael@0 1868 /* normal end of conversion: prepare for a new character */
michael@0 1869 c=0;
michael@0 1870 goto loop;
michael@0 1871 } else {
michael@0 1872 uint8_t *p;
michael@0 1873
michael@0 1874 /*
michael@0 1875 * We actually do this backwards here:
michael@0 1876 * In order to save an intermediate variable, we output
michael@0 1877 * first to the overflow buffer what does not fit into the
michael@0 1878 * regular target.
michael@0 1879 */
michael@0 1880 /* we know that 0<=targetCapacity<length<=4 */
michael@0 1881 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
michael@0 1882 length-=targetCapacity;
michael@0 1883 p=(uint8_t *)cnv->charErrorBuffer;
michael@0 1884 switch(length) {
michael@0 1885 /* each branch falls through to the next one */
michael@0 1886 case 4:
michael@0 1887 *p++=(uint8_t)(c>>24);
michael@0 1888 case 3: /*fall through*/
michael@0 1889 *p++=(uint8_t)(c>>16);
michael@0 1890 case 2: /*fall through*/
michael@0 1891 *p++=(uint8_t)(c>>8);
michael@0 1892 case 1: /*fall through*/
michael@0 1893 *p=(uint8_t)c;
michael@0 1894 default:
michael@0 1895 /* will never occur */
michael@0 1896 break;
michael@0 1897 }
michael@0 1898 cnv->charErrorBufferLength=(int8_t)length;
michael@0 1899
michael@0 1900 /* now output what fits into the regular target */
michael@0 1901 c>>=8*length; /* length was reduced by targetCapacity */
michael@0 1902 switch(targetCapacity) {
michael@0 1903 /* each branch falls through to the next one */
michael@0 1904 case 3:
michael@0 1905 *target++=(uint8_t)(c>>16);
michael@0 1906 case 2: /*fall through*/
michael@0 1907 *target++=(uint8_t)(c>>8);
michael@0 1908 case 1: /*fall through*/
michael@0 1909 *target++=(uint8_t)c;
michael@0 1910 default:
michael@0 1911 break;
michael@0 1912 }
michael@0 1913
michael@0 1914 /* target overflow */
michael@0 1915 targetCapacity=0;
michael@0 1916 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1917 c=0;
michael@0 1918 goto endloop;
michael@0 1919 }
michael@0 1920 }
michael@0 1921
michael@0 1922 /* miscellaneous ------------------------------------------------------------ */
michael@0 1923
michael@0 1924 static const char *
michael@0 1925 _SCSUGetName(const UConverter *cnv) {
michael@0 1926 SCSUData *scsu=(SCSUData *)cnv->extraInfo;
michael@0 1927
michael@0 1928 switch(scsu->locale) {
michael@0 1929 case l_ja:
michael@0 1930 return "SCSU,locale=ja";
michael@0 1931 default:
michael@0 1932 return "SCSU";
michael@0 1933 }
michael@0 1934 }
michael@0 1935
michael@0 1936 /* structure for SafeClone calculations */
michael@0 1937 struct cloneSCSUStruct
michael@0 1938 {
michael@0 1939 UConverter cnv;
michael@0 1940 SCSUData mydata;
michael@0 1941 };
michael@0 1942
michael@0 1943 static UConverter *
michael@0 1944 _SCSUSafeClone(const UConverter *cnv,
michael@0 1945 void *stackBuffer,
michael@0 1946 int32_t *pBufferSize,
michael@0 1947 UErrorCode *status)
michael@0 1948 {
michael@0 1949 struct cloneSCSUStruct * localClone;
michael@0 1950 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
michael@0 1951
michael@0 1952 if (U_FAILURE(*status)){
michael@0 1953 return 0;
michael@0 1954 }
michael@0 1955
michael@0 1956 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
michael@0 1957 *pBufferSize = bufferSizeNeeded;
michael@0 1958 return 0;
michael@0 1959 }
michael@0 1960
michael@0 1961 localClone = (struct cloneSCSUStruct *)stackBuffer;
michael@0 1962 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
michael@0 1963
michael@0 1964 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
michael@0 1965 localClone->cnv.extraInfo = &localClone->mydata;
michael@0 1966 localClone->cnv.isExtraLocal = TRUE;
michael@0 1967
michael@0 1968 return &localClone->cnv;
michael@0 1969 }
michael@0 1970
michael@0 1971
michael@0 1972 static const UConverterImpl _SCSUImpl={
michael@0 1973 UCNV_SCSU,
michael@0 1974
michael@0 1975 NULL,
michael@0 1976 NULL,
michael@0 1977
michael@0 1978 _SCSUOpen,
michael@0 1979 _SCSUClose,
michael@0 1980 _SCSUReset,
michael@0 1981
michael@0 1982 _SCSUToUnicode,
michael@0 1983 _SCSUToUnicodeWithOffsets,
michael@0 1984 _SCSUFromUnicode,
michael@0 1985 _SCSUFromUnicodeWithOffsets,
michael@0 1986 NULL,
michael@0 1987
michael@0 1988 NULL,
michael@0 1989 _SCSUGetName,
michael@0 1990 NULL,
michael@0 1991 _SCSUSafeClone,
michael@0 1992 ucnv_getCompleteUnicodeSet
michael@0 1993 };
michael@0 1994
michael@0 1995 static const UConverterStaticData _SCSUStaticData={
michael@0 1996 sizeof(UConverterStaticData),
michael@0 1997 "SCSU",
michael@0 1998 1212, /* CCSID for SCSU */
michael@0 1999 UCNV_IBM, UCNV_SCSU,
michael@0 2000 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
michael@0 2001 /*
michael@0 2002 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
michael@0 2003 * substitution string.
michael@0 2004 */
michael@0 2005 { 0x0e, 0xff, 0xfd, 0 }, 3,
michael@0 2006 FALSE, FALSE,
michael@0 2007 0,
michael@0 2008 0,
michael@0 2009 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 2010 };
michael@0 2011
michael@0 2012 const UConverterSharedData _SCSUData={
michael@0 2013 sizeof(UConverterSharedData), ~((uint32_t)0),
michael@0 2014 NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
michael@0 2015 0
michael@0 2016 };
michael@0 2017
michael@0 2018 #endif

mercurial