Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ****************************************************************************** |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2000-2011, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ****************************************************************************** |
michael@0 | 8 | * file name: ucnvscsu.c |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2000nov18 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * This is an implementation of the Standard Compression Scheme for Unicode |
michael@0 | 17 | * as defined in http://www.unicode.org/unicode/reports/tr6/ . |
michael@0 | 18 | * Reserved commands and window settings are treated as illegal sequences and |
michael@0 | 19 | * will result in callback calls. |
michael@0 | 20 | */ |
michael@0 | 21 | |
michael@0 | 22 | #include "unicode/utypes.h" |
michael@0 | 23 | |
michael@0 | 24 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 25 | |
michael@0 | 26 | #include "unicode/ucnv.h" |
michael@0 | 27 | #include "unicode/ucnv_cb.h" |
michael@0 | 28 | #include "unicode/utf16.h" |
michael@0 | 29 | #include "ucnv_bld.h" |
michael@0 | 30 | #include "ucnv_cnv.h" |
michael@0 | 31 | #include "cmemory.h" |
michael@0 | 32 | |
michael@0 | 33 | /* SCSU definitions --------------------------------------------------------- */ |
michael@0 | 34 | |
michael@0 | 35 | /* SCSU command byte values */ |
michael@0 | 36 | enum { |
michael@0 | 37 | SQ0=0x01, /* Quote from window pair 0 */ |
michael@0 | 38 | SQ7=0x08, /* Quote from window pair 7 */ |
michael@0 | 39 | SDX=0x0B, /* Define a window as extended */ |
michael@0 | 40 | Srs=0x0C, /* reserved */ |
michael@0 | 41 | SQU=0x0E, /* Quote a single Unicode character */ |
michael@0 | 42 | SCU=0x0F, /* Change to Unicode mode */ |
michael@0 | 43 | SC0=0x10, /* Select window 0 */ |
michael@0 | 44 | SC7=0x17, /* Select window 7 */ |
michael@0 | 45 | SD0=0x18, /* Define and select window 0 */ |
michael@0 | 46 | SD7=0x1F, /* Define and select window 7 */ |
michael@0 | 47 | |
michael@0 | 48 | UC0=0xE0, /* Select window 0 */ |
michael@0 | 49 | UC7=0xE7, /* Select window 7 */ |
michael@0 | 50 | UD0=0xE8, /* Define and select window 0 */ |
michael@0 | 51 | UD7=0xEF, /* Define and select window 7 */ |
michael@0 | 52 | UQU=0xF0, /* Quote a single Unicode character */ |
michael@0 | 53 | UDX=0xF1, /* Define a Window as extended */ |
michael@0 | 54 | Urs=0xF2 /* reserved */ |
michael@0 | 55 | }; |
michael@0 | 56 | |
michael@0 | 57 | enum { |
michael@0 | 58 | /* |
michael@0 | 59 | * Unicode code points from 3400 to E000 are not adressible by |
michael@0 | 60 | * dynamic window, since in these areas no short run alphabets are |
michael@0 | 61 | * found. Therefore add gapOffset to all values from gapThreshold. |
michael@0 | 62 | */ |
michael@0 | 63 | gapThreshold=0x68, |
michael@0 | 64 | gapOffset=0xAC00, |
michael@0 | 65 | |
michael@0 | 66 | /* values between reservedStart and fixedThreshold are reserved */ |
michael@0 | 67 | reservedStart=0xA8, |
michael@0 | 68 | |
michael@0 | 69 | /* use table of predefined fixed offsets for values from fixedThreshold */ |
michael@0 | 70 | fixedThreshold=0xF9 |
michael@0 | 71 | }; |
michael@0 | 72 | |
michael@0 | 73 | /* constant offsets for the 8 static windows */ |
michael@0 | 74 | static const uint32_t staticOffsets[8]={ |
michael@0 | 75 | 0x0000, /* ASCII for quoted tags */ |
michael@0 | 76 | 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ |
michael@0 | 77 | 0x0100, /* Latin Extended-A */ |
michael@0 | 78 | 0x0300, /* Combining Diacritical Marks */ |
michael@0 | 79 | 0x2000, /* General Punctuation */ |
michael@0 | 80 | 0x2080, /* Currency Symbols */ |
michael@0 | 81 | 0x2100, /* Letterlike Symbols and Number Forms */ |
michael@0 | 82 | 0x3000 /* CJK Symbols and punctuation */ |
michael@0 | 83 | }; |
michael@0 | 84 | |
michael@0 | 85 | /* initial offsets for the 8 dynamic (sliding) windows */ |
michael@0 | 86 | static const uint32_t initialDynamicOffsets[8]={ |
michael@0 | 87 | 0x0080, /* Latin-1 */ |
michael@0 | 88 | 0x00C0, /* Latin Extended A */ |
michael@0 | 89 | 0x0400, /* Cyrillic */ |
michael@0 | 90 | 0x0600, /* Arabic */ |
michael@0 | 91 | 0x0900, /* Devanagari */ |
michael@0 | 92 | 0x3040, /* Hiragana */ |
michael@0 | 93 | 0x30A0, /* Katakana */ |
michael@0 | 94 | 0xFF00 /* Fullwidth ASCII */ |
michael@0 | 95 | }; |
michael@0 | 96 | |
michael@0 | 97 | /* Table of fixed predefined Offsets */ |
michael@0 | 98 | static const uint32_t fixedOffsets[]={ |
michael@0 | 99 | /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ |
michael@0 | 100 | /* 0xFA */ 0x0250, /* IPA extensions */ |
michael@0 | 101 | /* 0xFB */ 0x0370, /* Greek */ |
michael@0 | 102 | /* 0xFC */ 0x0530, /* Armenian */ |
michael@0 | 103 | /* 0xFD */ 0x3040, /* Hiragana */ |
michael@0 | 104 | /* 0xFE */ 0x30A0, /* Katakana */ |
michael@0 | 105 | /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ |
michael@0 | 106 | }; |
michael@0 | 107 | |
michael@0 | 108 | /* state values */ |
michael@0 | 109 | enum { |
michael@0 | 110 | readCommand, |
michael@0 | 111 | quotePairOne, |
michael@0 | 112 | quotePairTwo, |
michael@0 | 113 | quoteOne, |
michael@0 | 114 | definePairOne, |
michael@0 | 115 | definePairTwo, |
michael@0 | 116 | defineOne |
michael@0 | 117 | }; |
michael@0 | 118 | |
michael@0 | 119 | typedef struct SCSUData { |
michael@0 | 120 | /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ |
michael@0 | 121 | uint32_t toUDynamicOffsets[8]; |
michael@0 | 122 | uint32_t fromUDynamicOffsets[8]; |
michael@0 | 123 | |
michael@0 | 124 | /* state machine state - toUnicode */ |
michael@0 | 125 | UBool toUIsSingleByteMode; |
michael@0 | 126 | uint8_t toUState; |
michael@0 | 127 | int8_t toUQuoteWindow, toUDynamicWindow; |
michael@0 | 128 | uint8_t toUByteOne; |
michael@0 | 129 | uint8_t toUPadding[3]; |
michael@0 | 130 | |
michael@0 | 131 | /* state machine state - fromUnicode */ |
michael@0 | 132 | UBool fromUIsSingleByteMode; |
michael@0 | 133 | int8_t fromUDynamicWindow; |
michael@0 | 134 | |
michael@0 | 135 | /* |
michael@0 | 136 | * windowUse[] keeps track of the use of the dynamic windows: |
michael@0 | 137 | * At nextWindowUseIndex there is the least recently used window, |
michael@0 | 138 | * and the following windows (in a wrapping manner) are more and more |
michael@0 | 139 | * recently used. |
michael@0 | 140 | * At nextWindowUseIndex-1 there is the most recently used window. |
michael@0 | 141 | */ |
michael@0 | 142 | uint8_t locale; |
michael@0 | 143 | int8_t nextWindowUseIndex; |
michael@0 | 144 | int8_t windowUse[8]; |
michael@0 | 145 | } SCSUData; |
michael@0 | 146 | |
michael@0 | 147 | static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; |
michael@0 | 148 | static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; |
michael@0 | 149 | |
michael@0 | 150 | enum { |
michael@0 | 151 | lGeneric, l_ja |
michael@0 | 152 | }; |
michael@0 | 153 | |
michael@0 | 154 | /* SCSU setup functions ----------------------------------------------------- */ |
michael@0 | 155 | |
michael@0 | 156 | static void |
michael@0 | 157 | _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { |
michael@0 | 158 | SCSUData *scsu=(SCSUData *)cnv->extraInfo; |
michael@0 | 159 | |
michael@0 | 160 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 161 | /* reset toUnicode */ |
michael@0 | 162 | uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); |
michael@0 | 163 | |
michael@0 | 164 | scsu->toUIsSingleByteMode=TRUE; |
michael@0 | 165 | scsu->toUState=readCommand; |
michael@0 | 166 | scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; |
michael@0 | 167 | scsu->toUByteOne=0; |
michael@0 | 168 | |
michael@0 | 169 | cnv->toULength=0; |
michael@0 | 170 | } |
michael@0 | 171 | if(choice!=UCNV_RESET_TO_UNICODE) { |
michael@0 | 172 | /* reset fromUnicode */ |
michael@0 | 173 | uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); |
michael@0 | 174 | |
michael@0 | 175 | scsu->fromUIsSingleByteMode=TRUE; |
michael@0 | 176 | scsu->fromUDynamicWindow=0; |
michael@0 | 177 | |
michael@0 | 178 | scsu->nextWindowUseIndex=0; |
michael@0 | 179 | switch(scsu->locale) { |
michael@0 | 180 | case l_ja: |
michael@0 | 181 | uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); |
michael@0 | 182 | break; |
michael@0 | 183 | default: |
michael@0 | 184 | uprv_memcpy(scsu->windowUse, initialWindowUse, 8); |
michael@0 | 185 | break; |
michael@0 | 186 | } |
michael@0 | 187 | |
michael@0 | 188 | cnv->fromUChar32=0; |
michael@0 | 189 | } |
michael@0 | 190 | } |
michael@0 | 191 | |
michael@0 | 192 | static void |
michael@0 | 193 | _SCSUOpen(UConverter *cnv, |
michael@0 | 194 | UConverterLoadArgs *pArgs, |
michael@0 | 195 | UErrorCode *pErrorCode) { |
michael@0 | 196 | const char *locale=pArgs->locale; |
michael@0 | 197 | if(pArgs->onlyTestIsLoadable) { |
michael@0 | 198 | return; |
michael@0 | 199 | } |
michael@0 | 200 | cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); |
michael@0 | 201 | if(cnv->extraInfo!=NULL) { |
michael@0 | 202 | if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { |
michael@0 | 203 | ((SCSUData *)cnv->extraInfo)->locale=l_ja; |
michael@0 | 204 | } else { |
michael@0 | 205 | ((SCSUData *)cnv->extraInfo)->locale=lGeneric; |
michael@0 | 206 | } |
michael@0 | 207 | _SCSUReset(cnv, UCNV_RESET_BOTH); |
michael@0 | 208 | } else { |
michael@0 | 209 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 210 | } |
michael@0 | 211 | |
michael@0 | 212 | /* Set the substitution character U+fffd as a Unicode string. */ |
michael@0 | 213 | cnv->subUChars[0]=0xfffd; |
michael@0 | 214 | cnv->subCharLen=-1; |
michael@0 | 215 | } |
michael@0 | 216 | |
michael@0 | 217 | static void |
michael@0 | 218 | _SCSUClose(UConverter *cnv) { |
michael@0 | 219 | if(cnv->extraInfo!=NULL) { |
michael@0 | 220 | if(!cnv->isExtraLocal) { |
michael@0 | 221 | uprv_free(cnv->extraInfo); |
michael@0 | 222 | } |
michael@0 | 223 | cnv->extraInfo=NULL; |
michael@0 | 224 | } |
michael@0 | 225 | } |
michael@0 | 226 | |
michael@0 | 227 | /* SCSU-to-Unicode conversion functions ------------------------------------- */ |
michael@0 | 228 | |
michael@0 | 229 | static void |
michael@0 | 230 | _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
michael@0 | 231 | UErrorCode *pErrorCode) { |
michael@0 | 232 | UConverter *cnv; |
michael@0 | 233 | SCSUData *scsu; |
michael@0 | 234 | const uint8_t *source, *sourceLimit; |
michael@0 | 235 | UChar *target; |
michael@0 | 236 | const UChar *targetLimit; |
michael@0 | 237 | int32_t *offsets; |
michael@0 | 238 | UBool isSingleByteMode; |
michael@0 | 239 | uint8_t state, byteOne; |
michael@0 | 240 | int8_t quoteWindow, dynamicWindow; |
michael@0 | 241 | |
michael@0 | 242 | int32_t sourceIndex, nextSourceIndex; |
michael@0 | 243 | |
michael@0 | 244 | uint8_t b; |
michael@0 | 245 | |
michael@0 | 246 | /* set up the local pointers */ |
michael@0 | 247 | cnv=pArgs->converter; |
michael@0 | 248 | scsu=(SCSUData *)cnv->extraInfo; |
michael@0 | 249 | |
michael@0 | 250 | source=(const uint8_t *)pArgs->source; |
michael@0 | 251 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
michael@0 | 252 | target=pArgs->target; |
michael@0 | 253 | targetLimit=pArgs->targetLimit; |
michael@0 | 254 | offsets=pArgs->offsets; |
michael@0 | 255 | |
michael@0 | 256 | /* get the state machine state */ |
michael@0 | 257 | isSingleByteMode=scsu->toUIsSingleByteMode; |
michael@0 | 258 | state=scsu->toUState; |
michael@0 | 259 | quoteWindow=scsu->toUQuoteWindow; |
michael@0 | 260 | dynamicWindow=scsu->toUDynamicWindow; |
michael@0 | 261 | byteOne=scsu->toUByteOne; |
michael@0 | 262 | |
michael@0 | 263 | /* sourceIndex=-1 if the current character began in the previous buffer */ |
michael@0 | 264 | sourceIndex=state==readCommand ? 0 : -1; |
michael@0 | 265 | nextSourceIndex=0; |
michael@0 | 266 | |
michael@0 | 267 | /* |
michael@0 | 268 | * conversion "loop" |
michael@0 | 269 | * |
michael@0 | 270 | * For performance, this is not a normal C loop. |
michael@0 | 271 | * Instead, there are two code blocks for the two SCSU modes. |
michael@0 | 272 | * The function branches to either one, and a change of the mode is done with a goto to |
michael@0 | 273 | * the other branch. |
michael@0 | 274 | * |
michael@0 | 275 | * Each branch has two conventional loops: |
michael@0 | 276 | * - a fast-path loop for the most common codes in the mode |
michael@0 | 277 | * - a loop for all other codes in the mode |
michael@0 | 278 | * When the fast-path runs into a code that it cannot handle, its loop ends and it |
michael@0 | 279 | * runs into the following loop to handle the other codes. |
michael@0 | 280 | * The end of the input or output buffer is also handled by the slower loop. |
michael@0 | 281 | * The slow loop jumps (goto) to the fast-path loop again as soon as possible. |
michael@0 | 282 | * |
michael@0 | 283 | * The callback handling is done by returning with an error code. |
michael@0 | 284 | * The conversion framework actually calls the callback function. |
michael@0 | 285 | */ |
michael@0 | 286 | if(isSingleByteMode) { |
michael@0 | 287 | /* fast path for single-byte mode */ |
michael@0 | 288 | if(state==readCommand) { |
michael@0 | 289 | fastSingle: |
michael@0 | 290 | while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { |
michael@0 | 291 | ++source; |
michael@0 | 292 | ++nextSourceIndex; |
michael@0 | 293 | if(b<=0x7f) { |
michael@0 | 294 | /* write US-ASCII graphic character or DEL */ |
michael@0 | 295 | *target++=(UChar)b; |
michael@0 | 296 | if(offsets!=NULL) { |
michael@0 | 297 | *offsets++=sourceIndex; |
michael@0 | 298 | } |
michael@0 | 299 | } else { |
michael@0 | 300 | /* write from dynamic window */ |
michael@0 | 301 | uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); |
michael@0 | 302 | if(c<=0xffff) { |
michael@0 | 303 | *target++=(UChar)c; |
michael@0 | 304 | if(offsets!=NULL) { |
michael@0 | 305 | *offsets++=sourceIndex; |
michael@0 | 306 | } |
michael@0 | 307 | } else { |
michael@0 | 308 | /* output surrogate pair */ |
michael@0 | 309 | *target++=(UChar)(0xd7c0+(c>>10)); |
michael@0 | 310 | if(target<targetLimit) { |
michael@0 | 311 | *target++=(UChar)(0xdc00|(c&0x3ff)); |
michael@0 | 312 | if(offsets!=NULL) { |
michael@0 | 313 | *offsets++=sourceIndex; |
michael@0 | 314 | *offsets++=sourceIndex; |
michael@0 | 315 | } |
michael@0 | 316 | } else { |
michael@0 | 317 | /* target overflow */ |
michael@0 | 318 | if(offsets!=NULL) { |
michael@0 | 319 | *offsets++=sourceIndex; |
michael@0 | 320 | } |
michael@0 | 321 | cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
michael@0 | 322 | cnv->UCharErrorBufferLength=1; |
michael@0 | 323 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 324 | goto endloop; |
michael@0 | 325 | } |
michael@0 | 326 | } |
michael@0 | 327 | } |
michael@0 | 328 | sourceIndex=nextSourceIndex; |
michael@0 | 329 | } |
michael@0 | 330 | } |
michael@0 | 331 | |
michael@0 | 332 | /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ |
michael@0 | 333 | singleByteMode: |
michael@0 | 334 | while(source<sourceLimit) { |
michael@0 | 335 | if(target>=targetLimit) { |
michael@0 | 336 | /* target is full */ |
michael@0 | 337 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 338 | break; |
michael@0 | 339 | } |
michael@0 | 340 | b=*source++; |
michael@0 | 341 | ++nextSourceIndex; |
michael@0 | 342 | switch(state) { |
michael@0 | 343 | case readCommand: |
michael@0 | 344 | /* redundant conditions are commented out */ |
michael@0 | 345 | /* here: b<0x20 because otherwise we would be in fastSingle */ |
michael@0 | 346 | if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
michael@0 | 347 | /* CR/LF/TAB/NUL */ |
michael@0 | 348 | *target++=(UChar)b; |
michael@0 | 349 | if(offsets!=NULL) { |
michael@0 | 350 | *offsets++=sourceIndex; |
michael@0 | 351 | } |
michael@0 | 352 | sourceIndex=nextSourceIndex; |
michael@0 | 353 | goto fastSingle; |
michael@0 | 354 | } else if(SC0<=b) { |
michael@0 | 355 | if(b<=SC7) { |
michael@0 | 356 | dynamicWindow=(int8_t)(b-SC0); |
michael@0 | 357 | sourceIndex=nextSourceIndex; |
michael@0 | 358 | goto fastSingle; |
michael@0 | 359 | } else /* if(SD0<=b && b<=SD7) */ { |
michael@0 | 360 | dynamicWindow=(int8_t)(b-SD0); |
michael@0 | 361 | state=defineOne; |
michael@0 | 362 | } |
michael@0 | 363 | } else if(/* SQ0<=b && */ b<=SQ7) { |
michael@0 | 364 | quoteWindow=(int8_t)(b-SQ0); |
michael@0 | 365 | state=quoteOne; |
michael@0 | 366 | } else if(b==SDX) { |
michael@0 | 367 | state=definePairOne; |
michael@0 | 368 | } else if(b==SQU) { |
michael@0 | 369 | state=quotePairOne; |
michael@0 | 370 | } else if(b==SCU) { |
michael@0 | 371 | sourceIndex=nextSourceIndex; |
michael@0 | 372 | isSingleByteMode=FALSE; |
michael@0 | 373 | goto fastUnicode; |
michael@0 | 374 | } else /* Srs */ { |
michael@0 | 375 | /* callback(illegal) */ |
michael@0 | 376 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 377 | cnv->toUBytes[0]=b; |
michael@0 | 378 | cnv->toULength=1; |
michael@0 | 379 | goto endloop; |
michael@0 | 380 | } |
michael@0 | 381 | |
michael@0 | 382 | /* store the first byte of a multibyte sequence in toUBytes[] */ |
michael@0 | 383 | cnv->toUBytes[0]=b; |
michael@0 | 384 | cnv->toULength=1; |
michael@0 | 385 | break; |
michael@0 | 386 | case quotePairOne: |
michael@0 | 387 | byteOne=b; |
michael@0 | 388 | cnv->toUBytes[1]=b; |
michael@0 | 389 | cnv->toULength=2; |
michael@0 | 390 | state=quotePairTwo; |
michael@0 | 391 | break; |
michael@0 | 392 | case quotePairTwo: |
michael@0 | 393 | *target++=(UChar)((byteOne<<8)|b); |
michael@0 | 394 | if(offsets!=NULL) { |
michael@0 | 395 | *offsets++=sourceIndex; |
michael@0 | 396 | } |
michael@0 | 397 | sourceIndex=nextSourceIndex; |
michael@0 | 398 | state=readCommand; |
michael@0 | 399 | goto fastSingle; |
michael@0 | 400 | case quoteOne: |
michael@0 | 401 | if(b<0x80) { |
michael@0 | 402 | /* all static offsets are in the BMP */ |
michael@0 | 403 | *target++=(UChar)(staticOffsets[quoteWindow]+b); |
michael@0 | 404 | if(offsets!=NULL) { |
michael@0 | 405 | *offsets++=sourceIndex; |
michael@0 | 406 | } |
michael@0 | 407 | } else { |
michael@0 | 408 | /* write from dynamic window */ |
michael@0 | 409 | uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); |
michael@0 | 410 | if(c<=0xffff) { |
michael@0 | 411 | *target++=(UChar)c; |
michael@0 | 412 | if(offsets!=NULL) { |
michael@0 | 413 | *offsets++=sourceIndex; |
michael@0 | 414 | } |
michael@0 | 415 | } else { |
michael@0 | 416 | /* output surrogate pair */ |
michael@0 | 417 | *target++=(UChar)(0xd7c0+(c>>10)); |
michael@0 | 418 | if(target<targetLimit) { |
michael@0 | 419 | *target++=(UChar)(0xdc00|(c&0x3ff)); |
michael@0 | 420 | if(offsets!=NULL) { |
michael@0 | 421 | *offsets++=sourceIndex; |
michael@0 | 422 | *offsets++=sourceIndex; |
michael@0 | 423 | } |
michael@0 | 424 | } else { |
michael@0 | 425 | /* target overflow */ |
michael@0 | 426 | if(offsets!=NULL) { |
michael@0 | 427 | *offsets++=sourceIndex; |
michael@0 | 428 | } |
michael@0 | 429 | cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
michael@0 | 430 | cnv->UCharErrorBufferLength=1; |
michael@0 | 431 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 432 | goto endloop; |
michael@0 | 433 | } |
michael@0 | 434 | } |
michael@0 | 435 | } |
michael@0 | 436 | sourceIndex=nextSourceIndex; |
michael@0 | 437 | state=readCommand; |
michael@0 | 438 | goto fastSingle; |
michael@0 | 439 | case definePairOne: |
michael@0 | 440 | dynamicWindow=(int8_t)((b>>5)&7); |
michael@0 | 441 | byteOne=(uint8_t)(b&0x1f); |
michael@0 | 442 | cnv->toUBytes[1]=b; |
michael@0 | 443 | cnv->toULength=2; |
michael@0 | 444 | state=definePairTwo; |
michael@0 | 445 | break; |
michael@0 | 446 | case definePairTwo: |
michael@0 | 447 | scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); |
michael@0 | 448 | sourceIndex=nextSourceIndex; |
michael@0 | 449 | state=readCommand; |
michael@0 | 450 | goto fastSingle; |
michael@0 | 451 | case defineOne: |
michael@0 | 452 | if(b==0) { |
michael@0 | 453 | /* callback(illegal): Reserved window offset value 0 */ |
michael@0 | 454 | cnv->toUBytes[1]=b; |
michael@0 | 455 | cnv->toULength=2; |
michael@0 | 456 | goto endloop; |
michael@0 | 457 | } else if(b<gapThreshold) { |
michael@0 | 458 | scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; |
michael@0 | 459 | } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { |
michael@0 | 460 | scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; |
michael@0 | 461 | } else if(b>=fixedThreshold) { |
michael@0 | 462 | scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; |
michael@0 | 463 | } else { |
michael@0 | 464 | /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ |
michael@0 | 465 | cnv->toUBytes[1]=b; |
michael@0 | 466 | cnv->toULength=2; |
michael@0 | 467 | goto endloop; |
michael@0 | 468 | } |
michael@0 | 469 | sourceIndex=nextSourceIndex; |
michael@0 | 470 | state=readCommand; |
michael@0 | 471 | goto fastSingle; |
michael@0 | 472 | } |
michael@0 | 473 | } |
michael@0 | 474 | } else { |
michael@0 | 475 | /* fast path for Unicode mode */ |
michael@0 | 476 | if(state==readCommand) { |
michael@0 | 477 | fastUnicode: |
michael@0 | 478 | while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { |
michael@0 | 479 | *target++=(UChar)((b<<8)|source[1]); |
michael@0 | 480 | if(offsets!=NULL) { |
michael@0 | 481 | *offsets++=sourceIndex; |
michael@0 | 482 | } |
michael@0 | 483 | sourceIndex=nextSourceIndex; |
michael@0 | 484 | nextSourceIndex+=2; |
michael@0 | 485 | source+=2; |
michael@0 | 486 | } |
michael@0 | 487 | } |
michael@0 | 488 | |
michael@0 | 489 | /* normal state machine for Unicode mode */ |
michael@0 | 490 | /* unicodeByteMode: */ |
michael@0 | 491 | while(source<sourceLimit) { |
michael@0 | 492 | if(target>=targetLimit) { |
michael@0 | 493 | /* target is full */ |
michael@0 | 494 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 495 | break; |
michael@0 | 496 | } |
michael@0 | 497 | b=*source++; |
michael@0 | 498 | ++nextSourceIndex; |
michael@0 | 499 | switch(state) { |
michael@0 | 500 | case readCommand: |
michael@0 | 501 | if((uint8_t)(b-UC0)>(Urs-UC0)) { |
michael@0 | 502 | byteOne=b; |
michael@0 | 503 | cnv->toUBytes[0]=b; |
michael@0 | 504 | cnv->toULength=1; |
michael@0 | 505 | state=quotePairTwo; |
michael@0 | 506 | } else if(/* UC0<=b && */ b<=UC7) { |
michael@0 | 507 | dynamicWindow=(int8_t)(b-UC0); |
michael@0 | 508 | sourceIndex=nextSourceIndex; |
michael@0 | 509 | isSingleByteMode=TRUE; |
michael@0 | 510 | goto fastSingle; |
michael@0 | 511 | } else if(/* UD0<=b && */ b<=UD7) { |
michael@0 | 512 | dynamicWindow=(int8_t)(b-UD0); |
michael@0 | 513 | isSingleByteMode=TRUE; |
michael@0 | 514 | cnv->toUBytes[0]=b; |
michael@0 | 515 | cnv->toULength=1; |
michael@0 | 516 | state=defineOne; |
michael@0 | 517 | goto singleByteMode; |
michael@0 | 518 | } else if(b==UDX) { |
michael@0 | 519 | isSingleByteMode=TRUE; |
michael@0 | 520 | cnv->toUBytes[0]=b; |
michael@0 | 521 | cnv->toULength=1; |
michael@0 | 522 | state=definePairOne; |
michael@0 | 523 | goto singleByteMode; |
michael@0 | 524 | } else if(b==UQU) { |
michael@0 | 525 | cnv->toUBytes[0]=b; |
michael@0 | 526 | cnv->toULength=1; |
michael@0 | 527 | state=quotePairOne; |
michael@0 | 528 | } else /* Urs */ { |
michael@0 | 529 | /* callback(illegal) */ |
michael@0 | 530 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 531 | cnv->toUBytes[0]=b; |
michael@0 | 532 | cnv->toULength=1; |
michael@0 | 533 | goto endloop; |
michael@0 | 534 | } |
michael@0 | 535 | break; |
michael@0 | 536 | case quotePairOne: |
michael@0 | 537 | byteOne=b; |
michael@0 | 538 | cnv->toUBytes[1]=b; |
michael@0 | 539 | cnv->toULength=2; |
michael@0 | 540 | state=quotePairTwo; |
michael@0 | 541 | break; |
michael@0 | 542 | case quotePairTwo: |
michael@0 | 543 | *target++=(UChar)((byteOne<<8)|b); |
michael@0 | 544 | if(offsets!=NULL) { |
michael@0 | 545 | *offsets++=sourceIndex; |
michael@0 | 546 | } |
michael@0 | 547 | sourceIndex=nextSourceIndex; |
michael@0 | 548 | state=readCommand; |
michael@0 | 549 | goto fastUnicode; |
michael@0 | 550 | } |
michael@0 | 551 | } |
michael@0 | 552 | } |
michael@0 | 553 | endloop: |
michael@0 | 554 | |
michael@0 | 555 | /* set the converter state back into UConverter */ |
michael@0 | 556 | if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 557 | /* reset to deal with the next character */ |
michael@0 | 558 | state=readCommand; |
michael@0 | 559 | } else if(state==readCommand) { |
michael@0 | 560 | /* not in a multi-byte sequence, reset toULength */ |
michael@0 | 561 | cnv->toULength=0; |
michael@0 | 562 | } |
michael@0 | 563 | scsu->toUIsSingleByteMode=isSingleByteMode; |
michael@0 | 564 | scsu->toUState=state; |
michael@0 | 565 | scsu->toUQuoteWindow=quoteWindow; |
michael@0 | 566 | scsu->toUDynamicWindow=dynamicWindow; |
michael@0 | 567 | scsu->toUByteOne=byteOne; |
michael@0 | 568 | |
michael@0 | 569 | /* write back the updated pointers */ |
michael@0 | 570 | pArgs->source=(const char *)source; |
michael@0 | 571 | pArgs->target=target; |
michael@0 | 572 | pArgs->offsets=offsets; |
michael@0 | 573 | return; |
michael@0 | 574 | } |
michael@0 | 575 | |
michael@0 | 576 | /* |
michael@0 | 577 | * Identical to _SCSUToUnicodeWithOffsets but without offset handling. |
michael@0 | 578 | * If a change is made in the original function, then either |
michael@0 | 579 | * change this function the same way or |
michael@0 | 580 | * re-copy the original function and remove the variables |
michael@0 | 581 | * offsets, sourceIndex, and nextSourceIndex. |
michael@0 | 582 | */ |
michael@0 | 583 | static void |
michael@0 | 584 | _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, |
michael@0 | 585 | UErrorCode *pErrorCode) { |
michael@0 | 586 | UConverter *cnv; |
michael@0 | 587 | SCSUData *scsu; |
michael@0 | 588 | const uint8_t *source, *sourceLimit; |
michael@0 | 589 | UChar *target; |
michael@0 | 590 | const UChar *targetLimit; |
michael@0 | 591 | UBool isSingleByteMode; |
michael@0 | 592 | uint8_t state, byteOne; |
michael@0 | 593 | int8_t quoteWindow, dynamicWindow; |
michael@0 | 594 | |
michael@0 | 595 | uint8_t b; |
michael@0 | 596 | |
michael@0 | 597 | /* set up the local pointers */ |
michael@0 | 598 | cnv=pArgs->converter; |
michael@0 | 599 | scsu=(SCSUData *)cnv->extraInfo; |
michael@0 | 600 | |
michael@0 | 601 | source=(const uint8_t *)pArgs->source; |
michael@0 | 602 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
michael@0 | 603 | target=pArgs->target; |
michael@0 | 604 | targetLimit=pArgs->targetLimit; |
michael@0 | 605 | |
michael@0 | 606 | /* get the state machine state */ |
michael@0 | 607 | isSingleByteMode=scsu->toUIsSingleByteMode; |
michael@0 | 608 | state=scsu->toUState; |
michael@0 | 609 | quoteWindow=scsu->toUQuoteWindow; |
michael@0 | 610 | dynamicWindow=scsu->toUDynamicWindow; |
michael@0 | 611 | byteOne=scsu->toUByteOne; |
michael@0 | 612 | |
michael@0 | 613 | /* |
michael@0 | 614 | * conversion "loop" |
michael@0 | 615 | * |
michael@0 | 616 | * For performance, this is not a normal C loop. |
michael@0 | 617 | * Instead, there are two code blocks for the two SCSU modes. |
michael@0 | 618 | * The function branches to either one, and a change of the mode is done with a goto to |
michael@0 | 619 | * the other branch. |
michael@0 | 620 | * |
michael@0 | 621 | * Each branch has two conventional loops: |
michael@0 | 622 | * - a fast-path loop for the most common codes in the mode |
michael@0 | 623 | * - a loop for all other codes in the mode |
michael@0 | 624 | * When the fast-path runs into a code that it cannot handle, its loop ends and it |
michael@0 | 625 | * runs into the following loop to handle the other codes. |
michael@0 | 626 | * The end of the input or output buffer is also handled by the slower loop. |
michael@0 | 627 | * The slow loop jumps (goto) to the fast-path loop again as soon as possible. |
michael@0 | 628 | * |
michael@0 | 629 | * The callback handling is done by returning with an error code. |
michael@0 | 630 | * The conversion framework actually calls the callback function. |
michael@0 | 631 | */ |
michael@0 | 632 | if(isSingleByteMode) { |
michael@0 | 633 | /* fast path for single-byte mode */ |
michael@0 | 634 | if(state==readCommand) { |
michael@0 | 635 | fastSingle: |
michael@0 | 636 | while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { |
michael@0 | 637 | ++source; |
michael@0 | 638 | if(b<=0x7f) { |
michael@0 | 639 | /* write US-ASCII graphic character or DEL */ |
michael@0 | 640 | *target++=(UChar)b; |
michael@0 | 641 | } else { |
michael@0 | 642 | /* write from dynamic window */ |
michael@0 | 643 | uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); |
michael@0 | 644 | if(c<=0xffff) { |
michael@0 | 645 | *target++=(UChar)c; |
michael@0 | 646 | } else { |
michael@0 | 647 | /* output surrogate pair */ |
michael@0 | 648 | *target++=(UChar)(0xd7c0+(c>>10)); |
michael@0 | 649 | if(target<targetLimit) { |
michael@0 | 650 | *target++=(UChar)(0xdc00|(c&0x3ff)); |
michael@0 | 651 | } else { |
michael@0 | 652 | /* target overflow */ |
michael@0 | 653 | cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
michael@0 | 654 | cnv->UCharErrorBufferLength=1; |
michael@0 | 655 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 656 | goto endloop; |
michael@0 | 657 | } |
michael@0 | 658 | } |
michael@0 | 659 | } |
michael@0 | 660 | } |
michael@0 | 661 | } |
michael@0 | 662 | |
michael@0 | 663 | /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ |
michael@0 | 664 | singleByteMode: |
michael@0 | 665 | while(source<sourceLimit) { |
michael@0 | 666 | if(target>=targetLimit) { |
michael@0 | 667 | /* target is full */ |
michael@0 | 668 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 669 | break; |
michael@0 | 670 | } |
michael@0 | 671 | b=*source++; |
michael@0 | 672 | switch(state) { |
michael@0 | 673 | case readCommand: |
michael@0 | 674 | /* redundant conditions are commented out */ |
michael@0 | 675 | /* here: b<0x20 because otherwise we would be in fastSingle */ |
michael@0 | 676 | if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
michael@0 | 677 | /* CR/LF/TAB/NUL */ |
michael@0 | 678 | *target++=(UChar)b; |
michael@0 | 679 | goto fastSingle; |
michael@0 | 680 | } else if(SC0<=b) { |
michael@0 | 681 | if(b<=SC7) { |
michael@0 | 682 | dynamicWindow=(int8_t)(b-SC0); |
michael@0 | 683 | goto fastSingle; |
michael@0 | 684 | } else /* if(SD0<=b && b<=SD7) */ { |
michael@0 | 685 | dynamicWindow=(int8_t)(b-SD0); |
michael@0 | 686 | state=defineOne; |
michael@0 | 687 | } |
michael@0 | 688 | } else if(/* SQ0<=b && */ b<=SQ7) { |
michael@0 | 689 | quoteWindow=(int8_t)(b-SQ0); |
michael@0 | 690 | state=quoteOne; |
michael@0 | 691 | } else if(b==SDX) { |
michael@0 | 692 | state=definePairOne; |
michael@0 | 693 | } else if(b==SQU) { |
michael@0 | 694 | state=quotePairOne; |
michael@0 | 695 | } else if(b==SCU) { |
michael@0 | 696 | isSingleByteMode=FALSE; |
michael@0 | 697 | goto fastUnicode; |
michael@0 | 698 | } else /* Srs */ { |
michael@0 | 699 | /* callback(illegal) */ |
michael@0 | 700 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 701 | cnv->toUBytes[0]=b; |
michael@0 | 702 | cnv->toULength=1; |
michael@0 | 703 | goto endloop; |
michael@0 | 704 | } |
michael@0 | 705 | |
michael@0 | 706 | /* store the first byte of a multibyte sequence in toUBytes[] */ |
michael@0 | 707 | cnv->toUBytes[0]=b; |
michael@0 | 708 | cnv->toULength=1; |
michael@0 | 709 | break; |
michael@0 | 710 | case quotePairOne: |
michael@0 | 711 | byteOne=b; |
michael@0 | 712 | cnv->toUBytes[1]=b; |
michael@0 | 713 | cnv->toULength=2; |
michael@0 | 714 | state=quotePairTwo; |
michael@0 | 715 | break; |
michael@0 | 716 | case quotePairTwo: |
michael@0 | 717 | *target++=(UChar)((byteOne<<8)|b); |
michael@0 | 718 | state=readCommand; |
michael@0 | 719 | goto fastSingle; |
michael@0 | 720 | case quoteOne: |
michael@0 | 721 | if(b<0x80) { |
michael@0 | 722 | /* all static offsets are in the BMP */ |
michael@0 | 723 | *target++=(UChar)(staticOffsets[quoteWindow]+b); |
michael@0 | 724 | } else { |
michael@0 | 725 | /* write from dynamic window */ |
michael@0 | 726 | uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); |
michael@0 | 727 | if(c<=0xffff) { |
michael@0 | 728 | *target++=(UChar)c; |
michael@0 | 729 | } else { |
michael@0 | 730 | /* output surrogate pair */ |
michael@0 | 731 | *target++=(UChar)(0xd7c0+(c>>10)); |
michael@0 | 732 | if(target<targetLimit) { |
michael@0 | 733 | *target++=(UChar)(0xdc00|(c&0x3ff)); |
michael@0 | 734 | } else { |
michael@0 | 735 | /* target overflow */ |
michael@0 | 736 | cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
michael@0 | 737 | cnv->UCharErrorBufferLength=1; |
michael@0 | 738 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 739 | goto endloop; |
michael@0 | 740 | } |
michael@0 | 741 | } |
michael@0 | 742 | } |
michael@0 | 743 | state=readCommand; |
michael@0 | 744 | goto fastSingle; |
michael@0 | 745 | case definePairOne: |
michael@0 | 746 | dynamicWindow=(int8_t)((b>>5)&7); |
michael@0 | 747 | byteOne=(uint8_t)(b&0x1f); |
michael@0 | 748 | cnv->toUBytes[1]=b; |
michael@0 | 749 | cnv->toULength=2; |
michael@0 | 750 | state=definePairTwo; |
michael@0 | 751 | break; |
michael@0 | 752 | case definePairTwo: |
michael@0 | 753 | scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); |
michael@0 | 754 | state=readCommand; |
michael@0 | 755 | goto fastSingle; |
michael@0 | 756 | case defineOne: |
michael@0 | 757 | if(b==0) { |
michael@0 | 758 | /* callback(illegal): Reserved window offset value 0 */ |
michael@0 | 759 | cnv->toUBytes[1]=b; |
michael@0 | 760 | cnv->toULength=2; |
michael@0 | 761 | goto endloop; |
michael@0 | 762 | } else if(b<gapThreshold) { |
michael@0 | 763 | scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; |
michael@0 | 764 | } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { |
michael@0 | 765 | scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; |
michael@0 | 766 | } else if(b>=fixedThreshold) { |
michael@0 | 767 | scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; |
michael@0 | 768 | } else { |
michael@0 | 769 | /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ |
michael@0 | 770 | cnv->toUBytes[1]=b; |
michael@0 | 771 | cnv->toULength=2; |
michael@0 | 772 | goto endloop; |
michael@0 | 773 | } |
michael@0 | 774 | state=readCommand; |
michael@0 | 775 | goto fastSingle; |
michael@0 | 776 | } |
michael@0 | 777 | } |
michael@0 | 778 | } else { |
michael@0 | 779 | /* fast path for Unicode mode */ |
michael@0 | 780 | if(state==readCommand) { |
michael@0 | 781 | fastUnicode: |
michael@0 | 782 | while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { |
michael@0 | 783 | *target++=(UChar)((b<<8)|source[1]); |
michael@0 | 784 | source+=2; |
michael@0 | 785 | } |
michael@0 | 786 | } |
michael@0 | 787 | |
michael@0 | 788 | /* normal state machine for Unicode mode */ |
michael@0 | 789 | /* unicodeByteMode: */ |
michael@0 | 790 | while(source<sourceLimit) { |
michael@0 | 791 | if(target>=targetLimit) { |
michael@0 | 792 | /* target is full */ |
michael@0 | 793 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 794 | break; |
michael@0 | 795 | } |
michael@0 | 796 | b=*source++; |
michael@0 | 797 | switch(state) { |
michael@0 | 798 | case readCommand: |
michael@0 | 799 | if((uint8_t)(b-UC0)>(Urs-UC0)) { |
michael@0 | 800 | byteOne=b; |
michael@0 | 801 | cnv->toUBytes[0]=b; |
michael@0 | 802 | cnv->toULength=1; |
michael@0 | 803 | state=quotePairTwo; |
michael@0 | 804 | } else if(/* UC0<=b && */ b<=UC7) { |
michael@0 | 805 | dynamicWindow=(int8_t)(b-UC0); |
michael@0 | 806 | isSingleByteMode=TRUE; |
michael@0 | 807 | goto fastSingle; |
michael@0 | 808 | } else if(/* UD0<=b && */ b<=UD7) { |
michael@0 | 809 | dynamicWindow=(int8_t)(b-UD0); |
michael@0 | 810 | isSingleByteMode=TRUE; |
michael@0 | 811 | cnv->toUBytes[0]=b; |
michael@0 | 812 | cnv->toULength=1; |
michael@0 | 813 | state=defineOne; |
michael@0 | 814 | goto singleByteMode; |
michael@0 | 815 | } else if(b==UDX) { |
michael@0 | 816 | isSingleByteMode=TRUE; |
michael@0 | 817 | cnv->toUBytes[0]=b; |
michael@0 | 818 | cnv->toULength=1; |
michael@0 | 819 | state=definePairOne; |
michael@0 | 820 | goto singleByteMode; |
michael@0 | 821 | } else if(b==UQU) { |
michael@0 | 822 | cnv->toUBytes[0]=b; |
michael@0 | 823 | cnv->toULength=1; |
michael@0 | 824 | state=quotePairOne; |
michael@0 | 825 | } else /* Urs */ { |
michael@0 | 826 | /* callback(illegal) */ |
michael@0 | 827 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 828 | cnv->toUBytes[0]=b; |
michael@0 | 829 | cnv->toULength=1; |
michael@0 | 830 | goto endloop; |
michael@0 | 831 | } |
michael@0 | 832 | break; |
michael@0 | 833 | case quotePairOne: |
michael@0 | 834 | byteOne=b; |
michael@0 | 835 | cnv->toUBytes[1]=b; |
michael@0 | 836 | cnv->toULength=2; |
michael@0 | 837 | state=quotePairTwo; |
michael@0 | 838 | break; |
michael@0 | 839 | case quotePairTwo: |
michael@0 | 840 | *target++=(UChar)((byteOne<<8)|b); |
michael@0 | 841 | state=readCommand; |
michael@0 | 842 | goto fastUnicode; |
michael@0 | 843 | } |
michael@0 | 844 | } |
michael@0 | 845 | } |
michael@0 | 846 | endloop: |
michael@0 | 847 | |
michael@0 | 848 | /* set the converter state back into UConverter */ |
michael@0 | 849 | if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 850 | /* reset to deal with the next character */ |
michael@0 | 851 | state=readCommand; |
michael@0 | 852 | } else if(state==readCommand) { |
michael@0 | 853 | /* not in a multi-byte sequence, reset toULength */ |
michael@0 | 854 | cnv->toULength=0; |
michael@0 | 855 | } |
michael@0 | 856 | scsu->toUIsSingleByteMode=isSingleByteMode; |
michael@0 | 857 | scsu->toUState=state; |
michael@0 | 858 | scsu->toUQuoteWindow=quoteWindow; |
michael@0 | 859 | scsu->toUDynamicWindow=dynamicWindow; |
michael@0 | 860 | scsu->toUByteOne=byteOne; |
michael@0 | 861 | |
michael@0 | 862 | /* write back the updated pointers */ |
michael@0 | 863 | pArgs->source=(const char *)source; |
michael@0 | 864 | pArgs->target=target; |
michael@0 | 865 | return; |
michael@0 | 866 | } |
michael@0 | 867 | |
michael@0 | 868 | /* SCSU-from-Unicode conversion functions ----------------------------------- */ |
michael@0 | 869 | |
michael@0 | 870 | /* |
michael@0 | 871 | * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve |
michael@0 | 872 | * reasonable results. The lookahead is minimal. |
michael@0 | 873 | * Many cases are simple: |
michael@0 | 874 | * A character fits directly into the current mode, a dynamic or static window, |
michael@0 | 875 | * or is not compressible. These cases are tested first. |
michael@0 | 876 | * Real compression heuristics are applied to the rest, in code branches for |
michael@0 | 877 | * single/Unicode mode and BMP/supplementary code points. |
michael@0 | 878 | * The heuristics used here are extremely simple. |
michael@0 | 879 | */ |
michael@0 | 880 | |
michael@0 | 881 | /* get the number of the window that this character is in, or -1 */ |
michael@0 | 882 | static int8_t |
michael@0 | 883 | getWindow(const uint32_t offsets[8], uint32_t c) { |
michael@0 | 884 | int i; |
michael@0 | 885 | for(i=0; i<8; ++i) { |
michael@0 | 886 | if((uint32_t)(c-offsets[i])<=0x7f) { |
michael@0 | 887 | return (int8_t)(i); |
michael@0 | 888 | } |
michael@0 | 889 | } |
michael@0 | 890 | return -1; |
michael@0 | 891 | } |
michael@0 | 892 | |
michael@0 | 893 | /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ |
michael@0 | 894 | static UBool |
michael@0 | 895 | isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { |
michael@0 | 896 | return (UBool)(c<=offset+0x7f && |
michael@0 | 897 | (c>=offset || (c<=0x7f && |
michael@0 | 898 | (c>=0x20 || (1UL<<c)&0x2601)))); |
michael@0 | 899 | /* binary 0010 0110 0000 0001, |
michael@0 | 900 | check for b==0xd || b==0xa || b==9 || b==0 */ |
michael@0 | 901 | } |
michael@0 | 902 | |
michael@0 | 903 | /* |
michael@0 | 904 | * getNextDynamicWindow returns the next dynamic window to be redefined |
michael@0 | 905 | */ |
michael@0 | 906 | static int8_t |
michael@0 | 907 | getNextDynamicWindow(SCSUData *scsu) { |
michael@0 | 908 | int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; |
michael@0 | 909 | if(++scsu->nextWindowUseIndex==8) { |
michael@0 | 910 | scsu->nextWindowUseIndex=0; |
michael@0 | 911 | } |
michael@0 | 912 | return window; |
michael@0 | 913 | } |
michael@0 | 914 | |
michael@0 | 915 | /* |
michael@0 | 916 | * useDynamicWindow() adjusts |
michael@0 | 917 | * windowUse[] and nextWindowUseIndex for the algorithm to choose |
michael@0 | 918 | * the next dynamic window to be defined; |
michael@0 | 919 | * a subclass may override it and provide its own algorithm. |
michael@0 | 920 | */ |
michael@0 | 921 | static void |
michael@0 | 922 | useDynamicWindow(SCSUData *scsu, int8_t window) { |
michael@0 | 923 | /* |
michael@0 | 924 | * move the existing window, which just became the most recently used one, |
michael@0 | 925 | * up in windowUse[] to nextWindowUseIndex-1 |
michael@0 | 926 | */ |
michael@0 | 927 | |
michael@0 | 928 | /* first, find the index of the window - backwards to favor the more recently used windows */ |
michael@0 | 929 | int i, j; |
michael@0 | 930 | |
michael@0 | 931 | i=scsu->nextWindowUseIndex; |
michael@0 | 932 | do { |
michael@0 | 933 | if(--i<0) { |
michael@0 | 934 | i=7; |
michael@0 | 935 | } |
michael@0 | 936 | } while(scsu->windowUse[i]!=window); |
michael@0 | 937 | |
michael@0 | 938 | /* now copy each windowUse[i+1] to [i] */ |
michael@0 | 939 | j=i+1; |
michael@0 | 940 | if(j==8) { |
michael@0 | 941 | j=0; |
michael@0 | 942 | } |
michael@0 | 943 | while(j!=scsu->nextWindowUseIndex) { |
michael@0 | 944 | scsu->windowUse[i]=scsu->windowUse[j]; |
michael@0 | 945 | i=j; |
michael@0 | 946 | if(++j==8) { j=0; } |
michael@0 | 947 | } |
michael@0 | 948 | |
michael@0 | 949 | /* finally, set the window into the most recently used index */ |
michael@0 | 950 | scsu->windowUse[i]=window; |
michael@0 | 951 | } |
michael@0 | 952 | |
michael@0 | 953 | /* |
michael@0 | 954 | * calculate the offset and the code for a dynamic window that contains the character |
michael@0 | 955 | * takes fixed offsets into account |
michael@0 | 956 | * the offset of the window is stored in the offset variable, |
michael@0 | 957 | * the code is returned |
michael@0 | 958 | * |
michael@0 | 959 | * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code |
michael@0 | 960 | */ |
michael@0 | 961 | static int |
michael@0 | 962 | getDynamicOffset(uint32_t c, uint32_t *pOffset) { |
michael@0 | 963 | int i; |
michael@0 | 964 | |
michael@0 | 965 | for(i=0; i<7; ++i) { |
michael@0 | 966 | if((uint32_t)(c-fixedOffsets[i])<=0x7f) { |
michael@0 | 967 | *pOffset=fixedOffsets[i]; |
michael@0 | 968 | return 0xf9+i; |
michael@0 | 969 | } |
michael@0 | 970 | } |
michael@0 | 971 | |
michael@0 | 972 | if(c<0x80) { |
michael@0 | 973 | /* No dynamic window for US-ASCII. */ |
michael@0 | 974 | return -1; |
michael@0 | 975 | } else if(c<0x3400 || |
michael@0 | 976 | (uint32_t)(c-0x10000)<(0x14000-0x10000) || |
michael@0 | 977 | (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) |
michael@0 | 978 | ) { |
michael@0 | 979 | /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ |
michael@0 | 980 | *pOffset=c&0x7fffff80; |
michael@0 | 981 | return (int)(c>>7); |
michael@0 | 982 | } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { |
michael@0 | 983 | /* For these characters we need to take the gapOffset into account. */ |
michael@0 | 984 | *pOffset=c&0x7fffff80; |
michael@0 | 985 | return (int)((c-gapOffset)>>7); |
michael@0 | 986 | } else { |
michael@0 | 987 | return -1; |
michael@0 | 988 | } |
michael@0 | 989 | } |
michael@0 | 990 | |
michael@0 | 991 | /* |
michael@0 | 992 | * Idea for compression: |
michael@0 | 993 | * - save SCSUData and other state before really starting work |
michael@0 | 994 | * - at endloop, see if compression could be better with just unicode mode |
michael@0 | 995 | * - don't do this if a callback has been called |
michael@0 | 996 | * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning |
michael@0 | 997 | * - different buffer handling! |
michael@0 | 998 | * |
michael@0 | 999 | * Drawback or need for corrective handling: |
michael@0 | 1000 | * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and |
michael@0 | 1001 | * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible |
michael@0 | 1002 | * not only for compression but also for HTML/XML documents with following charset/encoding announcers. |
michael@0 | 1003 | * |
michael@0 | 1004 | * How to achieve both? |
michael@0 | 1005 | * - Only replace the result after an SDX or SCU? |
michael@0 | 1006 | */ |
michael@0 | 1007 | |
michael@0 | 1008 | static void |
michael@0 | 1009 | _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
michael@0 | 1010 | UErrorCode *pErrorCode) { |
michael@0 | 1011 | UConverter *cnv; |
michael@0 | 1012 | SCSUData *scsu; |
michael@0 | 1013 | const UChar *source, *sourceLimit; |
michael@0 | 1014 | uint8_t *target; |
michael@0 | 1015 | int32_t targetCapacity; |
michael@0 | 1016 | int32_t *offsets; |
michael@0 | 1017 | |
michael@0 | 1018 | UBool isSingleByteMode; |
michael@0 | 1019 | uint8_t dynamicWindow; |
michael@0 | 1020 | uint32_t currentOffset; |
michael@0 | 1021 | |
michael@0 | 1022 | uint32_t c, delta; |
michael@0 | 1023 | |
michael@0 | 1024 | int32_t sourceIndex, nextSourceIndex; |
michael@0 | 1025 | |
michael@0 | 1026 | int32_t length; |
michael@0 | 1027 | |
michael@0 | 1028 | /* variables for compression heuristics */ |
michael@0 | 1029 | uint32_t offset; |
michael@0 | 1030 | UChar lead, trail; |
michael@0 | 1031 | int code; |
michael@0 | 1032 | int8_t window; |
michael@0 | 1033 | |
michael@0 | 1034 | /* set up the local pointers */ |
michael@0 | 1035 | cnv=pArgs->converter; |
michael@0 | 1036 | scsu=(SCSUData *)cnv->extraInfo; |
michael@0 | 1037 | |
michael@0 | 1038 | /* set up the local pointers */ |
michael@0 | 1039 | source=pArgs->source; |
michael@0 | 1040 | sourceLimit=pArgs->sourceLimit; |
michael@0 | 1041 | target=(uint8_t *)pArgs->target; |
michael@0 | 1042 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
michael@0 | 1043 | offsets=pArgs->offsets; |
michael@0 | 1044 | |
michael@0 | 1045 | /* get the state machine state */ |
michael@0 | 1046 | isSingleByteMode=scsu->fromUIsSingleByteMode; |
michael@0 | 1047 | dynamicWindow=scsu->fromUDynamicWindow; |
michael@0 | 1048 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1049 | |
michael@0 | 1050 | c=cnv->fromUChar32; |
michael@0 | 1051 | |
michael@0 | 1052 | /* sourceIndex=-1 if the current character began in the previous buffer */ |
michael@0 | 1053 | sourceIndex= c==0 ? 0 : -1; |
michael@0 | 1054 | nextSourceIndex=0; |
michael@0 | 1055 | |
michael@0 | 1056 | /* similar conversion "loop" as in toUnicode */ |
michael@0 | 1057 | loop: |
michael@0 | 1058 | if(isSingleByteMode) { |
michael@0 | 1059 | if(c!=0 && targetCapacity>0) { |
michael@0 | 1060 | goto getTrailSingle; |
michael@0 | 1061 | } |
michael@0 | 1062 | |
michael@0 | 1063 | /* state machine for single-byte mode */ |
michael@0 | 1064 | /* singleByteMode: */ |
michael@0 | 1065 | while(source<sourceLimit) { |
michael@0 | 1066 | if(targetCapacity<=0) { |
michael@0 | 1067 | /* target is full */ |
michael@0 | 1068 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1069 | break; |
michael@0 | 1070 | } |
michael@0 | 1071 | c=*source++; |
michael@0 | 1072 | ++nextSourceIndex; |
michael@0 | 1073 | |
michael@0 | 1074 | if((c-0x20)<=0x5f) { |
michael@0 | 1075 | /* pass US-ASCII graphic character through */ |
michael@0 | 1076 | *target++=(uint8_t)c; |
michael@0 | 1077 | if(offsets!=NULL) { |
michael@0 | 1078 | *offsets++=sourceIndex; |
michael@0 | 1079 | } |
michael@0 | 1080 | --targetCapacity; |
michael@0 | 1081 | } else if(c<0x20) { |
michael@0 | 1082 | if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
michael@0 | 1083 | /* CR/LF/TAB/NUL */ |
michael@0 | 1084 | *target++=(uint8_t)c; |
michael@0 | 1085 | if(offsets!=NULL) { |
michael@0 | 1086 | *offsets++=sourceIndex; |
michael@0 | 1087 | } |
michael@0 | 1088 | --targetCapacity; |
michael@0 | 1089 | } else { |
michael@0 | 1090 | /* quote C0 control character */ |
michael@0 | 1091 | c|=SQ0<<8; |
michael@0 | 1092 | length=2; |
michael@0 | 1093 | goto outputBytes; |
michael@0 | 1094 | } |
michael@0 | 1095 | } else if((delta=c-currentOffset)<=0x7f) { |
michael@0 | 1096 | /* use the current dynamic window */ |
michael@0 | 1097 | *target++=(uint8_t)(delta|0x80); |
michael@0 | 1098 | if(offsets!=NULL) { |
michael@0 | 1099 | *offsets++=sourceIndex; |
michael@0 | 1100 | } |
michael@0 | 1101 | --targetCapacity; |
michael@0 | 1102 | } else if(U16_IS_SURROGATE(c)) { |
michael@0 | 1103 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 1104 | getTrailSingle: |
michael@0 | 1105 | lead=(UChar)c; |
michael@0 | 1106 | if(source<sourceLimit) { |
michael@0 | 1107 | /* test the following code unit */ |
michael@0 | 1108 | trail=*source; |
michael@0 | 1109 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 1110 | ++source; |
michael@0 | 1111 | ++nextSourceIndex; |
michael@0 | 1112 | c=U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 1113 | /* convert this surrogate code point */ |
michael@0 | 1114 | /* exit this condition tree */ |
michael@0 | 1115 | } else { |
michael@0 | 1116 | /* this is an unmatched lead code unit (1st surrogate) */ |
michael@0 | 1117 | /* callback(illegal) */ |
michael@0 | 1118 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1119 | goto endloop; |
michael@0 | 1120 | } |
michael@0 | 1121 | } else { |
michael@0 | 1122 | /* no more input */ |
michael@0 | 1123 | break; |
michael@0 | 1124 | } |
michael@0 | 1125 | } else { |
michael@0 | 1126 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 1127 | /* callback(illegal) */ |
michael@0 | 1128 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1129 | goto endloop; |
michael@0 | 1130 | } |
michael@0 | 1131 | |
michael@0 | 1132 | /* compress supplementary character U+10000..U+10ffff */ |
michael@0 | 1133 | if((delta=c-currentOffset)<=0x7f) { |
michael@0 | 1134 | /* use the current dynamic window */ |
michael@0 | 1135 | *target++=(uint8_t)(delta|0x80); |
michael@0 | 1136 | if(offsets!=NULL) { |
michael@0 | 1137 | *offsets++=sourceIndex; |
michael@0 | 1138 | } |
michael@0 | 1139 | --targetCapacity; |
michael@0 | 1140 | } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
michael@0 | 1141 | /* there is a dynamic window that contains this character, change to it */ |
michael@0 | 1142 | dynamicWindow=window; |
michael@0 | 1143 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1144 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1145 | c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
michael@0 | 1146 | length=2; |
michael@0 | 1147 | goto outputBytes; |
michael@0 | 1148 | } else if((code=getDynamicOffset(c, &offset))>=0) { |
michael@0 | 1149 | /* might check if there are more characters in this window to come */ |
michael@0 | 1150 | /* define an extended window with this character */ |
michael@0 | 1151 | code-=0x200; |
michael@0 | 1152 | dynamicWindow=getNextDynamicWindow(scsu); |
michael@0 | 1153 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
michael@0 | 1154 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1155 | c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
michael@0 | 1156 | length=4; |
michael@0 | 1157 | goto outputBytes; |
michael@0 | 1158 | } else { |
michael@0 | 1159 | /* change to Unicode mode and output this (lead, trail) pair */ |
michael@0 | 1160 | isSingleByteMode=FALSE; |
michael@0 | 1161 | *target++=(uint8_t)SCU; |
michael@0 | 1162 | if(offsets!=NULL) { |
michael@0 | 1163 | *offsets++=sourceIndex; |
michael@0 | 1164 | } |
michael@0 | 1165 | --targetCapacity; |
michael@0 | 1166 | c=((uint32_t)lead<<16)|trail; |
michael@0 | 1167 | length=4; |
michael@0 | 1168 | goto outputBytes; |
michael@0 | 1169 | } |
michael@0 | 1170 | } else if(c<0xa0) { |
michael@0 | 1171 | /* quote C1 control character */ |
michael@0 | 1172 | c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ |
michael@0 | 1173 | length=2; |
michael@0 | 1174 | goto outputBytes; |
michael@0 | 1175 | } else if(c==0xfeff || c>=0xfff0) { |
michael@0 | 1176 | /* quote signature character=byte order mark and specials */ |
michael@0 | 1177 | c|=SQU<<16; |
michael@0 | 1178 | length=3; |
michael@0 | 1179 | goto outputBytes; |
michael@0 | 1180 | } else { |
michael@0 | 1181 | /* compress all other BMP characters */ |
michael@0 | 1182 | if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
michael@0 | 1183 | /* there is a window defined that contains this character - switch to it or quote from it? */ |
michael@0 | 1184 | if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { |
michael@0 | 1185 | /* change to dynamic window */ |
michael@0 | 1186 | dynamicWindow=window; |
michael@0 | 1187 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1188 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1189 | c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
michael@0 | 1190 | length=2; |
michael@0 | 1191 | goto outputBytes; |
michael@0 | 1192 | } else { |
michael@0 | 1193 | /* quote from dynamic window */ |
michael@0 | 1194 | c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; |
michael@0 | 1195 | length=2; |
michael@0 | 1196 | goto outputBytes; |
michael@0 | 1197 | } |
michael@0 | 1198 | } else if((window=getWindow(staticOffsets, c))>=0) { |
michael@0 | 1199 | /* quote from static window */ |
michael@0 | 1200 | c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); |
michael@0 | 1201 | length=2; |
michael@0 | 1202 | goto outputBytes; |
michael@0 | 1203 | } else if((code=getDynamicOffset(c, &offset))>=0) { |
michael@0 | 1204 | /* define a dynamic window with this character */ |
michael@0 | 1205 | dynamicWindow=getNextDynamicWindow(scsu); |
michael@0 | 1206 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
michael@0 | 1207 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1208 | c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
michael@0 | 1209 | length=3; |
michael@0 | 1210 | goto outputBytes; |
michael@0 | 1211 | } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && |
michael@0 | 1212 | (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
michael@0 | 1213 | ) { |
michael@0 | 1214 | /* |
michael@0 | 1215 | * this character is not compressible (a BMP ideograph or similar); |
michael@0 | 1216 | * switch to Unicode mode if this is the last character in the block |
michael@0 | 1217 | * or there is at least one more ideograph following immediately |
michael@0 | 1218 | */ |
michael@0 | 1219 | isSingleByteMode=FALSE; |
michael@0 | 1220 | c|=SCU<<16; |
michael@0 | 1221 | length=3; |
michael@0 | 1222 | goto outputBytes; |
michael@0 | 1223 | } else { |
michael@0 | 1224 | /* quote Unicode */ |
michael@0 | 1225 | c|=SQU<<16; |
michael@0 | 1226 | length=3; |
michael@0 | 1227 | goto outputBytes; |
michael@0 | 1228 | } |
michael@0 | 1229 | } |
michael@0 | 1230 | |
michael@0 | 1231 | /* normal end of conversion: prepare for a new character */ |
michael@0 | 1232 | c=0; |
michael@0 | 1233 | sourceIndex=nextSourceIndex; |
michael@0 | 1234 | } |
michael@0 | 1235 | } else { |
michael@0 | 1236 | if(c!=0 && targetCapacity>0) { |
michael@0 | 1237 | goto getTrailUnicode; |
michael@0 | 1238 | } |
michael@0 | 1239 | |
michael@0 | 1240 | /* state machine for Unicode mode */ |
michael@0 | 1241 | /* unicodeByteMode: */ |
michael@0 | 1242 | while(source<sourceLimit) { |
michael@0 | 1243 | if(targetCapacity<=0) { |
michael@0 | 1244 | /* target is full */ |
michael@0 | 1245 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1246 | break; |
michael@0 | 1247 | } |
michael@0 | 1248 | c=*source++; |
michael@0 | 1249 | ++nextSourceIndex; |
michael@0 | 1250 | |
michael@0 | 1251 | if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { |
michael@0 | 1252 | /* not compressible, write character directly */ |
michael@0 | 1253 | if(targetCapacity>=2) { |
michael@0 | 1254 | *target++=(uint8_t)(c>>8); |
michael@0 | 1255 | *target++=(uint8_t)c; |
michael@0 | 1256 | if(offsets!=NULL) { |
michael@0 | 1257 | *offsets++=sourceIndex; |
michael@0 | 1258 | *offsets++=sourceIndex; |
michael@0 | 1259 | } |
michael@0 | 1260 | targetCapacity-=2; |
michael@0 | 1261 | } else { |
michael@0 | 1262 | length=2; |
michael@0 | 1263 | goto outputBytes; |
michael@0 | 1264 | } |
michael@0 | 1265 | } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { |
michael@0 | 1266 | /* compress BMP character if the following one is not an uncompressible ideograph */ |
michael@0 | 1267 | if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { |
michael@0 | 1268 | if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { |
michael@0 | 1269 | /* ASCII digit or letter */ |
michael@0 | 1270 | isSingleByteMode=TRUE; |
michael@0 | 1271 | c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; |
michael@0 | 1272 | length=2; |
michael@0 | 1273 | goto outputBytes; |
michael@0 | 1274 | } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
michael@0 | 1275 | /* there is a dynamic window that contains this character, change to it */ |
michael@0 | 1276 | isSingleByteMode=TRUE; |
michael@0 | 1277 | dynamicWindow=window; |
michael@0 | 1278 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1279 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1280 | c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
michael@0 | 1281 | length=2; |
michael@0 | 1282 | goto outputBytes; |
michael@0 | 1283 | } else if((code=getDynamicOffset(c, &offset))>=0) { |
michael@0 | 1284 | /* define a dynamic window with this character */ |
michael@0 | 1285 | isSingleByteMode=TRUE; |
michael@0 | 1286 | dynamicWindow=getNextDynamicWindow(scsu); |
michael@0 | 1287 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
michael@0 | 1288 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1289 | c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
michael@0 | 1290 | length=3; |
michael@0 | 1291 | goto outputBytes; |
michael@0 | 1292 | } |
michael@0 | 1293 | } |
michael@0 | 1294 | |
michael@0 | 1295 | /* don't know how to compress this character, just write it directly */ |
michael@0 | 1296 | length=2; |
michael@0 | 1297 | goto outputBytes; |
michael@0 | 1298 | } else if(c<0xe000) { |
michael@0 | 1299 | /* c is a surrogate */ |
michael@0 | 1300 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 1301 | getTrailUnicode: |
michael@0 | 1302 | lead=(UChar)c; |
michael@0 | 1303 | if(source<sourceLimit) { |
michael@0 | 1304 | /* test the following code unit */ |
michael@0 | 1305 | trail=*source; |
michael@0 | 1306 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 1307 | ++source; |
michael@0 | 1308 | ++nextSourceIndex; |
michael@0 | 1309 | c=U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 1310 | /* convert this surrogate code point */ |
michael@0 | 1311 | /* exit this condition tree */ |
michael@0 | 1312 | } else { |
michael@0 | 1313 | /* this is an unmatched lead code unit (1st surrogate) */ |
michael@0 | 1314 | /* callback(illegal) */ |
michael@0 | 1315 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1316 | goto endloop; |
michael@0 | 1317 | } |
michael@0 | 1318 | } else { |
michael@0 | 1319 | /* no more input */ |
michael@0 | 1320 | break; |
michael@0 | 1321 | } |
michael@0 | 1322 | } else { |
michael@0 | 1323 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 1324 | /* callback(illegal) */ |
michael@0 | 1325 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1326 | goto endloop; |
michael@0 | 1327 | } |
michael@0 | 1328 | |
michael@0 | 1329 | /* compress supplementary character */ |
michael@0 | 1330 | if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && |
michael@0 | 1331 | !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
michael@0 | 1332 | ) { |
michael@0 | 1333 | /* |
michael@0 | 1334 | * there is a dynamic window that contains this character and |
michael@0 | 1335 | * the following character is not uncompressible, |
michael@0 | 1336 | * change to the window |
michael@0 | 1337 | */ |
michael@0 | 1338 | isSingleByteMode=TRUE; |
michael@0 | 1339 | dynamicWindow=window; |
michael@0 | 1340 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1341 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1342 | c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
michael@0 | 1343 | length=2; |
michael@0 | 1344 | goto outputBytes; |
michael@0 | 1345 | } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ |
michael@0 | 1346 | (code=getDynamicOffset(c, &offset))>=0 |
michael@0 | 1347 | ) { |
michael@0 | 1348 | /* two supplementary characters in (probably) the same window - define an extended one */ |
michael@0 | 1349 | isSingleByteMode=TRUE; |
michael@0 | 1350 | code-=0x200; |
michael@0 | 1351 | dynamicWindow=getNextDynamicWindow(scsu); |
michael@0 | 1352 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
michael@0 | 1353 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1354 | c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
michael@0 | 1355 | length=4; |
michael@0 | 1356 | goto outputBytes; |
michael@0 | 1357 | } else { |
michael@0 | 1358 | /* don't know how to compress this character, just write it directly */ |
michael@0 | 1359 | c=((uint32_t)lead<<16)|trail; |
michael@0 | 1360 | length=4; |
michael@0 | 1361 | goto outputBytes; |
michael@0 | 1362 | } |
michael@0 | 1363 | } else /* 0xe000<=c<0xf300 */ { |
michael@0 | 1364 | /* quote to avoid SCSU tags */ |
michael@0 | 1365 | c|=UQU<<16; |
michael@0 | 1366 | length=3; |
michael@0 | 1367 | goto outputBytes; |
michael@0 | 1368 | } |
michael@0 | 1369 | |
michael@0 | 1370 | /* normal end of conversion: prepare for a new character */ |
michael@0 | 1371 | c=0; |
michael@0 | 1372 | sourceIndex=nextSourceIndex; |
michael@0 | 1373 | } |
michael@0 | 1374 | } |
michael@0 | 1375 | endloop: |
michael@0 | 1376 | |
michael@0 | 1377 | /* set the converter state back into UConverter */ |
michael@0 | 1378 | scsu->fromUIsSingleByteMode=isSingleByteMode; |
michael@0 | 1379 | scsu->fromUDynamicWindow=dynamicWindow; |
michael@0 | 1380 | |
michael@0 | 1381 | cnv->fromUChar32=c; |
michael@0 | 1382 | |
michael@0 | 1383 | /* write back the updated pointers */ |
michael@0 | 1384 | pArgs->source=source; |
michael@0 | 1385 | pArgs->target=(char *)target; |
michael@0 | 1386 | pArgs->offsets=offsets; |
michael@0 | 1387 | return; |
michael@0 | 1388 | |
michael@0 | 1389 | outputBytes: |
michael@0 | 1390 | /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ |
michael@0 | 1391 | /* from the first if in the loop we know that targetCapacity>0 */ |
michael@0 | 1392 | if(length<=targetCapacity) { |
michael@0 | 1393 | if(offsets==NULL) { |
michael@0 | 1394 | switch(length) { |
michael@0 | 1395 | /* each branch falls through to the next one */ |
michael@0 | 1396 | case 4: |
michael@0 | 1397 | *target++=(uint8_t)(c>>24); |
michael@0 | 1398 | case 3: /*fall through*/ |
michael@0 | 1399 | *target++=(uint8_t)(c>>16); |
michael@0 | 1400 | case 2: /*fall through*/ |
michael@0 | 1401 | *target++=(uint8_t)(c>>8); |
michael@0 | 1402 | case 1: /*fall through*/ |
michael@0 | 1403 | *target++=(uint8_t)c; |
michael@0 | 1404 | default: |
michael@0 | 1405 | /* will never occur */ |
michael@0 | 1406 | break; |
michael@0 | 1407 | } |
michael@0 | 1408 | } else { |
michael@0 | 1409 | switch(length) { |
michael@0 | 1410 | /* each branch falls through to the next one */ |
michael@0 | 1411 | case 4: |
michael@0 | 1412 | *target++=(uint8_t)(c>>24); |
michael@0 | 1413 | *offsets++=sourceIndex; |
michael@0 | 1414 | case 3: /*fall through*/ |
michael@0 | 1415 | *target++=(uint8_t)(c>>16); |
michael@0 | 1416 | *offsets++=sourceIndex; |
michael@0 | 1417 | case 2: /*fall through*/ |
michael@0 | 1418 | *target++=(uint8_t)(c>>8); |
michael@0 | 1419 | *offsets++=sourceIndex; |
michael@0 | 1420 | case 1: /*fall through*/ |
michael@0 | 1421 | *target++=(uint8_t)c; |
michael@0 | 1422 | *offsets++=sourceIndex; |
michael@0 | 1423 | default: |
michael@0 | 1424 | /* will never occur */ |
michael@0 | 1425 | break; |
michael@0 | 1426 | } |
michael@0 | 1427 | } |
michael@0 | 1428 | targetCapacity-=length; |
michael@0 | 1429 | |
michael@0 | 1430 | /* normal end of conversion: prepare for a new character */ |
michael@0 | 1431 | c=0; |
michael@0 | 1432 | sourceIndex=nextSourceIndex; |
michael@0 | 1433 | goto loop; |
michael@0 | 1434 | } else { |
michael@0 | 1435 | uint8_t *p; |
michael@0 | 1436 | |
michael@0 | 1437 | /* |
michael@0 | 1438 | * We actually do this backwards here: |
michael@0 | 1439 | * In order to save an intermediate variable, we output |
michael@0 | 1440 | * first to the overflow buffer what does not fit into the |
michael@0 | 1441 | * regular target. |
michael@0 | 1442 | */ |
michael@0 | 1443 | /* we know that 0<=targetCapacity<length<=4 */ |
michael@0 | 1444 | /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ |
michael@0 | 1445 | length-=targetCapacity; |
michael@0 | 1446 | p=(uint8_t *)cnv->charErrorBuffer; |
michael@0 | 1447 | switch(length) { |
michael@0 | 1448 | /* each branch falls through to the next one */ |
michael@0 | 1449 | case 4: |
michael@0 | 1450 | *p++=(uint8_t)(c>>24); |
michael@0 | 1451 | case 3: /*fall through*/ |
michael@0 | 1452 | *p++=(uint8_t)(c>>16); |
michael@0 | 1453 | case 2: /*fall through*/ |
michael@0 | 1454 | *p++=(uint8_t)(c>>8); |
michael@0 | 1455 | case 1: /*fall through*/ |
michael@0 | 1456 | *p=(uint8_t)c; |
michael@0 | 1457 | default: |
michael@0 | 1458 | /* will never occur */ |
michael@0 | 1459 | break; |
michael@0 | 1460 | } |
michael@0 | 1461 | cnv->charErrorBufferLength=(int8_t)length; |
michael@0 | 1462 | |
michael@0 | 1463 | /* now output what fits into the regular target */ |
michael@0 | 1464 | c>>=8*length; /* length was reduced by targetCapacity */ |
michael@0 | 1465 | switch(targetCapacity) { |
michael@0 | 1466 | /* each branch falls through to the next one */ |
michael@0 | 1467 | case 3: |
michael@0 | 1468 | *target++=(uint8_t)(c>>16); |
michael@0 | 1469 | if(offsets!=NULL) { |
michael@0 | 1470 | *offsets++=sourceIndex; |
michael@0 | 1471 | } |
michael@0 | 1472 | case 2: /*fall through*/ |
michael@0 | 1473 | *target++=(uint8_t)(c>>8); |
michael@0 | 1474 | if(offsets!=NULL) { |
michael@0 | 1475 | *offsets++=sourceIndex; |
michael@0 | 1476 | } |
michael@0 | 1477 | case 1: /*fall through*/ |
michael@0 | 1478 | *target++=(uint8_t)c; |
michael@0 | 1479 | if(offsets!=NULL) { |
michael@0 | 1480 | *offsets++=sourceIndex; |
michael@0 | 1481 | } |
michael@0 | 1482 | default: |
michael@0 | 1483 | break; |
michael@0 | 1484 | } |
michael@0 | 1485 | |
michael@0 | 1486 | /* target overflow */ |
michael@0 | 1487 | targetCapacity=0; |
michael@0 | 1488 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1489 | c=0; |
michael@0 | 1490 | goto endloop; |
michael@0 | 1491 | } |
michael@0 | 1492 | } |
michael@0 | 1493 | |
michael@0 | 1494 | /* |
michael@0 | 1495 | * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. |
michael@0 | 1496 | * If a change is made in the original function, then either |
michael@0 | 1497 | * change this function the same way or |
michael@0 | 1498 | * re-copy the original function and remove the variables |
michael@0 | 1499 | * offsets, sourceIndex, and nextSourceIndex. |
michael@0 | 1500 | */ |
michael@0 | 1501 | static void |
michael@0 | 1502 | _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, |
michael@0 | 1503 | UErrorCode *pErrorCode) { |
michael@0 | 1504 | UConverter *cnv; |
michael@0 | 1505 | SCSUData *scsu; |
michael@0 | 1506 | const UChar *source, *sourceLimit; |
michael@0 | 1507 | uint8_t *target; |
michael@0 | 1508 | int32_t targetCapacity; |
michael@0 | 1509 | |
michael@0 | 1510 | UBool isSingleByteMode; |
michael@0 | 1511 | uint8_t dynamicWindow; |
michael@0 | 1512 | uint32_t currentOffset; |
michael@0 | 1513 | |
michael@0 | 1514 | uint32_t c, delta; |
michael@0 | 1515 | |
michael@0 | 1516 | int32_t length; |
michael@0 | 1517 | |
michael@0 | 1518 | /* variables for compression heuristics */ |
michael@0 | 1519 | uint32_t offset; |
michael@0 | 1520 | UChar lead, trail; |
michael@0 | 1521 | int code; |
michael@0 | 1522 | int8_t window; |
michael@0 | 1523 | |
michael@0 | 1524 | /* set up the local pointers */ |
michael@0 | 1525 | cnv=pArgs->converter; |
michael@0 | 1526 | scsu=(SCSUData *)cnv->extraInfo; |
michael@0 | 1527 | |
michael@0 | 1528 | /* set up the local pointers */ |
michael@0 | 1529 | source=pArgs->source; |
michael@0 | 1530 | sourceLimit=pArgs->sourceLimit; |
michael@0 | 1531 | target=(uint8_t *)pArgs->target; |
michael@0 | 1532 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
michael@0 | 1533 | |
michael@0 | 1534 | /* get the state machine state */ |
michael@0 | 1535 | isSingleByteMode=scsu->fromUIsSingleByteMode; |
michael@0 | 1536 | dynamicWindow=scsu->fromUDynamicWindow; |
michael@0 | 1537 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1538 | |
michael@0 | 1539 | c=cnv->fromUChar32; |
michael@0 | 1540 | |
michael@0 | 1541 | /* similar conversion "loop" as in toUnicode */ |
michael@0 | 1542 | loop: |
michael@0 | 1543 | if(isSingleByteMode) { |
michael@0 | 1544 | if(c!=0 && targetCapacity>0) { |
michael@0 | 1545 | goto getTrailSingle; |
michael@0 | 1546 | } |
michael@0 | 1547 | |
michael@0 | 1548 | /* state machine for single-byte mode */ |
michael@0 | 1549 | /* singleByteMode: */ |
michael@0 | 1550 | while(source<sourceLimit) { |
michael@0 | 1551 | if(targetCapacity<=0) { |
michael@0 | 1552 | /* target is full */ |
michael@0 | 1553 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1554 | break; |
michael@0 | 1555 | } |
michael@0 | 1556 | c=*source++; |
michael@0 | 1557 | |
michael@0 | 1558 | if((c-0x20)<=0x5f) { |
michael@0 | 1559 | /* pass US-ASCII graphic character through */ |
michael@0 | 1560 | *target++=(uint8_t)c; |
michael@0 | 1561 | --targetCapacity; |
michael@0 | 1562 | } else if(c<0x20) { |
michael@0 | 1563 | if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
michael@0 | 1564 | /* CR/LF/TAB/NUL */ |
michael@0 | 1565 | *target++=(uint8_t)c; |
michael@0 | 1566 | --targetCapacity; |
michael@0 | 1567 | } else { |
michael@0 | 1568 | /* quote C0 control character */ |
michael@0 | 1569 | c|=SQ0<<8; |
michael@0 | 1570 | length=2; |
michael@0 | 1571 | goto outputBytes; |
michael@0 | 1572 | } |
michael@0 | 1573 | } else if((delta=c-currentOffset)<=0x7f) { |
michael@0 | 1574 | /* use the current dynamic window */ |
michael@0 | 1575 | *target++=(uint8_t)(delta|0x80); |
michael@0 | 1576 | --targetCapacity; |
michael@0 | 1577 | } else if(U16_IS_SURROGATE(c)) { |
michael@0 | 1578 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 1579 | getTrailSingle: |
michael@0 | 1580 | lead=(UChar)c; |
michael@0 | 1581 | if(source<sourceLimit) { |
michael@0 | 1582 | /* test the following code unit */ |
michael@0 | 1583 | trail=*source; |
michael@0 | 1584 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 1585 | ++source; |
michael@0 | 1586 | c=U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 1587 | /* convert this surrogate code point */ |
michael@0 | 1588 | /* exit this condition tree */ |
michael@0 | 1589 | } else { |
michael@0 | 1590 | /* this is an unmatched lead code unit (1st surrogate) */ |
michael@0 | 1591 | /* callback(illegal) */ |
michael@0 | 1592 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1593 | goto endloop; |
michael@0 | 1594 | } |
michael@0 | 1595 | } else { |
michael@0 | 1596 | /* no more input */ |
michael@0 | 1597 | break; |
michael@0 | 1598 | } |
michael@0 | 1599 | } else { |
michael@0 | 1600 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 1601 | /* callback(illegal) */ |
michael@0 | 1602 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1603 | goto endloop; |
michael@0 | 1604 | } |
michael@0 | 1605 | |
michael@0 | 1606 | /* compress supplementary character U+10000..U+10ffff */ |
michael@0 | 1607 | if((delta=c-currentOffset)<=0x7f) { |
michael@0 | 1608 | /* use the current dynamic window */ |
michael@0 | 1609 | *target++=(uint8_t)(delta|0x80); |
michael@0 | 1610 | --targetCapacity; |
michael@0 | 1611 | } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
michael@0 | 1612 | /* there is a dynamic window that contains this character, change to it */ |
michael@0 | 1613 | dynamicWindow=window; |
michael@0 | 1614 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1615 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1616 | c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
michael@0 | 1617 | length=2; |
michael@0 | 1618 | goto outputBytes; |
michael@0 | 1619 | } else if((code=getDynamicOffset(c, &offset))>=0) { |
michael@0 | 1620 | /* might check if there are more characters in this window to come */ |
michael@0 | 1621 | /* define an extended window with this character */ |
michael@0 | 1622 | code-=0x200; |
michael@0 | 1623 | dynamicWindow=getNextDynamicWindow(scsu); |
michael@0 | 1624 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
michael@0 | 1625 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1626 | c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
michael@0 | 1627 | length=4; |
michael@0 | 1628 | goto outputBytes; |
michael@0 | 1629 | } else { |
michael@0 | 1630 | /* change to Unicode mode and output this (lead, trail) pair */ |
michael@0 | 1631 | isSingleByteMode=FALSE; |
michael@0 | 1632 | *target++=(uint8_t)SCU; |
michael@0 | 1633 | --targetCapacity; |
michael@0 | 1634 | c=((uint32_t)lead<<16)|trail; |
michael@0 | 1635 | length=4; |
michael@0 | 1636 | goto outputBytes; |
michael@0 | 1637 | } |
michael@0 | 1638 | } else if(c<0xa0) { |
michael@0 | 1639 | /* quote C1 control character */ |
michael@0 | 1640 | c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ |
michael@0 | 1641 | length=2; |
michael@0 | 1642 | goto outputBytes; |
michael@0 | 1643 | } else if(c==0xfeff || c>=0xfff0) { |
michael@0 | 1644 | /* quote signature character=byte order mark and specials */ |
michael@0 | 1645 | c|=SQU<<16; |
michael@0 | 1646 | length=3; |
michael@0 | 1647 | goto outputBytes; |
michael@0 | 1648 | } else { |
michael@0 | 1649 | /* compress all other BMP characters */ |
michael@0 | 1650 | if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
michael@0 | 1651 | /* there is a window defined that contains this character - switch to it or quote from it? */ |
michael@0 | 1652 | if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { |
michael@0 | 1653 | /* change to dynamic window */ |
michael@0 | 1654 | dynamicWindow=window; |
michael@0 | 1655 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1656 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1657 | c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
michael@0 | 1658 | length=2; |
michael@0 | 1659 | goto outputBytes; |
michael@0 | 1660 | } else { |
michael@0 | 1661 | /* quote from dynamic window */ |
michael@0 | 1662 | c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; |
michael@0 | 1663 | length=2; |
michael@0 | 1664 | goto outputBytes; |
michael@0 | 1665 | } |
michael@0 | 1666 | } else if((window=getWindow(staticOffsets, c))>=0) { |
michael@0 | 1667 | /* quote from static window */ |
michael@0 | 1668 | c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); |
michael@0 | 1669 | length=2; |
michael@0 | 1670 | goto outputBytes; |
michael@0 | 1671 | } else if((code=getDynamicOffset(c, &offset))>=0) { |
michael@0 | 1672 | /* define a dynamic window with this character */ |
michael@0 | 1673 | dynamicWindow=getNextDynamicWindow(scsu); |
michael@0 | 1674 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
michael@0 | 1675 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1676 | c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
michael@0 | 1677 | length=3; |
michael@0 | 1678 | goto outputBytes; |
michael@0 | 1679 | } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && |
michael@0 | 1680 | (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
michael@0 | 1681 | ) { |
michael@0 | 1682 | /* |
michael@0 | 1683 | * this character is not compressible (a BMP ideograph or similar); |
michael@0 | 1684 | * switch to Unicode mode if this is the last character in the block |
michael@0 | 1685 | * or there is at least one more ideograph following immediately |
michael@0 | 1686 | */ |
michael@0 | 1687 | isSingleByteMode=FALSE; |
michael@0 | 1688 | c|=SCU<<16; |
michael@0 | 1689 | length=3; |
michael@0 | 1690 | goto outputBytes; |
michael@0 | 1691 | } else { |
michael@0 | 1692 | /* quote Unicode */ |
michael@0 | 1693 | c|=SQU<<16; |
michael@0 | 1694 | length=3; |
michael@0 | 1695 | goto outputBytes; |
michael@0 | 1696 | } |
michael@0 | 1697 | } |
michael@0 | 1698 | |
michael@0 | 1699 | /* normal end of conversion: prepare for a new character */ |
michael@0 | 1700 | c=0; |
michael@0 | 1701 | } |
michael@0 | 1702 | } else { |
michael@0 | 1703 | if(c!=0 && targetCapacity>0) { |
michael@0 | 1704 | goto getTrailUnicode; |
michael@0 | 1705 | } |
michael@0 | 1706 | |
michael@0 | 1707 | /* state machine for Unicode mode */ |
michael@0 | 1708 | /* unicodeByteMode: */ |
michael@0 | 1709 | while(source<sourceLimit) { |
michael@0 | 1710 | if(targetCapacity<=0) { |
michael@0 | 1711 | /* target is full */ |
michael@0 | 1712 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1713 | break; |
michael@0 | 1714 | } |
michael@0 | 1715 | c=*source++; |
michael@0 | 1716 | |
michael@0 | 1717 | if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { |
michael@0 | 1718 | /* not compressible, write character directly */ |
michael@0 | 1719 | if(targetCapacity>=2) { |
michael@0 | 1720 | *target++=(uint8_t)(c>>8); |
michael@0 | 1721 | *target++=(uint8_t)c; |
michael@0 | 1722 | targetCapacity-=2; |
michael@0 | 1723 | } else { |
michael@0 | 1724 | length=2; |
michael@0 | 1725 | goto outputBytes; |
michael@0 | 1726 | } |
michael@0 | 1727 | } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { |
michael@0 | 1728 | /* compress BMP character if the following one is not an uncompressible ideograph */ |
michael@0 | 1729 | if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { |
michael@0 | 1730 | if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { |
michael@0 | 1731 | /* ASCII digit or letter */ |
michael@0 | 1732 | isSingleByteMode=TRUE; |
michael@0 | 1733 | c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; |
michael@0 | 1734 | length=2; |
michael@0 | 1735 | goto outputBytes; |
michael@0 | 1736 | } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
michael@0 | 1737 | /* there is a dynamic window that contains this character, change to it */ |
michael@0 | 1738 | isSingleByteMode=TRUE; |
michael@0 | 1739 | dynamicWindow=window; |
michael@0 | 1740 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1741 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1742 | c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
michael@0 | 1743 | length=2; |
michael@0 | 1744 | goto outputBytes; |
michael@0 | 1745 | } else if((code=getDynamicOffset(c, &offset))>=0) { |
michael@0 | 1746 | /* define a dynamic window with this character */ |
michael@0 | 1747 | isSingleByteMode=TRUE; |
michael@0 | 1748 | dynamicWindow=getNextDynamicWindow(scsu); |
michael@0 | 1749 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
michael@0 | 1750 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1751 | c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
michael@0 | 1752 | length=3; |
michael@0 | 1753 | goto outputBytes; |
michael@0 | 1754 | } |
michael@0 | 1755 | } |
michael@0 | 1756 | |
michael@0 | 1757 | /* don't know how to compress this character, just write it directly */ |
michael@0 | 1758 | length=2; |
michael@0 | 1759 | goto outputBytes; |
michael@0 | 1760 | } else if(c<0xe000) { |
michael@0 | 1761 | /* c is a surrogate */ |
michael@0 | 1762 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 1763 | getTrailUnicode: |
michael@0 | 1764 | lead=(UChar)c; |
michael@0 | 1765 | if(source<sourceLimit) { |
michael@0 | 1766 | /* test the following code unit */ |
michael@0 | 1767 | trail=*source; |
michael@0 | 1768 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 1769 | ++source; |
michael@0 | 1770 | c=U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 1771 | /* convert this surrogate code point */ |
michael@0 | 1772 | /* exit this condition tree */ |
michael@0 | 1773 | } else { |
michael@0 | 1774 | /* this is an unmatched lead code unit (1st surrogate) */ |
michael@0 | 1775 | /* callback(illegal) */ |
michael@0 | 1776 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1777 | goto endloop; |
michael@0 | 1778 | } |
michael@0 | 1779 | } else { |
michael@0 | 1780 | /* no more input */ |
michael@0 | 1781 | break; |
michael@0 | 1782 | } |
michael@0 | 1783 | } else { |
michael@0 | 1784 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 1785 | /* callback(illegal) */ |
michael@0 | 1786 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1787 | goto endloop; |
michael@0 | 1788 | } |
michael@0 | 1789 | |
michael@0 | 1790 | /* compress supplementary character */ |
michael@0 | 1791 | if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && |
michael@0 | 1792 | !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
michael@0 | 1793 | ) { |
michael@0 | 1794 | /* |
michael@0 | 1795 | * there is a dynamic window that contains this character and |
michael@0 | 1796 | * the following character is not uncompressible, |
michael@0 | 1797 | * change to the window |
michael@0 | 1798 | */ |
michael@0 | 1799 | isSingleByteMode=TRUE; |
michael@0 | 1800 | dynamicWindow=window; |
michael@0 | 1801 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
michael@0 | 1802 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1803 | c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
michael@0 | 1804 | length=2; |
michael@0 | 1805 | goto outputBytes; |
michael@0 | 1806 | } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ |
michael@0 | 1807 | (code=getDynamicOffset(c, &offset))>=0 |
michael@0 | 1808 | ) { |
michael@0 | 1809 | /* two supplementary characters in (probably) the same window - define an extended one */ |
michael@0 | 1810 | isSingleByteMode=TRUE; |
michael@0 | 1811 | code-=0x200; |
michael@0 | 1812 | dynamicWindow=getNextDynamicWindow(scsu); |
michael@0 | 1813 | currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
michael@0 | 1814 | useDynamicWindow(scsu, dynamicWindow); |
michael@0 | 1815 | c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
michael@0 | 1816 | length=4; |
michael@0 | 1817 | goto outputBytes; |
michael@0 | 1818 | } else { |
michael@0 | 1819 | /* don't know how to compress this character, just write it directly */ |
michael@0 | 1820 | c=((uint32_t)lead<<16)|trail; |
michael@0 | 1821 | length=4; |
michael@0 | 1822 | goto outputBytes; |
michael@0 | 1823 | } |
michael@0 | 1824 | } else /* 0xe000<=c<0xf300 */ { |
michael@0 | 1825 | /* quote to avoid SCSU tags */ |
michael@0 | 1826 | c|=UQU<<16; |
michael@0 | 1827 | length=3; |
michael@0 | 1828 | goto outputBytes; |
michael@0 | 1829 | } |
michael@0 | 1830 | |
michael@0 | 1831 | /* normal end of conversion: prepare for a new character */ |
michael@0 | 1832 | c=0; |
michael@0 | 1833 | } |
michael@0 | 1834 | } |
michael@0 | 1835 | endloop: |
michael@0 | 1836 | |
michael@0 | 1837 | /* set the converter state back into UConverter */ |
michael@0 | 1838 | scsu->fromUIsSingleByteMode=isSingleByteMode; |
michael@0 | 1839 | scsu->fromUDynamicWindow=dynamicWindow; |
michael@0 | 1840 | |
michael@0 | 1841 | cnv->fromUChar32=c; |
michael@0 | 1842 | |
michael@0 | 1843 | /* write back the updated pointers */ |
michael@0 | 1844 | pArgs->source=source; |
michael@0 | 1845 | pArgs->target=(char *)target; |
michael@0 | 1846 | return; |
michael@0 | 1847 | |
michael@0 | 1848 | outputBytes: |
michael@0 | 1849 | /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ |
michael@0 | 1850 | /* from the first if in the loop we know that targetCapacity>0 */ |
michael@0 | 1851 | if(length<=targetCapacity) { |
michael@0 | 1852 | switch(length) { |
michael@0 | 1853 | /* each branch falls through to the next one */ |
michael@0 | 1854 | case 4: |
michael@0 | 1855 | *target++=(uint8_t)(c>>24); |
michael@0 | 1856 | case 3: /*fall through*/ |
michael@0 | 1857 | *target++=(uint8_t)(c>>16); |
michael@0 | 1858 | case 2: /*fall through*/ |
michael@0 | 1859 | *target++=(uint8_t)(c>>8); |
michael@0 | 1860 | case 1: /*fall through*/ |
michael@0 | 1861 | *target++=(uint8_t)c; |
michael@0 | 1862 | default: |
michael@0 | 1863 | /* will never occur */ |
michael@0 | 1864 | break; |
michael@0 | 1865 | } |
michael@0 | 1866 | targetCapacity-=length; |
michael@0 | 1867 | |
michael@0 | 1868 | /* normal end of conversion: prepare for a new character */ |
michael@0 | 1869 | c=0; |
michael@0 | 1870 | goto loop; |
michael@0 | 1871 | } else { |
michael@0 | 1872 | uint8_t *p; |
michael@0 | 1873 | |
michael@0 | 1874 | /* |
michael@0 | 1875 | * We actually do this backwards here: |
michael@0 | 1876 | * In order to save an intermediate variable, we output |
michael@0 | 1877 | * first to the overflow buffer what does not fit into the |
michael@0 | 1878 | * regular target. |
michael@0 | 1879 | */ |
michael@0 | 1880 | /* we know that 0<=targetCapacity<length<=4 */ |
michael@0 | 1881 | /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ |
michael@0 | 1882 | length-=targetCapacity; |
michael@0 | 1883 | p=(uint8_t *)cnv->charErrorBuffer; |
michael@0 | 1884 | switch(length) { |
michael@0 | 1885 | /* each branch falls through to the next one */ |
michael@0 | 1886 | case 4: |
michael@0 | 1887 | *p++=(uint8_t)(c>>24); |
michael@0 | 1888 | case 3: /*fall through*/ |
michael@0 | 1889 | *p++=(uint8_t)(c>>16); |
michael@0 | 1890 | case 2: /*fall through*/ |
michael@0 | 1891 | *p++=(uint8_t)(c>>8); |
michael@0 | 1892 | case 1: /*fall through*/ |
michael@0 | 1893 | *p=(uint8_t)c; |
michael@0 | 1894 | default: |
michael@0 | 1895 | /* will never occur */ |
michael@0 | 1896 | break; |
michael@0 | 1897 | } |
michael@0 | 1898 | cnv->charErrorBufferLength=(int8_t)length; |
michael@0 | 1899 | |
michael@0 | 1900 | /* now output what fits into the regular target */ |
michael@0 | 1901 | c>>=8*length; /* length was reduced by targetCapacity */ |
michael@0 | 1902 | switch(targetCapacity) { |
michael@0 | 1903 | /* each branch falls through to the next one */ |
michael@0 | 1904 | case 3: |
michael@0 | 1905 | *target++=(uint8_t)(c>>16); |
michael@0 | 1906 | case 2: /*fall through*/ |
michael@0 | 1907 | *target++=(uint8_t)(c>>8); |
michael@0 | 1908 | case 1: /*fall through*/ |
michael@0 | 1909 | *target++=(uint8_t)c; |
michael@0 | 1910 | default: |
michael@0 | 1911 | break; |
michael@0 | 1912 | } |
michael@0 | 1913 | |
michael@0 | 1914 | /* target overflow */ |
michael@0 | 1915 | targetCapacity=0; |
michael@0 | 1916 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1917 | c=0; |
michael@0 | 1918 | goto endloop; |
michael@0 | 1919 | } |
michael@0 | 1920 | } |
michael@0 | 1921 | |
michael@0 | 1922 | /* miscellaneous ------------------------------------------------------------ */ |
michael@0 | 1923 | |
michael@0 | 1924 | static const char * |
michael@0 | 1925 | _SCSUGetName(const UConverter *cnv) { |
michael@0 | 1926 | SCSUData *scsu=(SCSUData *)cnv->extraInfo; |
michael@0 | 1927 | |
michael@0 | 1928 | switch(scsu->locale) { |
michael@0 | 1929 | case l_ja: |
michael@0 | 1930 | return "SCSU,locale=ja"; |
michael@0 | 1931 | default: |
michael@0 | 1932 | return "SCSU"; |
michael@0 | 1933 | } |
michael@0 | 1934 | } |
michael@0 | 1935 | |
michael@0 | 1936 | /* structure for SafeClone calculations */ |
michael@0 | 1937 | struct cloneSCSUStruct |
michael@0 | 1938 | { |
michael@0 | 1939 | UConverter cnv; |
michael@0 | 1940 | SCSUData mydata; |
michael@0 | 1941 | }; |
michael@0 | 1942 | |
michael@0 | 1943 | static UConverter * |
michael@0 | 1944 | _SCSUSafeClone(const UConverter *cnv, |
michael@0 | 1945 | void *stackBuffer, |
michael@0 | 1946 | int32_t *pBufferSize, |
michael@0 | 1947 | UErrorCode *status) |
michael@0 | 1948 | { |
michael@0 | 1949 | struct cloneSCSUStruct * localClone; |
michael@0 | 1950 | int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); |
michael@0 | 1951 | |
michael@0 | 1952 | if (U_FAILURE(*status)){ |
michael@0 | 1953 | return 0; |
michael@0 | 1954 | } |
michael@0 | 1955 | |
michael@0 | 1956 | if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ |
michael@0 | 1957 | *pBufferSize = bufferSizeNeeded; |
michael@0 | 1958 | return 0; |
michael@0 | 1959 | } |
michael@0 | 1960 | |
michael@0 | 1961 | localClone = (struct cloneSCSUStruct *)stackBuffer; |
michael@0 | 1962 | /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
michael@0 | 1963 | |
michael@0 | 1964 | uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); |
michael@0 | 1965 | localClone->cnv.extraInfo = &localClone->mydata; |
michael@0 | 1966 | localClone->cnv.isExtraLocal = TRUE; |
michael@0 | 1967 | |
michael@0 | 1968 | return &localClone->cnv; |
michael@0 | 1969 | } |
michael@0 | 1970 | |
michael@0 | 1971 | |
michael@0 | 1972 | static const UConverterImpl _SCSUImpl={ |
michael@0 | 1973 | UCNV_SCSU, |
michael@0 | 1974 | |
michael@0 | 1975 | NULL, |
michael@0 | 1976 | NULL, |
michael@0 | 1977 | |
michael@0 | 1978 | _SCSUOpen, |
michael@0 | 1979 | _SCSUClose, |
michael@0 | 1980 | _SCSUReset, |
michael@0 | 1981 | |
michael@0 | 1982 | _SCSUToUnicode, |
michael@0 | 1983 | _SCSUToUnicodeWithOffsets, |
michael@0 | 1984 | _SCSUFromUnicode, |
michael@0 | 1985 | _SCSUFromUnicodeWithOffsets, |
michael@0 | 1986 | NULL, |
michael@0 | 1987 | |
michael@0 | 1988 | NULL, |
michael@0 | 1989 | _SCSUGetName, |
michael@0 | 1990 | NULL, |
michael@0 | 1991 | _SCSUSafeClone, |
michael@0 | 1992 | ucnv_getCompleteUnicodeSet |
michael@0 | 1993 | }; |
michael@0 | 1994 | |
michael@0 | 1995 | static const UConverterStaticData _SCSUStaticData={ |
michael@0 | 1996 | sizeof(UConverterStaticData), |
michael@0 | 1997 | "SCSU", |
michael@0 | 1998 | 1212, /* CCSID for SCSU */ |
michael@0 | 1999 | UCNV_IBM, UCNV_SCSU, |
michael@0 | 2000 | 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ |
michael@0 | 2001 | /* |
michael@0 | 2002 | * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode |
michael@0 | 2003 | * substitution string. |
michael@0 | 2004 | */ |
michael@0 | 2005 | { 0x0e, 0xff, 0xfd, 0 }, 3, |
michael@0 | 2006 | FALSE, FALSE, |
michael@0 | 2007 | 0, |
michael@0 | 2008 | 0, |
michael@0 | 2009 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 2010 | }; |
michael@0 | 2011 | |
michael@0 | 2012 | const UConverterSharedData _SCSUData={ |
michael@0 | 2013 | sizeof(UConverterSharedData), ~((uint32_t)0), |
michael@0 | 2014 | NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, |
michael@0 | 2015 | 0 |
michael@0 | 2016 | }; |
michael@0 | 2017 | |
michael@0 | 2018 | #endif |