Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2000-2011, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * file name: ucnvhz.c |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * created on: 2000oct16 |
michael@0 | 12 | * created by: Ram Viswanadha |
michael@0 | 13 | * 10/31/2000 Ram Implemented offsets logic function |
michael@0 | 14 | * |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #include "unicode/utypes.h" |
michael@0 | 18 | |
michael@0 | 19 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
michael@0 | 20 | |
michael@0 | 21 | #include "cmemory.h" |
michael@0 | 22 | #include "unicode/ucnv.h" |
michael@0 | 23 | #include "unicode/ucnv_cb.h" |
michael@0 | 24 | #include "unicode/uset.h" |
michael@0 | 25 | #include "unicode/utf16.h" |
michael@0 | 26 | #include "ucnv_bld.h" |
michael@0 | 27 | #include "ucnv_cnv.h" |
michael@0 | 28 | #include "ucnv_imp.h" |
michael@0 | 29 | |
michael@0 | 30 | #define UCNV_TILDE 0x7E /* ~ */ |
michael@0 | 31 | #define UCNV_OPEN_BRACE 0x7B /* { */ |
michael@0 | 32 | #define UCNV_CLOSE_BRACE 0x7D /* } */ |
michael@0 | 33 | #define SB_ESCAPE "\x7E\x7D" |
michael@0 | 34 | #define DB_ESCAPE "\x7E\x7B" |
michael@0 | 35 | #define TILDE_ESCAPE "\x7E\x7E" |
michael@0 | 36 | #define ESC_LEN 2 |
michael@0 | 37 | |
michael@0 | 38 | |
michael@0 | 39 | #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \ |
michael@0 | 40 | while(len-->0){ \ |
michael@0 | 41 | if(targetIndex < targetLength){ \ |
michael@0 | 42 | args->target[targetIndex] = (unsigned char) *strToAppend; \ |
michael@0 | 43 | if(args->offsets!=NULL){ \ |
michael@0 | 44 | *(offsets++) = sourceIndex-1; \ |
michael@0 | 45 | } \ |
michael@0 | 46 | targetIndex++; \ |
michael@0 | 47 | } \ |
michael@0 | 48 | else{ \ |
michael@0 | 49 | args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \ |
michael@0 | 50 | *err =U_BUFFER_OVERFLOW_ERROR; \ |
michael@0 | 51 | } \ |
michael@0 | 52 | strToAppend++; \ |
michael@0 | 53 | } \ |
michael@0 | 54 | } |
michael@0 | 55 | |
michael@0 | 56 | |
michael@0 | 57 | typedef struct{ |
michael@0 | 58 | UConverter* gbConverter; |
michael@0 | 59 | int32_t targetIndex; |
michael@0 | 60 | int32_t sourceIndex; |
michael@0 | 61 | UBool isEscapeAppended; |
michael@0 | 62 | UBool isStateDBCS; |
michael@0 | 63 | UBool isTargetUCharDBCS; |
michael@0 | 64 | UBool isEmptySegment; |
michael@0 | 65 | }UConverterDataHZ; |
michael@0 | 66 | |
michael@0 | 67 | |
michael@0 | 68 | |
michael@0 | 69 | static void |
michael@0 | 70 | _HZOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
michael@0 | 71 | UConverter *gbConverter; |
michael@0 | 72 | if(pArgs->onlyTestIsLoadable) { |
michael@0 | 73 | ucnv_canCreateConverter("GBK", errorCode); /* errorCode carries result */ |
michael@0 | 74 | return; |
michael@0 | 75 | } |
michael@0 | 76 | gbConverter = ucnv_open("GBK", errorCode); |
michael@0 | 77 | if(U_FAILURE(*errorCode)) { |
michael@0 | 78 | return; |
michael@0 | 79 | } |
michael@0 | 80 | cnv->toUnicodeStatus = 0; |
michael@0 | 81 | cnv->fromUnicodeStatus= 0; |
michael@0 | 82 | cnv->mode=0; |
michael@0 | 83 | cnv->fromUChar32=0x0000; |
michael@0 | 84 | cnv->extraInfo = uprv_calloc(1, sizeof(UConverterDataHZ)); |
michael@0 | 85 | if(cnv->extraInfo != NULL){ |
michael@0 | 86 | ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = gbConverter; |
michael@0 | 87 | } |
michael@0 | 88 | else { |
michael@0 | 89 | ucnv_close(gbConverter); |
michael@0 | 90 | *errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 91 | return; |
michael@0 | 92 | } |
michael@0 | 93 | } |
michael@0 | 94 | |
michael@0 | 95 | static void |
michael@0 | 96 | _HZClose(UConverter *cnv){ |
michael@0 | 97 | if(cnv->extraInfo != NULL) { |
michael@0 | 98 | ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter); |
michael@0 | 99 | if(!cnv->isExtraLocal) { |
michael@0 | 100 | uprv_free(cnv->extraInfo); |
michael@0 | 101 | } |
michael@0 | 102 | cnv->extraInfo = NULL; |
michael@0 | 103 | } |
michael@0 | 104 | } |
michael@0 | 105 | |
michael@0 | 106 | static void |
michael@0 | 107 | _HZReset(UConverter *cnv, UConverterResetChoice choice){ |
michael@0 | 108 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 109 | cnv->toUnicodeStatus = 0; |
michael@0 | 110 | cnv->mode=0; |
michael@0 | 111 | if(cnv->extraInfo != NULL){ |
michael@0 | 112 | ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE; |
michael@0 | 113 | ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE; |
michael@0 | 114 | } |
michael@0 | 115 | } |
michael@0 | 116 | if(choice!=UCNV_RESET_TO_UNICODE) { |
michael@0 | 117 | cnv->fromUnicodeStatus= 0; |
michael@0 | 118 | cnv->fromUChar32=0x0000; |
michael@0 | 119 | if(cnv->extraInfo != NULL){ |
michael@0 | 120 | ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE; |
michael@0 | 121 | ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0; |
michael@0 | 122 | ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0; |
michael@0 | 123 | ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE; |
michael@0 | 124 | } |
michael@0 | 125 | } |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | /**************************************HZ Encoding************************************************* |
michael@0 | 129 | * Rules for HZ encoding |
michael@0 | 130 | * |
michael@0 | 131 | * In ASCII mode, a byte is interpreted as an ASCII character, unless a |
michael@0 | 132 | * '~' is encountered. The character '~' is an escape character. By |
michael@0 | 133 | * convention, it must be immediately followed ONLY by '~', '{' or '\n' |
michael@0 | 134 | * (<LF>), with the following special meaning. |
michael@0 | 135 | |
michael@0 | 136 | * 1. The escape sequence '~~' is interpreted as a '~'. |
michael@0 | 137 | * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB. |
michael@0 | 138 | * 3. The escape sequence '~\n' is a line-continuation marker to be |
michael@0 | 139 | * consumed with no output produced. |
michael@0 | 140 | * In GB mode, characters are interpreted two bytes at a time as (pure) |
michael@0 | 141 | * GB codes until the escape-from-GB code '~}' is read. This code |
michael@0 | 142 | * switches the mode from GB back to ASCII. (Note that the escape- |
michael@0 | 143 | * from-GB code '~}' ($7E7D) is outside the defined GB range.) |
michael@0 | 144 | * |
michael@0 | 145 | * Source: RFC 1842 |
michael@0 | 146 | * |
michael@0 | 147 | * Note that the formal syntax in RFC 1842 is invalid. I assume that the |
michael@0 | 148 | * intended definition of single-byte-segment is as follows (pedberg): |
michael@0 | 149 | * single-byte-segment = single-byte-seq 1*single-byte-char |
michael@0 | 150 | */ |
michael@0 | 151 | |
michael@0 | 152 | |
michael@0 | 153 | static void |
michael@0 | 154 | UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
michael@0 | 155 | UErrorCode* err){ |
michael@0 | 156 | char tempBuf[2]; |
michael@0 | 157 | const char *mySource = ( char *) args->source; |
michael@0 | 158 | UChar *myTarget = args->target; |
michael@0 | 159 | const char *mySourceLimit = args->sourceLimit; |
michael@0 | 160 | UChar32 targetUniChar = 0x0000; |
michael@0 | 161 | int32_t mySourceChar = 0x0000; |
michael@0 | 162 | UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo); |
michael@0 | 163 | tempBuf[0]=0; |
michael@0 | 164 | tempBuf[1]=0; |
michael@0 | 165 | |
michael@0 | 166 | /* Calling code already handles this situation. */ |
michael@0 | 167 | /*if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){ |
michael@0 | 168 | *err = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 169 | return; |
michael@0 | 170 | }*/ |
michael@0 | 171 | |
michael@0 | 172 | while(mySource< mySourceLimit){ |
michael@0 | 173 | |
michael@0 | 174 | if(myTarget < args->targetLimit){ |
michael@0 | 175 | |
michael@0 | 176 | mySourceChar= (unsigned char) *mySource++; |
michael@0 | 177 | |
michael@0 | 178 | if(args->converter->mode == UCNV_TILDE) { |
michael@0 | 179 | /* second byte after ~ */ |
michael@0 | 180 | args->converter->mode=0; |
michael@0 | 181 | switch(mySourceChar) { |
michael@0 | 182 | case 0x0A: |
michael@0 | 183 | /* no output for ~\n (line-continuation marker) */ |
michael@0 | 184 | continue; |
michael@0 | 185 | case UCNV_TILDE: |
michael@0 | 186 | if(args->offsets) { |
michael@0 | 187 | args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); |
michael@0 | 188 | } |
michael@0 | 189 | *(myTarget++)=(UChar)mySourceChar; |
michael@0 | 190 | myData->isEmptySegment = FALSE; |
michael@0 | 191 | continue; |
michael@0 | 192 | case UCNV_OPEN_BRACE: |
michael@0 | 193 | case UCNV_CLOSE_BRACE: |
michael@0 | 194 | myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); |
michael@0 | 195 | if (myData->isEmptySegment) { |
michael@0 | 196 | myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ |
michael@0 | 197 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 198 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
michael@0 | 199 | args->converter->toUBytes[0] = UCNV_TILDE; |
michael@0 | 200 | args->converter->toUBytes[1] = mySourceChar; |
michael@0 | 201 | args->converter->toULength = 2; |
michael@0 | 202 | args->target = myTarget; |
michael@0 | 203 | args->source = mySource; |
michael@0 | 204 | return; |
michael@0 | 205 | } |
michael@0 | 206 | myData->isEmptySegment = TRUE; |
michael@0 | 207 | continue; |
michael@0 | 208 | default: |
michael@0 | 209 | /* if the first byte is equal to TILDE and the trail byte |
michael@0 | 210 | * is not a valid byte then it is an error condition |
michael@0 | 211 | */ |
michael@0 | 212 | /* |
michael@0 | 213 | * Ticket 5691: consistent illegal sequences: |
michael@0 | 214 | * - We include at least the first byte in the illegal sequence. |
michael@0 | 215 | * - If any of the non-initial bytes could be the start of a character, |
michael@0 | 216 | * we stop the illegal sequence before the first one of those. |
michael@0 | 217 | */ |
michael@0 | 218 | myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ |
michael@0 | 219 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 220 | args->converter->toUBytes[0] = UCNV_TILDE; |
michael@0 | 221 | if( myData->isStateDBCS ? |
michael@0 | 222 | (0x21 <= mySourceChar && mySourceChar <= 0x7e) : |
michael@0 | 223 | mySourceChar <= 0x7f |
michael@0 | 224 | ) { |
michael@0 | 225 | /* The current byte could be the start of a character: Back it out. */ |
michael@0 | 226 | args->converter->toULength = 1; |
michael@0 | 227 | --mySource; |
michael@0 | 228 | } else { |
michael@0 | 229 | /* Include the current byte in the illegal sequence. */ |
michael@0 | 230 | args->converter->toUBytes[1] = mySourceChar; |
michael@0 | 231 | args->converter->toULength = 2; |
michael@0 | 232 | } |
michael@0 | 233 | args->target = myTarget; |
michael@0 | 234 | args->source = mySource; |
michael@0 | 235 | return; |
michael@0 | 236 | } |
michael@0 | 237 | } else if(myData->isStateDBCS) { |
michael@0 | 238 | if(args->converter->toUnicodeStatus == 0x00){ |
michael@0 | 239 | /* lead byte */ |
michael@0 | 240 | if(mySourceChar == UCNV_TILDE) { |
michael@0 | 241 | args->converter->mode = UCNV_TILDE; |
michael@0 | 242 | } else { |
michael@0 | 243 | /* add another bit to distinguish a 0 byte from not having seen a lead byte */ |
michael@0 | 244 | args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); |
michael@0 | 245 | myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */ |
michael@0 | 246 | } |
michael@0 | 247 | continue; |
michael@0 | 248 | } |
michael@0 | 249 | else{ |
michael@0 | 250 | /* trail byte */ |
michael@0 | 251 | int leadIsOk, trailIsOk; |
michael@0 | 252 | uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; |
michael@0 | 253 | targetUniChar = 0xffff; |
michael@0 | 254 | /* |
michael@0 | 255 | * Ticket 5691: consistent illegal sequences: |
michael@0 | 256 | * - We include at least the first byte in the illegal sequence. |
michael@0 | 257 | * - If any of the non-initial bytes could be the start of a character, |
michael@0 | 258 | * we stop the illegal sequence before the first one of those. |
michael@0 | 259 | * |
michael@0 | 260 | * In HZ DBCS, if the second byte is in the 21..7e range, |
michael@0 | 261 | * we report only the first byte as the illegal sequence. |
michael@0 | 262 | * Otherwise we convert or report the pair of bytes. |
michael@0 | 263 | */ |
michael@0 | 264 | leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); |
michael@0 | 265 | trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
michael@0 | 266 | if (leadIsOk && trailIsOk) { |
michael@0 | 267 | tempBuf[0] = (char) (leadByte+0x80) ; |
michael@0 | 268 | tempBuf[1] = (char) (mySourceChar+0x80); |
michael@0 | 269 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, |
michael@0 | 270 | tempBuf, 2, args->converter->useFallback); |
michael@0 | 271 | mySourceChar= (leadByte << 8) | mySourceChar; |
michael@0 | 272 | } else if (trailIsOk) { |
michael@0 | 273 | /* report a single illegal byte and continue with the following DBCS starter byte */ |
michael@0 | 274 | --mySource; |
michael@0 | 275 | mySourceChar = (int32_t)leadByte; |
michael@0 | 276 | } else { |
michael@0 | 277 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
michael@0 | 278 | /* add another bit so that the code below writes 2 bytes in case of error */ |
michael@0 | 279 | mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; |
michael@0 | 280 | } |
michael@0 | 281 | args->converter->toUnicodeStatus =0x00; |
michael@0 | 282 | } |
michael@0 | 283 | } |
michael@0 | 284 | else{ |
michael@0 | 285 | if(mySourceChar == UCNV_TILDE) { |
michael@0 | 286 | args->converter->mode = UCNV_TILDE; |
michael@0 | 287 | continue; |
michael@0 | 288 | } else if(mySourceChar <= 0x7f) { |
michael@0 | 289 | targetUniChar = (UChar)mySourceChar; /* ASCII */ |
michael@0 | 290 | myData->isEmptySegment = FALSE; /* the segment has something valid */ |
michael@0 | 291 | } else { |
michael@0 | 292 | targetUniChar = 0xffff; |
michael@0 | 293 | myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ |
michael@0 | 294 | } |
michael@0 | 295 | } |
michael@0 | 296 | if(targetUniChar < 0xfffe){ |
michael@0 | 297 | if(args->offsets) { |
michael@0 | 298 | args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS)); |
michael@0 | 299 | } |
michael@0 | 300 | |
michael@0 | 301 | *(myTarget++)=(UChar)targetUniChar; |
michael@0 | 302 | } |
michael@0 | 303 | else /* targetUniChar>=0xfffe */ { |
michael@0 | 304 | if(targetUniChar == 0xfffe){ |
michael@0 | 305 | *err = U_INVALID_CHAR_FOUND; |
michael@0 | 306 | } |
michael@0 | 307 | else{ |
michael@0 | 308 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 309 | } |
michael@0 | 310 | if(mySourceChar > 0xff){ |
michael@0 | 311 | args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8); |
michael@0 | 312 | args->converter->toUBytes[1] = (uint8_t)mySourceChar; |
michael@0 | 313 | args->converter->toULength=2; |
michael@0 | 314 | } |
michael@0 | 315 | else{ |
michael@0 | 316 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
michael@0 | 317 | args->converter->toULength=1; |
michael@0 | 318 | } |
michael@0 | 319 | break; |
michael@0 | 320 | } |
michael@0 | 321 | } |
michael@0 | 322 | else{ |
michael@0 | 323 | *err =U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 324 | break; |
michael@0 | 325 | } |
michael@0 | 326 | } |
michael@0 | 327 | |
michael@0 | 328 | args->target = myTarget; |
michael@0 | 329 | args->source = mySource; |
michael@0 | 330 | } |
michael@0 | 331 | |
michael@0 | 332 | |
michael@0 | 333 | static void |
michael@0 | 334 | UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, |
michael@0 | 335 | UErrorCode * err){ |
michael@0 | 336 | const UChar *mySource = args->source; |
michael@0 | 337 | char *myTarget = args->target; |
michael@0 | 338 | int32_t* offsets = args->offsets; |
michael@0 | 339 | int32_t mySourceIndex = 0; |
michael@0 | 340 | int32_t myTargetIndex = 0; |
michael@0 | 341 | int32_t targetLength = (int32_t)(args->targetLimit - myTarget); |
michael@0 | 342 | int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source); |
michael@0 | 343 | int32_t length=0; |
michael@0 | 344 | uint32_t targetUniChar = 0x0000; |
michael@0 | 345 | UChar32 mySourceChar = 0x0000; |
michael@0 | 346 | UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo; |
michael@0 | 347 | UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS; |
michael@0 | 348 | UBool oldIsTargetUCharDBCS = isTargetUCharDBCS; |
michael@0 | 349 | int len =0; |
michael@0 | 350 | const char* escSeq=NULL; |
michael@0 | 351 | |
michael@0 | 352 | /* Calling code already handles this situation. */ |
michael@0 | 353 | /*if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){ |
michael@0 | 354 | *err = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 355 | return; |
michael@0 | 356 | }*/ |
michael@0 | 357 | if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) { |
michael@0 | 358 | goto getTrail; |
michael@0 | 359 | } |
michael@0 | 360 | /*writing the char to the output stream */ |
michael@0 | 361 | while (mySourceIndex < mySourceLength){ |
michael@0 | 362 | targetUniChar = missingCharMarker; |
michael@0 | 363 | if (myTargetIndex < targetLength){ |
michael@0 | 364 | |
michael@0 | 365 | mySourceChar = (UChar) mySource[mySourceIndex++]; |
michael@0 | 366 | |
michael@0 | 367 | |
michael@0 | 368 | oldIsTargetUCharDBCS = isTargetUCharDBCS; |
michael@0 | 369 | if(mySourceChar ==UCNV_TILDE){ |
michael@0 | 370 | /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/ |
michael@0 | 371 | len = ESC_LEN; |
michael@0 | 372 | escSeq = TILDE_ESCAPE; |
michael@0 | 373 | CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); |
michael@0 | 374 | continue; |
michael@0 | 375 | } else if(mySourceChar <= 0x7f) { |
michael@0 | 376 | length = 1; |
michael@0 | 377 | targetUniChar = mySourceChar; |
michael@0 | 378 | } else { |
michael@0 | 379 | length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData, |
michael@0 | 380 | mySourceChar,&targetUniChar,args->converter->useFallback); |
michael@0 | 381 | /* we can only use lead bytes 21..7D and trail bytes 21..7E */ |
michael@0 | 382 | if( length == 2 && |
michael@0 | 383 | (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) && |
michael@0 | 384 | (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1) |
michael@0 | 385 | ) { |
michael@0 | 386 | targetUniChar -= 0x8080; |
michael@0 | 387 | } else { |
michael@0 | 388 | targetUniChar = missingCharMarker; |
michael@0 | 389 | } |
michael@0 | 390 | } |
michael@0 | 391 | if (targetUniChar != missingCharMarker){ |
michael@0 | 392 | myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF); |
michael@0 | 393 | if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){ |
michael@0 | 394 | /*Shifting from a double byte to single byte mode*/ |
michael@0 | 395 | if(!isTargetUCharDBCS){ |
michael@0 | 396 | len =ESC_LEN; |
michael@0 | 397 | escSeq = SB_ESCAPE; |
michael@0 | 398 | CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); |
michael@0 | 399 | myConverterData->isEscapeAppended = TRUE; |
michael@0 | 400 | } |
michael@0 | 401 | else{ /* Shifting from a single byte to double byte mode*/ |
michael@0 | 402 | len =ESC_LEN; |
michael@0 | 403 | escSeq = DB_ESCAPE; |
michael@0 | 404 | CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); |
michael@0 | 405 | myConverterData->isEscapeAppended = TRUE; |
michael@0 | 406 | |
michael@0 | 407 | } |
michael@0 | 408 | } |
michael@0 | 409 | |
michael@0 | 410 | if(isTargetUCharDBCS){ |
michael@0 | 411 | if( myTargetIndex <targetLength){ |
michael@0 | 412 | myTarget[myTargetIndex++] =(char) (targetUniChar >> 8); |
michael@0 | 413 | if(offsets){ |
michael@0 | 414 | *(offsets++) = mySourceIndex-1; |
michael@0 | 415 | } |
michael@0 | 416 | if(myTargetIndex < targetLength){ |
michael@0 | 417 | myTarget[myTargetIndex++] =(char) targetUniChar; |
michael@0 | 418 | if(offsets){ |
michael@0 | 419 | *(offsets++) = mySourceIndex-1; |
michael@0 | 420 | } |
michael@0 | 421 | }else{ |
michael@0 | 422 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; |
michael@0 | 423 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 424 | } |
michael@0 | 425 | }else{ |
michael@0 | 426 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8); |
michael@0 | 427 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; |
michael@0 | 428 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 429 | } |
michael@0 | 430 | |
michael@0 | 431 | }else{ |
michael@0 | 432 | if( myTargetIndex <targetLength){ |
michael@0 | 433 | myTarget[myTargetIndex++] = (char) (targetUniChar ); |
michael@0 | 434 | if(offsets){ |
michael@0 | 435 | *(offsets++) = mySourceIndex-1; |
michael@0 | 436 | } |
michael@0 | 437 | |
michael@0 | 438 | }else{ |
michael@0 | 439 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; |
michael@0 | 440 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 441 | } |
michael@0 | 442 | } |
michael@0 | 443 | |
michael@0 | 444 | } |
michael@0 | 445 | else{ |
michael@0 | 446 | /* oops.. the code point is unassigned */ |
michael@0 | 447 | /*Handle surrogates */ |
michael@0 | 448 | /*check if the char is a First surrogate*/ |
michael@0 | 449 | if(U16_IS_SURROGATE(mySourceChar)) { |
michael@0 | 450 | if(U16_IS_SURROGATE_LEAD(mySourceChar)) { |
michael@0 | 451 | args->converter->fromUChar32=mySourceChar; |
michael@0 | 452 | getTrail: |
michael@0 | 453 | /*look ahead to find the trail surrogate*/ |
michael@0 | 454 | if(mySourceIndex < mySourceLength) { |
michael@0 | 455 | /* test the following code unit */ |
michael@0 | 456 | UChar trail=(UChar) args->source[mySourceIndex]; |
michael@0 | 457 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 458 | ++mySourceIndex; |
michael@0 | 459 | mySourceChar=U16_GET_SUPPLEMENTARY(args->converter->fromUChar32, trail); |
michael@0 | 460 | args->converter->fromUChar32=0x00; |
michael@0 | 461 | /* there are no surrogates in GB2312*/ |
michael@0 | 462 | *err = U_INVALID_CHAR_FOUND; |
michael@0 | 463 | /* exit this condition tree */ |
michael@0 | 464 | } else { |
michael@0 | 465 | /* this is an unmatched lead code unit (1st surrogate) */ |
michael@0 | 466 | /* callback(illegal) */ |
michael@0 | 467 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 468 | } |
michael@0 | 469 | } else { |
michael@0 | 470 | /* no more input */ |
michael@0 | 471 | *err = U_ZERO_ERROR; |
michael@0 | 472 | } |
michael@0 | 473 | } else { |
michael@0 | 474 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 475 | /* callback(illegal) */ |
michael@0 | 476 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 477 | } |
michael@0 | 478 | } else { |
michael@0 | 479 | /* callback(unassigned) for a BMP code point */ |
michael@0 | 480 | *err = U_INVALID_CHAR_FOUND; |
michael@0 | 481 | } |
michael@0 | 482 | |
michael@0 | 483 | args->converter->fromUChar32=mySourceChar; |
michael@0 | 484 | break; |
michael@0 | 485 | } |
michael@0 | 486 | } |
michael@0 | 487 | else{ |
michael@0 | 488 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 489 | break; |
michael@0 | 490 | } |
michael@0 | 491 | targetUniChar=missingCharMarker; |
michael@0 | 492 | } |
michael@0 | 493 | |
michael@0 | 494 | args->target += myTargetIndex; |
michael@0 | 495 | args->source += mySourceIndex; |
michael@0 | 496 | myConverterData->isTargetUCharDBCS = isTargetUCharDBCS; |
michael@0 | 497 | } |
michael@0 | 498 | |
michael@0 | 499 | static void |
michael@0 | 500 | _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { |
michael@0 | 501 | UConverter *cnv = args->converter; |
michael@0 | 502 | UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo; |
michael@0 | 503 | char *p; |
michael@0 | 504 | char buffer[4]; |
michael@0 | 505 | p = buffer; |
michael@0 | 506 | |
michael@0 | 507 | if( convData->isTargetUCharDBCS){ |
michael@0 | 508 | *p++= UCNV_TILDE; |
michael@0 | 509 | *p++= UCNV_CLOSE_BRACE; |
michael@0 | 510 | convData->isTargetUCharDBCS=FALSE; |
michael@0 | 511 | } |
michael@0 | 512 | *p++= (char)cnv->subChars[0]; |
michael@0 | 513 | |
michael@0 | 514 | ucnv_cbFromUWriteBytes(args, |
michael@0 | 515 | buffer, (int32_t)(p - buffer), |
michael@0 | 516 | offsetIndex, err); |
michael@0 | 517 | } |
michael@0 | 518 | |
michael@0 | 519 | /* |
michael@0 | 520 | * Structure for cloning an HZ converter into a single memory block. |
michael@0 | 521 | * ucnv_safeClone() of the HZ converter will align the entire cloneHZStruct, |
michael@0 | 522 | * and then ucnv_safeClone() of the sub-converter may additionally align |
michael@0 | 523 | * subCnv inside the cloneHZStruct, for which we need the deadSpace after |
michael@0 | 524 | * subCnv. This is because UAlignedMemory may be larger than the actually |
michael@0 | 525 | * necessary alignment size for the platform. |
michael@0 | 526 | * The other cloneHZStruct fields will not be moved around, |
michael@0 | 527 | * and are aligned properly with cloneHZStruct's alignment. |
michael@0 | 528 | */ |
michael@0 | 529 | struct cloneHZStruct |
michael@0 | 530 | { |
michael@0 | 531 | UConverter cnv; |
michael@0 | 532 | UConverter subCnv; |
michael@0 | 533 | UAlignedMemory deadSpace; |
michael@0 | 534 | UConverterDataHZ mydata; |
michael@0 | 535 | }; |
michael@0 | 536 | |
michael@0 | 537 | |
michael@0 | 538 | static UConverter * |
michael@0 | 539 | _HZ_SafeClone(const UConverter *cnv, |
michael@0 | 540 | void *stackBuffer, |
michael@0 | 541 | int32_t *pBufferSize, |
michael@0 | 542 | UErrorCode *status) |
michael@0 | 543 | { |
michael@0 | 544 | struct cloneHZStruct * localClone; |
michael@0 | 545 | int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct); |
michael@0 | 546 | |
michael@0 | 547 | if (U_FAILURE(*status)){ |
michael@0 | 548 | return 0; |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ |
michael@0 | 552 | *pBufferSize = bufferSizeNeeded; |
michael@0 | 553 | return 0; |
michael@0 | 554 | } |
michael@0 | 555 | |
michael@0 | 556 | localClone = (struct cloneHZStruct *)stackBuffer; |
michael@0 | 557 | /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
michael@0 | 558 | |
michael@0 | 559 | uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ)); |
michael@0 | 560 | localClone->cnv.extraInfo = &localClone->mydata; |
michael@0 | 561 | localClone->cnv.isExtraLocal = TRUE; |
michael@0 | 562 | |
michael@0 | 563 | /* deep-clone the sub-converter */ |
michael@0 | 564 | size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ |
michael@0 | 565 | ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter = |
michael@0 | 566 | ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status); |
michael@0 | 567 | |
michael@0 | 568 | return &localClone->cnv; |
michael@0 | 569 | } |
michael@0 | 570 | |
michael@0 | 571 | static void |
michael@0 | 572 | _HZ_GetUnicodeSet(const UConverter *cnv, |
michael@0 | 573 | const USetAdder *sa, |
michael@0 | 574 | UConverterUnicodeSet which, |
michael@0 | 575 | UErrorCode *pErrorCode) { |
michael@0 | 576 | /* HZ converts all of ASCII */ |
michael@0 | 577 | sa->addRange(sa->set, 0, 0x7f); |
michael@0 | 578 | |
michael@0 | 579 | /* add all of the code points that the sub-converter handles */ |
michael@0 | 580 | ucnv_MBCSGetFilteredUnicodeSetForUnicode( |
michael@0 | 581 | ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, |
michael@0 | 582 | sa, which, UCNV_SET_FILTER_HZ, |
michael@0 | 583 | pErrorCode); |
michael@0 | 584 | } |
michael@0 | 585 | |
michael@0 | 586 | static const UConverterImpl _HZImpl={ |
michael@0 | 587 | |
michael@0 | 588 | UCNV_HZ, |
michael@0 | 589 | |
michael@0 | 590 | NULL, |
michael@0 | 591 | NULL, |
michael@0 | 592 | |
michael@0 | 593 | _HZOpen, |
michael@0 | 594 | _HZClose, |
michael@0 | 595 | _HZReset, |
michael@0 | 596 | |
michael@0 | 597 | UConverter_toUnicode_HZ_OFFSETS_LOGIC, |
michael@0 | 598 | UConverter_toUnicode_HZ_OFFSETS_LOGIC, |
michael@0 | 599 | UConverter_fromUnicode_HZ_OFFSETS_LOGIC, |
michael@0 | 600 | UConverter_fromUnicode_HZ_OFFSETS_LOGIC, |
michael@0 | 601 | NULL, |
michael@0 | 602 | |
michael@0 | 603 | NULL, |
michael@0 | 604 | NULL, |
michael@0 | 605 | _HZ_WriteSub, |
michael@0 | 606 | _HZ_SafeClone, |
michael@0 | 607 | _HZ_GetUnicodeSet |
michael@0 | 608 | }; |
michael@0 | 609 | |
michael@0 | 610 | static const UConverterStaticData _HZStaticData={ |
michael@0 | 611 | sizeof(UConverterStaticData), |
michael@0 | 612 | "HZ", |
michael@0 | 613 | 0, |
michael@0 | 614 | UCNV_IBM, |
michael@0 | 615 | UCNV_HZ, |
michael@0 | 616 | 1, |
michael@0 | 617 | 4, |
michael@0 | 618 | { 0x1a, 0, 0, 0 }, |
michael@0 | 619 | 1, |
michael@0 | 620 | FALSE, |
michael@0 | 621 | FALSE, |
michael@0 | 622 | 0, |
michael@0 | 623 | 0, |
michael@0 | 624 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */ |
michael@0 | 625 | |
michael@0 | 626 | }; |
michael@0 | 627 | |
michael@0 | 628 | |
michael@0 | 629 | const UConverterSharedData _HZData={ |
michael@0 | 630 | sizeof(UConverterSharedData), |
michael@0 | 631 | ~((uint32_t) 0), |
michael@0 | 632 | NULL, |
michael@0 | 633 | NULL, |
michael@0 | 634 | &_HZStaticData, |
michael@0 | 635 | FALSE, |
michael@0 | 636 | &_HZImpl, |
michael@0 | 637 | 0 |
michael@0 | 638 | }; |
michael@0 | 639 | |
michael@0 | 640 | #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |