intl/icu/source/common/ucnvhz.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnvhz.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,640 @@
     1.4 +/*  
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2000-2011, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   file name:  ucnvhz.c
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2000oct16
    1.15 +*   created by: Ram Viswanadha
    1.16 +*   10/31/2000  Ram     Implemented offsets logic function
    1.17 +*   
    1.18 +*/
    1.19 +
    1.20 +#include "unicode/utypes.h"
    1.21 +
    1.22 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
    1.23 +
    1.24 +#include "cmemory.h"
    1.25 +#include "unicode/ucnv.h"
    1.26 +#include "unicode/ucnv_cb.h"
    1.27 +#include "unicode/uset.h"
    1.28 +#include "unicode/utf16.h"
    1.29 +#include "ucnv_bld.h"
    1.30 +#include "ucnv_cnv.h"
    1.31 +#include "ucnv_imp.h"
    1.32 +
    1.33 +#define UCNV_TILDE 0x7E          /* ~ */
    1.34 +#define UCNV_OPEN_BRACE 0x7B     /* { */
    1.35 +#define UCNV_CLOSE_BRACE 0x7D   /* } */
    1.36 +#define SB_ESCAPE    "\x7E\x7D"
    1.37 +#define DB_ESCAPE    "\x7E\x7B"
    1.38 +#define TILDE_ESCAPE "\x7E\x7E"
    1.39 +#define ESC_LEN       2
    1.40 +
    1.41 +
    1.42 +#define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){                             \
    1.43 +    while(len-->0){                                                                                                         \
    1.44 +        if(targetIndex < targetLength){                                                                                     \
    1.45 +            args->target[targetIndex] = (unsigned char) *strToAppend;                                                       \
    1.46 +            if(args->offsets!=NULL){                                                                                        \
    1.47 +                *(offsets++) = sourceIndex-1;                                                                               \
    1.48 +            }                                                                                                               \
    1.49 +            targetIndex++;                                                                                                  \
    1.50 +        }                                                                                                                   \
    1.51 +        else{                                                                                                               \
    1.52 +            args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
    1.53 +            *err =U_BUFFER_OVERFLOW_ERROR;                                                                                  \
    1.54 +        }                                                                                                                   \
    1.55 +        strToAppend++;                                                                                                      \
    1.56 +    }                                                                                                                       \
    1.57 +}
    1.58 +
    1.59 +
    1.60 +typedef struct{
    1.61 +    UConverter* gbConverter;
    1.62 +    int32_t targetIndex;
    1.63 +    int32_t sourceIndex;
    1.64 +    UBool isEscapeAppended;
    1.65 +    UBool isStateDBCS;
    1.66 +    UBool isTargetUCharDBCS;
    1.67 +    UBool isEmptySegment;
    1.68 +}UConverterDataHZ;
    1.69 +
    1.70 +
    1.71 +
    1.72 +static void 
    1.73 +_HZOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
    1.74 +    UConverter *gbConverter;
    1.75 +    if(pArgs->onlyTestIsLoadable) {
    1.76 +        ucnv_canCreateConverter("GBK", errorCode);  /* errorCode carries result */
    1.77 +        return;
    1.78 +    }
    1.79 +    gbConverter = ucnv_open("GBK", errorCode);
    1.80 +    if(U_FAILURE(*errorCode)) {
    1.81 +        return;
    1.82 +    }
    1.83 +    cnv->toUnicodeStatus = 0;
    1.84 +    cnv->fromUnicodeStatus= 0;
    1.85 +    cnv->mode=0;
    1.86 +    cnv->fromUChar32=0x0000;
    1.87 +    cnv->extraInfo = uprv_calloc(1, sizeof(UConverterDataHZ));
    1.88 +    if(cnv->extraInfo != NULL){
    1.89 +        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = gbConverter;
    1.90 +    }
    1.91 +    else {
    1.92 +        ucnv_close(gbConverter);
    1.93 +        *errorCode = U_MEMORY_ALLOCATION_ERROR;
    1.94 +        return;
    1.95 +    }
    1.96 +}
    1.97 +
    1.98 +static void 
    1.99 +_HZClose(UConverter *cnv){
   1.100 +    if(cnv->extraInfo != NULL) {
   1.101 +        ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
   1.102 +        if(!cnv->isExtraLocal) {
   1.103 +            uprv_free(cnv->extraInfo);
   1.104 +        }
   1.105 +        cnv->extraInfo = NULL;
   1.106 +    }
   1.107 +}
   1.108 +
   1.109 +static void 
   1.110 +_HZReset(UConverter *cnv, UConverterResetChoice choice){
   1.111 +    if(choice<=UCNV_RESET_TO_UNICODE) {
   1.112 +        cnv->toUnicodeStatus = 0;
   1.113 +        cnv->mode=0;
   1.114 +        if(cnv->extraInfo != NULL){
   1.115 +            ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
   1.116 +            ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
   1.117 +        }
   1.118 +    }
   1.119 +    if(choice!=UCNV_RESET_TO_UNICODE) {
   1.120 +        cnv->fromUnicodeStatus= 0;
   1.121 +        cnv->fromUChar32=0x0000; 
   1.122 +        if(cnv->extraInfo != NULL){
   1.123 +            ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
   1.124 +            ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
   1.125 +            ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
   1.126 +            ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
   1.127 +        }
   1.128 +    }
   1.129 +}
   1.130 +
   1.131 +/**************************************HZ Encoding*************************************************
   1.132 +* Rules for HZ encoding
   1.133 +* 
   1.134 +*   In ASCII mode, a byte is interpreted as an ASCII character, unless a
   1.135 +*   '~' is encountered. The character '~' is an escape character. By
   1.136 +*   convention, it must be immediately followed ONLY by '~', '{' or '\n'
   1.137 +*   (<LF>), with the following special meaning.
   1.138 +
   1.139 +*   1. The escape sequence '~~' is interpreted as a '~'.
   1.140 +*   2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
   1.141 +*   3. The escape sequence '~\n' is a line-continuation marker to be
   1.142 +*     consumed with no output produced.
   1.143 +*   In GB mode, characters are interpreted two bytes at a time as (pure)
   1.144 +*   GB codes until the escape-from-GB code '~}' is read. This code
   1.145 +*   switches the mode from GB back to ASCII.  (Note that the escape-
   1.146 +*   from-GB code '~}' ($7E7D) is outside the defined GB range.)
   1.147 +*
   1.148 +*   Source: RFC 1842
   1.149 +*
   1.150 +*   Note that the formal syntax in RFC 1842 is invalid. I assume that the
   1.151 +*   intended definition of single-byte-segment is as follows (pedberg):
   1.152 +*   single-byte-segment = single-byte-seq 1*single-byte-char
   1.153 +*/
   1.154 +
   1.155 +
   1.156 +static void 
   1.157 +UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
   1.158 +                                                            UErrorCode* err){
   1.159 +    char tempBuf[2];
   1.160 +    const char *mySource = ( char *) args->source;
   1.161 +    UChar *myTarget = args->target;
   1.162 +    const char *mySourceLimit = args->sourceLimit;
   1.163 +    UChar32 targetUniChar = 0x0000;
   1.164 +    int32_t mySourceChar = 0x0000;
   1.165 +    UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
   1.166 +    tempBuf[0]=0; 
   1.167 +    tempBuf[1]=0;
   1.168 +
   1.169 +    /* Calling code already handles this situation. */
   1.170 +    /*if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
   1.171 +        *err = U_ILLEGAL_ARGUMENT_ERROR;
   1.172 +        return;
   1.173 +    }*/
   1.174 +    
   1.175 +    while(mySource< mySourceLimit){
   1.176 +        
   1.177 +        if(myTarget < args->targetLimit){
   1.178 +            
   1.179 +            mySourceChar= (unsigned char) *mySource++;
   1.180 +
   1.181 +            if(args->converter->mode == UCNV_TILDE) {
   1.182 +                /* second byte after ~ */
   1.183 +                args->converter->mode=0;
   1.184 +                switch(mySourceChar) {
   1.185 +                case 0x0A:
   1.186 +                    /* no output for ~\n (line-continuation marker) */
   1.187 +                    continue;
   1.188 +                case UCNV_TILDE:
   1.189 +                    if(args->offsets) {
   1.190 +                        args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
   1.191 +                    }
   1.192 +                    *(myTarget++)=(UChar)mySourceChar;
   1.193 +                    myData->isEmptySegment = FALSE;
   1.194 +                    continue;
   1.195 +                case UCNV_OPEN_BRACE:
   1.196 +                case UCNV_CLOSE_BRACE:
   1.197 +                    myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
   1.198 +                    if (myData->isEmptySegment) {
   1.199 +                        myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
   1.200 +                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1.201 +                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
   1.202 +                        args->converter->toUBytes[0] = UCNV_TILDE;
   1.203 +                        args->converter->toUBytes[1] = mySourceChar;
   1.204 +                        args->converter->toULength = 2;
   1.205 +                        args->target = myTarget;
   1.206 +                        args->source = mySource;
   1.207 +                        return;
   1.208 +                    }
   1.209 +                    myData->isEmptySegment = TRUE;
   1.210 +                    continue;
   1.211 +                default:
   1.212 +                     /* if the first byte is equal to TILDE and the trail byte
   1.213 +                     * is not a valid byte then it is an error condition
   1.214 +                     */
   1.215 +                    /*
   1.216 +                     * Ticket 5691: consistent illegal sequences:
   1.217 +                     * - We include at least the first byte in the illegal sequence.
   1.218 +                     * - If any of the non-initial bytes could be the start of a character,
   1.219 +                     *   we stop the illegal sequence before the first one of those.
   1.220 +                     */
   1.221 +                    myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
   1.222 +                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1.223 +                    args->converter->toUBytes[0] = UCNV_TILDE;
   1.224 +                    if( myData->isStateDBCS ?
   1.225 +                            (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
   1.226 +                            mySourceChar <= 0x7f
   1.227 +                    ) {
   1.228 +                        /* The current byte could be the start of a character: Back it out. */
   1.229 +                        args->converter->toULength = 1;
   1.230 +                        --mySource;
   1.231 +                    } else {
   1.232 +                        /* Include the current byte in the illegal sequence. */
   1.233 +                        args->converter->toUBytes[1] = mySourceChar;
   1.234 +                        args->converter->toULength = 2;
   1.235 +                    }
   1.236 +                    args->target = myTarget;
   1.237 +                    args->source = mySource;
   1.238 +                    return;
   1.239 +                }
   1.240 +            } else if(myData->isStateDBCS) {
   1.241 +                if(args->converter->toUnicodeStatus == 0x00){
   1.242 +                    /* lead byte */
   1.243 +                    if(mySourceChar == UCNV_TILDE) {
   1.244 +                        args->converter->mode = UCNV_TILDE;
   1.245 +                    } else {
   1.246 +                        /* add another bit to distinguish a 0 byte from not having seen a lead byte */
   1.247 +                        args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
   1.248 +                        myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
   1.249 +                    }
   1.250 +                    continue;
   1.251 +                }
   1.252 +                else{
   1.253 +                    /* trail byte */
   1.254 +                    int leadIsOk, trailIsOk;
   1.255 +                    uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
   1.256 +                    targetUniChar = 0xffff;
   1.257 +                    /*
   1.258 +                     * Ticket 5691: consistent illegal sequences:
   1.259 +                     * - We include at least the first byte in the illegal sequence.
   1.260 +                     * - If any of the non-initial bytes could be the start of a character,
   1.261 +                     *   we stop the illegal sequence before the first one of those.
   1.262 +                     *
   1.263 +                     * In HZ DBCS, if the second byte is in the 21..7e range,
   1.264 +                     * we report only the first byte as the illegal sequence.
   1.265 +                     * Otherwise we convert or report the pair of bytes.
   1.266 +                     */
   1.267 +                    leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
   1.268 +                    trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
   1.269 +                    if (leadIsOk && trailIsOk) {
   1.270 +                        tempBuf[0] = (char) (leadByte+0x80) ;
   1.271 +                        tempBuf[1] = (char) (mySourceChar+0x80);
   1.272 +                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
   1.273 +                            tempBuf, 2, args->converter->useFallback);
   1.274 +                        mySourceChar= (leadByte << 8) | mySourceChar;
   1.275 +                    } else if (trailIsOk) {
   1.276 +                        /* report a single illegal byte and continue with the following DBCS starter byte */
   1.277 +                        --mySource;
   1.278 +                        mySourceChar = (int32_t)leadByte;
   1.279 +                    } else {
   1.280 +                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
   1.281 +                        /* add another bit so that the code below writes 2 bytes in case of error */
   1.282 +                        mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
   1.283 +                    }
   1.284 +                    args->converter->toUnicodeStatus =0x00;
   1.285 +                }
   1.286 +            }
   1.287 +            else{
   1.288 +                if(mySourceChar == UCNV_TILDE) {
   1.289 +                    args->converter->mode = UCNV_TILDE;
   1.290 +                    continue;
   1.291 +                } else if(mySourceChar <= 0x7f) {
   1.292 +                    targetUniChar = (UChar)mySourceChar;  /* ASCII */
   1.293 +                    myData->isEmptySegment = FALSE; /* the segment has something valid */
   1.294 +                } else {
   1.295 +                    targetUniChar = 0xffff;
   1.296 +                    myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
   1.297 +                }
   1.298 +            }
   1.299 +            if(targetUniChar < 0xfffe){
   1.300 +                if(args->offsets) {
   1.301 +                    args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
   1.302 +                }
   1.303 +
   1.304 +                *(myTarget++)=(UChar)targetUniChar;
   1.305 +            }
   1.306 +            else /* targetUniChar>=0xfffe */ {
   1.307 +                if(targetUniChar == 0xfffe){
   1.308 +                    *err = U_INVALID_CHAR_FOUND;
   1.309 +                }
   1.310 +                else{
   1.311 +                    *err = U_ILLEGAL_CHAR_FOUND;
   1.312 +                }
   1.313 +                if(mySourceChar > 0xff){
   1.314 +                    args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
   1.315 +                    args->converter->toUBytes[1] = (uint8_t)mySourceChar;
   1.316 +                    args->converter->toULength=2;
   1.317 +                }
   1.318 +                else{
   1.319 +                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   1.320 +                    args->converter->toULength=1;
   1.321 +                }
   1.322 +                break;
   1.323 +            }
   1.324 +        }
   1.325 +        else{
   1.326 +            *err =U_BUFFER_OVERFLOW_ERROR;
   1.327 +            break;
   1.328 +        }
   1.329 +    }
   1.330 +
   1.331 +    args->target = myTarget;
   1.332 +    args->source = mySource;
   1.333 +}
   1.334 +
   1.335 +
   1.336 +static void 
   1.337 +UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
   1.338 +                                                      UErrorCode * err){
   1.339 +    const UChar *mySource = args->source;
   1.340 +    char *myTarget = args->target;
   1.341 +    int32_t* offsets = args->offsets;
   1.342 +    int32_t mySourceIndex = 0;
   1.343 +    int32_t myTargetIndex = 0;
   1.344 +    int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
   1.345 +    int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
   1.346 +    int32_t length=0;
   1.347 +    uint32_t targetUniChar = 0x0000;
   1.348 +    UChar32 mySourceChar = 0x0000;
   1.349 +    UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
   1.350 +    UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
   1.351 +    UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
   1.352 +    int len =0;
   1.353 +    const char* escSeq=NULL;
   1.354 +    
   1.355 +    /* Calling code already handles this situation. */
   1.356 +    /*if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
   1.357 +        *err = U_ILLEGAL_ARGUMENT_ERROR;
   1.358 +        return;
   1.359 +    }*/
   1.360 +    if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
   1.361 +        goto getTrail;
   1.362 +    }
   1.363 +    /*writing the char to the output stream */
   1.364 +    while (mySourceIndex < mySourceLength){
   1.365 +        targetUniChar = missingCharMarker;
   1.366 +        if (myTargetIndex < targetLength){
   1.367 +            
   1.368 +            mySourceChar = (UChar) mySource[mySourceIndex++];
   1.369 +            
   1.370 +
   1.371 +            oldIsTargetUCharDBCS = isTargetUCharDBCS;
   1.372 +            if(mySourceChar ==UCNV_TILDE){
   1.373 +                /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
   1.374 +                len = ESC_LEN;
   1.375 +                escSeq = TILDE_ESCAPE;
   1.376 +                CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
   1.377 +                continue;
   1.378 +            } else if(mySourceChar <= 0x7f) {
   1.379 +                length = 1;
   1.380 +                targetUniChar = mySourceChar;
   1.381 +            } else {
   1.382 +                length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
   1.383 +                    mySourceChar,&targetUniChar,args->converter->useFallback);
   1.384 +                /* we can only use lead bytes 21..7D and trail bytes 21..7E */
   1.385 +                if( length == 2 &&
   1.386 +                    (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
   1.387 +                    (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
   1.388 +                ) {
   1.389 +                    targetUniChar -= 0x8080;
   1.390 +                } else {
   1.391 +                    targetUniChar = missingCharMarker;
   1.392 +                }
   1.393 +            }
   1.394 +            if (targetUniChar != missingCharMarker){
   1.395 +               myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);     
   1.396 +                 if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){
   1.397 +                    /*Shifting from a double byte to single byte mode*/
   1.398 +                    if(!isTargetUCharDBCS){
   1.399 +                        len =ESC_LEN;
   1.400 +                        escSeq = SB_ESCAPE;
   1.401 +                        CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
   1.402 +                        myConverterData->isEscapeAppended = TRUE;
   1.403 +                    }
   1.404 +                    else{ /* Shifting from a single byte to double byte mode*/
   1.405 +                        len =ESC_LEN;
   1.406 +                        escSeq = DB_ESCAPE;
   1.407 +                        CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
   1.408 +                        myConverterData->isEscapeAppended = TRUE;
   1.409 +                        
   1.410 +                    }
   1.411 +                }
   1.412 +            
   1.413 +                if(isTargetUCharDBCS){
   1.414 +                    if( myTargetIndex <targetLength){
   1.415 +                        myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
   1.416 +                        if(offsets){
   1.417 +                            *(offsets++) = mySourceIndex-1;
   1.418 +                        }
   1.419 +                        if(myTargetIndex < targetLength){
   1.420 +                            myTarget[myTargetIndex++] =(char) targetUniChar;
   1.421 +                            if(offsets){
   1.422 +                                *(offsets++) = mySourceIndex-1;
   1.423 +                            }
   1.424 +                        }else{
   1.425 +                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
   1.426 +                            *err = U_BUFFER_OVERFLOW_ERROR;
   1.427 +                        } 
   1.428 +                    }else{
   1.429 +                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
   1.430 +                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
   1.431 +                        *err = U_BUFFER_OVERFLOW_ERROR;
   1.432 +                    }
   1.433 +
   1.434 +                }else{
   1.435 +                    if( myTargetIndex <targetLength){
   1.436 +                        myTarget[myTargetIndex++] = (char) (targetUniChar );
   1.437 +                        if(offsets){
   1.438 +                            *(offsets++) = mySourceIndex-1;
   1.439 +                        }
   1.440 +                        
   1.441 +                    }else{
   1.442 +                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
   1.443 +                        *err = U_BUFFER_OVERFLOW_ERROR;
   1.444 +                    }
   1.445 +                }
   1.446 +
   1.447 +            }
   1.448 +            else{
   1.449 +                /* oops.. the code point is unassigned */
   1.450 +                /*Handle surrogates */
   1.451 +                /*check if the char is a First surrogate*/
   1.452 +                if(U16_IS_SURROGATE(mySourceChar)) {
   1.453 +                    if(U16_IS_SURROGATE_LEAD(mySourceChar)) {
   1.454 +                        args->converter->fromUChar32=mySourceChar;
   1.455 +getTrail:
   1.456 +                        /*look ahead to find the trail surrogate*/
   1.457 +                        if(mySourceIndex <  mySourceLength) {
   1.458 +                            /* test the following code unit */
   1.459 +                            UChar trail=(UChar) args->source[mySourceIndex];
   1.460 +                            if(U16_IS_TRAIL(trail)) {
   1.461 +                                ++mySourceIndex;
   1.462 +                                mySourceChar=U16_GET_SUPPLEMENTARY(args->converter->fromUChar32, trail);
   1.463 +                                args->converter->fromUChar32=0x00;
   1.464 +                                /* there are no surrogates in GB2312*/
   1.465 +                                *err = U_INVALID_CHAR_FOUND;
   1.466 +                                /* exit this condition tree */
   1.467 +                            } else {
   1.468 +                                /* this is an unmatched lead code unit (1st surrogate) */
   1.469 +                                /* callback(illegal) */
   1.470 +                                *err=U_ILLEGAL_CHAR_FOUND;
   1.471 +                            }
   1.472 +                        } else {
   1.473 +                            /* no more input */
   1.474 +                            *err = U_ZERO_ERROR;
   1.475 +                        }
   1.476 +                    } else {
   1.477 +                        /* this is an unmatched trail code unit (2nd surrogate) */
   1.478 +                        /* callback(illegal) */
   1.479 +                        *err=U_ILLEGAL_CHAR_FOUND;
   1.480 +                    }
   1.481 +                } else {
   1.482 +                    /* callback(unassigned) for a BMP code point */
   1.483 +                    *err = U_INVALID_CHAR_FOUND;
   1.484 +                }
   1.485 +
   1.486 +                args->converter->fromUChar32=mySourceChar;
   1.487 +                break;
   1.488 +            }
   1.489 +        }
   1.490 +        else{
   1.491 +            *err = U_BUFFER_OVERFLOW_ERROR;
   1.492 +            break;
   1.493 +        }
   1.494 +        targetUniChar=missingCharMarker;
   1.495 +    }
   1.496 +
   1.497 +    args->target += myTargetIndex;
   1.498 +    args->source += mySourceIndex;
   1.499 +    myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
   1.500 +}
   1.501 +
   1.502 +static void
   1.503 +_HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
   1.504 +    UConverter *cnv = args->converter;
   1.505 +    UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo;
   1.506 +    char *p;
   1.507 +    char buffer[4];
   1.508 +    p = buffer;
   1.509 +    
   1.510 +    if( convData->isTargetUCharDBCS){
   1.511 +        *p++= UCNV_TILDE;
   1.512 +        *p++= UCNV_CLOSE_BRACE;
   1.513 +        convData->isTargetUCharDBCS=FALSE;
   1.514 +    }
   1.515 +    *p++= (char)cnv->subChars[0];
   1.516 +
   1.517 +    ucnv_cbFromUWriteBytes(args,
   1.518 +                           buffer, (int32_t)(p - buffer),
   1.519 +                           offsetIndex, err);
   1.520 +}
   1.521 +
   1.522 +/*
   1.523 + * Structure for cloning an HZ converter into a single memory block.
   1.524 + * ucnv_safeClone() of the HZ converter will align the entire cloneHZStruct,
   1.525 + * and then ucnv_safeClone() of the sub-converter may additionally align
   1.526 + * subCnv inside the cloneHZStruct, for which we need the deadSpace after
   1.527 + * subCnv. This is because UAlignedMemory may be larger than the actually
   1.528 + * necessary alignment size for the platform.
   1.529 + * The other cloneHZStruct fields will not be moved around,
   1.530 + * and are aligned properly with cloneHZStruct's alignment.
   1.531 + */
   1.532 +struct cloneHZStruct
   1.533 +{
   1.534 +    UConverter cnv;
   1.535 +    UConverter subCnv;
   1.536 +    UAlignedMemory deadSpace;
   1.537 +    UConverterDataHZ mydata;
   1.538 +};
   1.539 +
   1.540 +
   1.541 +static UConverter * 
   1.542 +_HZ_SafeClone(const UConverter *cnv, 
   1.543 +              void *stackBuffer, 
   1.544 +              int32_t *pBufferSize, 
   1.545 +              UErrorCode *status)
   1.546 +{
   1.547 +    struct cloneHZStruct * localClone;
   1.548 +    int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
   1.549 +
   1.550 +    if (U_FAILURE(*status)){
   1.551 +        return 0;
   1.552 +    }
   1.553 +
   1.554 +    if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
   1.555 +        *pBufferSize = bufferSizeNeeded;
   1.556 +        return 0;
   1.557 +    }
   1.558 +
   1.559 +    localClone = (struct cloneHZStruct *)stackBuffer;
   1.560 +    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
   1.561 +
   1.562 +    uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
   1.563 +    localClone->cnv.extraInfo = &localClone->mydata;
   1.564 +    localClone->cnv.isExtraLocal = TRUE;
   1.565 +
   1.566 +    /* deep-clone the sub-converter */
   1.567 +    size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
   1.568 +    ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
   1.569 +        ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
   1.570 +
   1.571 +    return &localClone->cnv;
   1.572 +}
   1.573 +
   1.574 +static void
   1.575 +_HZ_GetUnicodeSet(const UConverter *cnv,
   1.576 +                  const USetAdder *sa,
   1.577 +                  UConverterUnicodeSet which,
   1.578 +                  UErrorCode *pErrorCode) {
   1.579 +    /* HZ converts all of ASCII */
   1.580 +    sa->addRange(sa->set, 0, 0x7f);
   1.581 +
   1.582 +    /* add all of the code points that the sub-converter handles */
   1.583 +    ucnv_MBCSGetFilteredUnicodeSetForUnicode(
   1.584 +        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
   1.585 +        sa, which, UCNV_SET_FILTER_HZ,
   1.586 +        pErrorCode);
   1.587 +}
   1.588 +
   1.589 +static const UConverterImpl _HZImpl={
   1.590 +
   1.591 +    UCNV_HZ,
   1.592 +    
   1.593 +    NULL,
   1.594 +    NULL,
   1.595 +    
   1.596 +    _HZOpen,
   1.597 +    _HZClose,
   1.598 +    _HZReset,
   1.599 +    
   1.600 +    UConverter_toUnicode_HZ_OFFSETS_LOGIC,
   1.601 +    UConverter_toUnicode_HZ_OFFSETS_LOGIC,
   1.602 +    UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
   1.603 +    UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
   1.604 +    NULL,
   1.605 +    
   1.606 +    NULL,
   1.607 +    NULL,
   1.608 +    _HZ_WriteSub,
   1.609 +    _HZ_SafeClone,
   1.610 +    _HZ_GetUnicodeSet
   1.611 +};
   1.612 +
   1.613 +static const UConverterStaticData _HZStaticData={
   1.614 +    sizeof(UConverterStaticData),
   1.615 +        "HZ",
   1.616 +         0, 
   1.617 +         UCNV_IBM, 
   1.618 +         UCNV_HZ, 
   1.619 +         1, 
   1.620 +         4,
   1.621 +        { 0x1a, 0, 0, 0 },
   1.622 +        1,
   1.623 +        FALSE, 
   1.624 +        FALSE,
   1.625 +        0,
   1.626 +        0,
   1.627 +        { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
   1.628 +
   1.629 +};
   1.630 +            
   1.631 +            
   1.632 +const UConverterSharedData _HZData={
   1.633 +    sizeof(UConverterSharedData),
   1.634 +        ~((uint32_t) 0),
   1.635 +        NULL, 
   1.636 +        NULL, 
   1.637 +        &_HZStaticData, 
   1.638 +        FALSE, 
   1.639 +        &_HZImpl, 
   1.640 +        0
   1.641 +};
   1.642 +
   1.643 +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

mercurial