1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnv_u7.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1484 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2002-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* file name: ucnv_u7.c 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2002jul01 1.15 +* created by: Markus W. Scherer 1.16 +* 1.17 +* UTF-7 converter implementation. Used to be in ucnv_utf.c. 1.18 +*/ 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 + 1.22 +#if !UCONFIG_NO_CONVERSION 1.23 + 1.24 +#include "unicode/ucnv.h" 1.25 +#include "ucnv_bld.h" 1.26 +#include "ucnv_cnv.h" 1.27 +#include "uassert.h" 1.28 + 1.29 +/* UTF-7 -------------------------------------------------------------------- */ 1.30 + 1.31 +/* 1.32 + * UTF-7 is a stateful encoding of Unicode. 1.33 + * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt) 1.34 + * It was intended for use in Internet email systems, using in its bytewise 1.35 + * encoding only a subset of 7-bit US-ASCII. 1.36 + * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still 1.37 + * occasionally used. 1.38 + * 1.39 + * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII 1.40 + * characters directly or in base64. Especially, the characters in set O 1.41 + * as defined in the RFC (see below) may be encoded directly but are not 1.42 + * allowed in, e.g., email headers. 1.43 + * By default, the ICU UTF-7 converter encodes set O directly. 1.44 + * By choosing the option "version=1", set O will be escaped instead. 1.45 + * For example: 1.46 + * utf7Converter=ucnv_open("UTF-7,version=1"); 1.47 + * 1.48 + * For details about email headers see RFC 2047. 1.49 + */ 1.50 + 1.51 +/* 1.52 + * Tests for US-ASCII characters belonging to character classes 1.53 + * defined in UTF-7. 1.54 + * 1.55 + * Set D (directly encoded characters) consists of the following 1.56 + * characters: the upper and lower case letters A through Z 1.57 + * and a through z, the 10 digits 0-9, and the following nine special 1.58 + * characters (note that "+" and "=" are omitted): 1.59 + * '(),-./:? 1.60 + * 1.61 + * Set O (optional direct characters) consists of the following 1.62 + * characters (note that "\" and "~" are omitted): 1.63 + * !"#$%&*;<=>@[]^_`{|} 1.64 + * 1.65 + * According to the rules in RFC 2152, the byte values for the following 1.66 + * US-ASCII characters are not used in UTF-7 and are therefore illegal: 1.67 + * - all C0 control codes except for CR LF TAB 1.68 + * - BACKSLASH 1.69 + * - TILDE 1.70 + * - DEL 1.71 + * - all codes beyond US-ASCII, i.e. all >127 1.72 + */ 1.73 +#define inSetD(c) \ 1.74 + ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \ 1.75 + (uint8_t)((c)-48)<10 || /* digits */ \ 1.76 + (uint8_t)((c)-39)<3 || /* '() */ \ 1.77 + (uint8_t)((c)-44)<4 || /* ,-./ */ \ 1.78 + (c)==58 || (c)==63 /* :? */ \ 1.79 + ) 1.80 + 1.81 +#define inSetO(c) \ 1.82 + ((uint8_t)((c)-33)<6 || /* !"#$%& */ \ 1.83 + (uint8_t)((c)-59)<4 || /* ;<=> */ \ 1.84 + (uint8_t)((c)-93)<4 || /* ]^_` */ \ 1.85 + (uint8_t)((c)-123)<3 || /* {|} */ \ 1.86 + (c)==42 || (c)==64 || (c)==91 /* *@[ */ \ 1.87 + ) 1.88 + 1.89 +#define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9) 1.90 +#define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9) 1.91 + 1.92 +#define PLUS 43 1.93 +#define MINUS 45 1.94 +#define BACKSLASH 92 1.95 +#define TILDE 126 1.96 + 1.97 +/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */ 1.98 +#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c)) 1.99 + 1.100 +/* encode directly sets D and O and CR LF SP TAB */ 1.101 +static const UBool encodeDirectlyMaximum[128]={ 1.102 + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 1.103 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1.104 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.105 + 1.106 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1.107 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.108 + 1.109 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.110 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1.111 + 1.112 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.113 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 1.114 +}; 1.115 + 1.116 +/* encode directly set D and CR LF SP TAB but not set O */ 1.117 +static const UBool encodeDirectlyRestricted[128]={ 1.118 + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 1.119 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1.120 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.121 + 1.122 + 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1.123 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1.124 + 1.125 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.126 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1.127 + 1.128 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.129 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 1.130 +}; 1.131 + 1.132 +static const uint8_t 1.133 +toBase64[64]={ 1.134 + /* A-Z */ 1.135 + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 1.136 + 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 1.137 + /* a-z */ 1.138 + 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 1.139 + 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 1.140 + /* 0-9 */ 1.141 + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 1.142 + /* +/ */ 1.143 + 43, 47 1.144 +}; 1.145 + 1.146 +static const int8_t 1.147 +fromBase64[128]={ 1.148 + /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */ 1.149 + -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3, 1.150 + -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, 1.151 + 1.152 + /* general punctuation with + and / and a special value (-2) for - */ 1.153 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63, 1.154 + /* digits */ 1.155 + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, 1.156 + 1.157 + /* A-Z */ 1.158 + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1.159 + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1, 1.160 + 1.161 + /* a-z */ 1.162 + -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 1.163 + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3 1.164 +}; 1.165 + 1.166 +/* 1.167 + * converter status values: 1.168 + * 1.169 + * toUnicodeStatus: 1.170 + * 24 inDirectMode (boolean) 1.171 + * 23..16 base64Counter (-1..7) 1.172 + * 15..0 bits (up to 14 bits incoming base64) 1.173 + * 1.174 + * fromUnicodeStatus: 1.175 + * 31..28 version (0: set O direct 1: set O escaped) 1.176 + * 24 inDirectMode (boolean) 1.177 + * 23..16 base64Counter (0..2) 1.178 + * 7..0 bits (6 bits outgoing base64) 1.179 + * 1.180 + */ 1.181 + 1.182 +static void 1.183 +_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) { 1.184 + if(choice<=UCNV_RESET_TO_UNICODE) { 1.185 + /* reset toUnicode */ 1.186 + cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */ 1.187 + cnv->toULength=0; 1.188 + } 1.189 + if(choice!=UCNV_RESET_TO_UNICODE) { 1.190 + /* reset fromUnicode */ 1.191 + cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 1.192 + } 1.193 +} 1.194 + 1.195 +static void 1.196 +_UTF7Open(UConverter *cnv, 1.197 + UConverterLoadArgs *pArgs, 1.198 + UErrorCode *pErrorCode) { 1.199 + if(UCNV_GET_VERSION(cnv)<=1) { 1.200 + /* TODO(markus): Should just use cnv->options rather than copying the version number. */ 1.201 + cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28; 1.202 + _UTF7Reset(cnv, UCNV_RESET_BOTH); 1.203 + } else { 1.204 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.205 + } 1.206 +} 1.207 + 1.208 +static void 1.209 +_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.210 + UErrorCode *pErrorCode) { 1.211 + UConverter *cnv; 1.212 + const uint8_t *source, *sourceLimit; 1.213 + UChar *target; 1.214 + const UChar *targetLimit; 1.215 + int32_t *offsets; 1.216 + 1.217 + uint8_t *bytes; 1.218 + uint8_t byteIndex; 1.219 + 1.220 + int32_t length, targetCapacity; 1.221 + 1.222 + /* UTF-7 state */ 1.223 + uint16_t bits; 1.224 + int8_t base64Counter; 1.225 + UBool inDirectMode; 1.226 + 1.227 + int8_t base64Value; 1.228 + 1.229 + int32_t sourceIndex, nextSourceIndex; 1.230 + 1.231 + uint8_t b; 1.232 + /* set up the local pointers */ 1.233 + cnv=pArgs->converter; 1.234 + 1.235 + source=(const uint8_t *)pArgs->source; 1.236 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.237 + target=pArgs->target; 1.238 + targetLimit=pArgs->targetLimit; 1.239 + offsets=pArgs->offsets; 1.240 + /* get the state machine state */ 1.241 + { 1.242 + uint32_t status=cnv->toUnicodeStatus; 1.243 + inDirectMode=(UBool)((status>>24)&1); 1.244 + base64Counter=(int8_t)(status>>16); 1.245 + bits=(uint16_t)status; 1.246 + } 1.247 + bytes=cnv->toUBytes; 1.248 + byteIndex=cnv->toULength; 1.249 + 1.250 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.251 + sourceIndex=byteIndex==0 ? 0 : -1; 1.252 + nextSourceIndex=0; 1.253 + 1.254 + if(inDirectMode) { 1.255 +directMode: 1.256 + /* 1.257 + * In Direct Mode, most US-ASCII characters are encoded directly, i.e., 1.258 + * with their US-ASCII byte values. 1.259 + * Backslash and Tilde and most control characters are not allowed in UTF-7. 1.260 + * A plus sign starts Unicode (or "escape") Mode. 1.261 + * 1.262 + * In Direct Mode, only the sourceIndex is used. 1.263 + */ 1.264 + byteIndex=0; 1.265 + length=(int32_t)(sourceLimit-source); 1.266 + targetCapacity=(int32_t)(targetLimit-target); 1.267 + if(length>targetCapacity) { 1.268 + length=targetCapacity; 1.269 + } 1.270 + while(length>0) { 1.271 + b=*source++; 1.272 + if(!isLegalUTF7(b)) { 1.273 + /* illegal */ 1.274 + bytes[0]=b; 1.275 + byteIndex=1; 1.276 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.277 + break; 1.278 + } else if(b!=PLUS) { 1.279 + /* write directly encoded character */ 1.280 + *target++=b; 1.281 + if(offsets!=NULL) { 1.282 + *offsets++=sourceIndex++; 1.283 + } 1.284 + } else /* PLUS */ { 1.285 + /* switch to Unicode mode */ 1.286 + nextSourceIndex=++sourceIndex; 1.287 + inDirectMode=FALSE; 1.288 + byteIndex=0; 1.289 + bits=0; 1.290 + base64Counter=-1; 1.291 + goto unicodeMode; 1.292 + } 1.293 + --length; 1.294 + } 1.295 + if(source<sourceLimit && target>=targetLimit) { 1.296 + /* target is full */ 1.297 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.298 + } 1.299 + } else { 1.300 +unicodeMode: 1.301 + /* 1.302 + * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded. 1.303 + * The base64 sequence ends with any character that is not in the base64 alphabet. 1.304 + * A terminating minus sign is consumed. 1.305 + * 1.306 + * In Unicode Mode, the sourceIndex has the index to the start of the current 1.307 + * base64 bytes, while nextSourceIndex is precisely parallel to source, 1.308 + * keeping the index to the following byte. 1.309 + * Note that in 2 out of 3 cases, UChars overlap within a base64 byte. 1.310 + */ 1.311 + while(source<sourceLimit) { 1.312 + if(target<targetLimit) { 1.313 + bytes[byteIndex++]=b=*source++; 1.314 + ++nextSourceIndex; 1.315 + base64Value = -3; /* initialize as illegal */ 1.316 + if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) { 1.317 + /* either 1.318 + * base64Value==-1 for any legal character except base64 and minus sign, or 1.319 + * base64Value==-3 for illegal characters: 1.320 + * 1. In either case, leave Unicode mode. 1.321 + * 2.1. If we ended with an incomplete UChar or none after the +, then 1.322 + * generate an error for the preceding erroneous sequence and deal with 1.323 + * the current (possibly illegal) character next time through. 1.324 + * 2.2. Else the current char comes after a complete UChar, which was already 1.325 + * pushed to the output buf, so: 1.326 + * 2.2.1. If the current char is legal, just save it for processing next time. 1.327 + * It may be for example, a plus which we need to deal with in direct mode. 1.328 + * 2.2.2. Else if the current char is illegal, we might as well deal with it here. 1.329 + */ 1.330 + inDirectMode=TRUE; 1.331 + if(base64Counter==-1) { 1.332 + /* illegal: + immediately followed by something other than base64 or minus sign */ 1.333 + /* include the plus sign in the reported sequence, but not the subsequent char */ 1.334 + --source; 1.335 + bytes[0]=PLUS; 1.336 + byteIndex=1; 1.337 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.338 + break; 1.339 + } else if(bits!=0) { 1.340 + /* bits are illegally left over, a UChar is incomplete */ 1.341 + /* don't include current char (legal or illegal) in error seq */ 1.342 + --source; 1.343 + --byteIndex; 1.344 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.345 + break; 1.346 + } else { 1.347 + /* previous UChar was complete */ 1.348 + if(base64Value==-3) { 1.349 + /* current character is illegal, deal with it here */ 1.350 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.351 + break; 1.352 + } else { 1.353 + /* un-read the current character in case it is a plus sign */ 1.354 + --source; 1.355 + sourceIndex=nextSourceIndex-1; 1.356 + goto directMode; 1.357 + } 1.358 + } 1.359 + } else if(base64Value>=0) { 1.360 + /* collect base64 bytes into UChars */ 1.361 + switch(base64Counter) { 1.362 + case -1: /* -1 is immediately after the + */ 1.363 + case 0: 1.364 + bits=base64Value; 1.365 + base64Counter=1; 1.366 + break; 1.367 + case 1: 1.368 + case 3: 1.369 + case 4: 1.370 + case 6: 1.371 + bits=(uint16_t)((bits<<6)|base64Value); 1.372 + ++base64Counter; 1.373 + break; 1.374 + case 2: 1.375 + *target++=(UChar)((bits<<4)|(base64Value>>2)); 1.376 + if(offsets!=NULL) { 1.377 + *offsets++=sourceIndex; 1.378 + sourceIndex=nextSourceIndex-1; 1.379 + } 1.380 + bytes[0]=b; /* keep this byte in case an error occurs */ 1.381 + byteIndex=1; 1.382 + bits=(uint16_t)(base64Value&3); 1.383 + base64Counter=3; 1.384 + break; 1.385 + case 5: 1.386 + *target++=(UChar)((bits<<2)|(base64Value>>4)); 1.387 + if(offsets!=NULL) { 1.388 + *offsets++=sourceIndex; 1.389 + sourceIndex=nextSourceIndex-1; 1.390 + } 1.391 + bytes[0]=b; /* keep this byte in case an error occurs */ 1.392 + byteIndex=1; 1.393 + bits=(uint16_t)(base64Value&15); 1.394 + base64Counter=6; 1.395 + break; 1.396 + case 7: 1.397 + *target++=(UChar)((bits<<6)|base64Value); 1.398 + if(offsets!=NULL) { 1.399 + *offsets++=sourceIndex; 1.400 + sourceIndex=nextSourceIndex; 1.401 + } 1.402 + byteIndex=0; 1.403 + bits=0; 1.404 + base64Counter=0; 1.405 + break; 1.406 + default: 1.407 + /* will never occur */ 1.408 + break; 1.409 + } 1.410 + } else /*base64Value==-2*/ { 1.411 + /* minus sign terminates the base64 sequence */ 1.412 + inDirectMode=TRUE; 1.413 + if(base64Counter==-1) { 1.414 + /* +- i.e. a minus immediately following a plus */ 1.415 + *target++=PLUS; 1.416 + if(offsets!=NULL) { 1.417 + *offsets++=sourceIndex-1; 1.418 + } 1.419 + } else { 1.420 + /* absorb the minus and leave the Unicode Mode */ 1.421 + if(bits!=0) { 1.422 + /* bits are illegally left over, a UChar is incomplete */ 1.423 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.424 + break; 1.425 + } 1.426 + } 1.427 + sourceIndex=nextSourceIndex; 1.428 + goto directMode; 1.429 + } 1.430 + } else { 1.431 + /* target is full */ 1.432 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.433 + break; 1.434 + } 1.435 + } 1.436 + } 1.437 + 1.438 + if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) { 1.439 + /* 1.440 + * if we are in Unicode mode, then the byteIndex might not be 0, 1.441 + * but that is ok if bits==0 1.442 + * -> we set byteIndex=0 at the end of the stream to avoid a truncated error 1.443 + * (not true for IMAP-mailbox-name where we must end in direct mode) 1.444 + */ 1.445 + byteIndex=0; 1.446 + } 1.447 + 1.448 + /* set the converter state back into UConverter */ 1.449 + cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; 1.450 + cnv->toULength=byteIndex; 1.451 + 1.452 + /* write back the updated pointers */ 1.453 + pArgs->source=(const char *)source; 1.454 + pArgs->target=target; 1.455 + pArgs->offsets=offsets; 1.456 + return; 1.457 +} 1.458 + 1.459 +static void 1.460 +_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.461 + UErrorCode *pErrorCode) { 1.462 + UConverter *cnv; 1.463 + const UChar *source, *sourceLimit; 1.464 + uint8_t *target, *targetLimit; 1.465 + int32_t *offsets; 1.466 + 1.467 + int32_t length, targetCapacity, sourceIndex; 1.468 + UChar c; 1.469 + 1.470 + /* UTF-7 state */ 1.471 + const UBool *encodeDirectly; 1.472 + uint8_t bits; 1.473 + int8_t base64Counter; 1.474 + UBool inDirectMode; 1.475 + 1.476 + /* set up the local pointers */ 1.477 + cnv=pArgs->converter; 1.478 + 1.479 + /* set up the local pointers */ 1.480 + source=pArgs->source; 1.481 + sourceLimit=pArgs->sourceLimit; 1.482 + target=(uint8_t *)pArgs->target; 1.483 + targetLimit=(uint8_t *)pArgs->targetLimit; 1.484 + offsets=pArgs->offsets; 1.485 + 1.486 + /* get the state machine state */ 1.487 + { 1.488 + uint32_t status=cnv->fromUnicodeStatus; 1.489 + encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted; 1.490 + inDirectMode=(UBool)((status>>24)&1); 1.491 + base64Counter=(int8_t)(status>>16); 1.492 + bits=(uint8_t)status; 1.493 + U_ASSERT(bits<=sizeof(toBase64)/sizeof(toBase64[0])); 1.494 + } 1.495 + 1.496 + /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */ 1.497 + sourceIndex=0; 1.498 + 1.499 + if(inDirectMode) { 1.500 +directMode: 1.501 + length=(int32_t)(sourceLimit-source); 1.502 + targetCapacity=(int32_t)(targetLimit-target); 1.503 + if(length>targetCapacity) { 1.504 + length=targetCapacity; 1.505 + } 1.506 + while(length>0) { 1.507 + c=*source++; 1.508 + /* currently always encode CR LF SP TAB directly */ 1.509 + if(c<=127 && encodeDirectly[c]) { 1.510 + /* encode directly */ 1.511 + *target++=(uint8_t)c; 1.512 + if(offsets!=NULL) { 1.513 + *offsets++=sourceIndex++; 1.514 + } 1.515 + } else if(c==PLUS) { 1.516 + /* output +- for + */ 1.517 + *target++=PLUS; 1.518 + if(target<targetLimit) { 1.519 + *target++=MINUS; 1.520 + if(offsets!=NULL) { 1.521 + *offsets++=sourceIndex; 1.522 + *offsets++=sourceIndex++; 1.523 + } 1.524 + /* realign length and targetCapacity */ 1.525 + goto directMode; 1.526 + } else { 1.527 + if(offsets!=NULL) { 1.528 + *offsets++=sourceIndex++; 1.529 + } 1.530 + cnv->charErrorBuffer[0]=MINUS; 1.531 + cnv->charErrorBufferLength=1; 1.532 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.533 + break; 1.534 + } 1.535 + } else { 1.536 + /* un-read this character and switch to Unicode Mode */ 1.537 + --source; 1.538 + *target++=PLUS; 1.539 + if(offsets!=NULL) { 1.540 + *offsets++=sourceIndex; 1.541 + } 1.542 + inDirectMode=FALSE; 1.543 + base64Counter=0; 1.544 + goto unicodeMode; 1.545 + } 1.546 + --length; 1.547 + } 1.548 + if(source<sourceLimit && target>=targetLimit) { 1.549 + /* target is full */ 1.550 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.551 + } 1.552 + } else { 1.553 +unicodeMode: 1.554 + while(source<sourceLimit) { 1.555 + if(target<targetLimit) { 1.556 + c=*source++; 1.557 + if(c<=127 && encodeDirectly[c]) { 1.558 + /* encode directly */ 1.559 + inDirectMode=TRUE; 1.560 + 1.561 + /* trick: back out this character to make this easier */ 1.562 + --source; 1.563 + 1.564 + /* terminate the base64 sequence */ 1.565 + if(base64Counter!=0) { 1.566 + /* write remaining bits for the previous character */ 1.567 + *target++=toBase64[bits]; 1.568 + if(offsets!=NULL) { 1.569 + *offsets++=sourceIndex-1; 1.570 + } 1.571 + } 1.572 + if(fromBase64[c]!=-1) { 1.573 + /* need to terminate with a minus */ 1.574 + if(target<targetLimit) { 1.575 + *target++=MINUS; 1.576 + if(offsets!=NULL) { 1.577 + *offsets++=sourceIndex-1; 1.578 + } 1.579 + } else { 1.580 + cnv->charErrorBuffer[0]=MINUS; 1.581 + cnv->charErrorBufferLength=1; 1.582 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.583 + break; 1.584 + } 1.585 + } 1.586 + goto directMode; 1.587 + } else { 1.588 + /* 1.589 + * base64 this character: 1.590 + * Output 2 or 3 base64 bytes for the remaining bits of the previous character 1.591 + * and the bits of this character, each implicitly in UTF-16BE. 1.592 + * 1.593 + * Here, bits is an 8-bit variable because only 6 bits need to be kept from one 1.594 + * character to the next. The actual 2 or 4 bits are shifted to the left edge 1.595 + * of the 6-bits field 5..0 to make the termination of the base64 sequence easier. 1.596 + */ 1.597 + switch(base64Counter) { 1.598 + case 0: 1.599 + *target++=toBase64[c>>10]; 1.600 + if(target<targetLimit) { 1.601 + *target++=toBase64[(c>>4)&0x3f]; 1.602 + if(offsets!=NULL) { 1.603 + *offsets++=sourceIndex; 1.604 + *offsets++=sourceIndex++; 1.605 + } 1.606 + } else { 1.607 + if(offsets!=NULL) { 1.608 + *offsets++=sourceIndex++; 1.609 + } 1.610 + cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f]; 1.611 + cnv->charErrorBufferLength=1; 1.612 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.613 + } 1.614 + bits=(uint8_t)((c&15)<<2); 1.615 + base64Counter=1; 1.616 + break; 1.617 + case 1: 1.618 + *target++=toBase64[bits|(c>>14)]; 1.619 + if(target<targetLimit) { 1.620 + *target++=toBase64[(c>>8)&0x3f]; 1.621 + if(target<targetLimit) { 1.622 + *target++=toBase64[(c>>2)&0x3f]; 1.623 + if(offsets!=NULL) { 1.624 + *offsets++=sourceIndex; 1.625 + *offsets++=sourceIndex; 1.626 + *offsets++=sourceIndex++; 1.627 + } 1.628 + } else { 1.629 + if(offsets!=NULL) { 1.630 + *offsets++=sourceIndex; 1.631 + *offsets++=sourceIndex++; 1.632 + } 1.633 + cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f]; 1.634 + cnv->charErrorBufferLength=1; 1.635 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.636 + } 1.637 + } else { 1.638 + if(offsets!=NULL) { 1.639 + *offsets++=sourceIndex++; 1.640 + } 1.641 + cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f]; 1.642 + cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f]; 1.643 + cnv->charErrorBufferLength=2; 1.644 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.645 + } 1.646 + bits=(uint8_t)((c&3)<<4); 1.647 + base64Counter=2; 1.648 + break; 1.649 + case 2: 1.650 + *target++=toBase64[bits|(c>>12)]; 1.651 + if(target<targetLimit) { 1.652 + *target++=toBase64[(c>>6)&0x3f]; 1.653 + if(target<targetLimit) { 1.654 + *target++=toBase64[c&0x3f]; 1.655 + if(offsets!=NULL) { 1.656 + *offsets++=sourceIndex; 1.657 + *offsets++=sourceIndex; 1.658 + *offsets++=sourceIndex++; 1.659 + } 1.660 + } else { 1.661 + if(offsets!=NULL) { 1.662 + *offsets++=sourceIndex; 1.663 + *offsets++=sourceIndex++; 1.664 + } 1.665 + cnv->charErrorBuffer[0]=toBase64[c&0x3f]; 1.666 + cnv->charErrorBufferLength=1; 1.667 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.668 + } 1.669 + } else { 1.670 + if(offsets!=NULL) { 1.671 + *offsets++=sourceIndex++; 1.672 + } 1.673 + cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f]; 1.674 + cnv->charErrorBuffer[1]=toBase64[c&0x3f]; 1.675 + cnv->charErrorBufferLength=2; 1.676 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.677 + } 1.678 + bits=0; 1.679 + base64Counter=0; 1.680 + break; 1.681 + default: 1.682 + /* will never occur */ 1.683 + break; 1.684 + } 1.685 + } 1.686 + } else { 1.687 + /* target is full */ 1.688 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.689 + break; 1.690 + } 1.691 + } 1.692 + } 1.693 + 1.694 + if(pArgs->flush && source>=sourceLimit) { 1.695 + /* flush remaining bits to the target */ 1.696 + if(!inDirectMode) { 1.697 + if (base64Counter!=0) { 1.698 + if(target<targetLimit) { 1.699 + *target++=toBase64[bits]; 1.700 + if(offsets!=NULL) { 1.701 + *offsets++=sourceIndex-1; 1.702 + } 1.703 + } else { 1.704 + cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits]; 1.705 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.706 + } 1.707 + } 1.708 + /* Add final MINUS to terminate unicodeMode */ 1.709 + if(target<targetLimit) { 1.710 + *target++=MINUS; 1.711 + if(offsets!=NULL) { 1.712 + *offsets++=sourceIndex-1; 1.713 + } 1.714 + } else { 1.715 + cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS; 1.716 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.717 + } 1.718 + } 1.719 + /* reset the state for the next conversion */ 1.720 + cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 1.721 + } else { 1.722 + /* set the converter state back into UConverter */ 1.723 + cnv->fromUnicodeStatus= 1.724 + (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/ 1.725 + ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits; 1.726 + } 1.727 + 1.728 + /* write back the updated pointers */ 1.729 + pArgs->source=source; 1.730 + pArgs->target=(char *)target; 1.731 + pArgs->offsets=offsets; 1.732 + return; 1.733 +} 1.734 + 1.735 +static const char * 1.736 +_UTF7GetName(const UConverter *cnv) { 1.737 + switch(cnv->fromUnicodeStatus>>28) { 1.738 + case 1: 1.739 + return "UTF-7,version=1"; 1.740 + default: 1.741 + return "UTF-7"; 1.742 + } 1.743 +} 1.744 + 1.745 +static const UConverterImpl _UTF7Impl={ 1.746 + UCNV_UTF7, 1.747 + 1.748 + NULL, 1.749 + NULL, 1.750 + 1.751 + _UTF7Open, 1.752 + NULL, 1.753 + _UTF7Reset, 1.754 + 1.755 + _UTF7ToUnicodeWithOffsets, 1.756 + _UTF7ToUnicodeWithOffsets, 1.757 + _UTF7FromUnicodeWithOffsets, 1.758 + _UTF7FromUnicodeWithOffsets, 1.759 + NULL, 1.760 + 1.761 + NULL, 1.762 + _UTF7GetName, 1.763 + NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */ 1.764 + NULL, 1.765 + ucnv_getCompleteUnicodeSet 1.766 +}; 1.767 + 1.768 +static const UConverterStaticData _UTF7StaticData={ 1.769 + sizeof(UConverterStaticData), 1.770 + "UTF-7", 1.771 + 0, /* TODO CCSID for UTF-7 */ 1.772 + UCNV_IBM, UCNV_UTF7, 1.773 + 1, 4, 1.774 + { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */ 1.775 + FALSE, FALSE, 1.776 + 0, 1.777 + 0, 1.778 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.779 +}; 1.780 + 1.781 +const UConverterSharedData _UTF7Data={ 1.782 + sizeof(UConverterSharedData), ~((uint32_t)0), 1.783 + NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl, 1.784 + 0 1.785 +}; 1.786 + 1.787 +/* IMAP mailbox name encoding ----------------------------------------------- */ 1.788 + 1.789 +/* 1.790 + * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 1.791 + * http://www.ietf.org/rfc/rfc2060.txt 1.792 + * 1.793 + * 5.1.3. Mailbox International Naming Convention 1.794 + * 1.795 + * By convention, international mailbox names are specified using a 1.796 + * modified version of the UTF-7 encoding described in [UTF-7]. The 1.797 + * purpose of these modifications is to correct the following problems 1.798 + * with UTF-7: 1.799 + * 1.800 + * 1) UTF-7 uses the "+" character for shifting; this conflicts with 1.801 + * the common use of "+" in mailbox names, in particular USENET 1.802 + * newsgroup names. 1.803 + * 1.804 + * 2) UTF-7's encoding is BASE64 which uses the "/" character; this 1.805 + * conflicts with the use of "/" as a popular hierarchy delimiter. 1.806 + * 1.807 + * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with 1.808 + * the use of "\" as a popular hierarchy delimiter. 1.809 + * 1.810 + * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with 1.811 + * the use of "~" in some servers as a home directory indicator. 1.812 + * 1.813 + * 5) UTF-7 permits multiple alternate forms to represent the same 1.814 + * string; in particular, printable US-ASCII chararacters can be 1.815 + * represented in encoded form. 1.816 + * 1.817 + * In modified UTF-7, printable US-ASCII characters except for "&" 1.818 + * represent themselves; that is, characters with octet values 0x20-0x25 1.819 + * and 0x27-0x7e. The character "&" (0x26) is represented by the two- 1.820 + * octet sequence "&-". 1.821 + * 1.822 + * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all 1.823 + * Unicode 16-bit octets) are represented in modified BASE64, with a 1.824 + * further modification from [UTF-7] that "," is used instead of "/". 1.825 + * Modified BASE64 MUST NOT be used to represent any printing US-ASCII 1.826 + * character which can represent itself. 1.827 + * 1.828 + * "&" is used to shift to modified BASE64 and "-" to shift back to US- 1.829 + * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that 1.830 + * is, a name that ends with a Unicode 16-bit octet MUST end with a "- 1.831 + * "). 1.832 + * 1.833 + * For example, here is a mailbox name which mixes English, Japanese, 1.834 + * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw- 1.835 + */ 1.836 + 1.837 +/* 1.838 + * Tests for US-ASCII characters belonging to character classes 1.839 + * defined in UTF-7. 1.840 + * 1.841 + * Set D (directly encoded characters) consists of the following 1.842 + * characters: the upper and lower case letters A through Z 1.843 + * and a through z, the 10 digits 0-9, and the following nine special 1.844 + * characters (note that "+" and "=" are omitted): 1.845 + * '(),-./:? 1.846 + * 1.847 + * Set O (optional direct characters) consists of the following 1.848 + * characters (note that "\" and "~" are omitted): 1.849 + * !"#$%&*;<=>@[]^_`{|} 1.850 + * 1.851 + * According to the rules in RFC 2152, the byte values for the following 1.852 + * US-ASCII characters are not used in UTF-7 and are therefore illegal: 1.853 + * - all C0 control codes except for CR LF TAB 1.854 + * - BACKSLASH 1.855 + * - TILDE 1.856 + * - DEL 1.857 + * - all codes beyond US-ASCII, i.e. all >127 1.858 + */ 1.859 + 1.860 +/* uses '&' not '+' to start a base64 sequence */ 1.861 +#define AMPERSAND 0x26 1.862 +#define COMMA 0x2c 1.863 +#define SLASH 0x2f 1.864 + 1.865 +/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */ 1.866 +#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e) 1.867 + 1.868 +/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */ 1.869 +#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND) 1.870 + 1.871 +#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA) 1.872 +#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c]) 1.873 + 1.874 +/* 1.875 + * converter status values: 1.876 + * 1.877 + * toUnicodeStatus: 1.878 + * 24 inDirectMode (boolean) 1.879 + * 23..16 base64Counter (-1..7) 1.880 + * 15..0 bits (up to 14 bits incoming base64) 1.881 + * 1.882 + * fromUnicodeStatus: 1.883 + * 24 inDirectMode (boolean) 1.884 + * 23..16 base64Counter (0..2) 1.885 + * 7..0 bits (6 bits outgoing base64) 1.886 + * 1.887 + * ignore bits 31..25 1.888 + */ 1.889 + 1.890 +static void 1.891 +_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.892 + UErrorCode *pErrorCode) { 1.893 + UConverter *cnv; 1.894 + const uint8_t *source, *sourceLimit; 1.895 + UChar *target; 1.896 + const UChar *targetLimit; 1.897 + int32_t *offsets; 1.898 + 1.899 + uint8_t *bytes; 1.900 + uint8_t byteIndex; 1.901 + 1.902 + int32_t length, targetCapacity; 1.903 + 1.904 + /* UTF-7 state */ 1.905 + uint16_t bits; 1.906 + int8_t base64Counter; 1.907 + UBool inDirectMode; 1.908 + 1.909 + int8_t base64Value; 1.910 + 1.911 + int32_t sourceIndex, nextSourceIndex; 1.912 + 1.913 + UChar c; 1.914 + uint8_t b; 1.915 + 1.916 + /* set up the local pointers */ 1.917 + cnv=pArgs->converter; 1.918 + 1.919 + source=(const uint8_t *)pArgs->source; 1.920 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.921 + target=pArgs->target; 1.922 + targetLimit=pArgs->targetLimit; 1.923 + offsets=pArgs->offsets; 1.924 + /* get the state machine state */ 1.925 + { 1.926 + uint32_t status=cnv->toUnicodeStatus; 1.927 + inDirectMode=(UBool)((status>>24)&1); 1.928 + base64Counter=(int8_t)(status>>16); 1.929 + bits=(uint16_t)status; 1.930 + } 1.931 + bytes=cnv->toUBytes; 1.932 + byteIndex=cnv->toULength; 1.933 + 1.934 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.935 + sourceIndex=byteIndex==0 ? 0 : -1; 1.936 + nextSourceIndex=0; 1.937 + 1.938 + if(inDirectMode) { 1.939 +directMode: 1.940 + /* 1.941 + * In Direct Mode, US-ASCII characters are encoded directly, i.e., 1.942 + * with their US-ASCII byte values. 1.943 + * An ampersand starts Unicode (or "escape") Mode. 1.944 + * 1.945 + * In Direct Mode, only the sourceIndex is used. 1.946 + */ 1.947 + byteIndex=0; 1.948 + length=(int32_t)(sourceLimit-source); 1.949 + targetCapacity=(int32_t)(targetLimit-target); 1.950 + if(length>targetCapacity) { 1.951 + length=targetCapacity; 1.952 + } 1.953 + while(length>0) { 1.954 + b=*source++; 1.955 + if(!isLegalIMAP(b)) { 1.956 + /* illegal */ 1.957 + bytes[0]=b; 1.958 + byteIndex=1; 1.959 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.960 + break; 1.961 + } else if(b!=AMPERSAND) { 1.962 + /* write directly encoded character */ 1.963 + *target++=b; 1.964 + if(offsets!=NULL) { 1.965 + *offsets++=sourceIndex++; 1.966 + } 1.967 + } else /* AMPERSAND */ { 1.968 + /* switch to Unicode mode */ 1.969 + nextSourceIndex=++sourceIndex; 1.970 + inDirectMode=FALSE; 1.971 + byteIndex=0; 1.972 + bits=0; 1.973 + base64Counter=-1; 1.974 + goto unicodeMode; 1.975 + } 1.976 + --length; 1.977 + } 1.978 + if(source<sourceLimit && target>=targetLimit) { 1.979 + /* target is full */ 1.980 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.981 + } 1.982 + } else { 1.983 +unicodeMode: 1.984 + /* 1.985 + * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded. 1.986 + * The base64 sequence ends with any character that is not in the base64 alphabet. 1.987 + * A terminating minus sign is consumed. 1.988 + * US-ASCII must not be base64-ed. 1.989 + * 1.990 + * In Unicode Mode, the sourceIndex has the index to the start of the current 1.991 + * base64 bytes, while nextSourceIndex is precisely parallel to source, 1.992 + * keeping the index to the following byte. 1.993 + * Note that in 2 out of 3 cases, UChars overlap within a base64 byte. 1.994 + */ 1.995 + while(source<sourceLimit) { 1.996 + if(target<targetLimit) { 1.997 + bytes[byteIndex++]=b=*source++; 1.998 + ++nextSourceIndex; 1.999 + if(b>0x7e) { 1.1000 + /* illegal - test other illegal US-ASCII values by base64Value==-3 */ 1.1001 + inDirectMode=TRUE; 1.1002 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1003 + break; 1.1004 + } else if((base64Value=FROM_BASE64_IMAP(b))>=0) { 1.1005 + /* collect base64 bytes into UChars */ 1.1006 + switch(base64Counter) { 1.1007 + case -1: /* -1 is immediately after the & */ 1.1008 + case 0: 1.1009 + bits=base64Value; 1.1010 + base64Counter=1; 1.1011 + break; 1.1012 + case 1: 1.1013 + case 3: 1.1014 + case 4: 1.1015 + case 6: 1.1016 + bits=(uint16_t)((bits<<6)|base64Value); 1.1017 + ++base64Counter; 1.1018 + break; 1.1019 + case 2: 1.1020 + c=(UChar)((bits<<4)|(base64Value>>2)); 1.1021 + if(isLegalIMAP(c)) { 1.1022 + /* illegal */ 1.1023 + inDirectMode=TRUE; 1.1024 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1025 + goto endloop; 1.1026 + } 1.1027 + *target++=c; 1.1028 + if(offsets!=NULL) { 1.1029 + *offsets++=sourceIndex; 1.1030 + sourceIndex=nextSourceIndex-1; 1.1031 + } 1.1032 + bytes[0]=b; /* keep this byte in case an error occurs */ 1.1033 + byteIndex=1; 1.1034 + bits=(uint16_t)(base64Value&3); 1.1035 + base64Counter=3; 1.1036 + break; 1.1037 + case 5: 1.1038 + c=(UChar)((bits<<2)|(base64Value>>4)); 1.1039 + if(isLegalIMAP(c)) { 1.1040 + /* illegal */ 1.1041 + inDirectMode=TRUE; 1.1042 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1043 + goto endloop; 1.1044 + } 1.1045 + *target++=c; 1.1046 + if(offsets!=NULL) { 1.1047 + *offsets++=sourceIndex; 1.1048 + sourceIndex=nextSourceIndex-1; 1.1049 + } 1.1050 + bytes[0]=b; /* keep this byte in case an error occurs */ 1.1051 + byteIndex=1; 1.1052 + bits=(uint16_t)(base64Value&15); 1.1053 + base64Counter=6; 1.1054 + break; 1.1055 + case 7: 1.1056 + c=(UChar)((bits<<6)|base64Value); 1.1057 + if(isLegalIMAP(c)) { 1.1058 + /* illegal */ 1.1059 + inDirectMode=TRUE; 1.1060 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1061 + goto endloop; 1.1062 + } 1.1063 + *target++=c; 1.1064 + if(offsets!=NULL) { 1.1065 + *offsets++=sourceIndex; 1.1066 + sourceIndex=nextSourceIndex; 1.1067 + } 1.1068 + byteIndex=0; 1.1069 + bits=0; 1.1070 + base64Counter=0; 1.1071 + break; 1.1072 + default: 1.1073 + /* will never occur */ 1.1074 + break; 1.1075 + } 1.1076 + } else if(base64Value==-2) { 1.1077 + /* minus sign terminates the base64 sequence */ 1.1078 + inDirectMode=TRUE; 1.1079 + if(base64Counter==-1) { 1.1080 + /* &- i.e. a minus immediately following an ampersand */ 1.1081 + *target++=AMPERSAND; 1.1082 + if(offsets!=NULL) { 1.1083 + *offsets++=sourceIndex-1; 1.1084 + } 1.1085 + } else { 1.1086 + /* absorb the minus and leave the Unicode Mode */ 1.1087 + if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) { 1.1088 + /* bits are illegally left over, a UChar is incomplete */ 1.1089 + /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */ 1.1090 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1091 + break; 1.1092 + } 1.1093 + } 1.1094 + sourceIndex=nextSourceIndex; 1.1095 + goto directMode; 1.1096 + } else { 1.1097 + if(base64Counter==-1) { 1.1098 + /* illegal: & immediately followed by something other than base64 or minus sign */ 1.1099 + /* include the ampersand in the reported sequence */ 1.1100 + --sourceIndex; 1.1101 + bytes[0]=AMPERSAND; 1.1102 + bytes[1]=b; 1.1103 + byteIndex=2; 1.1104 + } 1.1105 + /* base64Value==-1 for characters that are illegal only in Unicode mode */ 1.1106 + /* base64Value==-3 for illegal characters */ 1.1107 + /* illegal */ 1.1108 + inDirectMode=TRUE; 1.1109 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1110 + break; 1.1111 + } 1.1112 + } else { 1.1113 + /* target is full */ 1.1114 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1115 + break; 1.1116 + } 1.1117 + } 1.1118 + } 1.1119 +endloop: 1.1120 + 1.1121 + /* 1.1122 + * the end of the input stream and detection of truncated input 1.1123 + * are handled by the framework, but here we must check if we are in Unicode 1.1124 + * mode and byteIndex==0 because we must end in direct mode 1.1125 + * 1.1126 + * conditions: 1.1127 + * successful 1.1128 + * in Unicode mode and byteIndex==0 1.1129 + * end of input and no truncated input 1.1130 + */ 1.1131 + if( U_SUCCESS(*pErrorCode) && 1.1132 + !inDirectMode && byteIndex==0 && 1.1133 + pArgs->flush && source>=sourceLimit 1.1134 + ) { 1.1135 + if(base64Counter==-1) { 1.1136 + /* & at the very end of the input */ 1.1137 + /* make the ampersand the reported sequence */ 1.1138 + bytes[0]=AMPERSAND; 1.1139 + byteIndex=1; 1.1140 + } 1.1141 + /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */ 1.1142 + 1.1143 + inDirectMode=TRUE; /* avoid looping */ 1.1144 + *pErrorCode=U_TRUNCATED_CHAR_FOUND; 1.1145 + } 1.1146 + 1.1147 + /* set the converter state back into UConverter */ 1.1148 + cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits; 1.1149 + cnv->toULength=byteIndex; 1.1150 + 1.1151 + /* write back the updated pointers */ 1.1152 + pArgs->source=(const char *)source; 1.1153 + pArgs->target=target; 1.1154 + pArgs->offsets=offsets; 1.1155 + return; 1.1156 +} 1.1157 + 1.1158 +static void 1.1159 +_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.1160 + UErrorCode *pErrorCode) { 1.1161 + UConverter *cnv; 1.1162 + const UChar *source, *sourceLimit; 1.1163 + uint8_t *target, *targetLimit; 1.1164 + int32_t *offsets; 1.1165 + 1.1166 + int32_t length, targetCapacity, sourceIndex; 1.1167 + UChar c; 1.1168 + uint8_t b; 1.1169 + 1.1170 + /* UTF-7 state */ 1.1171 + uint8_t bits; 1.1172 + int8_t base64Counter; 1.1173 + UBool inDirectMode; 1.1174 + 1.1175 + /* set up the local pointers */ 1.1176 + cnv=pArgs->converter; 1.1177 + 1.1178 + /* set up the local pointers */ 1.1179 + source=pArgs->source; 1.1180 + sourceLimit=pArgs->sourceLimit; 1.1181 + target=(uint8_t *)pArgs->target; 1.1182 + targetLimit=(uint8_t *)pArgs->targetLimit; 1.1183 + offsets=pArgs->offsets; 1.1184 + 1.1185 + /* get the state machine state */ 1.1186 + { 1.1187 + uint32_t status=cnv->fromUnicodeStatus; 1.1188 + inDirectMode=(UBool)((status>>24)&1); 1.1189 + base64Counter=(int8_t)(status>>16); 1.1190 + bits=(uint8_t)status; 1.1191 + } 1.1192 + 1.1193 + /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */ 1.1194 + sourceIndex=0; 1.1195 + 1.1196 + if(inDirectMode) { 1.1197 +directMode: 1.1198 + length=(int32_t)(sourceLimit-source); 1.1199 + targetCapacity=(int32_t)(targetLimit-target); 1.1200 + if(length>targetCapacity) { 1.1201 + length=targetCapacity; 1.1202 + } 1.1203 + while(length>0) { 1.1204 + c=*source++; 1.1205 + /* encode 0x20..0x7e except '&' directly */ 1.1206 + if(inSetDIMAP(c)) { 1.1207 + /* encode directly */ 1.1208 + *target++=(uint8_t)c; 1.1209 + if(offsets!=NULL) { 1.1210 + *offsets++=sourceIndex++; 1.1211 + } 1.1212 + } else if(c==AMPERSAND) { 1.1213 + /* output &- for & */ 1.1214 + *target++=AMPERSAND; 1.1215 + if(target<targetLimit) { 1.1216 + *target++=MINUS; 1.1217 + if(offsets!=NULL) { 1.1218 + *offsets++=sourceIndex; 1.1219 + *offsets++=sourceIndex++; 1.1220 + } 1.1221 + /* realign length and targetCapacity */ 1.1222 + goto directMode; 1.1223 + } else { 1.1224 + if(offsets!=NULL) { 1.1225 + *offsets++=sourceIndex++; 1.1226 + } 1.1227 + cnv->charErrorBuffer[0]=MINUS; 1.1228 + cnv->charErrorBufferLength=1; 1.1229 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1230 + break; 1.1231 + } 1.1232 + } else { 1.1233 + /* un-read this character and switch to Unicode Mode */ 1.1234 + --source; 1.1235 + *target++=AMPERSAND; 1.1236 + if(offsets!=NULL) { 1.1237 + *offsets++=sourceIndex; 1.1238 + } 1.1239 + inDirectMode=FALSE; 1.1240 + base64Counter=0; 1.1241 + goto unicodeMode; 1.1242 + } 1.1243 + --length; 1.1244 + } 1.1245 + if(source<sourceLimit && target>=targetLimit) { 1.1246 + /* target is full */ 1.1247 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1248 + } 1.1249 + } else { 1.1250 +unicodeMode: 1.1251 + while(source<sourceLimit) { 1.1252 + if(target<targetLimit) { 1.1253 + c=*source++; 1.1254 + if(isLegalIMAP(c)) { 1.1255 + /* encode directly */ 1.1256 + inDirectMode=TRUE; 1.1257 + 1.1258 + /* trick: back out this character to make this easier */ 1.1259 + --source; 1.1260 + 1.1261 + /* terminate the base64 sequence */ 1.1262 + if(base64Counter!=0) { 1.1263 + /* write remaining bits for the previous character */ 1.1264 + *target++=TO_BASE64_IMAP(bits); 1.1265 + if(offsets!=NULL) { 1.1266 + *offsets++=sourceIndex-1; 1.1267 + } 1.1268 + } 1.1269 + /* need to terminate with a minus */ 1.1270 + if(target<targetLimit) { 1.1271 + *target++=MINUS; 1.1272 + if(offsets!=NULL) { 1.1273 + *offsets++=sourceIndex-1; 1.1274 + } 1.1275 + } else { 1.1276 + cnv->charErrorBuffer[0]=MINUS; 1.1277 + cnv->charErrorBufferLength=1; 1.1278 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1279 + break; 1.1280 + } 1.1281 + goto directMode; 1.1282 + } else { 1.1283 + /* 1.1284 + * base64 this character: 1.1285 + * Output 2 or 3 base64 bytes for the remaining bits of the previous character 1.1286 + * and the bits of this character, each implicitly in UTF-16BE. 1.1287 + * 1.1288 + * Here, bits is an 8-bit variable because only 6 bits need to be kept from one 1.1289 + * character to the next. The actual 2 or 4 bits are shifted to the left edge 1.1290 + * of the 6-bits field 5..0 to make the termination of the base64 sequence easier. 1.1291 + */ 1.1292 + switch(base64Counter) { 1.1293 + case 0: 1.1294 + b=(uint8_t)(c>>10); 1.1295 + *target++=TO_BASE64_IMAP(b); 1.1296 + if(target<targetLimit) { 1.1297 + b=(uint8_t)((c>>4)&0x3f); 1.1298 + *target++=TO_BASE64_IMAP(b); 1.1299 + if(offsets!=NULL) { 1.1300 + *offsets++=sourceIndex; 1.1301 + *offsets++=sourceIndex++; 1.1302 + } 1.1303 + } else { 1.1304 + if(offsets!=NULL) { 1.1305 + *offsets++=sourceIndex++; 1.1306 + } 1.1307 + b=(uint8_t)((c>>4)&0x3f); 1.1308 + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1.1309 + cnv->charErrorBufferLength=1; 1.1310 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1311 + } 1.1312 + bits=(uint8_t)((c&15)<<2); 1.1313 + base64Counter=1; 1.1314 + break; 1.1315 + case 1: 1.1316 + b=(uint8_t)(bits|(c>>14)); 1.1317 + *target++=TO_BASE64_IMAP(b); 1.1318 + if(target<targetLimit) { 1.1319 + b=(uint8_t)((c>>8)&0x3f); 1.1320 + *target++=TO_BASE64_IMAP(b); 1.1321 + if(target<targetLimit) { 1.1322 + b=(uint8_t)((c>>2)&0x3f); 1.1323 + *target++=TO_BASE64_IMAP(b); 1.1324 + if(offsets!=NULL) { 1.1325 + *offsets++=sourceIndex; 1.1326 + *offsets++=sourceIndex; 1.1327 + *offsets++=sourceIndex++; 1.1328 + } 1.1329 + } else { 1.1330 + if(offsets!=NULL) { 1.1331 + *offsets++=sourceIndex; 1.1332 + *offsets++=sourceIndex++; 1.1333 + } 1.1334 + b=(uint8_t)((c>>2)&0x3f); 1.1335 + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1.1336 + cnv->charErrorBufferLength=1; 1.1337 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1338 + } 1.1339 + } else { 1.1340 + if(offsets!=NULL) { 1.1341 + *offsets++=sourceIndex++; 1.1342 + } 1.1343 + b=(uint8_t)((c>>8)&0x3f); 1.1344 + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1.1345 + b=(uint8_t)((c>>2)&0x3f); 1.1346 + cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b); 1.1347 + cnv->charErrorBufferLength=2; 1.1348 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1349 + } 1.1350 + bits=(uint8_t)((c&3)<<4); 1.1351 + base64Counter=2; 1.1352 + break; 1.1353 + case 2: 1.1354 + b=(uint8_t)(bits|(c>>12)); 1.1355 + *target++=TO_BASE64_IMAP(b); 1.1356 + if(target<targetLimit) { 1.1357 + b=(uint8_t)((c>>6)&0x3f); 1.1358 + *target++=TO_BASE64_IMAP(b); 1.1359 + if(target<targetLimit) { 1.1360 + b=(uint8_t)(c&0x3f); 1.1361 + *target++=TO_BASE64_IMAP(b); 1.1362 + if(offsets!=NULL) { 1.1363 + *offsets++=sourceIndex; 1.1364 + *offsets++=sourceIndex; 1.1365 + *offsets++=sourceIndex++; 1.1366 + } 1.1367 + } else { 1.1368 + if(offsets!=NULL) { 1.1369 + *offsets++=sourceIndex; 1.1370 + *offsets++=sourceIndex++; 1.1371 + } 1.1372 + b=(uint8_t)(c&0x3f); 1.1373 + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1.1374 + cnv->charErrorBufferLength=1; 1.1375 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1376 + } 1.1377 + } else { 1.1378 + if(offsets!=NULL) { 1.1379 + *offsets++=sourceIndex++; 1.1380 + } 1.1381 + b=(uint8_t)((c>>6)&0x3f); 1.1382 + cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b); 1.1383 + b=(uint8_t)(c&0x3f); 1.1384 + cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b); 1.1385 + cnv->charErrorBufferLength=2; 1.1386 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1387 + } 1.1388 + bits=0; 1.1389 + base64Counter=0; 1.1390 + break; 1.1391 + default: 1.1392 + /* will never occur */ 1.1393 + break; 1.1394 + } 1.1395 + } 1.1396 + } else { 1.1397 + /* target is full */ 1.1398 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1399 + break; 1.1400 + } 1.1401 + } 1.1402 + } 1.1403 + 1.1404 + if(pArgs->flush && source>=sourceLimit) { 1.1405 + /* flush remaining bits to the target */ 1.1406 + if(!inDirectMode) { 1.1407 + if(base64Counter!=0) { 1.1408 + if(target<targetLimit) { 1.1409 + *target++=TO_BASE64_IMAP(bits); 1.1410 + if(offsets!=NULL) { 1.1411 + *offsets++=sourceIndex-1; 1.1412 + } 1.1413 + } else { 1.1414 + cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits); 1.1415 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1416 + } 1.1417 + } 1.1418 + /* need to terminate with a minus */ 1.1419 + if(target<targetLimit) { 1.1420 + *target++=MINUS; 1.1421 + if(offsets!=NULL) { 1.1422 + *offsets++=sourceIndex-1; 1.1423 + } 1.1424 + } else { 1.1425 + cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS; 1.1426 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1427 + } 1.1428 + } 1.1429 + /* reset the state for the next conversion */ 1.1430 + cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */ 1.1431 + } else { 1.1432 + /* set the converter state back into UConverter */ 1.1433 + cnv->fromUnicodeStatus= 1.1434 + (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/ 1.1435 + ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits; 1.1436 + } 1.1437 + 1.1438 + /* write back the updated pointers */ 1.1439 + pArgs->source=source; 1.1440 + pArgs->target=(char *)target; 1.1441 + pArgs->offsets=offsets; 1.1442 + return; 1.1443 +} 1.1444 + 1.1445 +static const UConverterImpl _IMAPImpl={ 1.1446 + UCNV_IMAP_MAILBOX, 1.1447 + 1.1448 + NULL, 1.1449 + NULL, 1.1450 + 1.1451 + _UTF7Open, 1.1452 + NULL, 1.1453 + _UTF7Reset, 1.1454 + 1.1455 + _IMAPToUnicodeWithOffsets, 1.1456 + _IMAPToUnicodeWithOffsets, 1.1457 + _IMAPFromUnicodeWithOffsets, 1.1458 + _IMAPFromUnicodeWithOffsets, 1.1459 + NULL, 1.1460 + 1.1461 + NULL, 1.1462 + NULL, 1.1463 + NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */ 1.1464 + NULL, 1.1465 + ucnv_getCompleteUnicodeSet 1.1466 +}; 1.1467 + 1.1468 +static const UConverterStaticData _IMAPStaticData={ 1.1469 + sizeof(UConverterStaticData), 1.1470 + "IMAP-mailbox-name", 1.1471 + 0, /* TODO CCSID for IMAP-mailbox-name */ 1.1472 + UCNV_IBM, UCNV_IMAP_MAILBOX, 1.1473 + 1, 4, 1.1474 + { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */ 1.1475 + FALSE, FALSE, 1.1476 + 0, 1.1477 + 0, 1.1478 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.1479 +}; 1.1480 + 1.1481 +const UConverterSharedData _IMAPData={ 1.1482 + sizeof(UConverterSharedData), ~((uint32_t)0), 1.1483 + NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl, 1.1484 + 0 1.1485 +}; 1.1486 + 1.1487 +#endif