intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "nsUTF16ToUnicode.h"
michael@0 7 #include "nsCharTraits.h"
michael@0 8 #include "mozilla/Endian.h"
michael@0 9
michael@0 10 enum {
michael@0 11 STATE_NORMAL = 0,
michael@0 12 STATE_HALF_CODE_POINT = 1,
michael@0 13 STATE_FIRST_CALL = 2,
michael@0 14 STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
michael@0 15 STATE_ODD_SURROGATE_PAIR = 4
michael@0 16 };
michael@0 17
michael@0 18 nsresult
michael@0 19 nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
michael@0 20 int32_t * aSrcLength,
michael@0 21 char16_t * aDest,
michael@0 22 int32_t * aDestLength,
michael@0 23 bool aSwapBytes)
michael@0 24 {
michael@0 25 const char* src = aSrc;
michael@0 26 const char* srcEnd = aSrc + *aSrcLength;
michael@0 27 char16_t* dest = aDest;
michael@0 28 char16_t* destEnd = aDest + *aDestLength;
michael@0 29 char16_t oddHighSurrogate;
michael@0 30
michael@0 31 switch(mState) {
michael@0 32 case STATE_FIRST_CALL:
michael@0 33 NS_ASSERTION(*aSrcLength > 1, "buffer too short");
michael@0 34 src+=2;
michael@0 35 mState = STATE_NORMAL;
michael@0 36 break;
michael@0 37
michael@0 38 case STATE_SECOND_BYTE:
michael@0 39 NS_ASSERTION(*aSrcLength > 0, "buffer too short");
michael@0 40 src++;
michael@0 41 mState = STATE_NORMAL;
michael@0 42 break;
michael@0 43
michael@0 44 case STATE_ODD_SURROGATE_PAIR:
michael@0 45 if (*aDestLength < 2)
michael@0 46 goto error;
michael@0 47 else {
michael@0 48 *dest++ = mOddHighSurrogate;
michael@0 49 *dest++ = mOddLowSurrogate;
michael@0 50 mOddHighSurrogate = mOddLowSurrogate = 0;
michael@0 51 mState = STATE_NORMAL;
michael@0 52 }
michael@0 53 break;
michael@0 54
michael@0 55 case STATE_NORMAL:
michael@0 56 case STATE_HALF_CODE_POINT:
michael@0 57 default:
michael@0 58 break;
michael@0 59 }
michael@0 60
michael@0 61 oddHighSurrogate = mOddHighSurrogate;
michael@0 62
michael@0 63 if (src == srcEnd) {
michael@0 64 *aDestLength = dest - aDest;
michael@0 65 return (mState != STATE_NORMAL || oddHighSurrogate) ?
michael@0 66 NS_OK_UDEC_MOREINPUT : NS_OK;
michael@0 67 }
michael@0 68
michael@0 69 const char* srcEvenEnd;
michael@0 70
michael@0 71 char16_t u;
michael@0 72 if (mState == STATE_HALF_CODE_POINT) {
michael@0 73 if (dest == destEnd)
michael@0 74 goto error;
michael@0 75
michael@0 76 // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
michael@0 77 // previous run while the 2nd byte has to come from |*src|.
michael@0 78 mState = STATE_NORMAL;
michael@0 79 #if MOZ_BIG_ENDIAN
michael@0 80 u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
michael@0 81 #else
michael@0 82 u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
michael@0 83 #endif
michael@0 84 srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
michael@0 85 goto have_codepoint;
michael@0 86 } else {
michael@0 87 srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
michael@0 88 }
michael@0 89
michael@0 90 while (src != srcEvenEnd) {
michael@0 91 if (dest == destEnd)
michael@0 92 goto error;
michael@0 93
michael@0 94 #if !defined(__sparc__) && !defined(__arm__)
michael@0 95 u = *(const char16_t*)src;
michael@0 96 #else
michael@0 97 memcpy(&u, src, 2);
michael@0 98 #endif
michael@0 99 src += 2;
michael@0 100
michael@0 101 have_codepoint:
michael@0 102 if (aSwapBytes)
michael@0 103 u = u << 8 | u >> 8;
michael@0 104
michael@0 105 if (!IS_SURROGATE(u)) {
michael@0 106 if (oddHighSurrogate) {
michael@0 107 if (mErrBehavior == kOnError_Signal) {
michael@0 108 goto error2;
michael@0 109 }
michael@0 110 *dest++ = UCS2_REPLACEMENT_CHAR;
michael@0 111 if (dest == destEnd)
michael@0 112 goto error;
michael@0 113 oddHighSurrogate = 0;
michael@0 114 }
michael@0 115 *dest++ = u;
michael@0 116 } else if (NS_IS_HIGH_SURROGATE(u)) {
michael@0 117 if (oddHighSurrogate) {
michael@0 118 if (mErrBehavior == kOnError_Signal) {
michael@0 119 goto error2;
michael@0 120 }
michael@0 121 *dest++ = UCS2_REPLACEMENT_CHAR;
michael@0 122 if (dest == destEnd)
michael@0 123 goto error;
michael@0 124 }
michael@0 125 oddHighSurrogate = u;
michael@0 126 }
michael@0 127 else /* if (NS_IS_LOW_SURROGATE(u)) */ {
michael@0 128 if (oddHighSurrogate && *aDestLength > 1) {
michael@0 129 if (dest + 1 >= destEnd) {
michael@0 130 mOddLowSurrogate = u;
michael@0 131 mOddHighSurrogate = oddHighSurrogate;
michael@0 132 mState = STATE_ODD_SURROGATE_PAIR;
michael@0 133 goto error;
michael@0 134 }
michael@0 135 *dest++ = oddHighSurrogate;
michael@0 136 *dest++ = u;
michael@0 137 } else {
michael@0 138 if (mErrBehavior == kOnError_Signal) {
michael@0 139 goto error2;
michael@0 140 }
michael@0 141 *dest++ = UCS2_REPLACEMENT_CHAR;
michael@0 142 }
michael@0 143 oddHighSurrogate = 0;
michael@0 144 }
michael@0 145 }
michael@0 146 if (src != srcEnd) {
michael@0 147 // store the lead byte of a 16-bit unit for the next run.
michael@0 148 mOddByte = *src++;
michael@0 149 mState = STATE_HALF_CODE_POINT;
michael@0 150 }
michael@0 151
michael@0 152 mOddHighSurrogate = oddHighSurrogate;
michael@0 153
michael@0 154 *aDestLength = dest - aDest;
michael@0 155 *aSrcLength = src - aSrc;
michael@0 156 return (mState != STATE_NORMAL || oddHighSurrogate) ?
michael@0 157 NS_OK_UDEC_MOREINPUT : NS_OK;
michael@0 158
michael@0 159 error:
michael@0 160 *aDestLength = dest - aDest;
michael@0 161 *aSrcLength = src - aSrc;
michael@0 162 return NS_OK_UDEC_MOREOUTPUT;
michael@0 163
michael@0 164 error2:
michael@0 165 *aDestLength = dest - aDest;
michael@0 166 *aSrcLength = --src - aSrc;
michael@0 167 return NS_ERROR_ILLEGAL_INPUT;
michael@0 168 }
michael@0 169
michael@0 170 NS_IMETHODIMP
michael@0 171 nsUTF16ToUnicodeBase::Reset()
michael@0 172 {
michael@0 173 mState = STATE_FIRST_CALL;
michael@0 174 mOddByte = 0;
michael@0 175 mOddHighSurrogate = 0;
michael@0 176 mOddLowSurrogate = 0;
michael@0 177 return NS_OK;
michael@0 178 }
michael@0 179
michael@0 180 NS_IMETHODIMP
michael@0 181 nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength,
michael@0 182 int32_t * aDestLength)
michael@0 183 {
michael@0 184 // the left-over data of the previous run have to be taken into account.
michael@0 185 *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2;
michael@0 186 if (mOddHighSurrogate)
michael@0 187 (*aDestLength)++;
michael@0 188 if (mOddLowSurrogate)
michael@0 189 (*aDestLength)++;
michael@0 190 return NS_OK;
michael@0 191 }
michael@0 192
michael@0 193
michael@0 194 NS_IMETHODIMP
michael@0 195 nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
michael@0 196 char16_t * aDest, int32_t * aDestLength)
michael@0 197 {
michael@0 198 switch (mState) {
michael@0 199 case STATE_FIRST_CALL:
michael@0 200 if (*aSrcLength < 2) {
michael@0 201 if (*aSrcLength < 1) {
michael@0 202 *aDestLength = 0;
michael@0 203 return NS_OK;
michael@0 204 }
michael@0 205 if (uint8_t(*aSrc) != 0xFE) {
michael@0 206 mState = STATE_NORMAL;
michael@0 207 break;
michael@0 208 }
michael@0 209 *aDestLength = 0;
michael@0 210 mState = STATE_SECOND_BYTE;
michael@0 211 return NS_OK_UDEC_MOREINPUT;
michael@0 212 }
michael@0 213 #if MOZ_LITTLE_ENDIAN
michael@0 214 // on LE machines, BE BOM is 0xFFFE
michael@0 215 if (0xFFFE != *((char16_t*)aSrc)) {
michael@0 216 mState = STATE_NORMAL;
michael@0 217 }
michael@0 218 #else
michael@0 219 if (0xFEFF != *((char16_t*)aSrc)) {
michael@0 220 mState = STATE_NORMAL;
michael@0 221 }
michael@0 222 #endif
michael@0 223 break;
michael@0 224
michael@0 225 case STATE_SECOND_BYTE:
michael@0 226 if (*aSrcLength < 1) {
michael@0 227 *aDestLength = 0;
michael@0 228 return NS_OK_UDEC_MOREINPUT;
michael@0 229 }
michael@0 230 if (uint8_t(*aSrc) != 0xFF) {
michael@0 231 mOddByte = 0xFE;
michael@0 232 mState = STATE_HALF_CODE_POINT;
michael@0 233 }
michael@0 234 break;
michael@0 235 }
michael@0 236
michael@0 237 return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
michael@0 238 bool(MOZ_LITTLE_ENDIAN));
michael@0 239 }
michael@0 240
michael@0 241 NS_IMETHODIMP
michael@0 242 nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
michael@0 243 char16_t * aDest, int32_t * aDestLength)
michael@0 244 {
michael@0 245 switch (mState) {
michael@0 246 case STATE_FIRST_CALL:
michael@0 247 if (*aSrcLength < 2) {
michael@0 248 if (*aSrcLength < 1) {
michael@0 249 *aDestLength = 0;
michael@0 250 return NS_OK;
michael@0 251 }
michael@0 252 if (uint8_t(*aSrc) != 0xFF) {
michael@0 253 mState = STATE_NORMAL;
michael@0 254 break;
michael@0 255 }
michael@0 256 *aDestLength = 0;
michael@0 257 mState = STATE_SECOND_BYTE;
michael@0 258 return NS_OK_UDEC_MOREINPUT;
michael@0 259 }
michael@0 260 #if MOZ_BIG_ENDIAN
michael@0 261 // on BE machines, LE BOM is 0xFFFE
michael@0 262 if (0xFFFE != *((char16_t*)aSrc)) {
michael@0 263 mState = STATE_NORMAL;
michael@0 264 }
michael@0 265 #else
michael@0 266 if (0xFEFF != *((char16_t*)aSrc)) {
michael@0 267 mState = STATE_NORMAL;
michael@0 268 }
michael@0 269 #endif
michael@0 270 break;
michael@0 271
michael@0 272 case STATE_SECOND_BYTE:
michael@0 273 if (*aSrcLength < 1) {
michael@0 274 *aDestLength = 0;
michael@0 275 return NS_OK_UDEC_MOREINPUT;
michael@0 276 }
michael@0 277 if (uint8_t(*aSrc) != 0xFE) {
michael@0 278 mOddByte = 0xFF;
michael@0 279 mState = STATE_HALF_CODE_POINT;
michael@0 280 }
michael@0 281 break;
michael@0 282 }
michael@0 283
michael@0 284 return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
michael@0 285 bool(MOZ_BIG_ENDIAN));
michael@0 286 }
michael@0 287
michael@0 288 NS_IMETHODIMP
michael@0 289 nsUTF16ToUnicode::Reset()
michael@0 290 {
michael@0 291 mEndian = kUnknown;
michael@0 292 mFoundBOM = false;
michael@0 293 return nsUTF16ToUnicodeBase::Reset();
michael@0 294 }
michael@0 295
michael@0 296 NS_IMETHODIMP
michael@0 297 nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
michael@0 298 char16_t * aDest, int32_t * aDestLength)
michael@0 299 {
michael@0 300 if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
michael@0 301 {
michael@0 302 nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
michael@0 303 *aSrcLength=0;
michael@0 304 *aDestLength=0;
michael@0 305 return res;
michael@0 306 }
michael@0 307 if(STATE_FIRST_CALL == mState) // first time called
michael@0 308 {
michael@0 309 // check if BOM (0xFEFF) is at the beginning, remove it if found, and
michael@0 310 // set mEndian accordingly.
michael@0 311 if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
michael@0 312 mEndian = kLittleEndian;
michael@0 313 mFoundBOM = true;
michael@0 314 }
michael@0 315 else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
michael@0 316 mEndian = kBigEndian;
michael@0 317 mFoundBOM = true;
michael@0 318 }
michael@0 319 // BOM is not found, but we can use a simple heuristic to determine
michael@0 320 // the endianness. Assume the first character is [U+0001, U+00FF].
michael@0 321 // Not always valid, but it's very likely to hold for html/xml/css.
michael@0 322 else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00)
michael@0 323 mState = STATE_NORMAL;
michael@0 324 mEndian = kBigEndian;
michael@0 325 }
michael@0 326 else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00)
michael@0 327 mState = STATE_NORMAL;
michael@0 328 mEndian = kLittleEndian;
michael@0 329 }
michael@0 330 else { // Neither BOM nor 'plausible' byte patterns at the beginning.
michael@0 331 // Just assume it's BE (following Unicode standard)
michael@0 332 // and let the garbage show up in the browser. (security concern?)
michael@0 333 // (bug 246194)
michael@0 334 mState = STATE_NORMAL;
michael@0 335 mEndian = kBigEndian;
michael@0 336 }
michael@0 337 }
michael@0 338
michael@0 339 nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
michael@0 340 #if MOZ_BIG_ENDIAN
michael@0 341 (mEndian == kLittleEndian)
michael@0 342 #else
michael@0 343 (mEndian == kBigEndian)
michael@0 344 #endif
michael@0 345 );
michael@0 346
michael@0 347 // If BOM is not found and we're to return NS_OK, signal that BOM
michael@0 348 // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
michael@0 349 return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
michael@0 350 }

mercurial