The Tor Browser: gfx/2d/ImageScalingSSE2.cpp@97036ab72558

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-

     2  * This Source Code Form is subject to the terms of the Mozilla Public

     3  * License, v. 2.0. If a copy of the MPL was not distributed with this

     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     6 #include "ImageScaling.h"

     7 #include "mozilla/Attributes.h"

     9 #include "SSEHelpers.h"

    11 /* The functions below use the following system for averaging 4 pixels:

    12  *

    13  * The first observation is that a half-adder is implemented as follows:

    14  * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);

    15  *

    16  * This can be trivially extended to three pixels by observaring that when

    17  * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the

    18  * carries of the individual numbers, since the sum of 3 bits can only ever

    19  * have a carry of one.

    20  *

    21  * We then observe that the average is then ((carry << 1) + sum) >> 1, or,

    22  * assuming eliminating overflows and underflows, carry + (sum >> 1).

    23  *

    24  * We now average our existing sum with the fourth number, so we get:

    25  * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).

    26  *

    27  * We now observe that our sum has been moved into place relative to the

    28  * carry, so we can now average with the carry to get the final 4 input

    29  * average: avg = (sum2 + carry) >> 1;

    30  *

    31  * Or to reverse the proof:

    32  * avg = ((sum >> 1) + carry + d >> 1) >> 1

    33  * avg = ((a + b + c) >> 1 + d >> 1) >> 1

    34  * avg = ((a + b + c + d) >> 2)

    35  *

    36  * An additional fact used in the SSE versions is the concept that we can

    37  * trivially convert a rounded average to a truncated average:

    38  *

    39  * We have:

    40  * f(a, b) = (a + b + 1) >> 1

    41  *

    42  * And want:

    43  * g(a, b) = (a + b) >> 1

    44  *

    45  * Observe:

    46  * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)

    47  *            == ~((-a - 1 + -b - 1 + 1) >> 1)

    48  *            == ~((-a - 1 + -b) >> 1)

    49  *            == ~((-(a + b) - 1) >> 1)

    50  *            == ~((~(a + b)) >> 1)

    51  *            == (a + b) >> 1

    52  *            == g(a, b)

    53  */

    55 MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg)

    56 {

    57   __m128i minusone = _mm_set1_epi32(0xffffffff);

    58   return _mm_xor_si128(arg, minusone);

    59 }

    61 /* We have to pass pointers here, MSVC does not allow passing more than 3

    62  * __m128i arguments on the stack. And it does not allow 16-byte aligned

    63  * stack variables. This inlines properly on MSVC 2010. It does -not- inline

    64  * with just the inline directive.

    65  */

    66 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d)

    67 {

    68 #define shuf1 _MM_SHUFFLE(2, 0, 2, 0)

    69 #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)

    71 // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps

    72 // needs to be a compile time constant.

    73 #define shuffle_si128(arga, argb, imm) \

    74   _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm)));

    76   __m128i t = shuffle_si128(*a, *b, shuf1);

    77   *b = shuffle_si128(*a, *b, shuf2);

    78   *a = t;

    79   t = shuffle_si128(*c, *d, shuf1);

    80   *d = shuffle_si128(*c, *d, shuf2);

    81   *c = t;

    83 #undef shuf1

    84 #undef shuf2

    85 #undef shuffle_si128

    87   __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));

    89   __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));

    91   sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));

    93   return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));

    94 }

    96 MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b)

    97 {

    98   return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));

    99 }

   101 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)

   102 {

   103   __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));

   104   b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));

   105   a = t;

   107   return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));

   108 }

   110 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)

   111 {

   112   uint32_t sum = a ^ b ^ c;

   113   uint32_t carry = (a & b) | (a & c) | (b & c);

   115   uint32_t mask = 0xfefefefe;

   117   // Not having a byte based average instruction means we should mask to avoid

   118   // underflow.

   119   sum = (((sum ^ d) & mask) >> 1) + (sum & d);

   121   return (((sum ^ carry) & mask) >> 1) + (sum & carry);

   122 }

   124 // Simple 2 pixel average version of the function above.

   125 MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b)

   126 {

   127   uint32_t sum = a ^ b;

   128   uint32_t carry = (a & b);

   130   uint32_t mask = 0xfefefefe;

   132   return ((sum & mask) >> 1) + carry;

   133 }

   135 namespace mozilla {

   136 namespace gfx {

   138 void

   139 ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride,

   140                                   const IntSize &aSourceSize, uint8_t *aDest,

   141                                   uint32_t aDestStride)

   142 {

   143   const int Bpp = 4;

   145   for (int y = 0; y < aSourceSize.height; y += 2) {

   146     __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);

   147     int x = 0;

   148     // Run a loop depending on alignment.

   149     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&

   150         !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {

   151       for (; x < (aSourceSize.width - 7); x += 8) {

   152         __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));

   153         __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));

   155         __m128i a = _mm_load_si128(upperRow);

   156         __m128i b = _mm_load_si128(upperRow + 1);

   157         __m128i c = _mm_load_si128(lowerRow);

   158         __m128i d = _mm_load_si128(lowerRow + 1);

   160         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);

   161       }

   162     } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {

   163       for (; x < (aSourceSize.width - 7); x += 8) {

   164         __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));

   165         __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));

   167         __m128i a = _mm_load_si128(upperRow);

   168         __m128i b = _mm_load_si128(upperRow + 1);

   169         __m128i c = loadUnaligned128(lowerRow);

   170         __m128i d = loadUnaligned128(lowerRow + 1);

   172         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);

   173       }

   174     } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {

   175       for (; x < (aSourceSize.width - 7); x += 8) {

   176         __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));

   177         __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));

   179         __m128i a = loadUnaligned128((__m128i*)upperRow);

   180         __m128i b = loadUnaligned128((__m128i*)upperRow + 1);

   181         __m128i c = _mm_load_si128((__m128i*)lowerRow);

   182         __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);

   184         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);

   185       }

   186     } else {

   187       for (; x < (aSourceSize.width - 7); x += 8) {

   188         __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));

   189         __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));

   191         __m128i a = loadUnaligned128(upperRow);

   192         __m128i b = loadUnaligned128(upperRow + 1);

   193         __m128i c = loadUnaligned128(lowerRow);

   194         __m128i d = loadUnaligned128(lowerRow + 1);

   196         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);

   197       }

   198     }

   200     uint32_t *unalignedStorage = (uint32_t*)storage;

   201     // Take care of the final pixels, we know there's an even number of pixels

   202     // in the source rectangle. We use a 2x2 'simd' implementation for this.

   203     //

   204     // Potentially we only have to do this in the last row since overflowing

   205     // 8 pixels in an earlier row would appear to be harmless as it doesn't

   206     // touch invalid memory. Even when reading and writing to the same surface.

   207     // in practice we only do this when doing an additional downscale pass, and

   208     // in this situation we have unused stride to write into harmlessly.

   209     // I do not believe the additional code complexity would be worth it though.

   210     for (; x < aSourceSize.width; x += 2) {

   211       uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp);

   212       uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);

   214       *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),

   215                                    *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));

   216     }

   217   }

   218 }

   220 void

   221 ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride,

   222                                         const IntSize &aSourceSize, uint8_t *aDest,

   223                                         uint32_t aDestStride)

   224 {

   225   for (int y = 0; y < aSourceSize.height; y += 2) {

   226     __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);

   227     int x = 0;

   228     // Run a loop depending on alignment.

   229     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&

   230         !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {

   231       for (; x < (aSourceSize.width - 3); x += 4) {

   232         uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);

   233         uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);

   235         __m128i a = _mm_load_si128((__m128i*)upperRow);

   236         __m128i b = _mm_load_si128((__m128i*)lowerRow);

   238         *storage++ = avg_sse2_4x2_4x1(a, b);

   239       }

   240     } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {

   241       // This line doesn't align well.

   242       for (; x < (aSourceSize.width - 3); x += 4) {

   243         uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);

   244         uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);

   246         __m128i a = _mm_load_si128((__m128i*)upperRow);

   247         __m128i b = loadUnaligned128((__m128i*)lowerRow);

   249         *storage++ = avg_sse2_4x2_4x1(a, b);

   250       }

   251     } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {

   252       for (; x < (aSourceSize.width - 3); x += 4) {

   253         uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);

   254         uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);

   256         __m128i a = loadUnaligned128((__m128i*)upperRow);

   257         __m128i b = _mm_load_si128((__m128i*)lowerRow);

   259         *storage++ = avg_sse2_4x2_4x1(a, b);

   260       }

   261     } else {

   262       for (; x < (aSourceSize.width - 3); x += 4) {

   263         uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);

   264         uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);

   266         __m128i a = loadUnaligned128((__m128i*)upperRow);

   267         __m128i b = loadUnaligned128((__m128i*)lowerRow);

   269         *storage++ = avg_sse2_4x2_4x1(a, b);

   270       }

   271     }

   273     uint32_t *unalignedStorage = (uint32_t*)storage;

   274     // Take care of the final pixels, we know there's an even number of pixels

   275     // in the source rectangle.

   276     //

   277     // Similar overflow considerations are valid as in the previous function.

   278     for (; x < aSourceSize.width; x++) {

   279       uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);

   280       uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);

   282       *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);

   283     }

   284   }

   285 }

   287 void

   288 ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride,

   289                                           const IntSize &aSourceSize, uint8_t *aDest,

   290                                           uint32_t aDestStride)

   291 {

   292   for (int y = 0; y < aSourceSize.height; y++) {

   293     __m128i *storage = (__m128i*)(aDest + (y * aDestStride));

   294     int x = 0;

   295     // Run a loop depending on alignment.

   296     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {

   297       for (; x < (aSourceSize.width - 7); x += 8) {

   298         __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));

   300         __m128i a = _mm_load_si128(pixels);

   301         __m128i b = _mm_load_si128(pixels + 1);

   303         *storage++ = avg_sse2_8x1_4x1(a, b);

   304       }

   305     } else {

   306       for (; x < (aSourceSize.width - 7); x += 8) {

   307         __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));

   309         __m128i a = loadUnaligned128(pixels);

   310         __m128i b = loadUnaligned128(pixels + 1);

   312         *storage++ = avg_sse2_8x1_4x1(a, b);

   313       }

   314     }

   316     uint32_t *unalignedStorage = (uint32_t*)storage;

   317     // Take care of the final pixels, we know there's an even number of pixels

   318     // in the source rectangle.

   319     //

   320     // Similar overflow considerations are valid as in the previous function.

   321     for (; x < aSourceSize.width; x += 2) {

   322       uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));

   324       *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));

   325     }

   326   }

   327 }

   329 }

   330 }

The Tor Browser / file revision

gfx/2d/ImageScalingSSE2.cpp@97036ab72558

gfx/2d/ImageScalingSSE2.cpp