gfx/angle/src/libGLESv2/renderer/ImageSSE2.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/angle/src/libGLESv2/renderer/ImageSSE2.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,100 @@
     1.4 +#include "precompiled.h"
     1.5 +//
     1.6 +// Copyright (c) 2002-2012 The ANGLE Project Authors. All rights reserved.
     1.7 +// Use of this source code is governed by a BSD-style license that can be
     1.8 +// found in the LICENSE file.
     1.9 +//
    1.10 +
    1.11 +// ImageSSE2.cpp: Implements SSE2-based functions of rx::Image class. It's
    1.12 +// in a separated file for GCC, which can enable SSE usage only per-file,
    1.13 +// not for code blocks that use SSE2 explicitly.
    1.14 +
    1.15 +#include "libGLESv2/Texture.h"
    1.16 +#include "libGLESv2/renderer/Image.h"
    1.17 +
    1.18 +namespace rx
    1.19 +{
    1.20 +
    1.21 +void Image::loadRGBAUByteDataToBGRASSE2(GLsizei width, GLsizei height,
    1.22 +                                        int inputPitch, const void *input, size_t outputPitch, void *output)
    1.23 +{
    1.24 +    const unsigned int *source = NULL;
    1.25 +    unsigned int *dest = NULL;
    1.26 +    __m128i brMask = _mm_set1_epi32(0x00ff00ff);
    1.27 +
    1.28 +    for (int y = 0; y < height; y++)
    1.29 +    {
    1.30 +        source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputPitch);
    1.31 +        dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch);
    1.32 +        int x = 0;
    1.33 +
    1.34 +        // Make output writes aligned
    1.35 +        for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
    1.36 +        {
    1.37 +            unsigned int rgba = source[x];
    1.38 +            dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
    1.39 +        }
    1.40 +
    1.41 +        for (; x + 3 < width; x += 4)
    1.42 +        {
    1.43 +            __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
    1.44 +            // Mask out g and a, which don't change
    1.45 +            __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
    1.46 +            // Mask out b and r
    1.47 +            __m128i brComponents = _mm_and_si128(sourceData, brMask);
    1.48 +            // Swap b and r
    1.49 +            __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
    1.50 +            __m128i result = _mm_or_si128(gaComponents, brSwapped);
    1.51 +            _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
    1.52 +        }
    1.53 +
    1.54 +        // Perform leftover writes
    1.55 +        for (; x < width; x++)
    1.56 +        {
    1.57 +            unsigned int rgba = source[x];
    1.58 +            dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
    1.59 +        }
    1.60 +    }
    1.61 +}
    1.62 +
    1.63 +void Image::loadAlphaDataToBGRASSE2(GLsizei width, GLsizei height,
    1.64 +                                    int inputPitch, const void *input, size_t outputPitch, void *output)
    1.65 +{
    1.66 +    const unsigned char *source = NULL;
    1.67 +    unsigned int *dest = NULL;
    1.68 +    __m128i zeroWide = _mm_setzero_si128();
    1.69 +
    1.70 +    for (int y = 0; y < height; y++)
    1.71 +    {
    1.72 +        source = static_cast<const unsigned char*>(input) + y * inputPitch;
    1.73 +        dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch);
    1.74 +
    1.75 +        int x;
    1.76 +        // Make output writes aligned
    1.77 +        for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
    1.78 +        {
    1.79 +            dest[x] = static_cast<unsigned int>(source[x]) << 24;
    1.80 +        }
    1.81 +
    1.82 +        for (; x + 7 < width; x += 8)
    1.83 +        {
    1.84 +            __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
    1.85 +            // Interleave each byte to 16bit, make the lower byte to zero
    1.86 +            sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
    1.87 +            // Interleave each 16bit to 32bit, make the lower 16bit to zero
    1.88 +            __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
    1.89 +            __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
    1.90 +
    1.91 +            _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
    1.92 +            _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
    1.93 +        }
    1.94 +
    1.95 +        // Handle the remainder
    1.96 +        for (; x < width; x++)
    1.97 +        {
    1.98 +            dest[x] = static_cast<unsigned int>(source[x]) << 24;
    1.99 +        }
   1.100 +    }
   1.101 +}
   1.102 +
   1.103 +}

mercurial