gfx/ycbcr/yuv_row_win64.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/ycbcr/yuv_row_win64.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,205 @@
     1.4 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
     1.5 +// Use of this source code is governed by a BSD-style license that can be
     1.6 +// found in the LICENSE file.
     1.7 +
     1.8 +#include "yuv_row.h"
     1.9 +
    1.10 +extern "C" {
    1.11 +
    1.12 +// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
    1.13 +
    1.14 +#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
    1.15 +#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
    1.16 +
    1.17 +#include <emmintrin.h>
    1.18 +
    1.19 +static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
    1.20 +                                          const uint8* u_buf,
    1.21 +                                          const uint8* v_buf,
    1.22 +                                          uint8* rgb_buf,
    1.23 +                                          int width) {
    1.24 +  __m128i xmm0, xmmY1, xmmY2;
    1.25 +  __m128  xmmY;
    1.26 +
    1.27 +  while (width >= 2) {
    1.28 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
    1.29 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
    1.30 +
    1.31 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
    1.32 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    1.33 +
    1.34 +    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
    1.35 +    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
    1.36 +
    1.37 +    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
    1.38 +                          0x44);
    1.39 +    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    1.40 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    1.41 +
    1.42 +    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    1.43 +    rgb_buf += 8;
    1.44 +    width -= 2;
    1.45 +  }
    1.46 +
    1.47 +  if (width) {
    1.48 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
    1.49 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
    1.50 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
    1.51 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    1.52 +    xmmY1 = _mm_srai_epi16(xmmY1, 6);
    1.53 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    1.54 +    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
    1.55 +  }
    1.56 +}
    1.57 +
    1.58 +static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
    1.59 +                                    const uint8* u_buf,
    1.60 +                                    const uint8* v_buf,
    1.61 +                                    uint8* rgb_buf,
    1.62 +                                    int width,
    1.63 +                                    int source_dx) {
    1.64 +  __m128i xmm0, xmmY1, xmmY2;
    1.65 +  __m128  xmmY;
    1.66 +  uint8 u, v, y;
    1.67 +  int x = 0;
    1.68 +
    1.69 +  while (width >= 2) {
    1.70 +    u = u_buf[x >> 17];
    1.71 +    v = v_buf[x >> 17];
    1.72 +    y = y_buf[x >> 16];
    1.73 +    x += source_dx;
    1.74 +
    1.75 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
    1.76 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
    1.77 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    1.78 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    1.79 +
    1.80 +    y = y_buf[x >> 16];
    1.81 +    x += source_dx;
    1.82 +
    1.83 +    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    1.84 +    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
    1.85 +
    1.86 +    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
    1.87 +                          0x44);
    1.88 +    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    1.89 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    1.90 +
    1.91 +    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    1.92 +    rgb_buf += 8;
    1.93 +    width -= 2;
    1.94 +  }
    1.95 +
    1.96 +  if (width) {
    1.97 +    u = u_buf[x >> 17];
    1.98 +    v = v_buf[x >> 17];
    1.99 +    y = y_buf[x >> 16];
   1.100 +
   1.101 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   1.102 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   1.103 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   1.104 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   1.105 +    xmmY1 = _mm_srai_epi16(xmmY1, 6);
   1.106 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   1.107 +    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
   1.108 +  }
   1.109 +}
   1.110 +
   1.111 +static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
   1.112 +                                          const uint8* u_buf,
   1.113 +                                          const uint8* v_buf,
   1.114 +                                          uint8* rgb_buf,
   1.115 +                                          int width,
   1.116 +                                          int source_dx) {
   1.117 +  __m128i xmm0, xmmY1, xmmY2;
   1.118 +  __m128  xmmY;
   1.119 +  uint8 u0, u1, v0, v1, y0, y1;
   1.120 +  uint32 uv_frac, y_frac, u, v, y;
   1.121 +  int x = 0;
   1.122 +
   1.123 +  if (source_dx >= 0x20000) {
   1.124 +    x = 32768;
   1.125 +  }
   1.126 +
   1.127 +  while(width >= 2) {
   1.128 +    u0 = u_buf[x >> 17];
   1.129 +    u1 = u_buf[(x >> 17) + 1];
   1.130 +    v0 = v_buf[x >> 17];
   1.131 +    v1 = v_buf[(x >> 17) + 1];
   1.132 +    y0 = y_buf[x >> 16];
   1.133 +    y1 = y_buf[(x >> 16) + 1];
   1.134 +    uv_frac = (x & 0x1fffe);
   1.135 +    y_frac = (x & 0xffff);
   1.136 +    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
   1.137 +    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
   1.138 +    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
   1.139 +    x += source_dx;
   1.140 +
   1.141 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   1.142 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   1.143 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   1.144 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   1.145 +
   1.146 +    y0 = y_buf[x >> 16];
   1.147 +    y1 = y_buf[(x >> 16) + 1];
   1.148 +    y_frac = (x & 0xffff);
   1.149 +    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
   1.150 +    x += source_dx;
   1.151 +
   1.152 +    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   1.153 +    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
   1.154 +
   1.155 +    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
   1.156 +                          0x44);
   1.157 +    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
   1.158 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   1.159 +
   1.160 +    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
   1.161 +    rgb_buf += 8;
   1.162 +    width -= 2;
   1.163 +  }
   1.164 +
   1.165 +  if (width) {
   1.166 +    u = u_buf[x >> 17];
   1.167 +    v = v_buf[x >> 17];
   1.168 +    y = y_buf[x >> 16];
   1.169 +
   1.170 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   1.171 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   1.172 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   1.173 +
   1.174 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   1.175 +    xmmY1 = _mm_srai_epi16(xmmY1, 6);
   1.176 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   1.177 +    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
   1.178 +  }
   1.179 +}
   1.180 +
   1.181 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
   1.182 +                              const uint8* u_buf,
   1.183 +                              const uint8* v_buf,
   1.184 +                              uint8* rgb_buf,
   1.185 +                              int width) {
   1.186 +  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
   1.187 +}
   1.188 +
   1.189 +void ScaleYUVToRGB32Row(const uint8* y_buf,
   1.190 +                        const uint8* u_buf,
   1.191 +                        const uint8* v_buf,
   1.192 +                        uint8* rgb_buf,
   1.193 +                        int width,
   1.194 +                        int source_dx) {
   1.195 +  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   1.196 +}
   1.197 +
   1.198 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   1.199 +                              const uint8* u_buf,
   1.200 +                              const uint8* v_buf,
   1.201 +                              uint8* rgb_buf,
   1.202 +                              int width,
   1.203 +                              int source_dx) {
   1.204 +  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
   1.205 +                                source_dx);
   1.206 +}
   1.207 +
   1.208 +} // extern "C"

mercurial