gfx/ycbcr/win64.patch

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/ycbcr/win64.patch	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,210 @@
     1.4 +diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
     1.5 +new file mode 100644
     1.6 +--- /dev/null
     1.7 ++++ b/gfx/ycbcr/yuv_row_win64.cpp
     1.8 +@@ -0,0 +1,205 @@
     1.9 ++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
    1.10 ++// Use of this source code is governed by a BSD-style license that can be
    1.11 ++// found in the LICENSE file.
    1.12 ++
    1.13 ++#include "yuv_row.h"
    1.14 ++
    1.15 ++extern "C" {
    1.16 ++
    1.17 ++// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
    1.18 ++
    1.19 ++#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
    1.20 ++#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
    1.21 ++
    1.22 ++#include <emmintrin.h>
    1.23 ++
    1.24 ++static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
    1.25 ++                                          const uint8* u_buf,
    1.26 ++                                          const uint8* v_buf,
    1.27 ++                                          uint8* rgb_buf,
    1.28 ++                                          int width) {
    1.29 ++  __m128i xmm0, xmmY1, xmmY2;
    1.30 ++  __m128  xmmY;
    1.31 ++
    1.32 ++  while (width >= 2) {
    1.33 ++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
    1.34 ++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
    1.35 ++
    1.36 ++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
    1.37 ++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    1.38 ++
    1.39 ++    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
    1.40 ++    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
    1.41 ++
    1.42 ++    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
    1.43 ++                          0x44);
    1.44 ++    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    1.45 ++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    1.46 ++
    1.47 ++    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    1.48 ++    rgb_buf += 8;
    1.49 ++    width -= 2;
    1.50 ++  }
    1.51 ++
    1.52 ++  if (width) {
    1.53 ++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
    1.54 ++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
    1.55 ++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
    1.56 ++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    1.57 ++    xmmY1 = _mm_srai_epi16(xmmY1, 6);
    1.58 ++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    1.59 ++    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
    1.60 ++  }
    1.61 ++}
    1.62 ++
    1.63 ++static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
    1.64 ++                                    const uint8* u_buf,
    1.65 ++                                    const uint8* v_buf,
    1.66 ++                                    uint8* rgb_buf,
    1.67 ++                                    int width,
    1.68 ++                                    int source_dx) {
    1.69 ++  __m128i xmm0, xmmY1, xmmY2;
    1.70 ++  __m128  xmmY;
    1.71 ++  uint8 u, v, y;
    1.72 ++  int x = 0;
    1.73 ++
    1.74 ++  while (width >= 2) {
    1.75 ++    u = u_buf[x >> 17];
    1.76 ++    v = v_buf[x >> 17];
    1.77 ++    y = y_buf[x >> 16];
    1.78 ++    x += source_dx;
    1.79 ++
    1.80 ++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
    1.81 ++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
    1.82 ++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    1.83 ++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    1.84 ++
    1.85 ++    y = y_buf[x >> 16];
    1.86 ++    x += source_dx;
    1.87 ++
    1.88 ++    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    1.89 ++    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
    1.90 ++
    1.91 ++    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
    1.92 ++                          0x44);
    1.93 ++    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    1.94 ++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    1.95 ++
    1.96 ++    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    1.97 ++    rgb_buf += 8;
    1.98 ++    width -= 2;
    1.99 ++  }
   1.100 ++
   1.101 ++  if (width) {
   1.102 ++    u = u_buf[x >> 17];
   1.103 ++    v = v_buf[x >> 17];
   1.104 ++    y = y_buf[x >> 16];
   1.105 ++
   1.106 ++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   1.107 ++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   1.108 ++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   1.109 ++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   1.110 ++    xmmY1 = _mm_srai_epi16(xmmY1, 6);
   1.111 ++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   1.112 ++    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
   1.113 ++  }
   1.114 ++}
   1.115 ++
   1.116 ++static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
   1.117 ++                                          const uint8* u_buf,
   1.118 ++                                          const uint8* v_buf,
   1.119 ++                                          uint8* rgb_buf,
   1.120 ++                                          int width,
   1.121 ++                                          int source_dx) {
   1.122 ++  __m128i xmm0, xmmY1, xmmY2;
   1.123 ++  __m128  xmmY;
   1.124 ++  uint8 u0, u1, v0, v1, y0, y1;
   1.125 ++  uint32 uv_frac, y_frac, u, v, y;
   1.126 ++  int x = 0;
   1.127 ++
   1.128 ++  if (source_dx >= 0x20000) {
   1.129 ++    x = 32768;
   1.130 ++  }
   1.131 ++
   1.132 ++  while(width >= 2) {
   1.133 ++    u0 = u_buf[x >> 17];
   1.134 ++    u1 = u_buf[(x >> 17) + 1];
   1.135 ++    v0 = v_buf[x >> 17];
   1.136 ++    v1 = v_buf[(x >> 17) + 1];
   1.137 ++    y0 = y_buf[x >> 16];
   1.138 ++    y1 = y_buf[(x >> 16) + 1];
   1.139 ++    uv_frac = (x & 0x1fffe);
   1.140 ++    y_frac = (x & 0xffff);
   1.141 ++    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
   1.142 ++    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
   1.143 ++    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
   1.144 ++    x += source_dx;
   1.145 ++
   1.146 ++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   1.147 ++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   1.148 ++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   1.149 ++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   1.150 ++
   1.151 ++    y0 = y_buf[x >> 16];
   1.152 ++    y1 = y_buf[(x >> 16) + 1];
   1.153 ++    y_frac = (x & 0xffff);
   1.154 ++    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
   1.155 ++    x += source_dx;
   1.156 ++
   1.157 ++    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   1.158 ++    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
   1.159 ++
   1.160 ++    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
   1.161 ++                          0x44);
   1.162 ++    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
   1.163 ++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   1.164 ++
   1.165 ++    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
   1.166 ++    rgb_buf += 8;
   1.167 ++    width -= 2;
   1.168 ++  }
   1.169 ++
   1.170 ++  if (width) {
   1.171 ++    u = u_buf[x >> 17];
   1.172 ++    v = v_buf[x >> 17];
   1.173 ++    y = y_buf[x >> 16];
   1.174 ++
   1.175 ++    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   1.176 ++                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   1.177 ++    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   1.178 ++
   1.179 ++    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   1.180 ++    xmmY1 = _mm_srai_epi16(xmmY1, 6);
   1.181 ++    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   1.182 ++    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
   1.183 ++  }
   1.184 ++}
   1.185 ++
   1.186 ++void FastConvertYUVToRGB32Row(const uint8* y_buf,
   1.187 ++                              const uint8* u_buf,
   1.188 ++                              const uint8* v_buf,
   1.189 ++                              uint8* rgb_buf,
   1.190 ++                              int width) {
   1.191 ++  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
   1.192 ++}
   1.193 ++
   1.194 ++void ScaleYUVToRGB32Row(const uint8* y_buf,
   1.195 ++                        const uint8* u_buf,
   1.196 ++                        const uint8* v_buf,
   1.197 ++                        uint8* rgb_buf,
   1.198 ++                        int width,
   1.199 ++                        int source_dx) {
   1.200 ++  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   1.201 ++}
   1.202 ++
   1.203 ++void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   1.204 ++                              const uint8* u_buf,
   1.205 ++                              const uint8* v_buf,
   1.206 ++                              uint8* rgb_buf,
   1.207 ++                              int width,
   1.208 ++                              int source_dx) {
   1.209 ++  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
   1.210 ++                                source_dx);
   1.211 ++}
   1.212 ++
   1.213 ++} // extern "C"

mercurial