1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/ycbcr/win64.patch Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,210 @@ 1.4 +diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp 1.5 +new file mode 100644 1.6 +--- /dev/null 1.7 ++++ b/gfx/ycbcr/yuv_row_win64.cpp 1.8 +@@ -0,0 +1,205 @@ 1.9 ++// Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.10 ++// Use of this source code is governed by a BSD-style license that can be 1.11 ++// found in the LICENSE file. 1.12 ++ 1.13 ++#include "yuv_row.h" 1.14 ++ 1.15 ++extern "C" { 1.16 ++ 1.17 ++// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. 1.18 ++ 1.19 ++#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048) 1.20 ++#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096) 1.21 ++ 1.22 ++#include <emmintrin.h> 1.23 ++ 1.24 ++static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, 1.25 ++ const uint8* u_buf, 1.26 ++ const uint8* v_buf, 1.27 ++ uint8* rgb_buf, 1.28 ++ int width) { 1.29 ++ __m128i xmm0, xmmY1, xmmY2; 1.30 ++ __m128 xmmY; 1.31 ++ 1.32 ++ while (width >= 2) { 1.33 ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), 1.34 ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); 1.35 ++ 1.36 ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); 1.37 ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.38 ++ 1.39 ++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); 1.40 ++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 1.41 ++ 1.42 ++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 1.43 ++ 0x44); 1.44 ++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 1.45 ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.46 ++ 1.47 ++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 1.48 ++ rgb_buf += 8; 1.49 ++ width -= 2; 1.50 ++ } 1.51 ++ 1.52 ++ if (width) { 1.53 ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), 1.54 ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); 1.55 ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); 1.56 ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.57 ++ xmmY1 = _mm_srai_epi16(xmmY1, 6); 1.58 ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.59 ++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 1.60 ++ } 1.61 ++} 1.62 ++ 1.63 ++static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, 1.64 ++ const uint8* u_buf, 1.65 ++ const uint8* v_buf, 1.66 ++ uint8* rgb_buf, 1.67 ++ int width, 1.68 ++ int source_dx) { 1.69 ++ __m128i xmm0, xmmY1, xmmY2; 1.70 ++ __m128 xmmY; 1.71 ++ uint8 u, v, y; 1.72 ++ int x = 0; 1.73 ++ 1.74 ++ while (width >= 2) { 1.75 ++ u = u_buf[x >> 17]; 1.76 ++ v = v_buf[x >> 17]; 1.77 ++ y = y_buf[x >> 16]; 1.78 ++ x += source_dx; 1.79 ++ 1.80 ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), 1.81 ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); 1.82 ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.83 ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.84 ++ 1.85 ++ y = y_buf[x >> 16]; 1.86 ++ x += source_dx; 1.87 ++ 1.88 ++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.89 ++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 1.90 ++ 1.91 ++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 1.92 ++ 0x44); 1.93 ++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 1.94 ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.95 ++ 1.96 ++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 1.97 ++ rgb_buf += 8; 1.98 ++ width -= 2; 1.99 ++ } 1.100 ++ 1.101 ++ if (width) { 1.102 ++ u = u_buf[x >> 17]; 1.103 ++ v = v_buf[x >> 17]; 1.104 ++ y = y_buf[x >> 16]; 1.105 ++ 1.106 ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), 1.107 ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); 1.108 ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.109 ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.110 ++ xmmY1 = _mm_srai_epi16(xmmY1, 6); 1.111 ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.112 ++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 1.113 ++ } 1.114 ++} 1.115 ++ 1.116 ++static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, 1.117 ++ const uint8* u_buf, 1.118 ++ const uint8* v_buf, 1.119 ++ uint8* rgb_buf, 1.120 ++ int width, 1.121 ++ int source_dx) { 1.122 ++ __m128i xmm0, xmmY1, xmmY2; 1.123 ++ __m128 xmmY; 1.124 ++ uint8 u0, u1, v0, v1, y0, y1; 1.125 ++ uint32 uv_frac, y_frac, u, v, y; 1.126 ++ int x = 0; 1.127 ++ 1.128 ++ if (source_dx >= 0x20000) { 1.129 ++ x = 32768; 1.130 ++ } 1.131 ++ 1.132 ++ while(width >= 2) { 1.133 ++ u0 = u_buf[x >> 17]; 1.134 ++ u1 = u_buf[(x >> 17) + 1]; 1.135 ++ v0 = v_buf[x >> 17]; 1.136 ++ v1 = v_buf[(x >> 17) + 1]; 1.137 ++ y0 = y_buf[x >> 16]; 1.138 ++ y1 = y_buf[(x >> 16) + 1]; 1.139 ++ uv_frac = (x & 0x1fffe); 1.140 ++ y_frac = (x & 0xffff); 1.141 ++ u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; 1.142 ++ v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; 1.143 ++ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; 1.144 ++ x += source_dx; 1.145 ++ 1.146 ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), 1.147 ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); 1.148 ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.149 ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.150 ++ 1.151 ++ y0 = y_buf[x >> 16]; 1.152 ++ y1 = y_buf[(x >> 16) + 1]; 1.153 ++ y_frac = (x & 0xffff); 1.154 ++ y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; 1.155 ++ x += source_dx; 1.156 ++ 1.157 ++ xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.158 ++ xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 1.159 ++ 1.160 ++ xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 1.161 ++ 0x44); 1.162 ++ xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 1.163 ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.164 ++ 1.165 ++ _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 1.166 ++ rgb_buf += 8; 1.167 ++ width -= 2; 1.168 ++ } 1.169 ++ 1.170 ++ if (width) { 1.171 ++ u = u_buf[x >> 17]; 1.172 ++ v = v_buf[x >> 17]; 1.173 ++ y = y_buf[x >> 16]; 1.174 ++ 1.175 ++ xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), 1.176 ++ _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); 1.177 ++ xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.178 ++ 1.179 ++ xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.180 ++ xmmY1 = _mm_srai_epi16(xmmY1, 6); 1.181 ++ xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.182 ++ *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 1.183 ++ } 1.184 ++} 1.185 ++ 1.186 ++void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.187 ++ const uint8* u_buf, 1.188 ++ const uint8* v_buf, 1.189 ++ uint8* rgb_buf, 1.190 ++ int width) { 1.191 ++ FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); 1.192 ++} 1.193 ++ 1.194 ++void ScaleYUVToRGB32Row(const uint8* y_buf, 1.195 ++ const uint8* u_buf, 1.196 ++ const uint8* v_buf, 1.197 ++ uint8* rgb_buf, 1.198 ++ int width, 1.199 ++ int source_dx) { 1.200 ++ ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.201 ++} 1.202 ++ 1.203 ++void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.204 ++ const uint8* u_buf, 1.205 ++ const uint8* v_buf, 1.206 ++ uint8* rgb_buf, 1.207 ++ int width, 1.208 ++ int source_dx) { 1.209 ++ LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, 1.210 ++ source_dx); 1.211 ++} 1.212 ++ 1.213 ++} // extern "C"