1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/ycbcr/yuv_row_win64.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,205 @@ 1.4 +// Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.5 +// Use of this source code is governed by a BSD-style license that can be 1.6 +// found in the LICENSE file. 1.7 + 1.8 +#include "yuv_row.h" 1.9 + 1.10 +extern "C" { 1.11 + 1.12 +// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. 1.13 + 1.14 +#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048) 1.15 +#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096) 1.16 + 1.17 +#include <emmintrin.h> 1.18 + 1.19 +static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, 1.20 + const uint8* u_buf, 1.21 + const uint8* v_buf, 1.22 + uint8* rgb_buf, 1.23 + int width) { 1.24 + __m128i xmm0, xmmY1, xmmY2; 1.25 + __m128 xmmY; 1.26 + 1.27 + while (width >= 2) { 1.28 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), 1.29 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); 1.30 + 1.31 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); 1.32 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.33 + 1.34 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); 1.35 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 1.36 + 1.37 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 1.38 + 0x44); 1.39 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 1.40 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.41 + 1.42 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 1.43 + rgb_buf += 8; 1.44 + width -= 2; 1.45 + } 1.46 + 1.47 + if (width) { 1.48 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), 1.49 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); 1.50 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); 1.51 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.52 + xmmY1 = _mm_srai_epi16(xmmY1, 6); 1.53 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.54 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 1.55 + } 1.56 +} 1.57 + 1.58 +static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, 1.59 + const uint8* u_buf, 1.60 + const uint8* v_buf, 1.61 + uint8* rgb_buf, 1.62 + int width, 1.63 + int source_dx) { 1.64 + __m128i xmm0, xmmY1, xmmY2; 1.65 + __m128 xmmY; 1.66 + uint8 u, v, y; 1.67 + int x = 0; 1.68 + 1.69 + while (width >= 2) { 1.70 + u = u_buf[x >> 17]; 1.71 + v = v_buf[x >> 17]; 1.72 + y = y_buf[x >> 16]; 1.73 + x += source_dx; 1.74 + 1.75 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), 1.76 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); 1.77 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.78 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.79 + 1.80 + y = y_buf[x >> 16]; 1.81 + x += source_dx; 1.82 + 1.83 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.84 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 1.85 + 1.86 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 1.87 + 0x44); 1.88 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 1.89 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.90 + 1.91 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 1.92 + rgb_buf += 8; 1.93 + width -= 2; 1.94 + } 1.95 + 1.96 + if (width) { 1.97 + u = u_buf[x >> 17]; 1.98 + v = v_buf[x >> 17]; 1.99 + y = y_buf[x >> 16]; 1.100 + 1.101 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), 1.102 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); 1.103 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.104 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.105 + xmmY1 = _mm_srai_epi16(xmmY1, 6); 1.106 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.107 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 1.108 + } 1.109 +} 1.110 + 1.111 +static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, 1.112 + const uint8* u_buf, 1.113 + const uint8* v_buf, 1.114 + uint8* rgb_buf, 1.115 + int width, 1.116 + int source_dx) { 1.117 + __m128i xmm0, xmmY1, xmmY2; 1.118 + __m128 xmmY; 1.119 + uint8 u0, u1, v0, v1, y0, y1; 1.120 + uint32 uv_frac, y_frac, u, v, y; 1.121 + int x = 0; 1.122 + 1.123 + if (source_dx >= 0x20000) { 1.124 + x = 32768; 1.125 + } 1.126 + 1.127 + while(width >= 2) { 1.128 + u0 = u_buf[x >> 17]; 1.129 + u1 = u_buf[(x >> 17) + 1]; 1.130 + v0 = v_buf[x >> 17]; 1.131 + v1 = v_buf[(x >> 17) + 1]; 1.132 + y0 = y_buf[x >> 16]; 1.133 + y1 = y_buf[(x >> 16) + 1]; 1.134 + uv_frac = (x & 0x1fffe); 1.135 + y_frac = (x & 0xffff); 1.136 + u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; 1.137 + v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; 1.138 + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; 1.139 + x += source_dx; 1.140 + 1.141 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), 1.142 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); 1.143 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.144 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.145 + 1.146 + y0 = y_buf[x >> 16]; 1.147 + y1 = y_buf[(x >> 16) + 1]; 1.148 + y_frac = (x & 0xffff); 1.149 + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; 1.150 + x += source_dx; 1.151 + 1.152 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.153 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 1.154 + 1.155 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 1.156 + 0x44); 1.157 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 1.158 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.159 + 1.160 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 1.161 + rgb_buf += 8; 1.162 + width -= 2; 1.163 + } 1.164 + 1.165 + if (width) { 1.166 + u = u_buf[x >> 17]; 1.167 + v = v_buf[x >> 17]; 1.168 + y = y_buf[x >> 16]; 1.169 + 1.170 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), 1.171 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); 1.172 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); 1.173 + 1.174 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 1.175 + xmmY1 = _mm_srai_epi16(xmmY1, 6); 1.176 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 1.177 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 1.178 + } 1.179 +} 1.180 + 1.181 +void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.182 + const uint8* u_buf, 1.183 + const uint8* v_buf, 1.184 + uint8* rgb_buf, 1.185 + int width) { 1.186 + FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); 1.187 +} 1.188 + 1.189 +void ScaleYUVToRGB32Row(const uint8* y_buf, 1.190 + const uint8* u_buf, 1.191 + const uint8* v_buf, 1.192 + uint8* rgb_buf, 1.193 + int width, 1.194 + int source_dx) { 1.195 + ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.196 +} 1.197 + 1.198 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.199 + const uint8* u_buf, 1.200 + const uint8* v_buf, 1.201 + uint8* rgb_buf, 1.202 + int width, 1.203 + int source_dx) { 1.204 + LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, 1.205 + source_dx); 1.206 +} 1.207 + 1.208 +} // extern "C"