1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/row_win.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,7284 @@ 1.4 +/* 1.5 + * Copyright 2011 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/row.h" 1.15 + 1.16 +#ifdef __cplusplus 1.17 +namespace libyuv { 1.18 +extern "C" { 1.19 +#endif 1.20 + 1.21 +// This module is for Visual C x86. 1.22 +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 1.23 + 1.24 +#ifdef HAS_ARGBTOYROW_SSSE3 1.25 + 1.26 +// Constants for ARGB. 1.27 +static const vec8 kARGBToY = { 1.28 + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 1.29 +}; 1.30 + 1.31 +// JPeg full range. 1.32 +static const vec8 kARGBToYJ = { 1.33 + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 1.34 +}; 1.35 + 1.36 +static const vec8 kARGBToU = { 1.37 + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 1.38 +}; 1.39 + 1.40 +static const vec8 kARGBToUJ = { 1.41 + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 1.42 +}; 1.43 + 1.44 +static const vec8 kARGBToV = { 1.45 + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 1.46 +}; 1.47 + 1.48 +static const vec8 kARGBToVJ = { 1.49 + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 1.50 +}; 1.51 + 1.52 +// vpermd for vphaddw + vpackuswb vpermd. 1.53 +static const lvec32 kPermdARGBToY_AVX = { 1.54 + 0, 4, 1, 5, 2, 6, 3, 7 1.55 +}; 1.56 + 1.57 +// vpshufb for vphaddw + vpackuswb packed to shorts. 1.58 +static const lvec8 kShufARGBToUV_AVX = { 1.59 + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 1.60 + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 1.61 +}; 1.62 + 1.63 +// Constants for BGRA. 1.64 +static const vec8 kBGRAToY = { 1.65 + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 1.66 +}; 1.67 + 1.68 +static const vec8 kBGRAToU = { 1.69 + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 1.70 +}; 1.71 + 1.72 +static const vec8 kBGRAToV = { 1.73 + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 1.74 +}; 1.75 + 1.76 +// Constants for ABGR. 1.77 +static const vec8 kABGRToY = { 1.78 + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 1.79 +}; 1.80 + 1.81 +static const vec8 kABGRToU = { 1.82 + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 1.83 +}; 1.84 + 1.85 +static const vec8 kABGRToV = { 1.86 + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 1.87 +}; 1.88 + 1.89 +// Constants for RGBA. 1.90 +static const vec8 kRGBAToY = { 1.91 + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 1.92 +}; 1.93 + 1.94 +static const vec8 kRGBAToU = { 1.95 + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 1.96 +}; 1.97 + 1.98 +static const vec8 kRGBAToV = { 1.99 + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 1.100 +}; 1.101 + 1.102 +static const uvec8 kAddY16 = { 1.103 + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 1.104 +}; 1.105 + 1.106 +static const vec16 kAddYJ64 = { 1.107 + 64, 64, 64, 64, 64, 64, 64, 64 1.108 +}; 1.109 + 1.110 +static const uvec8 kAddUV128 = { 1.111 + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1.112 + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 1.113 +}; 1.114 + 1.115 +static const uvec16 kAddUVJ128 = { 1.116 + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 1.117 +}; 1.118 + 1.119 +// Shuffle table for converting RGB24 to ARGB. 1.120 +static const uvec8 kShuffleMaskRGB24ToARGB = { 1.121 + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 1.122 +}; 1.123 + 1.124 +// Shuffle table for converting RAW to ARGB. 1.125 +static const uvec8 kShuffleMaskRAWToARGB = { 1.126 + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 1.127 +}; 1.128 + 1.129 +// Shuffle table for converting ARGB to RGB24. 1.130 +static const uvec8 kShuffleMaskARGBToRGB24 = { 1.131 + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 1.132 +}; 1.133 + 1.134 +// Shuffle table for converting ARGB to RAW. 1.135 +static const uvec8 kShuffleMaskARGBToRAW = { 1.136 + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 1.137 +}; 1.138 + 1.139 +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 1.140 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { 1.141 + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 1.142 +}; 1.143 + 1.144 +// Shuffle table for converting ARGB to RAW. 1.145 +static const uvec8 kShuffleMaskARGBToRAW_0 = { 1.146 + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 1.147 +}; 1.148 + 1.149 +// Duplicates gray value 3 times and fills in alpha opaque. 1.150 +__declspec(naked) __declspec(align(16)) 1.151 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 1.152 + __asm { 1.153 + mov eax, [esp + 4] // src_y 1.154 + mov edx, [esp + 8] // dst_argb 1.155 + mov ecx, [esp + 12] // pix 1.156 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 1.157 + pslld xmm5, 24 1.158 + 1.159 + align 4 1.160 + convertloop: 1.161 + movq xmm0, qword ptr [eax] 1.162 + lea eax, [eax + 8] 1.163 + punpcklbw xmm0, xmm0 1.164 + movdqa xmm1, xmm0 1.165 + punpcklwd xmm0, xmm0 1.166 + punpckhwd xmm1, xmm1 1.167 + por xmm0, xmm5 1.168 + por xmm1, xmm5 1.169 + movdqa [edx], xmm0 1.170 + movdqa [edx + 16], xmm1 1.171 + lea edx, [edx + 32] 1.172 + sub ecx, 8 1.173 + jg convertloop 1.174 + ret 1.175 + } 1.176 +} 1.177 + 1.178 +__declspec(naked) __declspec(align(16)) 1.179 +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, 1.180 + int pix) { 1.181 + __asm { 1.182 + mov eax, [esp + 4] // src_y 1.183 + mov edx, [esp + 8] // dst_argb 1.184 + mov ecx, [esp + 12] // pix 1.185 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 1.186 + pslld xmm5, 24 1.187 + 1.188 + align 4 1.189 + convertloop: 1.190 + movq xmm0, qword ptr [eax] 1.191 + lea eax, [eax + 8] 1.192 + punpcklbw xmm0, xmm0 1.193 + movdqa xmm1, xmm0 1.194 + punpcklwd xmm0, xmm0 1.195 + punpckhwd xmm1, xmm1 1.196 + por xmm0, xmm5 1.197 + por xmm1, xmm5 1.198 + movdqu [edx], xmm0 1.199 + movdqu [edx + 16], xmm1 1.200 + lea edx, [edx + 32] 1.201 + sub ecx, 8 1.202 + jg convertloop 1.203 + ret 1.204 + } 1.205 +} 1.206 + 1.207 +__declspec(naked) __declspec(align(16)) 1.208 +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 1.209 + __asm { 1.210 + mov eax, [esp + 4] // src_rgb24 1.211 + mov edx, [esp + 8] // dst_argb 1.212 + mov ecx, [esp + 12] // pix 1.213 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 1.214 + pslld xmm5, 24 1.215 + movdqa xmm4, kShuffleMaskRGB24ToARGB 1.216 + 1.217 + align 4 1.218 + convertloop: 1.219 + movdqu xmm0, [eax] 1.220 + movdqu xmm1, [eax + 16] 1.221 + movdqu xmm3, [eax + 32] 1.222 + lea eax, [eax + 48] 1.223 + movdqa xmm2, xmm3 1.224 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 1.225 + pshufb xmm2, xmm4 1.226 + por xmm2, xmm5 1.227 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 1.228 + pshufb xmm0, xmm4 1.229 + movdqa [edx + 32], xmm2 1.230 + por xmm0, xmm5 1.231 + pshufb xmm1, xmm4 1.232 + movdqa [edx], xmm0 1.233 + por xmm1, xmm5 1.234 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 1.235 + pshufb xmm3, xmm4 1.236 + movdqa [edx + 16], xmm1 1.237 + por xmm3, xmm5 1.238 + sub ecx, 16 1.239 + movdqa [edx + 48], xmm3 1.240 + lea edx, [edx + 64] 1.241 + jg convertloop 1.242 + ret 1.243 + } 1.244 +} 1.245 + 1.246 +__declspec(naked) __declspec(align(16)) 1.247 +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 1.248 + int pix) { 1.249 + __asm { 1.250 + mov eax, [esp + 4] // src_raw 1.251 + mov edx, [esp + 8] // dst_argb 1.252 + mov ecx, [esp + 12] // pix 1.253 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 1.254 + pslld xmm5, 24 1.255 + movdqa xmm4, kShuffleMaskRAWToARGB 1.256 + 1.257 + align 4 1.258 + convertloop: 1.259 + movdqu xmm0, [eax] 1.260 + movdqu xmm1, [eax + 16] 1.261 + movdqu xmm3, [eax + 32] 1.262 + lea eax, [eax + 48] 1.263 + movdqa xmm2, xmm3 1.264 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 1.265 + pshufb xmm2, xmm4 1.266 + por xmm2, xmm5 1.267 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 1.268 + pshufb xmm0, xmm4 1.269 + movdqa [edx + 32], xmm2 1.270 + por xmm0, xmm5 1.271 + pshufb xmm1, xmm4 1.272 + movdqa [edx], xmm0 1.273 + por xmm1, xmm5 1.274 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 1.275 + pshufb xmm3, xmm4 1.276 + movdqa [edx + 16], xmm1 1.277 + por xmm3, xmm5 1.278 + sub ecx, 16 1.279 + movdqa [edx + 48], xmm3 1.280 + lea edx, [edx + 64] 1.281 + jg convertloop 1.282 + ret 1.283 + } 1.284 +} 1.285 + 1.286 +// pmul method to replicate bits. 1.287 +// Math to replicate bits: 1.288 +// (v << 8) | (v << 3) 1.289 +// v * 256 + v * 8 1.290 +// v * (256 + 8) 1.291 +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 1.292 +// 20 instructions. 1.293 +__declspec(naked) __declspec(align(16)) 1.294 +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 1.295 + int pix) { 1.296 + __asm { 1.297 + mov eax, 0x01080108 // generate multiplier to repeat 5 bits 1.298 + movd xmm5, eax 1.299 + pshufd xmm5, xmm5, 0 1.300 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 1.301 + movd xmm6, eax 1.302 + pshufd xmm6, xmm6, 0 1.303 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 1.304 + psllw xmm3, 11 1.305 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 1.306 + psllw xmm4, 10 1.307 + psrlw xmm4, 5 1.308 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 1.309 + psllw xmm7, 8 1.310 + 1.311 + mov eax, [esp + 4] // src_rgb565 1.312 + mov edx, [esp + 8] // dst_argb 1.313 + mov ecx, [esp + 12] // pix 1.314 + sub edx, eax 1.315 + sub edx, eax 1.316 + 1.317 + align 4 1.318 + convertloop: 1.319 + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 1.320 + movdqa xmm1, xmm0 1.321 + movdqa xmm2, xmm0 1.322 + pand xmm1, xmm3 // R in upper 5 bits 1.323 + psllw xmm2, 11 // B in upper 5 bits 1.324 + pmulhuw xmm1, xmm5 // * (256 + 8) 1.325 + pmulhuw xmm2, xmm5 // * (256 + 8) 1.326 + psllw xmm1, 8 1.327 + por xmm1, xmm2 // RB 1.328 + pand xmm0, xmm4 // G in middle 6 bits 1.329 + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 1.330 + por xmm0, xmm7 // AG 1.331 + movdqa xmm2, xmm1 1.332 + punpcklbw xmm1, xmm0 1.333 + punpckhbw xmm2, xmm0 1.334 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 1.335 + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 1.336 + lea eax, [eax + 16] 1.337 + sub ecx, 8 1.338 + jg convertloop 1.339 + ret 1.340 + } 1.341 +} 1.342 + 1.343 +// 24 instructions 1.344 +__declspec(naked) __declspec(align(16)) 1.345 +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 1.346 + int pix) { 1.347 + __asm { 1.348 + mov eax, 0x01080108 // generate multiplier to repeat 5 bits 1.349 + movd xmm5, eax 1.350 + pshufd xmm5, xmm5, 0 1.351 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 1.352 + movd xmm6, eax 1.353 + pshufd xmm6, xmm6, 0 1.354 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 1.355 + psllw xmm3, 11 1.356 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 1.357 + psrlw xmm4, 6 1.358 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 1.359 + psllw xmm7, 8 1.360 + 1.361 + mov eax, [esp + 4] // src_argb1555 1.362 + mov edx, [esp + 8] // dst_argb 1.363 + mov ecx, [esp + 12] // pix 1.364 + sub edx, eax 1.365 + sub edx, eax 1.366 + 1.367 + align 4 1.368 + convertloop: 1.369 + movdqu xmm0, [eax] // fetch 8 pixels of 1555 1.370 + movdqa xmm1, xmm0 1.371 + movdqa xmm2, xmm0 1.372 + psllw xmm1, 1 // R in upper 5 bits 1.373 + psllw xmm2, 11 // B in upper 5 bits 1.374 + pand xmm1, xmm3 1.375 + pmulhuw xmm2, xmm5 // * (256 + 8) 1.376 + pmulhuw xmm1, xmm5 // * (256 + 8) 1.377 + psllw xmm1, 8 1.378 + por xmm1, xmm2 // RB 1.379 + movdqa xmm2, xmm0 1.380 + pand xmm0, xmm4 // G in middle 5 bits 1.381 + psraw xmm2, 8 // A 1.382 + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 1.383 + pand xmm2, xmm7 1.384 + por xmm0, xmm2 // AG 1.385 + movdqa xmm2, xmm1 1.386 + punpcklbw xmm1, xmm0 1.387 + punpckhbw xmm2, xmm0 1.388 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 1.389 + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 1.390 + lea eax, [eax + 16] 1.391 + sub ecx, 8 1.392 + jg convertloop 1.393 + ret 1.394 + } 1.395 +} 1.396 + 1.397 +// 18 instructions. 1.398 +__declspec(naked) __declspec(align(16)) 1.399 +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 1.400 + int pix) { 1.401 + __asm { 1.402 + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 1.403 + movd xmm4, eax 1.404 + pshufd xmm4, xmm4, 0 1.405 + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 1.406 + pslld xmm5, 4 1.407 + mov eax, [esp + 4] // src_argb4444 1.408 + mov edx, [esp + 8] // dst_argb 1.409 + mov ecx, [esp + 12] // pix 1.410 + sub edx, eax 1.411 + sub edx, eax 1.412 + 1.413 + align 4 1.414 + convertloop: 1.415 + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 1.416 + movdqa xmm2, xmm0 1.417 + pand xmm0, xmm4 // mask low nibbles 1.418 + pand xmm2, xmm5 // mask high nibbles 1.419 + movdqa xmm1, xmm0 1.420 + movdqa xmm3, xmm2 1.421 + psllw xmm1, 4 1.422 + psrlw xmm3, 4 1.423 + por xmm0, xmm1 1.424 + por xmm2, xmm3 1.425 + movdqa xmm1, xmm0 1.426 + punpcklbw xmm0, xmm2 1.427 + punpckhbw xmm1, xmm2 1.428 + movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 1.429 + movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 1.430 + lea eax, [eax + 16] 1.431 + sub ecx, 8 1.432 + jg convertloop 1.433 + ret 1.434 + } 1.435 +} 1.436 + 1.437 +__declspec(naked) __declspec(align(16)) 1.438 +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 1.439 + __asm { 1.440 + mov eax, [esp + 4] // src_argb 1.441 + mov edx, [esp + 8] // dst_rgb 1.442 + mov ecx, [esp + 12] // pix 1.443 + movdqa xmm6, kShuffleMaskARGBToRGB24 1.444 + 1.445 + align 4 1.446 + convertloop: 1.447 + movdqu xmm0, [eax] // fetch 16 pixels of argb 1.448 + movdqu xmm1, [eax + 16] 1.449 + movdqu xmm2, [eax + 32] 1.450 + movdqu xmm3, [eax + 48] 1.451 + lea eax, [eax + 64] 1.452 + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 1.453 + pshufb xmm1, xmm6 1.454 + pshufb xmm2, xmm6 1.455 + pshufb xmm3, xmm6 1.456 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 1.457 + psrldq xmm1, 4 // 8 bytes from 1 1.458 + pslldq xmm4, 12 // 4 bytes from 1 for 0 1.459 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 1.460 + por xmm0, xmm4 // 4 bytes from 1 for 0 1.461 + pslldq xmm5, 8 // 8 bytes from 2 for 1 1.462 + movdqu [edx], xmm0 // store 0 1.463 + por xmm1, xmm5 // 8 bytes from 2 for 1 1.464 + psrldq xmm2, 8 // 4 bytes from 2 1.465 + pslldq xmm3, 4 // 12 bytes from 3 for 2 1.466 + por xmm2, xmm3 // 12 bytes from 3 for 2 1.467 + movdqu [edx + 16], xmm1 // store 1 1.468 + movdqu [edx + 32], xmm2 // store 2 1.469 + lea edx, [edx + 48] 1.470 + sub ecx, 16 1.471 + jg convertloop 1.472 + ret 1.473 + } 1.474 +} 1.475 + 1.476 +__declspec(naked) __declspec(align(16)) 1.477 +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 1.478 + __asm { 1.479 + mov eax, [esp + 4] // src_argb 1.480 + mov edx, [esp + 8] // dst_rgb 1.481 + mov ecx, [esp + 12] // pix 1.482 + movdqa xmm6, kShuffleMaskARGBToRAW 1.483 + 1.484 + align 4 1.485 + convertloop: 1.486 + movdqu xmm0, [eax] // fetch 16 pixels of argb 1.487 + movdqu xmm1, [eax + 16] 1.488 + movdqu xmm2, [eax + 32] 1.489 + movdqu xmm3, [eax + 48] 1.490 + lea eax, [eax + 64] 1.491 + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 1.492 + pshufb xmm1, xmm6 1.493 + pshufb xmm2, xmm6 1.494 + pshufb xmm3, xmm6 1.495 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 1.496 + psrldq xmm1, 4 // 8 bytes from 1 1.497 + pslldq xmm4, 12 // 4 bytes from 1 for 0 1.498 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 1.499 + por xmm0, xmm4 // 4 bytes from 1 for 0 1.500 + pslldq xmm5, 8 // 8 bytes from 2 for 1 1.501 + movdqu [edx], xmm0 // store 0 1.502 + por xmm1, xmm5 // 8 bytes from 2 for 1 1.503 + psrldq xmm2, 8 // 4 bytes from 2 1.504 + pslldq xmm3, 4 // 12 bytes from 3 for 2 1.505 + por xmm2, xmm3 // 12 bytes from 3 for 2 1.506 + movdqu [edx + 16], xmm1 // store 1 1.507 + movdqu [edx + 32], xmm2 // store 2 1.508 + lea edx, [edx + 48] 1.509 + sub ecx, 16 1.510 + jg convertloop 1.511 + ret 1.512 + } 1.513 +} 1.514 + 1.515 +__declspec(naked) __declspec(align(16)) 1.516 +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1.517 + __asm { 1.518 + mov eax, [esp + 4] // src_argb 1.519 + mov edx, [esp + 8] // dst_rgb 1.520 + mov ecx, [esp + 12] // pix 1.521 + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 1.522 + psrld xmm3, 27 1.523 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 1.524 + psrld xmm4, 26 1.525 + pslld xmm4, 5 1.526 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 1.527 + pslld xmm5, 11 1.528 + 1.529 + align 4 1.530 + convertloop: 1.531 + movdqa xmm0, [eax] // fetch 4 pixels of argb 1.532 + movdqa xmm1, xmm0 // B 1.533 + movdqa xmm2, xmm0 // G 1.534 + pslld xmm0, 8 // R 1.535 + psrld xmm1, 3 // B 1.536 + psrld xmm2, 5 // G 1.537 + psrad xmm0, 16 // R 1.538 + pand xmm1, xmm3 // B 1.539 + pand xmm2, xmm4 // G 1.540 + pand xmm0, xmm5 // R 1.541 + por xmm1, xmm2 // BG 1.542 + por xmm0, xmm1 // BGR 1.543 + packssdw xmm0, xmm0 1.544 + lea eax, [eax + 16] 1.545 + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 1.546 + lea edx, [edx + 8] 1.547 + sub ecx, 4 1.548 + jg convertloop 1.549 + ret 1.550 + } 1.551 +} 1.552 + 1.553 +// TODO(fbarchard): Improve sign extension/packing. 1.554 +__declspec(naked) __declspec(align(16)) 1.555 +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1.556 + __asm { 1.557 + mov eax, [esp + 4] // src_argb 1.558 + mov edx, [esp + 8] // dst_rgb 1.559 + mov ecx, [esp + 12] // pix 1.560 + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 1.561 + psrld xmm4, 27 1.562 + movdqa xmm5, xmm4 // generate mask 0x000003e0 1.563 + pslld xmm5, 5 1.564 + movdqa xmm6, xmm4 // generate mask 0x00007c00 1.565 + pslld xmm6, 10 1.566 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 1.567 + pslld xmm7, 15 1.568 + 1.569 + align 4 1.570 + convertloop: 1.571 + movdqa xmm0, [eax] // fetch 4 pixels of argb 1.572 + movdqa xmm1, xmm0 // B 1.573 + movdqa xmm2, xmm0 // G 1.574 + movdqa xmm3, xmm0 // R 1.575 + psrad xmm0, 16 // A 1.576 + psrld xmm1, 3 // B 1.577 + psrld xmm2, 6 // G 1.578 + psrld xmm3, 9 // R 1.579 + pand xmm0, xmm7 // A 1.580 + pand xmm1, xmm4 // B 1.581 + pand xmm2, xmm5 // G 1.582 + pand xmm3, xmm6 // R 1.583 + por xmm0, xmm1 // BA 1.584 + por xmm2, xmm3 // GR 1.585 + por xmm0, xmm2 // BGRA 1.586 + packssdw xmm0, xmm0 1.587 + lea eax, [eax + 16] 1.588 + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 1.589 + lea edx, [edx + 8] 1.590 + sub ecx, 4 1.591 + jg convertloop 1.592 + ret 1.593 + } 1.594 +} 1.595 + 1.596 +__declspec(naked) __declspec(align(16)) 1.597 +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1.598 + __asm { 1.599 + mov eax, [esp + 4] // src_argb 1.600 + mov edx, [esp + 8] // dst_rgb 1.601 + mov ecx, [esp + 12] // pix 1.602 + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 1.603 + psllw xmm4, 12 1.604 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 1.605 + psrlw xmm3, 8 1.606 + 1.607 + align 4 1.608 + convertloop: 1.609 + movdqa xmm0, [eax] // fetch 4 pixels of argb 1.610 + movdqa xmm1, xmm0 1.611 + pand xmm0, xmm3 // low nibble 1.612 + pand xmm1, xmm4 // high nibble 1.613 + psrl xmm0, 4 1.614 + psrl xmm1, 8 1.615 + por xmm0, xmm1 1.616 + packuswb xmm0, xmm0 1.617 + lea eax, [eax + 16] 1.618 + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 1.619 + lea edx, [edx + 8] 1.620 + sub ecx, 4 1.621 + jg convertloop 1.622 + ret 1.623 + } 1.624 +} 1.625 + 1.626 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. 1.627 +__declspec(naked) __declspec(align(16)) 1.628 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.629 + __asm { 1.630 + mov eax, [esp + 4] /* src_argb */ 1.631 + mov edx, [esp + 8] /* dst_y */ 1.632 + mov ecx, [esp + 12] /* pix */ 1.633 + movdqa xmm5, kAddY16 1.634 + movdqa xmm4, kARGBToY 1.635 + 1.636 + align 4 1.637 + convertloop: 1.638 + movdqa xmm0, [eax] 1.639 + movdqa xmm1, [eax + 16] 1.640 + movdqa xmm2, [eax + 32] 1.641 + movdqa xmm3, [eax + 48] 1.642 + pmaddubsw xmm0, xmm4 1.643 + pmaddubsw xmm1, xmm4 1.644 + pmaddubsw xmm2, xmm4 1.645 + pmaddubsw xmm3, xmm4 1.646 + lea eax, [eax + 64] 1.647 + phaddw xmm0, xmm1 1.648 + phaddw xmm2, xmm3 1.649 + psrlw xmm0, 7 1.650 + psrlw xmm2, 7 1.651 + packuswb xmm0, xmm2 1.652 + paddb xmm0, xmm5 1.653 + sub ecx, 16 1.654 + movdqa [edx], xmm0 1.655 + lea edx, [edx + 16] 1.656 + jg convertloop 1.657 + ret 1.658 + } 1.659 +} 1.660 + 1.661 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. 1.662 +__declspec(naked) __declspec(align(16)) 1.663 +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.664 + __asm { 1.665 + mov eax, [esp + 4] /* src_argb */ 1.666 + mov edx, [esp + 8] /* dst_y */ 1.667 + mov ecx, [esp + 12] /* pix */ 1.668 + movdqa xmm4, kARGBToYJ 1.669 + movdqa xmm5, kAddYJ64 1.670 + 1.671 + align 4 1.672 + convertloop: 1.673 + movdqa xmm0, [eax] 1.674 + movdqa xmm1, [eax + 16] 1.675 + movdqa xmm2, [eax + 32] 1.676 + movdqa xmm3, [eax + 48] 1.677 + pmaddubsw xmm0, xmm4 1.678 + pmaddubsw xmm1, xmm4 1.679 + pmaddubsw xmm2, xmm4 1.680 + pmaddubsw xmm3, xmm4 1.681 + lea eax, [eax + 64] 1.682 + phaddw xmm0, xmm1 1.683 + phaddw xmm2, xmm3 1.684 + paddw xmm0, xmm5 // Add .5 for rounding. 1.685 + paddw xmm2, xmm5 1.686 + psrlw xmm0, 7 1.687 + psrlw xmm2, 7 1.688 + packuswb xmm0, xmm2 1.689 + sub ecx, 16 1.690 + movdqa [edx], xmm0 1.691 + lea edx, [edx + 16] 1.692 + jg convertloop 1.693 + ret 1.694 + } 1.695 +} 1.696 + 1.697 +#ifdef HAS_ARGBTOYROW_AVX2 1.698 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1.699 +__declspec(naked) __declspec(align(32)) 1.700 +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1.701 + __asm { 1.702 + mov eax, [esp + 4] /* src_argb */ 1.703 + mov edx, [esp + 8] /* dst_y */ 1.704 + mov ecx, [esp + 12] /* pix */ 1.705 + vbroadcastf128 ymm4, kARGBToY 1.706 + vbroadcastf128 ymm5, kAddY16 1.707 + vmovdqa ymm6, kPermdARGBToY_AVX 1.708 + 1.709 + align 4 1.710 + convertloop: 1.711 + vmovdqu ymm0, [eax] 1.712 + vmovdqu ymm1, [eax + 32] 1.713 + vmovdqu ymm2, [eax + 64] 1.714 + vmovdqu ymm3, [eax + 96] 1.715 + vpmaddubsw ymm0, ymm0, ymm4 1.716 + vpmaddubsw ymm1, ymm1, ymm4 1.717 + vpmaddubsw ymm2, ymm2, ymm4 1.718 + vpmaddubsw ymm3, ymm3, ymm4 1.719 + lea eax, [eax + 128] 1.720 + vphaddw ymm0, ymm0, ymm1 // mutates. 1.721 + vphaddw ymm2, ymm2, ymm3 1.722 + vpsrlw ymm0, ymm0, 7 1.723 + vpsrlw ymm2, ymm2, 7 1.724 + vpackuswb ymm0, ymm0, ymm2 // mutates. 1.725 + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1.726 + vpaddb ymm0, ymm0, ymm5 1.727 + sub ecx, 32 1.728 + vmovdqu [edx], ymm0 1.729 + lea edx, [edx + 32] 1.730 + jg convertloop 1.731 + vzeroupper 1.732 + ret 1.733 + } 1.734 +} 1.735 +#endif // HAS_ARGBTOYROW_AVX2 1.736 + 1.737 +#ifdef HAS_ARGBTOYROW_AVX2 1.738 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1.739 +__declspec(naked) __declspec(align(32)) 1.740 +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1.741 + __asm { 1.742 + mov eax, [esp + 4] /* src_argb */ 1.743 + mov edx, [esp + 8] /* dst_y */ 1.744 + mov ecx, [esp + 12] /* pix */ 1.745 + vbroadcastf128 ymm4, kARGBToYJ 1.746 + vbroadcastf128 ymm5, kAddYJ64 1.747 + vmovdqa ymm6, kPermdARGBToY_AVX 1.748 + 1.749 + align 4 1.750 + convertloop: 1.751 + vmovdqu ymm0, [eax] 1.752 + vmovdqu ymm1, [eax + 32] 1.753 + vmovdqu ymm2, [eax + 64] 1.754 + vmovdqu ymm3, [eax + 96] 1.755 + vpmaddubsw ymm0, ymm0, ymm4 1.756 + vpmaddubsw ymm1, ymm1, ymm4 1.757 + vpmaddubsw ymm2, ymm2, ymm4 1.758 + vpmaddubsw ymm3, ymm3, ymm4 1.759 + lea eax, [eax + 128] 1.760 + vphaddw ymm0, ymm0, ymm1 // mutates. 1.761 + vphaddw ymm2, ymm2, ymm3 1.762 + vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. 1.763 + vpaddw ymm2, ymm2, ymm5 1.764 + vpsrlw ymm0, ymm0, 7 1.765 + vpsrlw ymm2, ymm2, 7 1.766 + vpackuswb ymm0, ymm0, ymm2 // mutates. 1.767 + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1.768 + sub ecx, 32 1.769 + vmovdqu [edx], ymm0 1.770 + lea edx, [edx + 32] 1.771 + jg convertloop 1.772 + 1.773 + vzeroupper 1.774 + ret 1.775 + } 1.776 +} 1.777 +#endif // HAS_ARGBTOYJROW_AVX2 1.778 + 1.779 +__declspec(naked) __declspec(align(16)) 1.780 +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.781 + __asm { 1.782 + mov eax, [esp + 4] /* src_argb */ 1.783 + mov edx, [esp + 8] /* dst_y */ 1.784 + mov ecx, [esp + 12] /* pix */ 1.785 + movdqa xmm5, kAddY16 1.786 + movdqa xmm4, kARGBToY 1.787 + 1.788 + align 4 1.789 + convertloop: 1.790 + movdqu xmm0, [eax] 1.791 + movdqu xmm1, [eax + 16] 1.792 + movdqu xmm2, [eax + 32] 1.793 + movdqu xmm3, [eax + 48] 1.794 + pmaddubsw xmm0, xmm4 1.795 + pmaddubsw xmm1, xmm4 1.796 + pmaddubsw xmm2, xmm4 1.797 + pmaddubsw xmm3, xmm4 1.798 + lea eax, [eax + 64] 1.799 + phaddw xmm0, xmm1 1.800 + phaddw xmm2, xmm3 1.801 + psrlw xmm0, 7 1.802 + psrlw xmm2, 7 1.803 + packuswb xmm0, xmm2 1.804 + paddb xmm0, xmm5 1.805 + sub ecx, 16 1.806 + movdqu [edx], xmm0 1.807 + lea edx, [edx + 16] 1.808 + jg convertloop 1.809 + ret 1.810 + } 1.811 +} 1.812 + 1.813 +__declspec(naked) __declspec(align(16)) 1.814 +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.815 + __asm { 1.816 + mov eax, [esp + 4] /* src_argb */ 1.817 + mov edx, [esp + 8] /* dst_y */ 1.818 + mov ecx, [esp + 12] /* pix */ 1.819 + movdqa xmm4, kARGBToYJ 1.820 + movdqa xmm5, kAddYJ64 1.821 + 1.822 + align 4 1.823 + convertloop: 1.824 + movdqu xmm0, [eax] 1.825 + movdqu xmm1, [eax + 16] 1.826 + movdqu xmm2, [eax + 32] 1.827 + movdqu xmm3, [eax + 48] 1.828 + pmaddubsw xmm0, xmm4 1.829 + pmaddubsw xmm1, xmm4 1.830 + pmaddubsw xmm2, xmm4 1.831 + pmaddubsw xmm3, xmm4 1.832 + lea eax, [eax + 64] 1.833 + phaddw xmm0, xmm1 1.834 + phaddw xmm2, xmm3 1.835 + paddw xmm0, xmm5 1.836 + paddw xmm2, xmm5 1.837 + psrlw xmm0, 7 1.838 + psrlw xmm2, 7 1.839 + packuswb xmm0, xmm2 1.840 + sub ecx, 16 1.841 + movdqu [edx], xmm0 1.842 + lea edx, [edx + 16] 1.843 + jg convertloop 1.844 + ret 1.845 + } 1.846 +} 1.847 + 1.848 +__declspec(naked) __declspec(align(16)) 1.849 +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.850 + __asm { 1.851 + mov eax, [esp + 4] /* src_argb */ 1.852 + mov edx, [esp + 8] /* dst_y */ 1.853 + mov ecx, [esp + 12] /* pix */ 1.854 + movdqa xmm5, kAddY16 1.855 + movdqa xmm4, kBGRAToY 1.856 + 1.857 + align 4 1.858 + convertloop: 1.859 + movdqa xmm0, [eax] 1.860 + movdqa xmm1, [eax + 16] 1.861 + movdqa xmm2, [eax + 32] 1.862 + movdqa xmm3, [eax + 48] 1.863 + pmaddubsw xmm0, xmm4 1.864 + pmaddubsw xmm1, xmm4 1.865 + pmaddubsw xmm2, xmm4 1.866 + pmaddubsw xmm3, xmm4 1.867 + lea eax, [eax + 64] 1.868 + phaddw xmm0, xmm1 1.869 + phaddw xmm2, xmm3 1.870 + psrlw xmm0, 7 1.871 + psrlw xmm2, 7 1.872 + packuswb xmm0, xmm2 1.873 + paddb xmm0, xmm5 1.874 + sub ecx, 16 1.875 + movdqa [edx], xmm0 1.876 + lea edx, [edx + 16] 1.877 + jg convertloop 1.878 + ret 1.879 + } 1.880 +} 1.881 + 1.882 +__declspec(naked) __declspec(align(16)) 1.883 +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.884 + __asm { 1.885 + mov eax, [esp + 4] /* src_argb */ 1.886 + mov edx, [esp + 8] /* dst_y */ 1.887 + mov ecx, [esp + 12] /* pix */ 1.888 + movdqa xmm5, kAddY16 1.889 + movdqa xmm4, kBGRAToY 1.890 + 1.891 + align 4 1.892 + convertloop: 1.893 + movdqu xmm0, [eax] 1.894 + movdqu xmm1, [eax + 16] 1.895 + movdqu xmm2, [eax + 32] 1.896 + movdqu xmm3, [eax + 48] 1.897 + pmaddubsw xmm0, xmm4 1.898 + pmaddubsw xmm1, xmm4 1.899 + pmaddubsw xmm2, xmm4 1.900 + pmaddubsw xmm3, xmm4 1.901 + lea eax, [eax + 64] 1.902 + phaddw xmm0, xmm1 1.903 + phaddw xmm2, xmm3 1.904 + psrlw xmm0, 7 1.905 + psrlw xmm2, 7 1.906 + packuswb xmm0, xmm2 1.907 + paddb xmm0, xmm5 1.908 + sub ecx, 16 1.909 + movdqu [edx], xmm0 1.910 + lea edx, [edx + 16] 1.911 + jg convertloop 1.912 + ret 1.913 + } 1.914 +} 1.915 + 1.916 +__declspec(naked) __declspec(align(16)) 1.917 +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.918 + __asm { 1.919 + mov eax, [esp + 4] /* src_argb */ 1.920 + mov edx, [esp + 8] /* dst_y */ 1.921 + mov ecx, [esp + 12] /* pix */ 1.922 + movdqa xmm5, kAddY16 1.923 + movdqa xmm4, kABGRToY 1.924 + 1.925 + align 4 1.926 + convertloop: 1.927 + movdqa xmm0, [eax] 1.928 + movdqa xmm1, [eax + 16] 1.929 + movdqa xmm2, [eax + 32] 1.930 + movdqa xmm3, [eax + 48] 1.931 + pmaddubsw xmm0, xmm4 1.932 + pmaddubsw xmm1, xmm4 1.933 + pmaddubsw xmm2, xmm4 1.934 + pmaddubsw xmm3, xmm4 1.935 + lea eax, [eax + 64] 1.936 + phaddw xmm0, xmm1 1.937 + phaddw xmm2, xmm3 1.938 + psrlw xmm0, 7 1.939 + psrlw xmm2, 7 1.940 + packuswb xmm0, xmm2 1.941 + paddb xmm0, xmm5 1.942 + sub ecx, 16 1.943 + movdqa [edx], xmm0 1.944 + lea edx, [edx + 16] 1.945 + jg convertloop 1.946 + ret 1.947 + } 1.948 +} 1.949 + 1.950 +__declspec(naked) __declspec(align(16)) 1.951 +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.952 + __asm { 1.953 + mov eax, [esp + 4] /* src_argb */ 1.954 + mov edx, [esp + 8] /* dst_y */ 1.955 + mov ecx, [esp + 12] /* pix */ 1.956 + movdqa xmm5, kAddY16 1.957 + movdqa xmm4, kABGRToY 1.958 + 1.959 + align 4 1.960 + convertloop: 1.961 + movdqu xmm0, [eax] 1.962 + movdqu xmm1, [eax + 16] 1.963 + movdqu xmm2, [eax + 32] 1.964 + movdqu xmm3, [eax + 48] 1.965 + pmaddubsw xmm0, xmm4 1.966 + pmaddubsw xmm1, xmm4 1.967 + pmaddubsw xmm2, xmm4 1.968 + pmaddubsw xmm3, xmm4 1.969 + lea eax, [eax + 64] 1.970 + phaddw xmm0, xmm1 1.971 + phaddw xmm2, xmm3 1.972 + psrlw xmm0, 7 1.973 + psrlw xmm2, 7 1.974 + packuswb xmm0, xmm2 1.975 + paddb xmm0, xmm5 1.976 + sub ecx, 16 1.977 + movdqu [edx], xmm0 1.978 + lea edx, [edx + 16] 1.979 + jg convertloop 1.980 + ret 1.981 + } 1.982 +} 1.983 + 1.984 +__declspec(naked) __declspec(align(16)) 1.985 +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.986 + __asm { 1.987 + mov eax, [esp + 4] /* src_argb */ 1.988 + mov edx, [esp + 8] /* dst_y */ 1.989 + mov ecx, [esp + 12] /* pix */ 1.990 + movdqa xmm5, kAddY16 1.991 + movdqa xmm4, kRGBAToY 1.992 + 1.993 + align 4 1.994 + convertloop: 1.995 + movdqa xmm0, [eax] 1.996 + movdqa xmm1, [eax + 16] 1.997 + movdqa xmm2, [eax + 32] 1.998 + movdqa xmm3, [eax + 48] 1.999 + pmaddubsw xmm0, xmm4 1.1000 + pmaddubsw xmm1, xmm4 1.1001 + pmaddubsw xmm2, xmm4 1.1002 + pmaddubsw xmm3, xmm4 1.1003 + lea eax, [eax + 64] 1.1004 + phaddw xmm0, xmm1 1.1005 + phaddw xmm2, xmm3 1.1006 + psrlw xmm0, 7 1.1007 + psrlw xmm2, 7 1.1008 + packuswb xmm0, xmm2 1.1009 + paddb xmm0, xmm5 1.1010 + sub ecx, 16 1.1011 + movdqa [edx], xmm0 1.1012 + lea edx, [edx + 16] 1.1013 + jg convertloop 1.1014 + ret 1.1015 + } 1.1016 +} 1.1017 + 1.1018 +__declspec(naked) __declspec(align(16)) 1.1019 +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.1020 + __asm { 1.1021 + mov eax, [esp + 4] /* src_argb */ 1.1022 + mov edx, [esp + 8] /* dst_y */ 1.1023 + mov ecx, [esp + 12] /* pix */ 1.1024 + movdqa xmm5, kAddY16 1.1025 + movdqa xmm4, kRGBAToY 1.1026 + 1.1027 + align 4 1.1028 + convertloop: 1.1029 + movdqu xmm0, [eax] 1.1030 + movdqu xmm1, [eax + 16] 1.1031 + movdqu xmm2, [eax + 32] 1.1032 + movdqu xmm3, [eax + 48] 1.1033 + pmaddubsw xmm0, xmm4 1.1034 + pmaddubsw xmm1, xmm4 1.1035 + pmaddubsw xmm2, xmm4 1.1036 + pmaddubsw xmm3, xmm4 1.1037 + lea eax, [eax + 64] 1.1038 + phaddw xmm0, xmm1 1.1039 + phaddw xmm2, xmm3 1.1040 + psrlw xmm0, 7 1.1041 + psrlw xmm2, 7 1.1042 + packuswb xmm0, xmm2 1.1043 + paddb xmm0, xmm5 1.1044 + sub ecx, 16 1.1045 + movdqu [edx], xmm0 1.1046 + lea edx, [edx + 16] 1.1047 + jg convertloop 1.1048 + ret 1.1049 + } 1.1050 +} 1.1051 + 1.1052 +__declspec(naked) __declspec(align(16)) 1.1053 +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1054 + uint8* dst_u, uint8* dst_v, int width) { 1.1055 + __asm { 1.1056 + push esi 1.1057 + push edi 1.1058 + mov eax, [esp + 8 + 4] // src_argb 1.1059 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1060 + mov edx, [esp + 8 + 12] // dst_u 1.1061 + mov edi, [esp + 8 + 16] // dst_v 1.1062 + mov ecx, [esp + 8 + 20] // pix 1.1063 + movdqa xmm7, kARGBToU 1.1064 + movdqa xmm6, kARGBToV 1.1065 + movdqa xmm5, kAddUV128 1.1066 + sub edi, edx // stride from u to v 1.1067 + 1.1068 + align 4 1.1069 + convertloop: 1.1070 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1071 + movdqa xmm0, [eax] 1.1072 + movdqa xmm1, [eax + 16] 1.1073 + movdqa xmm2, [eax + 32] 1.1074 + movdqa xmm3, [eax + 48] 1.1075 + pavgb xmm0, [eax + esi] 1.1076 + pavgb xmm1, [eax + esi + 16] 1.1077 + pavgb xmm2, [eax + esi + 32] 1.1078 + pavgb xmm3, [eax + esi + 48] 1.1079 + lea eax, [eax + 64] 1.1080 + movdqa xmm4, xmm0 1.1081 + shufps xmm0, xmm1, 0x88 1.1082 + shufps xmm4, xmm1, 0xdd 1.1083 + pavgb xmm0, xmm4 1.1084 + movdqa xmm4, xmm2 1.1085 + shufps xmm2, xmm3, 0x88 1.1086 + shufps xmm4, xmm3, 0xdd 1.1087 + pavgb xmm2, xmm4 1.1088 + 1.1089 + // step 2 - convert to U and V 1.1090 + // from here down is very similar to Y code except 1.1091 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1092 + movdqa xmm1, xmm0 1.1093 + movdqa xmm3, xmm2 1.1094 + pmaddubsw xmm0, xmm7 // U 1.1095 + pmaddubsw xmm2, xmm7 1.1096 + pmaddubsw xmm1, xmm6 // V 1.1097 + pmaddubsw xmm3, xmm6 1.1098 + phaddw xmm0, xmm2 1.1099 + phaddw xmm1, xmm3 1.1100 + psraw xmm0, 8 1.1101 + psraw xmm1, 8 1.1102 + packsswb xmm0, xmm1 1.1103 + paddb xmm0, xmm5 // -> unsigned 1.1104 + 1.1105 + // step 3 - store 8 U and 8 V values 1.1106 + sub ecx, 16 1.1107 + movlps qword ptr [edx], xmm0 // U 1.1108 + movhps qword ptr [edx + edi], xmm0 // V 1.1109 + lea edx, [edx + 8] 1.1110 + jg convertloop 1.1111 + 1.1112 + pop edi 1.1113 + pop esi 1.1114 + ret 1.1115 + } 1.1116 +} 1.1117 + 1.1118 +__declspec(naked) __declspec(align(16)) 1.1119 +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1120 + uint8* dst_u, uint8* dst_v, int width) { 1.1121 + __asm { 1.1122 + push esi 1.1123 + push edi 1.1124 + mov eax, [esp + 8 + 4] // src_argb 1.1125 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1126 + mov edx, [esp + 8 + 12] // dst_u 1.1127 + mov edi, [esp + 8 + 16] // dst_v 1.1128 + mov ecx, [esp + 8 + 20] // pix 1.1129 + movdqa xmm7, kARGBToUJ 1.1130 + movdqa xmm6, kARGBToVJ 1.1131 + movdqa xmm5, kAddUVJ128 1.1132 + sub edi, edx // stride from u to v 1.1133 + 1.1134 + align 4 1.1135 + convertloop: 1.1136 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1137 + movdqa xmm0, [eax] 1.1138 + movdqa xmm1, [eax + 16] 1.1139 + movdqa xmm2, [eax + 32] 1.1140 + movdqa xmm3, [eax + 48] 1.1141 + pavgb xmm0, [eax + esi] 1.1142 + pavgb xmm1, [eax + esi + 16] 1.1143 + pavgb xmm2, [eax + esi + 32] 1.1144 + pavgb xmm3, [eax + esi + 48] 1.1145 + lea eax, [eax + 64] 1.1146 + movdqa xmm4, xmm0 1.1147 + shufps xmm0, xmm1, 0x88 1.1148 + shufps xmm4, xmm1, 0xdd 1.1149 + pavgb xmm0, xmm4 1.1150 + movdqa xmm4, xmm2 1.1151 + shufps xmm2, xmm3, 0x88 1.1152 + shufps xmm4, xmm3, 0xdd 1.1153 + pavgb xmm2, xmm4 1.1154 + 1.1155 + // step 2 - convert to U and V 1.1156 + // from here down is very similar to Y code except 1.1157 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1158 + movdqa xmm1, xmm0 1.1159 + movdqa xmm3, xmm2 1.1160 + pmaddubsw xmm0, xmm7 // U 1.1161 + pmaddubsw xmm2, xmm7 1.1162 + pmaddubsw xmm1, xmm6 // V 1.1163 + pmaddubsw xmm3, xmm6 1.1164 + phaddw xmm0, xmm2 1.1165 + phaddw xmm1, xmm3 1.1166 + paddw xmm0, xmm5 // +.5 rounding -> unsigned 1.1167 + paddw xmm1, xmm5 1.1168 + psraw xmm0, 8 1.1169 + psraw xmm1, 8 1.1170 + packsswb xmm0, xmm1 1.1171 + 1.1172 + // step 3 - store 8 U and 8 V values 1.1173 + sub ecx, 16 1.1174 + movlps qword ptr [edx], xmm0 // U 1.1175 + movhps qword ptr [edx + edi], xmm0 // V 1.1176 + lea edx, [edx + 8] 1.1177 + jg convertloop 1.1178 + 1.1179 + pop edi 1.1180 + pop esi 1.1181 + ret 1.1182 + } 1.1183 +} 1.1184 + 1.1185 +#ifdef HAS_ARGBTOUVROW_AVX2 1.1186 +__declspec(naked) __declspec(align(32)) 1.1187 +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1.1188 + uint8* dst_u, uint8* dst_v, int width) { 1.1189 + __asm { 1.1190 + push esi 1.1191 + push edi 1.1192 + mov eax, [esp + 8 + 4] // src_argb 1.1193 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1194 + mov edx, [esp + 8 + 12] // dst_u 1.1195 + mov edi, [esp + 8 + 16] // dst_v 1.1196 + mov ecx, [esp + 8 + 20] // pix 1.1197 + vbroadcastf128 ymm5, kAddUV128 1.1198 + vbroadcastf128 ymm6, kARGBToV 1.1199 + vbroadcastf128 ymm7, kARGBToU 1.1200 + sub edi, edx // stride from u to v 1.1201 + 1.1202 + align 4 1.1203 + convertloop: 1.1204 + /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1.1205 + vmovdqu ymm0, [eax] 1.1206 + vmovdqu ymm1, [eax + 32] 1.1207 + vmovdqu ymm2, [eax + 64] 1.1208 + vmovdqu ymm3, [eax + 96] 1.1209 + vpavgb ymm0, ymm0, [eax + esi] 1.1210 + vpavgb ymm1, ymm1, [eax + esi + 32] 1.1211 + vpavgb ymm2, ymm2, [eax + esi + 64] 1.1212 + vpavgb ymm3, ymm3, [eax + esi + 96] 1.1213 + lea eax, [eax + 128] 1.1214 + vshufps ymm4, ymm0, ymm1, 0x88 1.1215 + vshufps ymm0, ymm0, ymm1, 0xdd 1.1216 + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1.1217 + vshufps ymm4, ymm2, ymm3, 0x88 1.1218 + vshufps ymm2, ymm2, ymm3, 0xdd 1.1219 + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1.1220 + 1.1221 + // step 2 - convert to U and V 1.1222 + // from here down is very similar to Y code except 1.1223 + // instead of 32 different pixels, its 16 pixels of U and 16 of V 1.1224 + vpmaddubsw ymm1, ymm0, ymm7 // U 1.1225 + vpmaddubsw ymm3, ymm2, ymm7 1.1226 + vpmaddubsw ymm0, ymm0, ymm6 // V 1.1227 + vpmaddubsw ymm2, ymm2, ymm6 1.1228 + vphaddw ymm1, ymm1, ymm3 // mutates 1.1229 + vphaddw ymm0, ymm0, ymm2 1.1230 + vpsraw ymm1, ymm1, 8 1.1231 + vpsraw ymm0, ymm0, 8 1.1232 + vpacksswb ymm0, ymm1, ymm0 // mutates 1.1233 + vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1.1234 + vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw 1.1235 + vpaddb ymm0, ymm0, ymm5 // -> unsigned 1.1236 + 1.1237 + // step 3 - store 16 U and 16 V values 1.1238 + sub ecx, 32 1.1239 + vextractf128 [edx], ymm0, 0 // U 1.1240 + vextractf128 [edx + edi], ymm0, 1 // V 1.1241 + lea edx, [edx + 16] 1.1242 + jg convertloop 1.1243 + 1.1244 + pop edi 1.1245 + pop esi 1.1246 + vzeroupper 1.1247 + ret 1.1248 + } 1.1249 +} 1.1250 +#endif // HAS_ARGBTOUVROW_AVX2 1.1251 + 1.1252 +__declspec(naked) __declspec(align(16)) 1.1253 +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1254 + uint8* dst_u, uint8* dst_v, int width) { 1.1255 + __asm { 1.1256 + push esi 1.1257 + push edi 1.1258 + mov eax, [esp + 8 + 4] // src_argb 1.1259 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1260 + mov edx, [esp + 8 + 12] // dst_u 1.1261 + mov edi, [esp + 8 + 16] // dst_v 1.1262 + mov ecx, [esp + 8 + 20] // pix 1.1263 + movdqa xmm7, kARGBToU 1.1264 + movdqa xmm6, kARGBToV 1.1265 + movdqa xmm5, kAddUV128 1.1266 + sub edi, edx // stride from u to v 1.1267 + 1.1268 + align 4 1.1269 + convertloop: 1.1270 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1271 + movdqu xmm0, [eax] 1.1272 + movdqu xmm1, [eax + 16] 1.1273 + movdqu xmm2, [eax + 32] 1.1274 + movdqu xmm3, [eax + 48] 1.1275 + movdqu xmm4, [eax + esi] 1.1276 + pavgb xmm0, xmm4 1.1277 + movdqu xmm4, [eax + esi + 16] 1.1278 + pavgb xmm1, xmm4 1.1279 + movdqu xmm4, [eax + esi + 32] 1.1280 + pavgb xmm2, xmm4 1.1281 + movdqu xmm4, [eax + esi + 48] 1.1282 + pavgb xmm3, xmm4 1.1283 + lea eax, [eax + 64] 1.1284 + movdqa xmm4, xmm0 1.1285 + shufps xmm0, xmm1, 0x88 1.1286 + shufps xmm4, xmm1, 0xdd 1.1287 + pavgb xmm0, xmm4 1.1288 + movdqa xmm4, xmm2 1.1289 + shufps xmm2, xmm3, 0x88 1.1290 + shufps xmm4, xmm3, 0xdd 1.1291 + pavgb xmm2, xmm4 1.1292 + 1.1293 + // step 2 - convert to U and V 1.1294 + // from here down is very similar to Y code except 1.1295 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1296 + movdqa xmm1, xmm0 1.1297 + movdqa xmm3, xmm2 1.1298 + pmaddubsw xmm0, xmm7 // U 1.1299 + pmaddubsw xmm2, xmm7 1.1300 + pmaddubsw xmm1, xmm6 // V 1.1301 + pmaddubsw xmm3, xmm6 1.1302 + phaddw xmm0, xmm2 1.1303 + phaddw xmm1, xmm3 1.1304 + psraw xmm0, 8 1.1305 + psraw xmm1, 8 1.1306 + packsswb xmm0, xmm1 1.1307 + paddb xmm0, xmm5 // -> unsigned 1.1308 + 1.1309 + // step 3 - store 8 U and 8 V values 1.1310 + sub ecx, 16 1.1311 + movlps qword ptr [edx], xmm0 // U 1.1312 + movhps qword ptr [edx + edi], xmm0 // V 1.1313 + lea edx, [edx + 8] 1.1314 + jg convertloop 1.1315 + 1.1316 + pop edi 1.1317 + pop esi 1.1318 + ret 1.1319 + } 1.1320 +} 1.1321 + 1.1322 +__declspec(naked) __declspec(align(16)) 1.1323 +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1324 + uint8* dst_u, uint8* dst_v, int width) { 1.1325 + __asm { 1.1326 + push esi 1.1327 + push edi 1.1328 + mov eax, [esp + 8 + 4] // src_argb 1.1329 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1330 + mov edx, [esp + 8 + 12] // dst_u 1.1331 + mov edi, [esp + 8 + 16] // dst_v 1.1332 + mov ecx, [esp + 8 + 20] // pix 1.1333 + movdqa xmm7, kARGBToUJ 1.1334 + movdqa xmm6, kARGBToVJ 1.1335 + movdqa xmm5, kAddUVJ128 1.1336 + sub edi, edx // stride from u to v 1.1337 + 1.1338 + align 4 1.1339 + convertloop: 1.1340 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1341 + movdqu xmm0, [eax] 1.1342 + movdqu xmm1, [eax + 16] 1.1343 + movdqu xmm2, [eax + 32] 1.1344 + movdqu xmm3, [eax + 48] 1.1345 + movdqu xmm4, [eax + esi] 1.1346 + pavgb xmm0, xmm4 1.1347 + movdqu xmm4, [eax + esi + 16] 1.1348 + pavgb xmm1, xmm4 1.1349 + movdqu xmm4, [eax + esi + 32] 1.1350 + pavgb xmm2, xmm4 1.1351 + movdqu xmm4, [eax + esi + 48] 1.1352 + pavgb xmm3, xmm4 1.1353 + lea eax, [eax + 64] 1.1354 + movdqa xmm4, xmm0 1.1355 + shufps xmm0, xmm1, 0x88 1.1356 + shufps xmm4, xmm1, 0xdd 1.1357 + pavgb xmm0, xmm4 1.1358 + movdqa xmm4, xmm2 1.1359 + shufps xmm2, xmm3, 0x88 1.1360 + shufps xmm4, xmm3, 0xdd 1.1361 + pavgb xmm2, xmm4 1.1362 + 1.1363 + // step 2 - convert to U and V 1.1364 + // from here down is very similar to Y code except 1.1365 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1366 + movdqa xmm1, xmm0 1.1367 + movdqa xmm3, xmm2 1.1368 + pmaddubsw xmm0, xmm7 // U 1.1369 + pmaddubsw xmm2, xmm7 1.1370 + pmaddubsw xmm1, xmm6 // V 1.1371 + pmaddubsw xmm3, xmm6 1.1372 + phaddw xmm0, xmm2 1.1373 + phaddw xmm1, xmm3 1.1374 + paddw xmm0, xmm5 // +.5 rounding -> unsigned 1.1375 + paddw xmm1, xmm5 1.1376 + psraw xmm0, 8 1.1377 + psraw xmm1, 8 1.1378 + packsswb xmm0, xmm1 1.1379 + 1.1380 + // step 3 - store 8 U and 8 V values 1.1381 + sub ecx, 16 1.1382 + movlps qword ptr [edx], xmm0 // U 1.1383 + movhps qword ptr [edx + edi], xmm0 // V 1.1384 + lea edx, [edx + 8] 1.1385 + jg convertloop 1.1386 + 1.1387 + pop edi 1.1388 + pop esi 1.1389 + ret 1.1390 + } 1.1391 +} 1.1392 + 1.1393 +__declspec(naked) __declspec(align(16)) 1.1394 +void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1.1395 + uint8* dst_u, uint8* dst_v, int width) { 1.1396 + __asm { 1.1397 + push edi 1.1398 + mov eax, [esp + 4 + 4] // src_argb 1.1399 + mov edx, [esp + 4 + 8] // dst_u 1.1400 + mov edi, [esp + 4 + 12] // dst_v 1.1401 + mov ecx, [esp + 4 + 16] // pix 1.1402 + movdqa xmm7, kARGBToU 1.1403 + movdqa xmm6, kARGBToV 1.1404 + movdqa xmm5, kAddUV128 1.1405 + sub edi, edx // stride from u to v 1.1406 + 1.1407 + align 4 1.1408 + convertloop: 1.1409 + /* convert to U and V */ 1.1410 + movdqa xmm0, [eax] // U 1.1411 + movdqa xmm1, [eax + 16] 1.1412 + movdqa xmm2, [eax + 32] 1.1413 + movdqa xmm3, [eax + 48] 1.1414 + pmaddubsw xmm0, xmm7 1.1415 + pmaddubsw xmm1, xmm7 1.1416 + pmaddubsw xmm2, xmm7 1.1417 + pmaddubsw xmm3, xmm7 1.1418 + phaddw xmm0, xmm1 1.1419 + phaddw xmm2, xmm3 1.1420 + psraw xmm0, 8 1.1421 + psraw xmm2, 8 1.1422 + packsswb xmm0, xmm2 1.1423 + paddb xmm0, xmm5 1.1424 + sub ecx, 16 1.1425 + movdqa [edx], xmm0 1.1426 + 1.1427 + movdqa xmm0, [eax] // V 1.1428 + movdqa xmm1, [eax + 16] 1.1429 + movdqa xmm2, [eax + 32] 1.1430 + movdqa xmm3, [eax + 48] 1.1431 + pmaddubsw xmm0, xmm6 1.1432 + pmaddubsw xmm1, xmm6 1.1433 + pmaddubsw xmm2, xmm6 1.1434 + pmaddubsw xmm3, xmm6 1.1435 + phaddw xmm0, xmm1 1.1436 + phaddw xmm2, xmm3 1.1437 + psraw xmm0, 8 1.1438 + psraw xmm2, 8 1.1439 + packsswb xmm0, xmm2 1.1440 + paddb xmm0, xmm5 1.1441 + lea eax, [eax + 64] 1.1442 + movdqa [edx + edi], xmm0 1.1443 + lea edx, [edx + 16] 1.1444 + jg convertloop 1.1445 + 1.1446 + pop edi 1.1447 + ret 1.1448 + } 1.1449 +} 1.1450 + 1.1451 +__declspec(naked) __declspec(align(16)) 1.1452 +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, 1.1453 + uint8* dst_u, uint8* dst_v, int width) { 1.1454 + __asm { 1.1455 + push edi 1.1456 + mov eax, [esp + 4 + 4] // src_argb 1.1457 + mov edx, [esp + 4 + 8] // dst_u 1.1458 + mov edi, [esp + 4 + 12] // dst_v 1.1459 + mov ecx, [esp + 4 + 16] // pix 1.1460 + movdqa xmm7, kARGBToU 1.1461 + movdqa xmm6, kARGBToV 1.1462 + movdqa xmm5, kAddUV128 1.1463 + sub edi, edx // stride from u to v 1.1464 + 1.1465 + align 4 1.1466 + convertloop: 1.1467 + /* convert to U and V */ 1.1468 + movdqu xmm0, [eax] // U 1.1469 + movdqu xmm1, [eax + 16] 1.1470 + movdqu xmm2, [eax + 32] 1.1471 + movdqu xmm3, [eax + 48] 1.1472 + pmaddubsw xmm0, xmm7 1.1473 + pmaddubsw xmm1, xmm7 1.1474 + pmaddubsw xmm2, xmm7 1.1475 + pmaddubsw xmm3, xmm7 1.1476 + phaddw xmm0, xmm1 1.1477 + phaddw xmm2, xmm3 1.1478 + psraw xmm0, 8 1.1479 + psraw xmm2, 8 1.1480 + packsswb xmm0, xmm2 1.1481 + paddb xmm0, xmm5 1.1482 + sub ecx, 16 1.1483 + movdqu [edx], xmm0 1.1484 + 1.1485 + movdqu xmm0, [eax] // V 1.1486 + movdqu xmm1, [eax + 16] 1.1487 + movdqu xmm2, [eax + 32] 1.1488 + movdqu xmm3, [eax + 48] 1.1489 + pmaddubsw xmm0, xmm6 1.1490 + pmaddubsw xmm1, xmm6 1.1491 + pmaddubsw xmm2, xmm6 1.1492 + pmaddubsw xmm3, xmm6 1.1493 + phaddw xmm0, xmm1 1.1494 + phaddw xmm2, xmm3 1.1495 + psraw xmm0, 8 1.1496 + psraw xmm2, 8 1.1497 + packsswb xmm0, xmm2 1.1498 + paddb xmm0, xmm5 1.1499 + lea eax, [eax + 64] 1.1500 + movdqu [edx + edi], xmm0 1.1501 + lea edx, [edx + 16] 1.1502 + jg convertloop 1.1503 + 1.1504 + pop edi 1.1505 + ret 1.1506 + } 1.1507 +} 1.1508 + 1.1509 +__declspec(naked) __declspec(align(16)) 1.1510 +void ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1.1511 + uint8* dst_u, uint8* dst_v, int width) { 1.1512 + __asm { 1.1513 + push edi 1.1514 + mov eax, [esp + 4 + 4] // src_argb 1.1515 + mov edx, [esp + 4 + 8] // dst_u 1.1516 + mov edi, [esp + 4 + 12] // dst_v 1.1517 + mov ecx, [esp + 4 + 16] // pix 1.1518 + movdqa xmm7, kARGBToU 1.1519 + movdqa xmm6, kARGBToV 1.1520 + movdqa xmm5, kAddUV128 1.1521 + sub edi, edx // stride from u to v 1.1522 + 1.1523 + align 4 1.1524 + convertloop: 1.1525 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1526 + movdqa xmm0, [eax] 1.1527 + movdqa xmm1, [eax + 16] 1.1528 + movdqa xmm2, [eax + 32] 1.1529 + movdqa xmm3, [eax + 48] 1.1530 + lea eax, [eax + 64] 1.1531 + movdqa xmm4, xmm0 1.1532 + shufps xmm0, xmm1, 0x88 1.1533 + shufps xmm4, xmm1, 0xdd 1.1534 + pavgb xmm0, xmm4 1.1535 + movdqa xmm4, xmm2 1.1536 + shufps xmm2, xmm3, 0x88 1.1537 + shufps xmm4, xmm3, 0xdd 1.1538 + pavgb xmm2, xmm4 1.1539 + 1.1540 + // step 2 - convert to U and V 1.1541 + // from here down is very similar to Y code except 1.1542 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1543 + movdqa xmm1, xmm0 1.1544 + movdqa xmm3, xmm2 1.1545 + pmaddubsw xmm0, xmm7 // U 1.1546 + pmaddubsw xmm2, xmm7 1.1547 + pmaddubsw xmm1, xmm6 // V 1.1548 + pmaddubsw xmm3, xmm6 1.1549 + phaddw xmm0, xmm2 1.1550 + phaddw xmm1, xmm3 1.1551 + psraw xmm0, 8 1.1552 + psraw xmm1, 8 1.1553 + packsswb xmm0, xmm1 1.1554 + paddb xmm0, xmm5 // -> unsigned 1.1555 + 1.1556 + // step 3 - store 8 U and 8 V values 1.1557 + sub ecx, 16 1.1558 + movlps qword ptr [edx], xmm0 // U 1.1559 + movhps qword ptr [edx + edi], xmm0 // V 1.1560 + lea edx, [edx + 8] 1.1561 + jg convertloop 1.1562 + 1.1563 + pop edi 1.1564 + ret 1.1565 + } 1.1566 +} 1.1567 + 1.1568 +__declspec(naked) __declspec(align(16)) 1.1569 +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, 1.1570 + uint8* dst_u, uint8* dst_v, int width) { 1.1571 + __asm { 1.1572 + push edi 1.1573 + mov eax, [esp + 4 + 4] // src_argb 1.1574 + mov edx, [esp + 4 + 8] // dst_u 1.1575 + mov edi, [esp + 4 + 12] // dst_v 1.1576 + mov ecx, [esp + 4 + 16] // pix 1.1577 + movdqa xmm7, kARGBToU 1.1578 + movdqa xmm6, kARGBToV 1.1579 + movdqa xmm5, kAddUV128 1.1580 + sub edi, edx // stride from u to v 1.1581 + 1.1582 + align 4 1.1583 + convertloop: 1.1584 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1585 + movdqu xmm0, [eax] 1.1586 + movdqu xmm1, [eax + 16] 1.1587 + movdqu xmm2, [eax + 32] 1.1588 + movdqu xmm3, [eax + 48] 1.1589 + lea eax, [eax + 64] 1.1590 + movdqa xmm4, xmm0 1.1591 + shufps xmm0, xmm1, 0x88 1.1592 + shufps xmm4, xmm1, 0xdd 1.1593 + pavgb xmm0, xmm4 1.1594 + movdqa xmm4, xmm2 1.1595 + shufps xmm2, xmm3, 0x88 1.1596 + shufps xmm4, xmm3, 0xdd 1.1597 + pavgb xmm2, xmm4 1.1598 + 1.1599 + // step 2 - convert to U and V 1.1600 + // from here down is very similar to Y code except 1.1601 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1602 + movdqa xmm1, xmm0 1.1603 + movdqa xmm3, xmm2 1.1604 + pmaddubsw xmm0, xmm7 // U 1.1605 + pmaddubsw xmm2, xmm7 1.1606 + pmaddubsw xmm1, xmm6 // V 1.1607 + pmaddubsw xmm3, xmm6 1.1608 + phaddw xmm0, xmm2 1.1609 + phaddw xmm1, xmm3 1.1610 + psraw xmm0, 8 1.1611 + psraw xmm1, 8 1.1612 + packsswb xmm0, xmm1 1.1613 + paddb xmm0, xmm5 // -> unsigned 1.1614 + 1.1615 + // step 3 - store 8 U and 8 V values 1.1616 + sub ecx, 16 1.1617 + movlps qword ptr [edx], xmm0 // U 1.1618 + movhps qword ptr [edx + edi], xmm0 // V 1.1619 + lea edx, [edx + 8] 1.1620 + jg convertloop 1.1621 + 1.1622 + pop edi 1.1623 + ret 1.1624 + } 1.1625 +} 1.1626 + 1.1627 +__declspec(naked) __declspec(align(16)) 1.1628 +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1629 + uint8* dst_u, uint8* dst_v, int width) { 1.1630 + __asm { 1.1631 + push esi 1.1632 + push edi 1.1633 + mov eax, [esp + 8 + 4] // src_argb 1.1634 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1635 + mov edx, [esp + 8 + 12] // dst_u 1.1636 + mov edi, [esp + 8 + 16] // dst_v 1.1637 + mov ecx, [esp + 8 + 20] // pix 1.1638 + movdqa xmm7, kBGRAToU 1.1639 + movdqa xmm6, kBGRAToV 1.1640 + movdqa xmm5, kAddUV128 1.1641 + sub edi, edx // stride from u to v 1.1642 + 1.1643 + align 4 1.1644 + convertloop: 1.1645 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1646 + movdqa xmm0, [eax] 1.1647 + movdqa xmm1, [eax + 16] 1.1648 + movdqa xmm2, [eax + 32] 1.1649 + movdqa xmm3, [eax + 48] 1.1650 + pavgb xmm0, [eax + esi] 1.1651 + pavgb xmm1, [eax + esi + 16] 1.1652 + pavgb xmm2, [eax + esi + 32] 1.1653 + pavgb xmm3, [eax + esi + 48] 1.1654 + lea eax, [eax + 64] 1.1655 + movdqa xmm4, xmm0 1.1656 + shufps xmm0, xmm1, 0x88 1.1657 + shufps xmm4, xmm1, 0xdd 1.1658 + pavgb xmm0, xmm4 1.1659 + movdqa xmm4, xmm2 1.1660 + shufps xmm2, xmm3, 0x88 1.1661 + shufps xmm4, xmm3, 0xdd 1.1662 + pavgb xmm2, xmm4 1.1663 + 1.1664 + // step 2 - convert to U and V 1.1665 + // from here down is very similar to Y code except 1.1666 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1667 + movdqa xmm1, xmm0 1.1668 + movdqa xmm3, xmm2 1.1669 + pmaddubsw xmm0, xmm7 // U 1.1670 + pmaddubsw xmm2, xmm7 1.1671 + pmaddubsw xmm1, xmm6 // V 1.1672 + pmaddubsw xmm3, xmm6 1.1673 + phaddw xmm0, xmm2 1.1674 + phaddw xmm1, xmm3 1.1675 + psraw xmm0, 8 1.1676 + psraw xmm1, 8 1.1677 + packsswb xmm0, xmm1 1.1678 + paddb xmm0, xmm5 // -> unsigned 1.1679 + 1.1680 + // step 3 - store 8 U and 8 V values 1.1681 + sub ecx, 16 1.1682 + movlps qword ptr [edx], xmm0 // U 1.1683 + movhps qword ptr [edx + edi], xmm0 // V 1.1684 + lea edx, [edx + 8] 1.1685 + jg convertloop 1.1686 + 1.1687 + pop edi 1.1688 + pop esi 1.1689 + ret 1.1690 + } 1.1691 +} 1.1692 + 1.1693 +__declspec(naked) __declspec(align(16)) 1.1694 +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1695 + uint8* dst_u, uint8* dst_v, int width) { 1.1696 + __asm { 1.1697 + push esi 1.1698 + push edi 1.1699 + mov eax, [esp + 8 + 4] // src_argb 1.1700 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1701 + mov edx, [esp + 8 + 12] // dst_u 1.1702 + mov edi, [esp + 8 + 16] // dst_v 1.1703 + mov ecx, [esp + 8 + 20] // pix 1.1704 + movdqa xmm7, kBGRAToU 1.1705 + movdqa xmm6, kBGRAToV 1.1706 + movdqa xmm5, kAddUV128 1.1707 + sub edi, edx // stride from u to v 1.1708 + 1.1709 + align 4 1.1710 + convertloop: 1.1711 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1712 + movdqu xmm0, [eax] 1.1713 + movdqu xmm1, [eax + 16] 1.1714 + movdqu xmm2, [eax + 32] 1.1715 + movdqu xmm3, [eax + 48] 1.1716 + movdqu xmm4, [eax + esi] 1.1717 + pavgb xmm0, xmm4 1.1718 + movdqu xmm4, [eax + esi + 16] 1.1719 + pavgb xmm1, xmm4 1.1720 + movdqu xmm4, [eax + esi + 32] 1.1721 + pavgb xmm2, xmm4 1.1722 + movdqu xmm4, [eax + esi + 48] 1.1723 + pavgb xmm3, xmm4 1.1724 + lea eax, [eax + 64] 1.1725 + movdqa xmm4, xmm0 1.1726 + shufps xmm0, xmm1, 0x88 1.1727 + shufps xmm4, xmm1, 0xdd 1.1728 + pavgb xmm0, xmm4 1.1729 + movdqa xmm4, xmm2 1.1730 + shufps xmm2, xmm3, 0x88 1.1731 + shufps xmm4, xmm3, 0xdd 1.1732 + pavgb xmm2, xmm4 1.1733 + 1.1734 + // step 2 - convert to U and V 1.1735 + // from here down is very similar to Y code except 1.1736 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1737 + movdqa xmm1, xmm0 1.1738 + movdqa xmm3, xmm2 1.1739 + pmaddubsw xmm0, xmm7 // U 1.1740 + pmaddubsw xmm2, xmm7 1.1741 + pmaddubsw xmm1, xmm6 // V 1.1742 + pmaddubsw xmm3, xmm6 1.1743 + phaddw xmm0, xmm2 1.1744 + phaddw xmm1, xmm3 1.1745 + psraw xmm0, 8 1.1746 + psraw xmm1, 8 1.1747 + packsswb xmm0, xmm1 1.1748 + paddb xmm0, xmm5 // -> unsigned 1.1749 + 1.1750 + // step 3 - store 8 U and 8 V values 1.1751 + sub ecx, 16 1.1752 + movlps qword ptr [edx], xmm0 // U 1.1753 + movhps qword ptr [edx + edi], xmm0 // V 1.1754 + lea edx, [edx + 8] 1.1755 + jg convertloop 1.1756 + 1.1757 + pop edi 1.1758 + pop esi 1.1759 + ret 1.1760 + } 1.1761 +} 1.1762 + 1.1763 +__declspec(naked) __declspec(align(16)) 1.1764 +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1765 + uint8* dst_u, uint8* dst_v, int width) { 1.1766 + __asm { 1.1767 + push esi 1.1768 + push edi 1.1769 + mov eax, [esp + 8 + 4] // src_argb 1.1770 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1771 + mov edx, [esp + 8 + 12] // dst_u 1.1772 + mov edi, [esp + 8 + 16] // dst_v 1.1773 + mov ecx, [esp + 8 + 20] // pix 1.1774 + movdqa xmm7, kABGRToU 1.1775 + movdqa xmm6, kABGRToV 1.1776 + movdqa xmm5, kAddUV128 1.1777 + sub edi, edx // stride from u to v 1.1778 + 1.1779 + align 4 1.1780 + convertloop: 1.1781 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1782 + movdqa xmm0, [eax] 1.1783 + movdqa xmm1, [eax + 16] 1.1784 + movdqa xmm2, [eax + 32] 1.1785 + movdqa xmm3, [eax + 48] 1.1786 + pavgb xmm0, [eax + esi] 1.1787 + pavgb xmm1, [eax + esi + 16] 1.1788 + pavgb xmm2, [eax + esi + 32] 1.1789 + pavgb xmm3, [eax + esi + 48] 1.1790 + lea eax, [eax + 64] 1.1791 + movdqa xmm4, xmm0 1.1792 + shufps xmm0, xmm1, 0x88 1.1793 + shufps xmm4, xmm1, 0xdd 1.1794 + pavgb xmm0, xmm4 1.1795 + movdqa xmm4, xmm2 1.1796 + shufps xmm2, xmm3, 0x88 1.1797 + shufps xmm4, xmm3, 0xdd 1.1798 + pavgb xmm2, xmm4 1.1799 + 1.1800 + // step 2 - convert to U and V 1.1801 + // from here down is very similar to Y code except 1.1802 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1803 + movdqa xmm1, xmm0 1.1804 + movdqa xmm3, xmm2 1.1805 + pmaddubsw xmm0, xmm7 // U 1.1806 + pmaddubsw xmm2, xmm7 1.1807 + pmaddubsw xmm1, xmm6 // V 1.1808 + pmaddubsw xmm3, xmm6 1.1809 + phaddw xmm0, xmm2 1.1810 + phaddw xmm1, xmm3 1.1811 + psraw xmm0, 8 1.1812 + psraw xmm1, 8 1.1813 + packsswb xmm0, xmm1 1.1814 + paddb xmm0, xmm5 // -> unsigned 1.1815 + 1.1816 + // step 3 - store 8 U and 8 V values 1.1817 + sub ecx, 16 1.1818 + movlps qword ptr [edx], xmm0 // U 1.1819 + movhps qword ptr [edx + edi], xmm0 // V 1.1820 + lea edx, [edx + 8] 1.1821 + jg convertloop 1.1822 + 1.1823 + pop edi 1.1824 + pop esi 1.1825 + ret 1.1826 + } 1.1827 +} 1.1828 + 1.1829 +__declspec(naked) __declspec(align(16)) 1.1830 +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1831 + uint8* dst_u, uint8* dst_v, int width) { 1.1832 + __asm { 1.1833 + push esi 1.1834 + push edi 1.1835 + mov eax, [esp + 8 + 4] // src_argb 1.1836 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1837 + mov edx, [esp + 8 + 12] // dst_u 1.1838 + mov edi, [esp + 8 + 16] // dst_v 1.1839 + mov ecx, [esp + 8 + 20] // pix 1.1840 + movdqa xmm7, kABGRToU 1.1841 + movdqa xmm6, kABGRToV 1.1842 + movdqa xmm5, kAddUV128 1.1843 + sub edi, edx // stride from u to v 1.1844 + 1.1845 + align 4 1.1846 + convertloop: 1.1847 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1848 + movdqu xmm0, [eax] 1.1849 + movdqu xmm1, [eax + 16] 1.1850 + movdqu xmm2, [eax + 32] 1.1851 + movdqu xmm3, [eax + 48] 1.1852 + movdqu xmm4, [eax + esi] 1.1853 + pavgb xmm0, xmm4 1.1854 + movdqu xmm4, [eax + esi + 16] 1.1855 + pavgb xmm1, xmm4 1.1856 + movdqu xmm4, [eax + esi + 32] 1.1857 + pavgb xmm2, xmm4 1.1858 + movdqu xmm4, [eax + esi + 48] 1.1859 + pavgb xmm3, xmm4 1.1860 + lea eax, [eax + 64] 1.1861 + movdqa xmm4, xmm0 1.1862 + shufps xmm0, xmm1, 0x88 1.1863 + shufps xmm4, xmm1, 0xdd 1.1864 + pavgb xmm0, xmm4 1.1865 + movdqa xmm4, xmm2 1.1866 + shufps xmm2, xmm3, 0x88 1.1867 + shufps xmm4, xmm3, 0xdd 1.1868 + pavgb xmm2, xmm4 1.1869 + 1.1870 + // step 2 - convert to U and V 1.1871 + // from here down is very similar to Y code except 1.1872 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1873 + movdqa xmm1, xmm0 1.1874 + movdqa xmm3, xmm2 1.1875 + pmaddubsw xmm0, xmm7 // U 1.1876 + pmaddubsw xmm2, xmm7 1.1877 + pmaddubsw xmm1, xmm6 // V 1.1878 + pmaddubsw xmm3, xmm6 1.1879 + phaddw xmm0, xmm2 1.1880 + phaddw xmm1, xmm3 1.1881 + psraw xmm0, 8 1.1882 + psraw xmm1, 8 1.1883 + packsswb xmm0, xmm1 1.1884 + paddb xmm0, xmm5 // -> unsigned 1.1885 + 1.1886 + // step 3 - store 8 U and 8 V values 1.1887 + sub ecx, 16 1.1888 + movlps qword ptr [edx], xmm0 // U 1.1889 + movhps qword ptr [edx + edi], xmm0 // V 1.1890 + lea edx, [edx + 8] 1.1891 + jg convertloop 1.1892 + 1.1893 + pop edi 1.1894 + pop esi 1.1895 + ret 1.1896 + } 1.1897 +} 1.1898 + 1.1899 +__declspec(naked) __declspec(align(16)) 1.1900 +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1901 + uint8* dst_u, uint8* dst_v, int width) { 1.1902 + __asm { 1.1903 + push esi 1.1904 + push edi 1.1905 + mov eax, [esp + 8 + 4] // src_argb 1.1906 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1907 + mov edx, [esp + 8 + 12] // dst_u 1.1908 + mov edi, [esp + 8 + 16] // dst_v 1.1909 + mov ecx, [esp + 8 + 20] // pix 1.1910 + movdqa xmm7, kRGBAToU 1.1911 + movdqa xmm6, kRGBAToV 1.1912 + movdqa xmm5, kAddUV128 1.1913 + sub edi, edx // stride from u to v 1.1914 + 1.1915 + align 4 1.1916 + convertloop: 1.1917 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1918 + movdqa xmm0, [eax] 1.1919 + movdqa xmm1, [eax + 16] 1.1920 + movdqa xmm2, [eax + 32] 1.1921 + movdqa xmm3, [eax + 48] 1.1922 + pavgb xmm0, [eax + esi] 1.1923 + pavgb xmm1, [eax + esi + 16] 1.1924 + pavgb xmm2, [eax + esi + 32] 1.1925 + pavgb xmm3, [eax + esi + 48] 1.1926 + lea eax, [eax + 64] 1.1927 + movdqa xmm4, xmm0 1.1928 + shufps xmm0, xmm1, 0x88 1.1929 + shufps xmm4, xmm1, 0xdd 1.1930 + pavgb xmm0, xmm4 1.1931 + movdqa xmm4, xmm2 1.1932 + shufps xmm2, xmm3, 0x88 1.1933 + shufps xmm4, xmm3, 0xdd 1.1934 + pavgb xmm2, xmm4 1.1935 + 1.1936 + // step 2 - convert to U and V 1.1937 + // from here down is very similar to Y code except 1.1938 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.1939 + movdqa xmm1, xmm0 1.1940 + movdqa xmm3, xmm2 1.1941 + pmaddubsw xmm0, xmm7 // U 1.1942 + pmaddubsw xmm2, xmm7 1.1943 + pmaddubsw xmm1, xmm6 // V 1.1944 + pmaddubsw xmm3, xmm6 1.1945 + phaddw xmm0, xmm2 1.1946 + phaddw xmm1, xmm3 1.1947 + psraw xmm0, 8 1.1948 + psraw xmm1, 8 1.1949 + packsswb xmm0, xmm1 1.1950 + paddb xmm0, xmm5 // -> unsigned 1.1951 + 1.1952 + // step 3 - store 8 U and 8 V values 1.1953 + sub ecx, 16 1.1954 + movlps qword ptr [edx], xmm0 // U 1.1955 + movhps qword ptr [edx + edi], xmm0 // V 1.1956 + lea edx, [edx + 8] 1.1957 + jg convertloop 1.1958 + 1.1959 + pop edi 1.1960 + pop esi 1.1961 + ret 1.1962 + } 1.1963 +} 1.1964 + 1.1965 +__declspec(naked) __declspec(align(16)) 1.1966 +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1967 + uint8* dst_u, uint8* dst_v, int width) { 1.1968 + __asm { 1.1969 + push esi 1.1970 + push edi 1.1971 + mov eax, [esp + 8 + 4] // src_argb 1.1972 + mov esi, [esp + 8 + 8] // src_stride_argb 1.1973 + mov edx, [esp + 8 + 12] // dst_u 1.1974 + mov edi, [esp + 8 + 16] // dst_v 1.1975 + mov ecx, [esp + 8 + 20] // pix 1.1976 + movdqa xmm7, kRGBAToU 1.1977 + movdqa xmm6, kRGBAToV 1.1978 + movdqa xmm5, kAddUV128 1.1979 + sub edi, edx // stride from u to v 1.1980 + 1.1981 + align 4 1.1982 + convertloop: 1.1983 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1.1984 + movdqu xmm0, [eax] 1.1985 + movdqu xmm1, [eax + 16] 1.1986 + movdqu xmm2, [eax + 32] 1.1987 + movdqu xmm3, [eax + 48] 1.1988 + movdqu xmm4, [eax + esi] 1.1989 + pavgb xmm0, xmm4 1.1990 + movdqu xmm4, [eax + esi + 16] 1.1991 + pavgb xmm1, xmm4 1.1992 + movdqu xmm4, [eax + esi + 32] 1.1993 + pavgb xmm2, xmm4 1.1994 + movdqu xmm4, [eax + esi + 48] 1.1995 + pavgb xmm3, xmm4 1.1996 + lea eax, [eax + 64] 1.1997 + movdqa xmm4, xmm0 1.1998 + shufps xmm0, xmm1, 0x88 1.1999 + shufps xmm4, xmm1, 0xdd 1.2000 + pavgb xmm0, xmm4 1.2001 + movdqa xmm4, xmm2 1.2002 + shufps xmm2, xmm3, 0x88 1.2003 + shufps xmm4, xmm3, 0xdd 1.2004 + pavgb xmm2, xmm4 1.2005 + 1.2006 + // step 2 - convert to U and V 1.2007 + // from here down is very similar to Y code except 1.2008 + // instead of 16 different pixels, its 8 pixels of U and 8 of V 1.2009 + movdqa xmm1, xmm0 1.2010 + movdqa xmm3, xmm2 1.2011 + pmaddubsw xmm0, xmm7 // U 1.2012 + pmaddubsw xmm2, xmm7 1.2013 + pmaddubsw xmm1, xmm6 // V 1.2014 + pmaddubsw xmm3, xmm6 1.2015 + phaddw xmm0, xmm2 1.2016 + phaddw xmm1, xmm3 1.2017 + psraw xmm0, 8 1.2018 + psraw xmm1, 8 1.2019 + packsswb xmm0, xmm1 1.2020 + paddb xmm0, xmm5 // -> unsigned 1.2021 + 1.2022 + // step 3 - store 8 U and 8 V values 1.2023 + sub ecx, 16 1.2024 + movlps qword ptr [edx], xmm0 // U 1.2025 + movhps qword ptr [edx + edi], xmm0 // V 1.2026 + lea edx, [edx + 8] 1.2027 + jg convertloop 1.2028 + 1.2029 + pop edi 1.2030 + pop esi 1.2031 + ret 1.2032 + } 1.2033 +} 1.2034 +#endif // HAS_ARGBTOYROW_SSSE3 1.2035 + 1.2036 +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ 1.2037 + 1.2038 +#define UB 127 /* min(63,(int8)(2.018 * 64)) */ 1.2039 +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ 1.2040 +#define UR 0 1.2041 + 1.2042 +#define VB 0 1.2043 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ 1.2044 +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ 1.2045 + 1.2046 +// Bias 1.2047 +#define BB UB * 128 + VB * 128 1.2048 +#define BG UG * 128 + VG * 128 1.2049 +#define BR UR * 128 + VR * 128 1.2050 + 1.2051 +#ifdef HAS_I422TOARGBROW_AVX2 1.2052 + 1.2053 +static const lvec8 kUVToB_AVX = { 1.2054 + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, 1.2055 + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB 1.2056 +}; 1.2057 +static const lvec8 kUVToR_AVX = { 1.2058 + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, 1.2059 + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR 1.2060 +}; 1.2061 +static const lvec8 kUVToG_AVX = { 1.2062 + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, 1.2063 + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG 1.2064 +}; 1.2065 +static const lvec16 kYToRgb_AVX = { 1.2066 + YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG 1.2067 +}; 1.2068 +static const lvec16 kYSub16_AVX = { 1.2069 + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 1.2070 +}; 1.2071 +static const lvec16 kUVBiasB_AVX = { 1.2072 + BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB 1.2073 +}; 1.2074 +static const lvec16 kUVBiasG_AVX = { 1.2075 + BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG 1.2076 +}; 1.2077 +static const lvec16 kUVBiasR_AVX = { 1.2078 + BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR 1.2079 +}; 1.2080 + 1.2081 +// 16 pixels 1.2082 +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 1.2083 +__declspec(naked) __declspec(align(16)) 1.2084 +void I422ToARGBRow_AVX2(const uint8* y_buf, 1.2085 + const uint8* u_buf, 1.2086 + const uint8* v_buf, 1.2087 + uint8* dst_argb, 1.2088 + int width) { 1.2089 + __asm { 1.2090 + push esi 1.2091 + push edi 1.2092 + mov eax, [esp + 8 + 4] // Y 1.2093 + mov esi, [esp + 8 + 8] // U 1.2094 + mov edi, [esp + 8 + 12] // V 1.2095 + mov edx, [esp + 8 + 16] // argb 1.2096 + mov ecx, [esp + 8 + 20] // width 1.2097 + sub edi, esi 1.2098 + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 1.2099 + vpxor ymm4, ymm4, ymm4 1.2100 + 1.2101 + align 4 1.2102 + convertloop: 1.2103 + vmovq xmm0, qword ptr [esi] // U 1.2104 + vmovq xmm1, qword ptr [esi + edi] // V 1.2105 + lea esi, [esi + 8] 1.2106 + vpunpcklbw ymm0, ymm0, ymm1 // UV 1.2107 + vpermq ymm0, ymm0, 0xd8 1.2108 + vpunpcklwd ymm0, ymm0, ymm0 // UVUV 1.2109 + vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV 1.2110 + vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV 1.2111 + vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV 1.2112 + vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed 1.2113 + vpsubw ymm1, ymm1, kUVBiasG_AVX 1.2114 + vpsubw ymm0, ymm0, kUVBiasR_AVX 1.2115 + 1.2116 + // Step 2: Find Y contribution to 16 R,G,B values 1.2117 + vmovdqu xmm3, [eax] // NOLINT 1.2118 + lea eax, [eax + 16] 1.2119 + vpermq ymm3, ymm3, 0xd8 1.2120 + vpunpcklbw ymm3, ymm3, ymm4 1.2121 + vpsubsw ymm3, ymm3, kYSub16_AVX 1.2122 + vpmullw ymm3, ymm3, kYToRgb_AVX 1.2123 + vpaddsw ymm2, ymm2, ymm3 // B += Y 1.2124 + vpaddsw ymm1, ymm1, ymm3 // G += Y 1.2125 + vpaddsw ymm0, ymm0, ymm3 // R += Y 1.2126 + vpsraw ymm2, ymm2, 6 1.2127 + vpsraw ymm1, ymm1, 6 1.2128 + vpsraw ymm0, ymm0, 6 1.2129 + vpackuswb ymm2, ymm2, ymm2 // B 1.2130 + vpackuswb ymm1, ymm1, ymm1 // G 1.2131 + vpackuswb ymm0, ymm0, ymm0 // R 1.2132 + 1.2133 + // Step 3: Weave into ARGB 1.2134 + vpunpcklbw ymm2, ymm2, ymm1 // BG 1.2135 + vpermq ymm2, ymm2, 0xd8 1.2136 + vpunpcklbw ymm0, ymm0, ymm5 // RA 1.2137 + vpermq ymm0, ymm0, 0xd8 1.2138 + vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels 1.2139 + vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels 1.2140 + vmovdqu [edx], ymm1 1.2141 + vmovdqu [edx + 32], ymm2 1.2142 + lea edx, [edx + 64] 1.2143 + sub ecx, 16 1.2144 + jg convertloop 1.2145 + vzeroupper 1.2146 + 1.2147 + pop edi 1.2148 + pop esi 1.2149 + ret 1.2150 + } 1.2151 +} 1.2152 +#endif // HAS_I422TOARGBROW_AVX2 1.2153 + 1.2154 +#ifdef HAS_I422TOARGBROW_SSSE3 1.2155 + 1.2156 +static const vec8 kUVToB = { 1.2157 + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB 1.2158 +}; 1.2159 + 1.2160 +static const vec8 kUVToR = { 1.2161 + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR 1.2162 +}; 1.2163 + 1.2164 +static const vec8 kUVToG = { 1.2165 + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG 1.2166 +}; 1.2167 + 1.2168 +static const vec8 kVUToB = { 1.2169 + VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, 1.2170 +}; 1.2171 + 1.2172 +static const vec8 kVUToR = { 1.2173 + VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, 1.2174 +}; 1.2175 + 1.2176 +static const vec8 kVUToG = { 1.2177 + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 1.2178 +}; 1.2179 + 1.2180 +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; 1.2181 +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; 1.2182 +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; 1.2183 +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; 1.2184 +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; 1.2185 + 1.2186 +// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 1.2187 + 1.2188 +// Read 8 UV from 444. 1.2189 +#define READYUV444 __asm { \ 1.2190 + __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 1.2191 + __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 1.2192 + __asm lea esi, [esi + 8] \ 1.2193 + __asm punpcklbw xmm0, xmm1 /* UV */ \ 1.2194 + } 1.2195 + 1.2196 +// Read 4 UV from 422, upsample to 8 UV. 1.2197 +#define READYUV422 __asm { \ 1.2198 + __asm movd xmm0, [esi] /* U */ \ 1.2199 + __asm movd xmm1, [esi + edi] /* V */ \ 1.2200 + __asm lea esi, [esi + 4] \ 1.2201 + __asm punpcklbw xmm0, xmm1 /* UV */ \ 1.2202 + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 1.2203 + } 1.2204 + 1.2205 +// Read 2 UV from 411, upsample to 8 UV. 1.2206 +#define READYUV411 __asm { \ 1.2207 + __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ 1.2208 + __asm movd xmm0, ebx \ 1.2209 + __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ 1.2210 + __asm movd xmm1, ebx \ 1.2211 + __asm lea esi, [esi + 2] \ 1.2212 + __asm punpcklbw xmm0, xmm1 /* UV */ \ 1.2213 + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 1.2214 + __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ 1.2215 + } 1.2216 + 1.2217 +// Read 4 UV from NV12, upsample to 8 UV. 1.2218 +#define READNV12 __asm { \ 1.2219 + __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ 1.2220 + __asm lea esi, [esi + 8] \ 1.2221 + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 1.2222 + } 1.2223 + 1.2224 +// Convert 8 pixels: 8 UV and 8 Y. 1.2225 +#define YUVTORGB __asm { \ 1.2226 + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 1.2227 + __asm movdqa xmm1, xmm0 \ 1.2228 + __asm movdqa xmm2, xmm0 \ 1.2229 + __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ 1.2230 + __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ 1.2231 + __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ 1.2232 + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 1.2233 + __asm psubw xmm1, kUVBiasG \ 1.2234 + __asm psubw xmm2, kUVBiasR \ 1.2235 + /* Step 2: Find Y contribution to 8 R,G,B values */ \ 1.2236 + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 1.2237 + __asm lea eax, [eax + 8] \ 1.2238 + __asm punpcklbw xmm3, xmm4 \ 1.2239 + __asm psubsw xmm3, kYSub16 \ 1.2240 + __asm pmullw xmm3, kYToRgb \ 1.2241 + __asm paddsw xmm0, xmm3 /* B += Y */ \ 1.2242 + __asm paddsw xmm1, xmm3 /* G += Y */ \ 1.2243 + __asm paddsw xmm2, xmm3 /* R += Y */ \ 1.2244 + __asm psraw xmm0, 6 \ 1.2245 + __asm psraw xmm1, 6 \ 1.2246 + __asm psraw xmm2, 6 \ 1.2247 + __asm packuswb xmm0, xmm0 /* B */ \ 1.2248 + __asm packuswb xmm1, xmm1 /* G */ \ 1.2249 + __asm packuswb xmm2, xmm2 /* R */ \ 1.2250 + } 1.2251 + 1.2252 +// Convert 8 pixels: 8 VU and 8 Y. 1.2253 +#define YVUTORGB __asm { \ 1.2254 + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 1.2255 + __asm movdqa xmm1, xmm0 \ 1.2256 + __asm movdqa xmm2, xmm0 \ 1.2257 + __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ 1.2258 + __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ 1.2259 + __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ 1.2260 + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 1.2261 + __asm psubw xmm1, kUVBiasG \ 1.2262 + __asm psubw xmm2, kUVBiasR \ 1.2263 + /* Step 2: Find Y contribution to 8 R,G,B values */ \ 1.2264 + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 1.2265 + __asm lea eax, [eax + 8] \ 1.2266 + __asm punpcklbw xmm3, xmm4 \ 1.2267 + __asm psubsw xmm3, kYSub16 \ 1.2268 + __asm pmullw xmm3, kYToRgb \ 1.2269 + __asm paddsw xmm0, xmm3 /* B += Y */ \ 1.2270 + __asm paddsw xmm1, xmm3 /* G += Y */ \ 1.2271 + __asm paddsw xmm2, xmm3 /* R += Y */ \ 1.2272 + __asm psraw xmm0, 6 \ 1.2273 + __asm psraw xmm1, 6 \ 1.2274 + __asm psraw xmm2, 6 \ 1.2275 + __asm packuswb xmm0, xmm0 /* B */ \ 1.2276 + __asm packuswb xmm1, xmm1 /* G */ \ 1.2277 + __asm packuswb xmm2, xmm2 /* R */ \ 1.2278 + } 1.2279 + 1.2280 +// 8 pixels, dest aligned 16. 1.2281 +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2282 +__declspec(naked) __declspec(align(16)) 1.2283 +void I444ToARGBRow_SSSE3(const uint8* y_buf, 1.2284 + const uint8* u_buf, 1.2285 + const uint8* v_buf, 1.2286 + uint8* dst_argb, 1.2287 + int width) { 1.2288 + __asm { 1.2289 + push esi 1.2290 + push edi 1.2291 + mov eax, [esp + 8 + 4] // Y 1.2292 + mov esi, [esp + 8 + 8] // U 1.2293 + mov edi, [esp + 8 + 12] // V 1.2294 + mov edx, [esp + 8 + 16] // argb 1.2295 + mov ecx, [esp + 8 + 20] // width 1.2296 + sub edi, esi 1.2297 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2298 + pxor xmm4, xmm4 1.2299 + 1.2300 + align 4 1.2301 + convertloop: 1.2302 + READYUV444 1.2303 + YUVTORGB 1.2304 + 1.2305 + // Step 3: Weave into ARGB 1.2306 + punpcklbw xmm0, xmm1 // BG 1.2307 + punpcklbw xmm2, xmm5 // RA 1.2308 + movdqa xmm1, xmm0 1.2309 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2310 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2311 + movdqa [edx], xmm0 1.2312 + movdqa [edx + 16], xmm1 1.2313 + lea edx, [edx + 32] 1.2314 + sub ecx, 8 1.2315 + jg convertloop 1.2316 + 1.2317 + pop edi 1.2318 + pop esi 1.2319 + ret 1.2320 + } 1.2321 +} 1.2322 + 1.2323 +// 8 pixels, dest aligned 16. 1.2324 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2325 +__declspec(naked) __declspec(align(16)) 1.2326 +void I422ToRGB24Row_SSSE3(const uint8* y_buf, 1.2327 + const uint8* u_buf, 1.2328 + const uint8* v_buf, 1.2329 + uint8* dst_rgb24, 1.2330 + int width) { 1.2331 + __asm { 1.2332 + push esi 1.2333 + push edi 1.2334 + mov eax, [esp + 8 + 4] // Y 1.2335 + mov esi, [esp + 8 + 8] // U 1.2336 + mov edi, [esp + 8 + 12] // V 1.2337 + mov edx, [esp + 8 + 16] // rgb24 1.2338 + mov ecx, [esp + 8 + 20] // width 1.2339 + sub edi, esi 1.2340 + pxor xmm4, xmm4 1.2341 + movdqa xmm5, kShuffleMaskARGBToRGB24_0 1.2342 + movdqa xmm6, kShuffleMaskARGBToRGB24 1.2343 + 1.2344 + align 4 1.2345 + convertloop: 1.2346 + READYUV422 1.2347 + YUVTORGB 1.2348 + 1.2349 + // Step 3: Weave into RRGB 1.2350 + punpcklbw xmm0, xmm1 // BG 1.2351 + punpcklbw xmm2, xmm2 // RR 1.2352 + movdqa xmm1, xmm0 1.2353 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels 1.2354 + punpckhwd xmm1, xmm2 // BGRR next 4 pixels 1.2355 + pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. 1.2356 + pshufb xmm1, xmm6 // Pack into first 12 bytes. 1.2357 + palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 1.2358 + movq qword ptr [edx], xmm0 // First 8 bytes 1.2359 + movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. 1.2360 + lea edx, [edx + 24] 1.2361 + sub ecx, 8 1.2362 + jg convertloop 1.2363 + 1.2364 + pop edi 1.2365 + pop esi 1.2366 + ret 1.2367 + } 1.2368 +} 1.2369 + 1.2370 +// 8 pixels, dest aligned 16. 1.2371 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2372 +__declspec(naked) __declspec(align(16)) 1.2373 +void I422ToRAWRow_SSSE3(const uint8* y_buf, 1.2374 + const uint8* u_buf, 1.2375 + const uint8* v_buf, 1.2376 + uint8* dst_raw, 1.2377 + int width) { 1.2378 + __asm { 1.2379 + push esi 1.2380 + push edi 1.2381 + mov eax, [esp + 8 + 4] // Y 1.2382 + mov esi, [esp + 8 + 8] // U 1.2383 + mov edi, [esp + 8 + 12] // V 1.2384 + mov edx, [esp + 8 + 16] // raw 1.2385 + mov ecx, [esp + 8 + 20] // width 1.2386 + sub edi, esi 1.2387 + pxor xmm4, xmm4 1.2388 + movdqa xmm5, kShuffleMaskARGBToRAW_0 1.2389 + movdqa xmm6, kShuffleMaskARGBToRAW 1.2390 + 1.2391 + align 4 1.2392 + convertloop: 1.2393 + READYUV422 1.2394 + YUVTORGB 1.2395 + 1.2396 + // Step 3: Weave into RRGB 1.2397 + punpcklbw xmm0, xmm1 // BG 1.2398 + punpcklbw xmm2, xmm2 // RR 1.2399 + movdqa xmm1, xmm0 1.2400 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels 1.2401 + punpckhwd xmm1, xmm2 // BGRR next 4 pixels 1.2402 + pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. 1.2403 + pshufb xmm1, xmm6 // Pack into first 12 bytes. 1.2404 + palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 1.2405 + movq qword ptr [edx], xmm0 // First 8 bytes 1.2406 + movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. 1.2407 + lea edx, [edx + 24] 1.2408 + sub ecx, 8 1.2409 + jg convertloop 1.2410 + 1.2411 + pop edi 1.2412 + pop esi 1.2413 + ret 1.2414 + } 1.2415 +} 1.2416 + 1.2417 +// 8 pixels, dest unaligned. 1.2418 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2419 +__declspec(naked) __declspec(align(16)) 1.2420 +void I422ToRGB565Row_SSSE3(const uint8* y_buf, 1.2421 + const uint8* u_buf, 1.2422 + const uint8* v_buf, 1.2423 + uint8* rgb565_buf, 1.2424 + int width) { 1.2425 + __asm { 1.2426 + push esi 1.2427 + push edi 1.2428 + mov eax, [esp + 8 + 4] // Y 1.2429 + mov esi, [esp + 8 + 8] // U 1.2430 + mov edi, [esp + 8 + 12] // V 1.2431 + mov edx, [esp + 8 + 16] // rgb565 1.2432 + mov ecx, [esp + 8 + 20] // width 1.2433 + sub edi, esi 1.2434 + pxor xmm4, xmm4 1.2435 + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f 1.2436 + psrld xmm5, 27 1.2437 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 1.2438 + psrld xmm6, 26 1.2439 + pslld xmm6, 5 1.2440 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 1.2441 + pslld xmm7, 11 1.2442 + 1.2443 + align 4 1.2444 + convertloop: 1.2445 + READYUV422 1.2446 + YUVTORGB 1.2447 + 1.2448 + // Step 3: Weave into RRGB 1.2449 + punpcklbw xmm0, xmm1 // BG 1.2450 + punpcklbw xmm2, xmm2 // RR 1.2451 + movdqa xmm1, xmm0 1.2452 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels 1.2453 + punpckhwd xmm1, xmm2 // BGRR next 4 pixels 1.2454 + 1.2455 + // Step 3b: RRGB -> RGB565 1.2456 + movdqa xmm3, xmm0 // B first 4 pixels of argb 1.2457 + movdqa xmm2, xmm0 // G 1.2458 + pslld xmm0, 8 // R 1.2459 + psrld xmm3, 3 // B 1.2460 + psrld xmm2, 5 // G 1.2461 + psrad xmm0, 16 // R 1.2462 + pand xmm3, xmm5 // B 1.2463 + pand xmm2, xmm6 // G 1.2464 + pand xmm0, xmm7 // R 1.2465 + por xmm3, xmm2 // BG 1.2466 + por xmm0, xmm3 // BGR 1.2467 + movdqa xmm3, xmm1 // B next 4 pixels of argb 1.2468 + movdqa xmm2, xmm1 // G 1.2469 + pslld xmm1, 8 // R 1.2470 + psrld xmm3, 3 // B 1.2471 + psrld xmm2, 5 // G 1.2472 + psrad xmm1, 16 // R 1.2473 + pand xmm3, xmm5 // B 1.2474 + pand xmm2, xmm6 // G 1.2475 + pand xmm1, xmm7 // R 1.2476 + por xmm3, xmm2 // BG 1.2477 + por xmm1, xmm3 // BGR 1.2478 + packssdw xmm0, xmm1 1.2479 + sub ecx, 8 1.2480 + movdqu [edx], xmm0 // store 8 pixels of RGB565 1.2481 + lea edx, [edx + 16] 1.2482 + jg convertloop 1.2483 + 1.2484 + pop edi 1.2485 + pop esi 1.2486 + ret 1.2487 + } 1.2488 +} 1.2489 + 1.2490 +// 8 pixels, dest aligned 16. 1.2491 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2492 +__declspec(naked) __declspec(align(16)) 1.2493 +void I422ToARGBRow_SSSE3(const uint8* y_buf, 1.2494 + const uint8* u_buf, 1.2495 + const uint8* v_buf, 1.2496 + uint8* dst_argb, 1.2497 + int width) { 1.2498 + __asm { 1.2499 + push esi 1.2500 + push edi 1.2501 + mov eax, [esp + 8 + 4] // Y 1.2502 + mov esi, [esp + 8 + 8] // U 1.2503 + mov edi, [esp + 8 + 12] // V 1.2504 + mov edx, [esp + 8 + 16] // argb 1.2505 + mov ecx, [esp + 8 + 20] // width 1.2506 + sub edi, esi 1.2507 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2508 + pxor xmm4, xmm4 1.2509 + 1.2510 + align 4 1.2511 + convertloop: 1.2512 + READYUV422 1.2513 + YUVTORGB 1.2514 + 1.2515 + // Step 3: Weave into ARGB 1.2516 + punpcklbw xmm0, xmm1 // BG 1.2517 + punpcklbw xmm2, xmm5 // RA 1.2518 + movdqa xmm1, xmm0 1.2519 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2520 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2521 + movdqa [edx], xmm0 1.2522 + movdqa [edx + 16], xmm1 1.2523 + lea edx, [edx + 32] 1.2524 + sub ecx, 8 1.2525 + jg convertloop 1.2526 + 1.2527 + pop edi 1.2528 + pop esi 1.2529 + ret 1.2530 + } 1.2531 +} 1.2532 + 1.2533 +// 8 pixels, dest aligned 16. 1.2534 +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2535 +// Similar to I420 but duplicate UV once more. 1.2536 +__declspec(naked) __declspec(align(16)) 1.2537 +void I411ToARGBRow_SSSE3(const uint8* y_buf, 1.2538 + const uint8* u_buf, 1.2539 + const uint8* v_buf, 1.2540 + uint8* dst_argb, 1.2541 + int width) { 1.2542 + __asm { 1.2543 + push ebx 1.2544 + push esi 1.2545 + push edi 1.2546 + mov eax, [esp + 12 + 4] // Y 1.2547 + mov esi, [esp + 12 + 8] // U 1.2548 + mov edi, [esp + 12 + 12] // V 1.2549 + mov edx, [esp + 12 + 16] // argb 1.2550 + mov ecx, [esp + 12 + 20] // width 1.2551 + sub edi, esi 1.2552 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2553 + pxor xmm4, xmm4 1.2554 + 1.2555 + align 4 1.2556 + convertloop: 1.2557 + READYUV411 // modifies EBX 1.2558 + YUVTORGB 1.2559 + 1.2560 + // Step 3: Weave into ARGB 1.2561 + punpcklbw xmm0, xmm1 // BG 1.2562 + punpcklbw xmm2, xmm5 // RA 1.2563 + movdqa xmm1, xmm0 1.2564 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2565 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2566 + movdqa [edx], xmm0 1.2567 + movdqa [edx + 16], xmm1 1.2568 + lea edx, [edx + 32] 1.2569 + sub ecx, 8 1.2570 + jg convertloop 1.2571 + 1.2572 + pop edi 1.2573 + pop esi 1.2574 + pop ebx 1.2575 + ret 1.2576 + } 1.2577 +} 1.2578 + 1.2579 +// 8 pixels, dest aligned 16. 1.2580 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2581 +__declspec(naked) __declspec(align(16)) 1.2582 +void NV12ToARGBRow_SSSE3(const uint8* y_buf, 1.2583 + const uint8* uv_buf, 1.2584 + uint8* dst_argb, 1.2585 + int width) { 1.2586 + __asm { 1.2587 + push esi 1.2588 + mov eax, [esp + 4 + 4] // Y 1.2589 + mov esi, [esp + 4 + 8] // UV 1.2590 + mov edx, [esp + 4 + 12] // argb 1.2591 + mov ecx, [esp + 4 + 16] // width 1.2592 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2593 + pxor xmm4, xmm4 1.2594 + 1.2595 + align 4 1.2596 + convertloop: 1.2597 + READNV12 1.2598 + YUVTORGB 1.2599 + 1.2600 + // Step 3: Weave into ARGB 1.2601 + punpcklbw xmm0, xmm1 // BG 1.2602 + punpcklbw xmm2, xmm5 // RA 1.2603 + movdqa xmm1, xmm0 1.2604 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2605 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2606 + movdqa [edx], xmm0 1.2607 + movdqa [edx + 16], xmm1 1.2608 + lea edx, [edx + 32] 1.2609 + sub ecx, 8 1.2610 + jg convertloop 1.2611 + 1.2612 + pop esi 1.2613 + ret 1.2614 + } 1.2615 +} 1.2616 + 1.2617 +// 8 pixels, dest aligned 16. 1.2618 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2619 +__declspec(naked) __declspec(align(16)) 1.2620 +void NV21ToARGBRow_SSSE3(const uint8* y_buf, 1.2621 + const uint8* uv_buf, 1.2622 + uint8* dst_argb, 1.2623 + int width) { 1.2624 + __asm { 1.2625 + push esi 1.2626 + mov eax, [esp + 4 + 4] // Y 1.2627 + mov esi, [esp + 4 + 8] // VU 1.2628 + mov edx, [esp + 4 + 12] // argb 1.2629 + mov ecx, [esp + 4 + 16] // width 1.2630 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2631 + pxor xmm4, xmm4 1.2632 + 1.2633 + align 4 1.2634 + convertloop: 1.2635 + READNV12 1.2636 + YVUTORGB 1.2637 + 1.2638 + // Step 3: Weave into ARGB 1.2639 + punpcklbw xmm0, xmm1 // BG 1.2640 + punpcklbw xmm2, xmm5 // RA 1.2641 + movdqa xmm1, xmm0 1.2642 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2643 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2644 + movdqa [edx], xmm0 1.2645 + movdqa [edx + 16], xmm1 1.2646 + lea edx, [edx + 32] 1.2647 + sub ecx, 8 1.2648 + jg convertloop 1.2649 + 1.2650 + pop esi 1.2651 + ret 1.2652 + } 1.2653 +} 1.2654 + 1.2655 +// 8 pixels, unaligned. 1.2656 +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2657 +__declspec(naked) __declspec(align(16)) 1.2658 +void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2659 + const uint8* u_buf, 1.2660 + const uint8* v_buf, 1.2661 + uint8* dst_argb, 1.2662 + int width) { 1.2663 + __asm { 1.2664 + push esi 1.2665 + push edi 1.2666 + mov eax, [esp + 8 + 4] // Y 1.2667 + mov esi, [esp + 8 + 8] // U 1.2668 + mov edi, [esp + 8 + 12] // V 1.2669 + mov edx, [esp + 8 + 16] // argb 1.2670 + mov ecx, [esp + 8 + 20] // width 1.2671 + sub edi, esi 1.2672 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2673 + pxor xmm4, xmm4 1.2674 + 1.2675 + align 4 1.2676 + convertloop: 1.2677 + READYUV444 1.2678 + YUVTORGB 1.2679 + 1.2680 + // Step 3: Weave into ARGB 1.2681 + punpcklbw xmm0, xmm1 // BG 1.2682 + punpcklbw xmm2, xmm5 // RA 1.2683 + movdqa xmm1, xmm0 1.2684 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2685 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2686 + movdqu [edx], xmm0 1.2687 + movdqu [edx + 16], xmm1 1.2688 + lea edx, [edx + 32] 1.2689 + sub ecx, 8 1.2690 + jg convertloop 1.2691 + 1.2692 + pop edi 1.2693 + pop esi 1.2694 + ret 1.2695 + } 1.2696 +} 1.2697 + 1.2698 +// 8 pixels, unaligned. 1.2699 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2700 +__declspec(naked) __declspec(align(16)) 1.2701 +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2702 + const uint8* u_buf, 1.2703 + const uint8* v_buf, 1.2704 + uint8* dst_argb, 1.2705 + int width) { 1.2706 + __asm { 1.2707 + push esi 1.2708 + push edi 1.2709 + mov eax, [esp + 8 + 4] // Y 1.2710 + mov esi, [esp + 8 + 8] // U 1.2711 + mov edi, [esp + 8 + 12] // V 1.2712 + mov edx, [esp + 8 + 16] // argb 1.2713 + mov ecx, [esp + 8 + 20] // width 1.2714 + sub edi, esi 1.2715 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2716 + pxor xmm4, xmm4 1.2717 + 1.2718 + align 4 1.2719 + convertloop: 1.2720 + READYUV422 1.2721 + YUVTORGB 1.2722 + 1.2723 + // Step 3: Weave into ARGB 1.2724 + punpcklbw xmm0, xmm1 // BG 1.2725 + punpcklbw xmm2, xmm5 // RA 1.2726 + movdqa xmm1, xmm0 1.2727 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2728 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2729 + movdqu [edx], xmm0 1.2730 + movdqu [edx + 16], xmm1 1.2731 + lea edx, [edx + 32] 1.2732 + sub ecx, 8 1.2733 + jg convertloop 1.2734 + 1.2735 + pop edi 1.2736 + pop esi 1.2737 + ret 1.2738 + } 1.2739 +} 1.2740 + 1.2741 +// 8 pixels, unaligned. 1.2742 +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2743 +// Similar to I420 but duplicate UV once more. 1.2744 +__declspec(naked) __declspec(align(16)) 1.2745 +void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2746 + const uint8* u_buf, 1.2747 + const uint8* v_buf, 1.2748 + uint8* dst_argb, 1.2749 + int width) { 1.2750 + __asm { 1.2751 + push ebx 1.2752 + push esi 1.2753 + push edi 1.2754 + mov eax, [esp + 12 + 4] // Y 1.2755 + mov esi, [esp + 12 + 8] // U 1.2756 + mov edi, [esp + 12 + 12] // V 1.2757 + mov edx, [esp + 12 + 16] // argb 1.2758 + mov ecx, [esp + 12 + 20] // width 1.2759 + sub edi, esi 1.2760 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2761 + pxor xmm4, xmm4 1.2762 + 1.2763 + align 4 1.2764 + convertloop: 1.2765 + READYUV411 // modifies EBX 1.2766 + YUVTORGB 1.2767 + 1.2768 + // Step 3: Weave into ARGB 1.2769 + punpcklbw xmm0, xmm1 // BG 1.2770 + punpcklbw xmm2, xmm5 // RA 1.2771 + movdqa xmm1, xmm0 1.2772 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2773 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2774 + movdqu [edx], xmm0 1.2775 + movdqu [edx + 16], xmm1 1.2776 + lea edx, [edx + 32] 1.2777 + sub ecx, 8 1.2778 + jg convertloop 1.2779 + 1.2780 + pop edi 1.2781 + pop esi 1.2782 + pop ebx 1.2783 + ret 1.2784 + } 1.2785 +} 1.2786 + 1.2787 +// 8 pixels, dest aligned 16. 1.2788 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2789 +__declspec(naked) __declspec(align(16)) 1.2790 +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2791 + const uint8* uv_buf, 1.2792 + uint8* dst_argb, 1.2793 + int width) { 1.2794 + __asm { 1.2795 + push esi 1.2796 + mov eax, [esp + 4 + 4] // Y 1.2797 + mov esi, [esp + 4 + 8] // UV 1.2798 + mov edx, [esp + 4 + 12] // argb 1.2799 + mov ecx, [esp + 4 + 16] // width 1.2800 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2801 + pxor xmm4, xmm4 1.2802 + 1.2803 + align 4 1.2804 + convertloop: 1.2805 + READNV12 1.2806 + YUVTORGB 1.2807 + 1.2808 + // Step 3: Weave into ARGB 1.2809 + punpcklbw xmm0, xmm1 // BG 1.2810 + punpcklbw xmm2, xmm5 // RA 1.2811 + movdqa xmm1, xmm0 1.2812 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2813 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2814 + movdqu [edx], xmm0 1.2815 + movdqu [edx + 16], xmm1 1.2816 + lea edx, [edx + 32] 1.2817 + sub ecx, 8 1.2818 + jg convertloop 1.2819 + 1.2820 + pop esi 1.2821 + ret 1.2822 + } 1.2823 +} 1.2824 + 1.2825 +// 8 pixels, dest aligned 16. 1.2826 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1.2827 +__declspec(naked) __declspec(align(16)) 1.2828 +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2829 + const uint8* uv_buf, 1.2830 + uint8* dst_argb, 1.2831 + int width) { 1.2832 + __asm { 1.2833 + push esi 1.2834 + mov eax, [esp + 4 + 4] // Y 1.2835 + mov esi, [esp + 4 + 8] // VU 1.2836 + mov edx, [esp + 4 + 12] // argb 1.2837 + mov ecx, [esp + 4 + 16] // width 1.2838 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2839 + pxor xmm4, xmm4 1.2840 + 1.2841 + align 4 1.2842 + convertloop: 1.2843 + READNV12 1.2844 + YVUTORGB 1.2845 + 1.2846 + // Step 3: Weave into ARGB 1.2847 + punpcklbw xmm0, xmm1 // BG 1.2848 + punpcklbw xmm2, xmm5 // RA 1.2849 + movdqa xmm1, xmm0 1.2850 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1.2851 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1.2852 + movdqu [edx], xmm0 1.2853 + movdqu [edx + 16], xmm1 1.2854 + lea edx, [edx + 32] 1.2855 + sub ecx, 8 1.2856 + jg convertloop 1.2857 + 1.2858 + pop esi 1.2859 + ret 1.2860 + } 1.2861 +} 1.2862 + 1.2863 +__declspec(naked) __declspec(align(16)) 1.2864 +void I422ToBGRARow_SSSE3(const uint8* y_buf, 1.2865 + const uint8* u_buf, 1.2866 + const uint8* v_buf, 1.2867 + uint8* dst_bgra, 1.2868 + int width) { 1.2869 + __asm { 1.2870 + push esi 1.2871 + push edi 1.2872 + mov eax, [esp + 8 + 4] // Y 1.2873 + mov esi, [esp + 8 + 8] // U 1.2874 + mov edi, [esp + 8 + 12] // V 1.2875 + mov edx, [esp + 8 + 16] // bgra 1.2876 + mov ecx, [esp + 8 + 20] // width 1.2877 + sub edi, esi 1.2878 + pxor xmm4, xmm4 1.2879 + 1.2880 + align 4 1.2881 + convertloop: 1.2882 + READYUV422 1.2883 + YUVTORGB 1.2884 + 1.2885 + // Step 3: Weave into BGRA 1.2886 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2887 + punpcklbw xmm1, xmm0 // GB 1.2888 + punpcklbw xmm5, xmm2 // AR 1.2889 + movdqa xmm0, xmm5 1.2890 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels 1.2891 + punpckhwd xmm0, xmm1 // BGRA next 4 pixels 1.2892 + movdqa [edx], xmm5 1.2893 + movdqa [edx + 16], xmm0 1.2894 + lea edx, [edx + 32] 1.2895 + sub ecx, 8 1.2896 + jg convertloop 1.2897 + 1.2898 + pop edi 1.2899 + pop esi 1.2900 + ret 1.2901 + } 1.2902 +} 1.2903 + 1.2904 +__declspec(naked) __declspec(align(16)) 1.2905 +void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, 1.2906 + const uint8* u_buf, 1.2907 + const uint8* v_buf, 1.2908 + uint8* dst_bgra, 1.2909 + int width) { 1.2910 + __asm { 1.2911 + push esi 1.2912 + push edi 1.2913 + mov eax, [esp + 8 + 4] // Y 1.2914 + mov esi, [esp + 8 + 8] // U 1.2915 + mov edi, [esp + 8 + 12] // V 1.2916 + mov edx, [esp + 8 + 16] // bgra 1.2917 + mov ecx, [esp + 8 + 20] // width 1.2918 + sub edi, esi 1.2919 + pxor xmm4, xmm4 1.2920 + 1.2921 + align 4 1.2922 + convertloop: 1.2923 + READYUV422 1.2924 + YUVTORGB 1.2925 + 1.2926 + // Step 3: Weave into BGRA 1.2927 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2928 + punpcklbw xmm1, xmm0 // GB 1.2929 + punpcklbw xmm5, xmm2 // AR 1.2930 + movdqa xmm0, xmm5 1.2931 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels 1.2932 + punpckhwd xmm0, xmm1 // BGRA next 4 pixels 1.2933 + movdqu [edx], xmm5 1.2934 + movdqu [edx + 16], xmm0 1.2935 + lea edx, [edx + 32] 1.2936 + sub ecx, 8 1.2937 + jg convertloop 1.2938 + 1.2939 + pop edi 1.2940 + pop esi 1.2941 + ret 1.2942 + } 1.2943 +} 1.2944 + 1.2945 +__declspec(naked) __declspec(align(16)) 1.2946 +void I422ToABGRRow_SSSE3(const uint8* y_buf, 1.2947 + const uint8* u_buf, 1.2948 + const uint8* v_buf, 1.2949 + uint8* dst_abgr, 1.2950 + int width) { 1.2951 + __asm { 1.2952 + push esi 1.2953 + push edi 1.2954 + mov eax, [esp + 8 + 4] // Y 1.2955 + mov esi, [esp + 8 + 8] // U 1.2956 + mov edi, [esp + 8 + 12] // V 1.2957 + mov edx, [esp + 8 + 16] // abgr 1.2958 + mov ecx, [esp + 8 + 20] // width 1.2959 + sub edi, esi 1.2960 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.2961 + pxor xmm4, xmm4 1.2962 + 1.2963 + align 4 1.2964 + convertloop: 1.2965 + READYUV422 1.2966 + YUVTORGB 1.2967 + 1.2968 + // Step 3: Weave into ARGB 1.2969 + punpcklbw xmm2, xmm1 // RG 1.2970 + punpcklbw xmm0, xmm5 // BA 1.2971 + movdqa xmm1, xmm2 1.2972 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels 1.2973 + punpckhwd xmm1, xmm0 // RGBA next 4 pixels 1.2974 + movdqa [edx], xmm2 1.2975 + movdqa [edx + 16], xmm1 1.2976 + lea edx, [edx + 32] 1.2977 + sub ecx, 8 1.2978 + jg convertloop 1.2979 + 1.2980 + pop edi 1.2981 + pop esi 1.2982 + ret 1.2983 + } 1.2984 +} 1.2985 + 1.2986 +__declspec(naked) __declspec(align(16)) 1.2987 +void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, 1.2988 + const uint8* u_buf, 1.2989 + const uint8* v_buf, 1.2990 + uint8* dst_abgr, 1.2991 + int width) { 1.2992 + __asm { 1.2993 + push esi 1.2994 + push edi 1.2995 + mov eax, [esp + 8 + 4] // Y 1.2996 + mov esi, [esp + 8 + 8] // U 1.2997 + mov edi, [esp + 8 + 12] // V 1.2998 + mov edx, [esp + 8 + 16] // abgr 1.2999 + mov ecx, [esp + 8 + 20] // width 1.3000 + sub edi, esi 1.3001 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.3002 + pxor xmm4, xmm4 1.3003 + 1.3004 + align 4 1.3005 + convertloop: 1.3006 + READYUV422 1.3007 + YUVTORGB 1.3008 + 1.3009 + // Step 3: Weave into ARGB 1.3010 + punpcklbw xmm2, xmm1 // RG 1.3011 + punpcklbw xmm0, xmm5 // BA 1.3012 + movdqa xmm1, xmm2 1.3013 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels 1.3014 + punpckhwd xmm1, xmm0 // RGBA next 4 pixels 1.3015 + movdqu [edx], xmm2 1.3016 + movdqu [edx + 16], xmm1 1.3017 + lea edx, [edx + 32] 1.3018 + sub ecx, 8 1.3019 + jg convertloop 1.3020 + 1.3021 + pop edi 1.3022 + pop esi 1.3023 + ret 1.3024 + } 1.3025 +} 1.3026 + 1.3027 +__declspec(naked) __declspec(align(16)) 1.3028 +void I422ToRGBARow_SSSE3(const uint8* y_buf, 1.3029 + const uint8* u_buf, 1.3030 + const uint8* v_buf, 1.3031 + uint8* dst_rgba, 1.3032 + int width) { 1.3033 + __asm { 1.3034 + push esi 1.3035 + push edi 1.3036 + mov eax, [esp + 8 + 4] // Y 1.3037 + mov esi, [esp + 8 + 8] // U 1.3038 + mov edi, [esp + 8 + 12] // V 1.3039 + mov edx, [esp + 8 + 16] // rgba 1.3040 + mov ecx, [esp + 8 + 20] // width 1.3041 + sub edi, esi 1.3042 + pxor xmm4, xmm4 1.3043 + 1.3044 + align 4 1.3045 + convertloop: 1.3046 + READYUV422 1.3047 + YUVTORGB 1.3048 + 1.3049 + // Step 3: Weave into RGBA 1.3050 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.3051 + punpcklbw xmm1, xmm2 // GR 1.3052 + punpcklbw xmm5, xmm0 // AB 1.3053 + movdqa xmm0, xmm5 1.3054 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels 1.3055 + punpckhwd xmm0, xmm1 // RGBA next 4 pixels 1.3056 + movdqa [edx], xmm5 1.3057 + movdqa [edx + 16], xmm0 1.3058 + lea edx, [edx + 32] 1.3059 + sub ecx, 8 1.3060 + jg convertloop 1.3061 + 1.3062 + pop edi 1.3063 + pop esi 1.3064 + ret 1.3065 + } 1.3066 +} 1.3067 + 1.3068 +__declspec(naked) __declspec(align(16)) 1.3069 +void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, 1.3070 + const uint8* u_buf, 1.3071 + const uint8* v_buf, 1.3072 + uint8* dst_rgba, 1.3073 + int width) { 1.3074 + __asm { 1.3075 + push esi 1.3076 + push edi 1.3077 + mov eax, [esp + 8 + 4] // Y 1.3078 + mov esi, [esp + 8 + 8] // U 1.3079 + mov edi, [esp + 8 + 12] // V 1.3080 + mov edx, [esp + 8 + 16] // rgba 1.3081 + mov ecx, [esp + 8 + 20] // width 1.3082 + sub edi, esi 1.3083 + pxor xmm4, xmm4 1.3084 + 1.3085 + align 4 1.3086 + convertloop: 1.3087 + READYUV422 1.3088 + YUVTORGB 1.3089 + 1.3090 + // Step 3: Weave into RGBA 1.3091 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1.3092 + punpcklbw xmm1, xmm2 // GR 1.3093 + punpcklbw xmm5, xmm0 // AB 1.3094 + movdqa xmm0, xmm5 1.3095 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels 1.3096 + punpckhwd xmm0, xmm1 // RGBA next 4 pixels 1.3097 + movdqu [edx], xmm5 1.3098 + movdqu [edx + 16], xmm0 1.3099 + lea edx, [edx + 32] 1.3100 + sub ecx, 8 1.3101 + jg convertloop 1.3102 + 1.3103 + pop edi 1.3104 + pop esi 1.3105 + ret 1.3106 + } 1.3107 +} 1.3108 + 1.3109 +#endif // HAS_I422TOARGBROW_SSSE3 1.3110 + 1.3111 +#ifdef HAS_YTOARGBROW_SSE2 1.3112 +__declspec(naked) __declspec(align(16)) 1.3113 +void YToARGBRow_SSE2(const uint8* y_buf, 1.3114 + uint8* rgb_buf, 1.3115 + int width) { 1.3116 + __asm { 1.3117 + pxor xmm5, xmm5 1.3118 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 1.3119 + pslld xmm4, 24 1.3120 + mov eax, 0x00100010 1.3121 + movd xmm3, eax 1.3122 + pshufd xmm3, xmm3, 0 1.3123 + mov eax, 0x004a004a // 74 1.3124 + movd xmm2, eax 1.3125 + pshufd xmm2, xmm2,0 1.3126 + mov eax, [esp + 4] // Y 1.3127 + mov edx, [esp + 8] // rgb 1.3128 + mov ecx, [esp + 12] // width 1.3129 + 1.3130 + align 4 1.3131 + convertloop: 1.3132 + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 1.3133 + movq xmm0, qword ptr [eax] 1.3134 + lea eax, [eax + 8] 1.3135 + punpcklbw xmm0, xmm5 // 0.Y 1.3136 + psubusw xmm0, xmm3 1.3137 + pmullw xmm0, xmm2 1.3138 + psrlw xmm0, 6 1.3139 + packuswb xmm0, xmm0 // G 1.3140 + 1.3141 + // Step 2: Weave into ARGB 1.3142 + punpcklbw xmm0, xmm0 // GG 1.3143 + movdqa xmm1, xmm0 1.3144 + punpcklwd xmm0, xmm0 // BGRA first 4 pixels 1.3145 + punpckhwd xmm1, xmm1 // BGRA next 4 pixels 1.3146 + por xmm0, xmm4 1.3147 + por xmm1, xmm4 1.3148 + movdqa [edx], xmm0 1.3149 + movdqa [edx + 16], xmm1 1.3150 + lea edx, [edx + 32] 1.3151 + sub ecx, 8 1.3152 + jg convertloop 1.3153 + 1.3154 + ret 1.3155 + } 1.3156 +} 1.3157 +#endif // HAS_YTOARGBROW_SSE2 1.3158 + 1.3159 +#ifdef HAS_MIRRORROW_SSSE3 1.3160 +// Shuffle table for reversing the bytes. 1.3161 +static const uvec8 kShuffleMirror = { 1.3162 + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 1.3163 +}; 1.3164 + 1.3165 +__declspec(naked) __declspec(align(16)) 1.3166 +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 1.3167 + __asm { 1.3168 + mov eax, [esp + 4] // src 1.3169 + mov edx, [esp + 8] // dst 1.3170 + mov ecx, [esp + 12] // width 1.3171 + movdqa xmm5, kShuffleMirror 1.3172 + lea eax, [eax - 16] 1.3173 + 1.3174 + align 4 1.3175 + convertloop: 1.3176 + movdqa xmm0, [eax + ecx] 1.3177 + pshufb xmm0, xmm5 1.3178 + sub ecx, 16 1.3179 + movdqa [edx], xmm0 1.3180 + lea edx, [edx + 16] 1.3181 + jg convertloop 1.3182 + ret 1.3183 + } 1.3184 +} 1.3185 +#endif // HAS_MIRRORROW_SSSE3 1.3186 + 1.3187 +#ifdef HAS_MIRRORROW_AVX2 1.3188 +// Shuffle table for reversing the bytes. 1.3189 +static const ulvec8 kShuffleMirror_AVX2 = { 1.3190 + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 1.3191 + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 1.3192 +}; 1.3193 + 1.3194 +__declspec(naked) __declspec(align(16)) 1.3195 +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 1.3196 + __asm { 1.3197 + mov eax, [esp + 4] // src 1.3198 + mov edx, [esp + 8] // dst 1.3199 + mov ecx, [esp + 12] // width 1.3200 + vmovdqa ymm5, kShuffleMirror_AVX2 1.3201 + lea eax, [eax - 32] 1.3202 + 1.3203 + align 4 1.3204 + convertloop: 1.3205 + vmovdqu ymm0, [eax + ecx] 1.3206 + vpshufb ymm0, ymm0, ymm5 1.3207 + vpermq ymm0, ymm0, 0x4e // swap high and low halfs 1.3208 + sub ecx, 32 1.3209 + vmovdqu [edx], ymm0 1.3210 + lea edx, [edx + 32] 1.3211 + jg convertloop 1.3212 + vzeroupper 1.3213 + ret 1.3214 + } 1.3215 +} 1.3216 +#endif // HAS_MIRRORROW_AVX2 1.3217 + 1.3218 +#ifdef HAS_MIRRORROW_SSE2 1.3219 +// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 1.3220 +// version can not. 1.3221 +__declspec(naked) __declspec(align(16)) 1.3222 +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 1.3223 + __asm { 1.3224 + mov eax, [esp + 4] // src 1.3225 + mov edx, [esp + 8] // dst 1.3226 + mov ecx, [esp + 12] // width 1.3227 + lea eax, [eax - 16] 1.3228 + 1.3229 + align 4 1.3230 + convertloop: 1.3231 + movdqu xmm0, [eax + ecx] 1.3232 + movdqa xmm1, xmm0 // swap bytes 1.3233 + psllw xmm0, 8 1.3234 + psrlw xmm1, 8 1.3235 + por xmm0, xmm1 1.3236 + pshuflw xmm0, xmm0, 0x1b // swap words 1.3237 + pshufhw xmm0, xmm0, 0x1b 1.3238 + pshufd xmm0, xmm0, 0x4e // swap qwords 1.3239 + sub ecx, 16 1.3240 + movdqu [edx], xmm0 1.3241 + lea edx, [edx + 16] 1.3242 + jg convertloop 1.3243 + ret 1.3244 + } 1.3245 +} 1.3246 +#endif // HAS_MIRRORROW_SSE2 1.3247 + 1.3248 +#ifdef HAS_MIRRORROW_UV_SSSE3 1.3249 +// Shuffle table for reversing the bytes of UV channels. 1.3250 +static const uvec8 kShuffleMirrorUV = { 1.3251 + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 1.3252 +}; 1.3253 + 1.3254 +__declspec(naked) __declspec(align(16)) 1.3255 +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 1.3256 + int width) { 1.3257 + __asm { 1.3258 + push edi 1.3259 + mov eax, [esp + 4 + 4] // src 1.3260 + mov edx, [esp + 4 + 8] // dst_u 1.3261 + mov edi, [esp + 4 + 12] // dst_v 1.3262 + mov ecx, [esp + 4 + 16] // width 1.3263 + movdqa xmm1, kShuffleMirrorUV 1.3264 + lea eax, [eax + ecx * 2 - 16] 1.3265 + sub edi, edx 1.3266 + 1.3267 + align 4 1.3268 + convertloop: 1.3269 + movdqa xmm0, [eax] 1.3270 + lea eax, [eax - 16] 1.3271 + pshufb xmm0, xmm1 1.3272 + sub ecx, 8 1.3273 + movlpd qword ptr [edx], xmm0 1.3274 + movhpd qword ptr [edx + edi], xmm0 1.3275 + lea edx, [edx + 8] 1.3276 + jg convertloop 1.3277 + 1.3278 + pop edi 1.3279 + ret 1.3280 + } 1.3281 +} 1.3282 +#endif // HAS_MIRRORROW_UV_SSSE3 1.3283 + 1.3284 +#ifdef HAS_ARGBMIRRORROW_SSSE3 1.3285 +// Shuffle table for reversing the bytes. 1.3286 +static const uvec8 kARGBShuffleMirror = { 1.3287 + 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u 1.3288 +}; 1.3289 + 1.3290 +__declspec(naked) __declspec(align(16)) 1.3291 +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 1.3292 + __asm { 1.3293 + mov eax, [esp + 4] // src 1.3294 + mov edx, [esp + 8] // dst 1.3295 + mov ecx, [esp + 12] // width 1.3296 + lea eax, [eax - 16 + ecx * 4] // last 4 pixels. 1.3297 + movdqa xmm5, kARGBShuffleMirror 1.3298 + 1.3299 + align 4 1.3300 + convertloop: 1.3301 + movdqa xmm0, [eax] 1.3302 + lea eax, [eax - 16] 1.3303 + pshufb xmm0, xmm5 1.3304 + sub ecx, 4 1.3305 + movdqa [edx], xmm0 1.3306 + lea edx, [edx + 16] 1.3307 + jg convertloop 1.3308 + ret 1.3309 + } 1.3310 +} 1.3311 +#endif // HAS_ARGBMIRRORROW_SSSE3 1.3312 + 1.3313 +#ifdef HAS_ARGBMIRRORROW_AVX2 1.3314 +// Shuffle table for reversing the bytes. 1.3315 +static const ulvec32 kARGBShuffleMirror_AVX2 = { 1.3316 + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 1.3317 +}; 1.3318 + 1.3319 +__declspec(naked) __declspec(align(16)) 1.3320 +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 1.3321 + __asm { 1.3322 + mov eax, [esp + 4] // src 1.3323 + mov edx, [esp + 8] // dst 1.3324 + mov ecx, [esp + 12] // width 1.3325 + lea eax, [eax - 32] 1.3326 + vmovdqa ymm5, kARGBShuffleMirror_AVX2 1.3327 + 1.3328 + align 4 1.3329 + convertloop: 1.3330 + vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order 1.3331 + sub ecx, 8 1.3332 + vmovdqu [edx], ymm0 1.3333 + lea edx, [edx + 32] 1.3334 + jg convertloop 1.3335 + vzeroupper 1.3336 + ret 1.3337 + } 1.3338 +} 1.3339 +#endif // HAS_ARGBMIRRORROW_AVX2 1.3340 + 1.3341 +#ifdef HAS_SPLITUVROW_SSE2 1.3342 +__declspec(naked) __declspec(align(16)) 1.3343 +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 1.3344 + __asm { 1.3345 + push edi 1.3346 + mov eax, [esp + 4 + 4] // src_uv 1.3347 + mov edx, [esp + 4 + 8] // dst_u 1.3348 + mov edi, [esp + 4 + 12] // dst_v 1.3349 + mov ecx, [esp + 4 + 16] // pix 1.3350 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.3351 + psrlw xmm5, 8 1.3352 + sub edi, edx 1.3353 + 1.3354 + align 4 1.3355 + convertloop: 1.3356 + movdqa xmm0, [eax] 1.3357 + movdqa xmm1, [eax + 16] 1.3358 + lea eax, [eax + 32] 1.3359 + movdqa xmm2, xmm0 1.3360 + movdqa xmm3, xmm1 1.3361 + pand xmm0, xmm5 // even bytes 1.3362 + pand xmm1, xmm5 1.3363 + packuswb xmm0, xmm1 1.3364 + psrlw xmm2, 8 // odd bytes 1.3365 + psrlw xmm3, 8 1.3366 + packuswb xmm2, xmm3 1.3367 + movdqa [edx], xmm0 1.3368 + movdqa [edx + edi], xmm2 1.3369 + lea edx, [edx + 16] 1.3370 + sub ecx, 16 1.3371 + jg convertloop 1.3372 + 1.3373 + pop edi 1.3374 + ret 1.3375 + } 1.3376 +} 1.3377 + 1.3378 +__declspec(naked) __declspec(align(16)) 1.3379 +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 1.3380 + int pix) { 1.3381 + __asm { 1.3382 + push edi 1.3383 + mov eax, [esp + 4 + 4] // src_uv 1.3384 + mov edx, [esp + 4 + 8] // dst_u 1.3385 + mov edi, [esp + 4 + 12] // dst_v 1.3386 + mov ecx, [esp + 4 + 16] // pix 1.3387 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.3388 + psrlw xmm5, 8 1.3389 + sub edi, edx 1.3390 + 1.3391 + align 4 1.3392 + convertloop: 1.3393 + movdqu xmm0, [eax] 1.3394 + movdqu xmm1, [eax + 16] 1.3395 + lea eax, [eax + 32] 1.3396 + movdqa xmm2, xmm0 1.3397 + movdqa xmm3, xmm1 1.3398 + pand xmm0, xmm5 // even bytes 1.3399 + pand xmm1, xmm5 1.3400 + packuswb xmm0, xmm1 1.3401 + psrlw xmm2, 8 // odd bytes 1.3402 + psrlw xmm3, 8 1.3403 + packuswb xmm2, xmm3 1.3404 + movdqu [edx], xmm0 1.3405 + movdqu [edx + edi], xmm2 1.3406 + lea edx, [edx + 16] 1.3407 + sub ecx, 16 1.3408 + jg convertloop 1.3409 + 1.3410 + pop edi 1.3411 + ret 1.3412 + } 1.3413 +} 1.3414 +#endif // HAS_SPLITUVROW_SSE2 1.3415 + 1.3416 +#ifdef HAS_SPLITUVROW_AVX2 1.3417 +__declspec(naked) __declspec(align(16)) 1.3418 +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 1.3419 + __asm { 1.3420 + push edi 1.3421 + mov eax, [esp + 4 + 4] // src_uv 1.3422 + mov edx, [esp + 4 + 8] // dst_u 1.3423 + mov edi, [esp + 4 + 12] // dst_v 1.3424 + mov ecx, [esp + 4 + 16] // pix 1.3425 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 1.3426 + vpsrlw ymm5, ymm5, 8 1.3427 + sub edi, edx 1.3428 + 1.3429 + align 4 1.3430 + convertloop: 1.3431 + vmovdqu ymm0, [eax] 1.3432 + vmovdqu ymm1, [eax + 32] 1.3433 + lea eax, [eax + 64] 1.3434 + vpsrlw ymm2, ymm0, 8 // odd bytes 1.3435 + vpsrlw ymm3, ymm1, 8 1.3436 + vpand ymm0, ymm0, ymm5 // even bytes 1.3437 + vpand ymm1, ymm1, ymm5 1.3438 + vpackuswb ymm0, ymm0, ymm1 1.3439 + vpackuswb ymm2, ymm2, ymm3 1.3440 + vpermq ymm0, ymm0, 0xd8 1.3441 + vpermq ymm2, ymm2, 0xd8 1.3442 + vmovdqu [edx], ymm0 1.3443 + vmovdqu [edx + edi], ymm2 1.3444 + lea edx, [edx + 32] 1.3445 + sub ecx, 32 1.3446 + jg convertloop 1.3447 + 1.3448 + pop edi 1.3449 + vzeroupper 1.3450 + ret 1.3451 + } 1.3452 +} 1.3453 +#endif // HAS_SPLITUVROW_AVX2 1.3454 + 1.3455 +#ifdef HAS_MERGEUVROW_SSE2 1.3456 +__declspec(naked) __declspec(align(16)) 1.3457 +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 1.3458 + int width) { 1.3459 + __asm { 1.3460 + push edi 1.3461 + mov eax, [esp + 4 + 4] // src_u 1.3462 + mov edx, [esp + 4 + 8] // src_v 1.3463 + mov edi, [esp + 4 + 12] // dst_uv 1.3464 + mov ecx, [esp + 4 + 16] // width 1.3465 + sub edx, eax 1.3466 + 1.3467 + align 4 1.3468 + convertloop: 1.3469 + movdqa xmm0, [eax] // read 16 U's 1.3470 + movdqa xmm1, [eax + edx] // and 16 V's 1.3471 + lea eax, [eax + 16] 1.3472 + movdqa xmm2, xmm0 1.3473 + punpcklbw xmm0, xmm1 // first 8 UV pairs 1.3474 + punpckhbw xmm2, xmm1 // next 8 UV pairs 1.3475 + movdqa [edi], xmm0 1.3476 + movdqa [edi + 16], xmm2 1.3477 + lea edi, [edi + 32] 1.3478 + sub ecx, 16 1.3479 + jg convertloop 1.3480 + 1.3481 + pop edi 1.3482 + ret 1.3483 + } 1.3484 +} 1.3485 + 1.3486 +__declspec(naked) __declspec(align(16)) 1.3487 +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, 1.3488 + uint8* dst_uv, int width) { 1.3489 + __asm { 1.3490 + push edi 1.3491 + mov eax, [esp + 4 + 4] // src_u 1.3492 + mov edx, [esp + 4 + 8] // src_v 1.3493 + mov edi, [esp + 4 + 12] // dst_uv 1.3494 + mov ecx, [esp + 4 + 16] // width 1.3495 + sub edx, eax 1.3496 + 1.3497 + align 4 1.3498 + convertloop: 1.3499 + movdqu xmm0, [eax] // read 16 U's 1.3500 + movdqu xmm1, [eax + edx] // and 16 V's 1.3501 + lea eax, [eax + 16] 1.3502 + movdqa xmm2, xmm0 1.3503 + punpcklbw xmm0, xmm1 // first 8 UV pairs 1.3504 + punpckhbw xmm2, xmm1 // next 8 UV pairs 1.3505 + movdqu [edi], xmm0 1.3506 + movdqu [edi + 16], xmm2 1.3507 + lea edi, [edi + 32] 1.3508 + sub ecx, 16 1.3509 + jg convertloop 1.3510 + 1.3511 + pop edi 1.3512 + ret 1.3513 + } 1.3514 +} 1.3515 +#endif // HAS_MERGEUVROW_SSE2 1.3516 + 1.3517 +#ifdef HAS_MERGEUVROW_AVX2 1.3518 +__declspec(naked) __declspec(align(16)) 1.3519 +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 1.3520 + int width) { 1.3521 + __asm { 1.3522 + push edi 1.3523 + mov eax, [esp + 4 + 4] // src_u 1.3524 + mov edx, [esp + 4 + 8] // src_v 1.3525 + mov edi, [esp + 4 + 12] // dst_uv 1.3526 + mov ecx, [esp + 4 + 16] // width 1.3527 + sub edx, eax 1.3528 + 1.3529 + align 4 1.3530 + convertloop: 1.3531 + vmovdqu ymm0, [eax] // read 32 U's 1.3532 + vmovdqu ymm1, [eax + edx] // and 32 V's 1.3533 + lea eax, [eax + 32] 1.3534 + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 1.3535 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 1.3536 + vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 1.3537 + vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 1.3538 + vmovdqu [edi], ymm1 1.3539 + vmovdqu [edi + 32], ymm2 1.3540 + lea edi, [edi + 64] 1.3541 + sub ecx, 32 1.3542 + jg convertloop 1.3543 + 1.3544 + pop edi 1.3545 + vzeroupper 1.3546 + ret 1.3547 + } 1.3548 +} 1.3549 +#endif // HAS_MERGEUVROW_AVX2 1.3550 + 1.3551 +#ifdef HAS_COPYROW_SSE2 1.3552 +// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 1.3553 +__declspec(naked) __declspec(align(16)) 1.3554 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 1.3555 + __asm { 1.3556 + mov eax, [esp + 4] // src 1.3557 + mov edx, [esp + 8] // dst 1.3558 + mov ecx, [esp + 12] // count 1.3559 + 1.3560 + align 4 1.3561 + convertloop: 1.3562 + movdqa xmm0, [eax] 1.3563 + movdqa xmm1, [eax + 16] 1.3564 + lea eax, [eax + 32] 1.3565 + movdqa [edx], xmm0 1.3566 + movdqa [edx + 16], xmm1 1.3567 + lea edx, [edx + 32] 1.3568 + sub ecx, 32 1.3569 + jg convertloop 1.3570 + ret 1.3571 + } 1.3572 +} 1.3573 +#endif // HAS_COPYROW_SSE2 1.3574 + 1.3575 +// Unaligned Multiple of 1. 1.3576 +__declspec(naked) __declspec(align(16)) 1.3577 +void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { 1.3578 + __asm { 1.3579 + mov eax, esi 1.3580 + mov edx, edi 1.3581 + mov esi, [esp + 4] // src 1.3582 + mov edi, [esp + 8] // dst 1.3583 + mov ecx, [esp + 12] // count 1.3584 + rep movsb 1.3585 + mov edi, edx 1.3586 + mov esi, eax 1.3587 + ret 1.3588 + } 1.3589 +} 1.3590 + 1.3591 +#ifdef HAS_COPYROW_X86 1.3592 +__declspec(naked) __declspec(align(16)) 1.3593 +void CopyRow_X86(const uint8* src, uint8* dst, int count) { 1.3594 + __asm { 1.3595 + mov eax, esi 1.3596 + mov edx, edi 1.3597 + mov esi, [esp + 4] // src 1.3598 + mov edi, [esp + 8] // dst 1.3599 + mov ecx, [esp + 12] // count 1.3600 + shr ecx, 2 1.3601 + rep movsd 1.3602 + mov edi, edx 1.3603 + mov esi, eax 1.3604 + ret 1.3605 + } 1.3606 +} 1.3607 +#endif // HAS_COPYROW_X86 1.3608 + 1.3609 +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 1.3610 +// width in pixels 1.3611 +__declspec(naked) __declspec(align(16)) 1.3612 +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 1.3613 + __asm { 1.3614 + mov eax, [esp + 4] // src 1.3615 + mov edx, [esp + 8] // dst 1.3616 + mov ecx, [esp + 12] // count 1.3617 + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 1.3618 + pslld xmm0, 24 1.3619 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 1.3620 + psrld xmm1, 8 1.3621 + 1.3622 + align 4 1.3623 + convertloop: 1.3624 + movdqa xmm2, [eax] 1.3625 + movdqa xmm3, [eax + 16] 1.3626 + lea eax, [eax + 32] 1.3627 + movdqa xmm4, [edx] 1.3628 + movdqa xmm5, [edx + 16] 1.3629 + pand xmm2, xmm0 1.3630 + pand xmm3, xmm0 1.3631 + pand xmm4, xmm1 1.3632 + pand xmm5, xmm1 1.3633 + por xmm2, xmm4 1.3634 + por xmm3, xmm5 1.3635 + movdqa [edx], xmm2 1.3636 + movdqa [edx + 16], xmm3 1.3637 + lea edx, [edx + 32] 1.3638 + sub ecx, 8 1.3639 + jg convertloop 1.3640 + 1.3641 + ret 1.3642 + } 1.3643 +} 1.3644 +#endif // HAS_ARGBCOPYALPHAROW_SSE2 1.3645 + 1.3646 +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 1.3647 +// width in pixels 1.3648 +__declspec(naked) __declspec(align(16)) 1.3649 +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 1.3650 + __asm { 1.3651 + mov eax, [esp + 4] // src 1.3652 + mov edx, [esp + 8] // dst 1.3653 + mov ecx, [esp + 12] // count 1.3654 + vpcmpeqb ymm0, ymm0, ymm0 1.3655 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 1.3656 + 1.3657 + align 4 1.3658 + convertloop: 1.3659 + vmovdqu ymm1, [eax] 1.3660 + vmovdqu ymm2, [eax + 32] 1.3661 + lea eax, [eax + 64] 1.3662 + vpblendvb ymm1, ymm1, [edx], ymm0 1.3663 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 1.3664 + vmovdqu [edx], ymm1 1.3665 + vmovdqu [edx + 32], ymm2 1.3666 + lea edx, [edx + 64] 1.3667 + sub ecx, 16 1.3668 + jg convertloop 1.3669 + 1.3670 + vzeroupper 1.3671 + ret 1.3672 + } 1.3673 +} 1.3674 +#endif // HAS_ARGBCOPYALPHAROW_AVX2 1.3675 + 1.3676 +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 1.3677 +// width in pixels 1.3678 +__declspec(naked) __declspec(align(16)) 1.3679 +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 1.3680 + __asm { 1.3681 + mov eax, [esp + 4] // src 1.3682 + mov edx, [esp + 8] // dst 1.3683 + mov ecx, [esp + 12] // count 1.3684 + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 1.3685 + pslld xmm0, 24 1.3686 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 1.3687 + psrld xmm1, 8 1.3688 + 1.3689 + align 4 1.3690 + convertloop: 1.3691 + movq xmm2, qword ptr [eax] // 8 Y's 1.3692 + lea eax, [eax + 8] 1.3693 + punpcklbw xmm2, xmm2 1.3694 + punpckhwd xmm3, xmm2 1.3695 + punpcklwd xmm2, xmm2 1.3696 + movdqa xmm4, [edx] 1.3697 + movdqa xmm5, [edx + 16] 1.3698 + pand xmm2, xmm0 1.3699 + pand xmm3, xmm0 1.3700 + pand xmm4, xmm1 1.3701 + pand xmm5, xmm1 1.3702 + por xmm2, xmm4 1.3703 + por xmm3, xmm5 1.3704 + movdqa [edx], xmm2 1.3705 + movdqa [edx + 16], xmm3 1.3706 + lea edx, [edx + 32] 1.3707 + sub ecx, 8 1.3708 + jg convertloop 1.3709 + 1.3710 + ret 1.3711 + } 1.3712 +} 1.3713 +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 1.3714 + 1.3715 +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 1.3716 +// width in pixels 1.3717 +__declspec(naked) __declspec(align(16)) 1.3718 +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 1.3719 + __asm { 1.3720 + mov eax, [esp + 4] // src 1.3721 + mov edx, [esp + 8] // dst 1.3722 + mov ecx, [esp + 12] // count 1.3723 + vpcmpeqb ymm0, ymm0, ymm0 1.3724 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 1.3725 + 1.3726 + align 4 1.3727 + convertloop: 1.3728 + vpmovzxbd ymm1, qword ptr [eax] 1.3729 + vpmovzxbd ymm2, qword ptr [eax + 8] 1.3730 + lea eax, [eax + 16] 1.3731 + vpslld ymm1, ymm1, 24 1.3732 + vpslld ymm2, ymm2, 24 1.3733 + vpblendvb ymm1, ymm1, [edx], ymm0 1.3734 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 1.3735 + vmovdqu [edx], ymm1 1.3736 + vmovdqu [edx + 32], ymm2 1.3737 + lea edx, [edx + 64] 1.3738 + sub ecx, 16 1.3739 + jg convertloop 1.3740 + 1.3741 + vzeroupper 1.3742 + ret 1.3743 + } 1.3744 +} 1.3745 +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 1.3746 + 1.3747 +#ifdef HAS_SETROW_X86 1.3748 +// SetRow8 writes 'count' bytes using a 32 bit value repeated. 1.3749 +__declspec(naked) __declspec(align(16)) 1.3750 +void SetRow_X86(uint8* dst, uint32 v32, int count) { 1.3751 + __asm { 1.3752 + mov edx, edi 1.3753 + mov edi, [esp + 4] // dst 1.3754 + mov eax, [esp + 8] // v32 1.3755 + mov ecx, [esp + 12] // count 1.3756 + shr ecx, 2 1.3757 + rep stosd 1.3758 + mov edi, edx 1.3759 + ret 1.3760 + } 1.3761 +} 1.3762 + 1.3763 +// SetRow32 writes 'count' words using a 32 bit value repeated. 1.3764 +__declspec(naked) __declspec(align(16)) 1.3765 +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, 1.3766 + int dst_stride, int height) { 1.3767 + __asm { 1.3768 + push esi 1.3769 + push edi 1.3770 + push ebp 1.3771 + mov edi, [esp + 12 + 4] // dst 1.3772 + mov eax, [esp + 12 + 8] // v32 1.3773 + mov ebp, [esp + 12 + 12] // width 1.3774 + mov edx, [esp + 12 + 16] // dst_stride 1.3775 + mov esi, [esp + 12 + 20] // height 1.3776 + lea ecx, [ebp * 4] 1.3777 + sub edx, ecx // stride - width * 4 1.3778 + 1.3779 + align 4 1.3780 + convertloop: 1.3781 + mov ecx, ebp 1.3782 + rep stosd 1.3783 + add edi, edx 1.3784 + sub esi, 1 1.3785 + jg convertloop 1.3786 + 1.3787 + pop ebp 1.3788 + pop edi 1.3789 + pop esi 1.3790 + ret 1.3791 + } 1.3792 +} 1.3793 +#endif // HAS_SETROW_X86 1.3794 + 1.3795 +#ifdef HAS_YUY2TOYROW_AVX2 1.3796 +__declspec(naked) __declspec(align(16)) 1.3797 +void YUY2ToYRow_AVX2(const uint8* src_yuy2, 1.3798 + uint8* dst_y, int pix) { 1.3799 + __asm { 1.3800 + mov eax, [esp + 4] // src_yuy2 1.3801 + mov edx, [esp + 8] // dst_y 1.3802 + mov ecx, [esp + 12] // pix 1.3803 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 1.3804 + vpsrlw ymm5, ymm5, 8 1.3805 + 1.3806 + align 4 1.3807 + convertloop: 1.3808 + vmovdqu ymm0, [eax] 1.3809 + vmovdqu ymm1, [eax + 32] 1.3810 + lea eax, [eax + 64] 1.3811 + vpand ymm0, ymm0, ymm5 // even bytes are Y 1.3812 + vpand ymm1, ymm1, ymm5 1.3813 + vpackuswb ymm0, ymm0, ymm1 // mutates. 1.3814 + vpermq ymm0, ymm0, 0xd8 1.3815 + sub ecx, 32 1.3816 + vmovdqu [edx], ymm0 1.3817 + lea edx, [edx + 32] 1.3818 + jg convertloop 1.3819 + vzeroupper 1.3820 + ret 1.3821 + } 1.3822 +} 1.3823 + 1.3824 +__declspec(naked) __declspec(align(16)) 1.3825 +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 1.3826 + uint8* dst_u, uint8* dst_v, int pix) { 1.3827 + __asm { 1.3828 + push esi 1.3829 + push edi 1.3830 + mov eax, [esp + 8 + 4] // src_yuy2 1.3831 + mov esi, [esp + 8 + 8] // stride_yuy2 1.3832 + mov edx, [esp + 8 + 12] // dst_u 1.3833 + mov edi, [esp + 8 + 16] // dst_v 1.3834 + mov ecx, [esp + 8 + 20] // pix 1.3835 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 1.3836 + vpsrlw ymm5, ymm5, 8 1.3837 + sub edi, edx 1.3838 + 1.3839 + align 4 1.3840 + convertloop: 1.3841 + vmovdqu ymm0, [eax] 1.3842 + vmovdqu ymm1, [eax + 32] 1.3843 + vpavgb ymm0, ymm0, [eax + esi] 1.3844 + vpavgb ymm1, ymm1, [eax + esi + 32] 1.3845 + lea eax, [eax + 64] 1.3846 + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 1.3847 + vpsrlw ymm1, ymm1, 8 1.3848 + vpackuswb ymm0, ymm0, ymm1 // mutates. 1.3849 + vpermq ymm0, ymm0, 0xd8 1.3850 + vpand ymm1, ymm0, ymm5 // U 1.3851 + vpsrlw ymm0, ymm0, 8 // V 1.3852 + vpackuswb ymm1, ymm1, ymm1 // mutates. 1.3853 + vpackuswb ymm0, ymm0, ymm0 // mutates. 1.3854 + vpermq ymm1, ymm1, 0xd8 1.3855 + vpermq ymm0, ymm0, 0xd8 1.3856 + vextractf128 [edx], ymm1, 0 // U 1.3857 + vextractf128 [edx + edi], ymm0, 0 // V 1.3858 + lea edx, [edx + 16] 1.3859 + sub ecx, 32 1.3860 + jg convertloop 1.3861 + 1.3862 + pop edi 1.3863 + pop esi 1.3864 + vzeroupper 1.3865 + ret 1.3866 + } 1.3867 +} 1.3868 + 1.3869 +__declspec(naked) __declspec(align(16)) 1.3870 +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 1.3871 + uint8* dst_u, uint8* dst_v, int pix) { 1.3872 + __asm { 1.3873 + push edi 1.3874 + mov eax, [esp + 4 + 4] // src_yuy2 1.3875 + mov edx, [esp + 4 + 8] // dst_u 1.3876 + mov edi, [esp + 4 + 12] // dst_v 1.3877 + mov ecx, [esp + 4 + 16] // pix 1.3878 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 1.3879 + vpsrlw ymm5, ymm5, 8 1.3880 + sub edi, edx 1.3881 + 1.3882 + align 4 1.3883 + convertloop: 1.3884 + vmovdqu ymm0, [eax] 1.3885 + vmovdqu ymm1, [eax + 32] 1.3886 + lea eax, [eax + 64] 1.3887 + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 1.3888 + vpsrlw ymm1, ymm1, 8 1.3889 + vpackuswb ymm0, ymm0, ymm1 // mutates. 1.3890 + vpermq ymm0, ymm0, 0xd8 1.3891 + vpand ymm1, ymm0, ymm5 // U 1.3892 + vpsrlw ymm0, ymm0, 8 // V 1.3893 + vpackuswb ymm1, ymm1, ymm1 // mutates. 1.3894 + vpackuswb ymm0, ymm0, ymm0 // mutates. 1.3895 + vpermq ymm1, ymm1, 0xd8 1.3896 + vpermq ymm0, ymm0, 0xd8 1.3897 + vextractf128 [edx], ymm1, 0 // U 1.3898 + vextractf128 [edx + edi], ymm0, 0 // V 1.3899 + lea edx, [edx + 16] 1.3900 + sub ecx, 32 1.3901 + jg convertloop 1.3902 + 1.3903 + pop edi 1.3904 + vzeroupper 1.3905 + ret 1.3906 + } 1.3907 +} 1.3908 + 1.3909 +__declspec(naked) __declspec(align(16)) 1.3910 +void UYVYToYRow_AVX2(const uint8* src_uyvy, 1.3911 + uint8* dst_y, int pix) { 1.3912 + __asm { 1.3913 + mov eax, [esp + 4] // src_uyvy 1.3914 + mov edx, [esp + 8] // dst_y 1.3915 + mov ecx, [esp + 12] // pix 1.3916 + 1.3917 + align 4 1.3918 + convertloop: 1.3919 + vmovdqu ymm0, [eax] 1.3920 + vmovdqu ymm1, [eax + 32] 1.3921 + lea eax, [eax + 64] 1.3922 + vpsrlw ymm0, ymm0, 8 // odd bytes are Y 1.3923 + vpsrlw ymm1, ymm1, 8 1.3924 + vpackuswb ymm0, ymm0, ymm1 // mutates. 1.3925 + vpermq ymm0, ymm0, 0xd8 1.3926 + sub ecx, 32 1.3927 + vmovdqu [edx], ymm0 1.3928 + lea edx, [edx + 32] 1.3929 + jg convertloop 1.3930 + ret 1.3931 + vzeroupper 1.3932 + } 1.3933 +} 1.3934 + 1.3935 +__declspec(naked) __declspec(align(16)) 1.3936 +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 1.3937 + uint8* dst_u, uint8* dst_v, int pix) { 1.3938 + __asm { 1.3939 + push esi 1.3940 + push edi 1.3941 + mov eax, [esp + 8 + 4] // src_yuy2 1.3942 + mov esi, [esp + 8 + 8] // stride_yuy2 1.3943 + mov edx, [esp + 8 + 12] // dst_u 1.3944 + mov edi, [esp + 8 + 16] // dst_v 1.3945 + mov ecx, [esp + 8 + 20] // pix 1.3946 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 1.3947 + vpsrlw ymm5, ymm5, 8 1.3948 + sub edi, edx 1.3949 + 1.3950 + align 4 1.3951 + convertloop: 1.3952 + vmovdqu ymm0, [eax] 1.3953 + vmovdqu ymm1, [eax + 32] 1.3954 + vpavgb ymm0, ymm0, [eax + esi] 1.3955 + vpavgb ymm1, ymm1, [eax + esi + 32] 1.3956 + lea eax, [eax + 64] 1.3957 + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 1.3958 + vpand ymm1, ymm1, ymm5 1.3959 + vpackuswb ymm0, ymm0, ymm1 // mutates. 1.3960 + vpermq ymm0, ymm0, 0xd8 1.3961 + vpand ymm1, ymm0, ymm5 // U 1.3962 + vpsrlw ymm0, ymm0, 8 // V 1.3963 + vpackuswb ymm1, ymm1, ymm1 // mutates. 1.3964 + vpackuswb ymm0, ymm0, ymm0 // mutates. 1.3965 + vpermq ymm1, ymm1, 0xd8 1.3966 + vpermq ymm0, ymm0, 0xd8 1.3967 + vextractf128 [edx], ymm1, 0 // U 1.3968 + vextractf128 [edx + edi], ymm0, 0 // V 1.3969 + lea edx, [edx + 16] 1.3970 + sub ecx, 32 1.3971 + jg convertloop 1.3972 + 1.3973 + pop edi 1.3974 + pop esi 1.3975 + vzeroupper 1.3976 + ret 1.3977 + } 1.3978 +} 1.3979 + 1.3980 +__declspec(naked) __declspec(align(16)) 1.3981 +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 1.3982 + uint8* dst_u, uint8* dst_v, int pix) { 1.3983 + __asm { 1.3984 + push edi 1.3985 + mov eax, [esp + 4 + 4] // src_yuy2 1.3986 + mov edx, [esp + 4 + 8] // dst_u 1.3987 + mov edi, [esp + 4 + 12] // dst_v 1.3988 + mov ecx, [esp + 4 + 16] // pix 1.3989 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 1.3990 + vpsrlw ymm5, ymm5, 8 1.3991 + sub edi, edx 1.3992 + 1.3993 + align 4 1.3994 + convertloop: 1.3995 + vmovdqu ymm0, [eax] 1.3996 + vmovdqu ymm1, [eax + 32] 1.3997 + lea eax, [eax + 64] 1.3998 + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 1.3999 + vpand ymm1, ymm1, ymm5 1.4000 + vpackuswb ymm0, ymm0, ymm1 // mutates. 1.4001 + vpermq ymm0, ymm0, 0xd8 1.4002 + vpand ymm1, ymm0, ymm5 // U 1.4003 + vpsrlw ymm0, ymm0, 8 // V 1.4004 + vpackuswb ymm1, ymm1, ymm1 // mutates. 1.4005 + vpackuswb ymm0, ymm0, ymm0 // mutates. 1.4006 + vpermq ymm1, ymm1, 0xd8 1.4007 + vpermq ymm0, ymm0, 0xd8 1.4008 + vextractf128 [edx], ymm1, 0 // U 1.4009 + vextractf128 [edx + edi], ymm0, 0 // V 1.4010 + lea edx, [edx + 16] 1.4011 + sub ecx, 32 1.4012 + jg convertloop 1.4013 + 1.4014 + pop edi 1.4015 + vzeroupper 1.4016 + ret 1.4017 + } 1.4018 +} 1.4019 +#endif // HAS_YUY2TOYROW_AVX2 1.4020 + 1.4021 +#ifdef HAS_YUY2TOYROW_SSE2 1.4022 +__declspec(naked) __declspec(align(16)) 1.4023 +void YUY2ToYRow_SSE2(const uint8* src_yuy2, 1.4024 + uint8* dst_y, int pix) { 1.4025 + __asm { 1.4026 + mov eax, [esp + 4] // src_yuy2 1.4027 + mov edx, [esp + 8] // dst_y 1.4028 + mov ecx, [esp + 12] // pix 1.4029 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4030 + psrlw xmm5, 8 1.4031 + 1.4032 + align 4 1.4033 + convertloop: 1.4034 + movdqa xmm0, [eax] 1.4035 + movdqa xmm1, [eax + 16] 1.4036 + lea eax, [eax + 32] 1.4037 + pand xmm0, xmm5 // even bytes are Y 1.4038 + pand xmm1, xmm5 1.4039 + packuswb xmm0, xmm1 1.4040 + sub ecx, 16 1.4041 + movdqa [edx], xmm0 1.4042 + lea edx, [edx + 16] 1.4043 + jg convertloop 1.4044 + ret 1.4045 + } 1.4046 +} 1.4047 + 1.4048 +__declspec(naked) __declspec(align(16)) 1.4049 +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 1.4050 + uint8* dst_u, uint8* dst_v, int pix) { 1.4051 + __asm { 1.4052 + push esi 1.4053 + push edi 1.4054 + mov eax, [esp + 8 + 4] // src_yuy2 1.4055 + mov esi, [esp + 8 + 8] // stride_yuy2 1.4056 + mov edx, [esp + 8 + 12] // dst_u 1.4057 + mov edi, [esp + 8 + 16] // dst_v 1.4058 + mov ecx, [esp + 8 + 20] // pix 1.4059 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4060 + psrlw xmm5, 8 1.4061 + sub edi, edx 1.4062 + 1.4063 + align 4 1.4064 + convertloop: 1.4065 + movdqa xmm0, [eax] 1.4066 + movdqa xmm1, [eax + 16] 1.4067 + movdqa xmm2, [eax + esi] 1.4068 + movdqa xmm3, [eax + esi + 16] 1.4069 + lea eax, [eax + 32] 1.4070 + pavgb xmm0, xmm2 1.4071 + pavgb xmm1, xmm3 1.4072 + psrlw xmm0, 8 // YUYV -> UVUV 1.4073 + psrlw xmm1, 8 1.4074 + packuswb xmm0, xmm1 1.4075 + movdqa xmm1, xmm0 1.4076 + pand xmm0, xmm5 // U 1.4077 + packuswb xmm0, xmm0 1.4078 + psrlw xmm1, 8 // V 1.4079 + packuswb xmm1, xmm1 1.4080 + movq qword ptr [edx], xmm0 1.4081 + movq qword ptr [edx + edi], xmm1 1.4082 + lea edx, [edx + 8] 1.4083 + sub ecx, 16 1.4084 + jg convertloop 1.4085 + 1.4086 + pop edi 1.4087 + pop esi 1.4088 + ret 1.4089 + } 1.4090 +} 1.4091 + 1.4092 +__declspec(naked) __declspec(align(16)) 1.4093 +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 1.4094 + uint8* dst_u, uint8* dst_v, int pix) { 1.4095 + __asm { 1.4096 + push edi 1.4097 + mov eax, [esp + 4 + 4] // src_yuy2 1.4098 + mov edx, [esp + 4 + 8] // dst_u 1.4099 + mov edi, [esp + 4 + 12] // dst_v 1.4100 + mov ecx, [esp + 4 + 16] // pix 1.4101 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4102 + psrlw xmm5, 8 1.4103 + sub edi, edx 1.4104 + 1.4105 + align 4 1.4106 + convertloop: 1.4107 + movdqa xmm0, [eax] 1.4108 + movdqa xmm1, [eax + 16] 1.4109 + lea eax, [eax + 32] 1.4110 + psrlw xmm0, 8 // YUYV -> UVUV 1.4111 + psrlw xmm1, 8 1.4112 + packuswb xmm0, xmm1 1.4113 + movdqa xmm1, xmm0 1.4114 + pand xmm0, xmm5 // U 1.4115 + packuswb xmm0, xmm0 1.4116 + psrlw xmm1, 8 // V 1.4117 + packuswb xmm1, xmm1 1.4118 + movq qword ptr [edx], xmm0 1.4119 + movq qword ptr [edx + edi], xmm1 1.4120 + lea edx, [edx + 8] 1.4121 + sub ecx, 16 1.4122 + jg convertloop 1.4123 + 1.4124 + pop edi 1.4125 + ret 1.4126 + } 1.4127 +} 1.4128 + 1.4129 +__declspec(naked) __declspec(align(16)) 1.4130 +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, 1.4131 + uint8* dst_y, int pix) { 1.4132 + __asm { 1.4133 + mov eax, [esp + 4] // src_yuy2 1.4134 + mov edx, [esp + 8] // dst_y 1.4135 + mov ecx, [esp + 12] // pix 1.4136 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4137 + psrlw xmm5, 8 1.4138 + 1.4139 + align 4 1.4140 + convertloop: 1.4141 + movdqu xmm0, [eax] 1.4142 + movdqu xmm1, [eax + 16] 1.4143 + lea eax, [eax + 32] 1.4144 + pand xmm0, xmm5 // even bytes are Y 1.4145 + pand xmm1, xmm5 1.4146 + packuswb xmm0, xmm1 1.4147 + sub ecx, 16 1.4148 + movdqu [edx], xmm0 1.4149 + lea edx, [edx + 16] 1.4150 + jg convertloop 1.4151 + ret 1.4152 + } 1.4153 +} 1.4154 + 1.4155 +__declspec(naked) __declspec(align(16)) 1.4156 +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, 1.4157 + uint8* dst_u, uint8* dst_v, int pix) { 1.4158 + __asm { 1.4159 + push esi 1.4160 + push edi 1.4161 + mov eax, [esp + 8 + 4] // src_yuy2 1.4162 + mov esi, [esp + 8 + 8] // stride_yuy2 1.4163 + mov edx, [esp + 8 + 12] // dst_u 1.4164 + mov edi, [esp + 8 + 16] // dst_v 1.4165 + mov ecx, [esp + 8 + 20] // pix 1.4166 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4167 + psrlw xmm5, 8 1.4168 + sub edi, edx 1.4169 + 1.4170 + align 4 1.4171 + convertloop: 1.4172 + movdqu xmm0, [eax] 1.4173 + movdqu xmm1, [eax + 16] 1.4174 + movdqu xmm2, [eax + esi] 1.4175 + movdqu xmm3, [eax + esi + 16] 1.4176 + lea eax, [eax + 32] 1.4177 + pavgb xmm0, xmm2 1.4178 + pavgb xmm1, xmm3 1.4179 + psrlw xmm0, 8 // YUYV -> UVUV 1.4180 + psrlw xmm1, 8 1.4181 + packuswb xmm0, xmm1 1.4182 + movdqa xmm1, xmm0 1.4183 + pand xmm0, xmm5 // U 1.4184 + packuswb xmm0, xmm0 1.4185 + psrlw xmm1, 8 // V 1.4186 + packuswb xmm1, xmm1 1.4187 + movq qword ptr [edx], xmm0 1.4188 + movq qword ptr [edx + edi], xmm1 1.4189 + lea edx, [edx + 8] 1.4190 + sub ecx, 16 1.4191 + jg convertloop 1.4192 + 1.4193 + pop edi 1.4194 + pop esi 1.4195 + ret 1.4196 + } 1.4197 +} 1.4198 + 1.4199 +__declspec(naked) __declspec(align(16)) 1.4200 +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, 1.4201 + uint8* dst_u, uint8* dst_v, int pix) { 1.4202 + __asm { 1.4203 + push edi 1.4204 + mov eax, [esp + 4 + 4] // src_yuy2 1.4205 + mov edx, [esp + 4 + 8] // dst_u 1.4206 + mov edi, [esp + 4 + 12] // dst_v 1.4207 + mov ecx, [esp + 4 + 16] // pix 1.4208 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4209 + psrlw xmm5, 8 1.4210 + sub edi, edx 1.4211 + 1.4212 + align 4 1.4213 + convertloop: 1.4214 + movdqu xmm0, [eax] 1.4215 + movdqu xmm1, [eax + 16] 1.4216 + lea eax, [eax + 32] 1.4217 + psrlw xmm0, 8 // YUYV -> UVUV 1.4218 + psrlw xmm1, 8 1.4219 + packuswb xmm0, xmm1 1.4220 + movdqa xmm1, xmm0 1.4221 + pand xmm0, xmm5 // U 1.4222 + packuswb xmm0, xmm0 1.4223 + psrlw xmm1, 8 // V 1.4224 + packuswb xmm1, xmm1 1.4225 + movq qword ptr [edx], xmm0 1.4226 + movq qword ptr [edx + edi], xmm1 1.4227 + lea edx, [edx + 8] 1.4228 + sub ecx, 16 1.4229 + jg convertloop 1.4230 + 1.4231 + pop edi 1.4232 + ret 1.4233 + } 1.4234 +} 1.4235 + 1.4236 +__declspec(naked) __declspec(align(16)) 1.4237 +void UYVYToYRow_SSE2(const uint8* src_uyvy, 1.4238 + uint8* dst_y, int pix) { 1.4239 + __asm { 1.4240 + mov eax, [esp + 4] // src_uyvy 1.4241 + mov edx, [esp + 8] // dst_y 1.4242 + mov ecx, [esp + 12] // pix 1.4243 + 1.4244 + align 4 1.4245 + convertloop: 1.4246 + movdqa xmm0, [eax] 1.4247 + movdqa xmm1, [eax + 16] 1.4248 + lea eax, [eax + 32] 1.4249 + psrlw xmm0, 8 // odd bytes are Y 1.4250 + psrlw xmm1, 8 1.4251 + packuswb xmm0, xmm1 1.4252 + sub ecx, 16 1.4253 + movdqa [edx], xmm0 1.4254 + lea edx, [edx + 16] 1.4255 + jg convertloop 1.4256 + ret 1.4257 + } 1.4258 +} 1.4259 + 1.4260 +__declspec(naked) __declspec(align(16)) 1.4261 +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 1.4262 + uint8* dst_u, uint8* dst_v, int pix) { 1.4263 + __asm { 1.4264 + push esi 1.4265 + push edi 1.4266 + mov eax, [esp + 8 + 4] // src_yuy2 1.4267 + mov esi, [esp + 8 + 8] // stride_yuy2 1.4268 + mov edx, [esp + 8 + 12] // dst_u 1.4269 + mov edi, [esp + 8 + 16] // dst_v 1.4270 + mov ecx, [esp + 8 + 20] // pix 1.4271 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4272 + psrlw xmm5, 8 1.4273 + sub edi, edx 1.4274 + 1.4275 + align 4 1.4276 + convertloop: 1.4277 + movdqa xmm0, [eax] 1.4278 + movdqa xmm1, [eax + 16] 1.4279 + movdqa xmm2, [eax + esi] 1.4280 + movdqa xmm3, [eax + esi + 16] 1.4281 + lea eax, [eax + 32] 1.4282 + pavgb xmm0, xmm2 1.4283 + pavgb xmm1, xmm3 1.4284 + pand xmm0, xmm5 // UYVY -> UVUV 1.4285 + pand xmm1, xmm5 1.4286 + packuswb xmm0, xmm1 1.4287 + movdqa xmm1, xmm0 1.4288 + pand xmm0, xmm5 // U 1.4289 + packuswb xmm0, xmm0 1.4290 + psrlw xmm1, 8 // V 1.4291 + packuswb xmm1, xmm1 1.4292 + movq qword ptr [edx], xmm0 1.4293 + movq qword ptr [edx + edi], xmm1 1.4294 + lea edx, [edx + 8] 1.4295 + sub ecx, 16 1.4296 + jg convertloop 1.4297 + 1.4298 + pop edi 1.4299 + pop esi 1.4300 + ret 1.4301 + } 1.4302 +} 1.4303 + 1.4304 +__declspec(naked) __declspec(align(16)) 1.4305 +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 1.4306 + uint8* dst_u, uint8* dst_v, int pix) { 1.4307 + __asm { 1.4308 + push edi 1.4309 + mov eax, [esp + 4 + 4] // src_yuy2 1.4310 + mov edx, [esp + 4 + 8] // dst_u 1.4311 + mov edi, [esp + 4 + 12] // dst_v 1.4312 + mov ecx, [esp + 4 + 16] // pix 1.4313 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4314 + psrlw xmm5, 8 1.4315 + sub edi, edx 1.4316 + 1.4317 + align 4 1.4318 + convertloop: 1.4319 + movdqa xmm0, [eax] 1.4320 + movdqa xmm1, [eax + 16] 1.4321 + lea eax, [eax + 32] 1.4322 + pand xmm0, xmm5 // UYVY -> UVUV 1.4323 + pand xmm1, xmm5 1.4324 + packuswb xmm0, xmm1 1.4325 + movdqa xmm1, xmm0 1.4326 + pand xmm0, xmm5 // U 1.4327 + packuswb xmm0, xmm0 1.4328 + psrlw xmm1, 8 // V 1.4329 + packuswb xmm1, xmm1 1.4330 + movq qword ptr [edx], xmm0 1.4331 + movq qword ptr [edx + edi], xmm1 1.4332 + lea edx, [edx + 8] 1.4333 + sub ecx, 16 1.4334 + jg convertloop 1.4335 + 1.4336 + pop edi 1.4337 + ret 1.4338 + } 1.4339 +} 1.4340 + 1.4341 +__declspec(naked) __declspec(align(16)) 1.4342 +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, 1.4343 + uint8* dst_y, int pix) { 1.4344 + __asm { 1.4345 + mov eax, [esp + 4] // src_uyvy 1.4346 + mov edx, [esp + 8] // dst_y 1.4347 + mov ecx, [esp + 12] // pix 1.4348 + 1.4349 + align 4 1.4350 + convertloop: 1.4351 + movdqu xmm0, [eax] 1.4352 + movdqu xmm1, [eax + 16] 1.4353 + lea eax, [eax + 32] 1.4354 + psrlw xmm0, 8 // odd bytes are Y 1.4355 + psrlw xmm1, 8 1.4356 + packuswb xmm0, xmm1 1.4357 + sub ecx, 16 1.4358 + movdqu [edx], xmm0 1.4359 + lea edx, [edx + 16] 1.4360 + jg convertloop 1.4361 + ret 1.4362 + } 1.4363 +} 1.4364 + 1.4365 +__declspec(naked) __declspec(align(16)) 1.4366 +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, 1.4367 + uint8* dst_u, uint8* dst_v, int pix) { 1.4368 + __asm { 1.4369 + push esi 1.4370 + push edi 1.4371 + mov eax, [esp + 8 + 4] // src_yuy2 1.4372 + mov esi, [esp + 8 + 8] // stride_yuy2 1.4373 + mov edx, [esp + 8 + 12] // dst_u 1.4374 + mov edi, [esp + 8 + 16] // dst_v 1.4375 + mov ecx, [esp + 8 + 20] // pix 1.4376 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4377 + psrlw xmm5, 8 1.4378 + sub edi, edx 1.4379 + 1.4380 + align 4 1.4381 + convertloop: 1.4382 + movdqu xmm0, [eax] 1.4383 + movdqu xmm1, [eax + 16] 1.4384 + movdqu xmm2, [eax + esi] 1.4385 + movdqu xmm3, [eax + esi + 16] 1.4386 + lea eax, [eax + 32] 1.4387 + pavgb xmm0, xmm2 1.4388 + pavgb xmm1, xmm3 1.4389 + pand xmm0, xmm5 // UYVY -> UVUV 1.4390 + pand xmm1, xmm5 1.4391 + packuswb xmm0, xmm1 1.4392 + movdqa xmm1, xmm0 1.4393 + pand xmm0, xmm5 // U 1.4394 + packuswb xmm0, xmm0 1.4395 + psrlw xmm1, 8 // V 1.4396 + packuswb xmm1, xmm1 1.4397 + movq qword ptr [edx], xmm0 1.4398 + movq qword ptr [edx + edi], xmm1 1.4399 + lea edx, [edx + 8] 1.4400 + sub ecx, 16 1.4401 + jg convertloop 1.4402 + 1.4403 + pop edi 1.4404 + pop esi 1.4405 + ret 1.4406 + } 1.4407 +} 1.4408 + 1.4409 +__declspec(naked) __declspec(align(16)) 1.4410 +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, 1.4411 + uint8* dst_u, uint8* dst_v, int pix) { 1.4412 + __asm { 1.4413 + push edi 1.4414 + mov eax, [esp + 4 + 4] // src_yuy2 1.4415 + mov edx, [esp + 4 + 8] // dst_u 1.4416 + mov edi, [esp + 4 + 12] // dst_v 1.4417 + mov ecx, [esp + 4 + 16] // pix 1.4418 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.4419 + psrlw xmm5, 8 1.4420 + sub edi, edx 1.4421 + 1.4422 + align 4 1.4423 + convertloop: 1.4424 + movdqu xmm0, [eax] 1.4425 + movdqu xmm1, [eax + 16] 1.4426 + lea eax, [eax + 32] 1.4427 + pand xmm0, xmm5 // UYVY -> UVUV 1.4428 + pand xmm1, xmm5 1.4429 + packuswb xmm0, xmm1 1.4430 + movdqa xmm1, xmm0 1.4431 + pand xmm0, xmm5 // U 1.4432 + packuswb xmm0, xmm0 1.4433 + psrlw xmm1, 8 // V 1.4434 + packuswb xmm1, xmm1 1.4435 + movq qword ptr [edx], xmm0 1.4436 + movq qword ptr [edx + edi], xmm1 1.4437 + lea edx, [edx + 8] 1.4438 + sub ecx, 16 1.4439 + jg convertloop 1.4440 + 1.4441 + pop edi 1.4442 + ret 1.4443 + } 1.4444 +} 1.4445 +#endif // HAS_YUY2TOYROW_SSE2 1.4446 + 1.4447 +#ifdef HAS_ARGBBLENDROW_SSE2 1.4448 +// Blend 8 pixels at a time. 1.4449 +__declspec(naked) __declspec(align(16)) 1.4450 +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 1.4451 + uint8* dst_argb, int width) { 1.4452 + __asm { 1.4453 + push esi 1.4454 + mov eax, [esp + 4 + 4] // src_argb0 1.4455 + mov esi, [esp + 4 + 8] // src_argb1 1.4456 + mov edx, [esp + 4 + 12] // dst_argb 1.4457 + mov ecx, [esp + 4 + 16] // width 1.4458 + pcmpeqb xmm7, xmm7 // generate constant 1 1.4459 + psrlw xmm7, 15 1.4460 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 1.4461 + psrlw xmm6, 8 1.4462 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 1.4463 + psllw xmm5, 8 1.4464 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 1.4465 + pslld xmm4, 24 1.4466 + 1.4467 + sub ecx, 1 1.4468 + je convertloop1 // only 1 pixel? 1.4469 + jl convertloop1b 1.4470 + 1.4471 + // 1 pixel loop until destination pointer is aligned. 1.4472 + alignloop1: 1.4473 + test edx, 15 // aligned? 1.4474 + je alignloop1b 1.4475 + movd xmm3, [eax] 1.4476 + lea eax, [eax + 4] 1.4477 + movdqa xmm0, xmm3 // src argb 1.4478 + pxor xmm3, xmm4 // ~alpha 1.4479 + movd xmm2, [esi] // _r_b 1.4480 + psrlw xmm3, 8 // alpha 1.4481 + pshufhw xmm3, xmm3, 0F5h // 8 alpha words 1.4482 + pshuflw xmm3, xmm3, 0F5h 1.4483 + pand xmm2, xmm6 // _r_b 1.4484 + paddw xmm3, xmm7 // 256 - alpha 1.4485 + pmullw xmm2, xmm3 // _r_b * alpha 1.4486 + movd xmm1, [esi] // _a_g 1.4487 + lea esi, [esi + 4] 1.4488 + psrlw xmm1, 8 // _a_g 1.4489 + por xmm0, xmm4 // set alpha to 255 1.4490 + pmullw xmm1, xmm3 // _a_g * alpha 1.4491 + psrlw xmm2, 8 // _r_b convert to 8 bits again 1.4492 + paddusb xmm0, xmm2 // + src argb 1.4493 + pand xmm1, xmm5 // a_g_ convert to 8 bits again 1.4494 + paddusb xmm0, xmm1 // + src argb 1.4495 + sub ecx, 1 1.4496 + movd [edx], xmm0 1.4497 + lea edx, [edx + 4] 1.4498 + jge alignloop1 1.4499 + 1.4500 + alignloop1b: 1.4501 + add ecx, 1 - 4 1.4502 + jl convertloop4b 1.4503 + 1.4504 + // 4 pixel loop. 1.4505 + convertloop4: 1.4506 + movdqu xmm3, [eax] // src argb 1.4507 + lea eax, [eax + 16] 1.4508 + movdqa xmm0, xmm3 // src argb 1.4509 + pxor xmm3, xmm4 // ~alpha 1.4510 + movdqu xmm2, [esi] // _r_b 1.4511 + psrlw xmm3, 8 // alpha 1.4512 + pshufhw xmm3, xmm3, 0F5h // 8 alpha words 1.4513 + pshuflw xmm3, xmm3, 0F5h 1.4514 + pand xmm2, xmm6 // _r_b 1.4515 + paddw xmm3, xmm7 // 256 - alpha 1.4516 + pmullw xmm2, xmm3 // _r_b * alpha 1.4517 + movdqu xmm1, [esi] // _a_g 1.4518 + lea esi, [esi + 16] 1.4519 + psrlw xmm1, 8 // _a_g 1.4520 + por xmm0, xmm4 // set alpha to 255 1.4521 + pmullw xmm1, xmm3 // _a_g * alpha 1.4522 + psrlw xmm2, 8 // _r_b convert to 8 bits again 1.4523 + paddusb xmm0, xmm2 // + src argb 1.4524 + pand xmm1, xmm5 // a_g_ convert to 8 bits again 1.4525 + paddusb xmm0, xmm1 // + src argb 1.4526 + sub ecx, 4 1.4527 + movdqa [edx], xmm0 1.4528 + lea edx, [edx + 16] 1.4529 + jge convertloop4 1.4530 + 1.4531 + convertloop4b: 1.4532 + add ecx, 4 - 1 1.4533 + jl convertloop1b 1.4534 + 1.4535 + // 1 pixel loop. 1.4536 + convertloop1: 1.4537 + movd xmm3, [eax] // src argb 1.4538 + lea eax, [eax + 4] 1.4539 + movdqa xmm0, xmm3 // src argb 1.4540 + pxor xmm3, xmm4 // ~alpha 1.4541 + movd xmm2, [esi] // _r_b 1.4542 + psrlw xmm3, 8 // alpha 1.4543 + pshufhw xmm3, xmm3, 0F5h // 8 alpha words 1.4544 + pshuflw xmm3, xmm3, 0F5h 1.4545 + pand xmm2, xmm6 // _r_b 1.4546 + paddw xmm3, xmm7 // 256 - alpha 1.4547 + pmullw xmm2, xmm3 // _r_b * alpha 1.4548 + movd xmm1, [esi] // _a_g 1.4549 + lea esi, [esi + 4] 1.4550 + psrlw xmm1, 8 // _a_g 1.4551 + por xmm0, xmm4 // set alpha to 255 1.4552 + pmullw xmm1, xmm3 // _a_g * alpha 1.4553 + psrlw xmm2, 8 // _r_b convert to 8 bits again 1.4554 + paddusb xmm0, xmm2 // + src argb 1.4555 + pand xmm1, xmm5 // a_g_ convert to 8 bits again 1.4556 + paddusb xmm0, xmm1 // + src argb 1.4557 + sub ecx, 1 1.4558 + movd [edx], xmm0 1.4559 + lea edx, [edx + 4] 1.4560 + jge convertloop1 1.4561 + 1.4562 + convertloop1b: 1.4563 + pop esi 1.4564 + ret 1.4565 + } 1.4566 +} 1.4567 +#endif // HAS_ARGBBLENDROW_SSE2 1.4568 + 1.4569 +#ifdef HAS_ARGBBLENDROW_SSSE3 1.4570 +// Shuffle table for isolating alpha. 1.4571 +static const uvec8 kShuffleAlpha = { 1.4572 + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 1.4573 + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 1.4574 +}; 1.4575 +// Same as SSE2, but replaces: 1.4576 +// psrlw xmm3, 8 // alpha 1.4577 +// pshufhw xmm3, xmm3, 0F5h // 8 alpha words 1.4578 +// pshuflw xmm3, xmm3, 0F5h 1.4579 +// with.. 1.4580 +// pshufb xmm3, kShuffleAlpha // alpha 1.4581 +// Blend 8 pixels at a time. 1.4582 + 1.4583 +__declspec(naked) __declspec(align(16)) 1.4584 +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 1.4585 + uint8* dst_argb, int width) { 1.4586 + __asm { 1.4587 + push esi 1.4588 + mov eax, [esp + 4 + 4] // src_argb0 1.4589 + mov esi, [esp + 4 + 8] // src_argb1 1.4590 + mov edx, [esp + 4 + 12] // dst_argb 1.4591 + mov ecx, [esp + 4 + 16] // width 1.4592 + pcmpeqb xmm7, xmm7 // generate constant 0x0001 1.4593 + psrlw xmm7, 15 1.4594 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 1.4595 + psrlw xmm6, 8 1.4596 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 1.4597 + psllw xmm5, 8 1.4598 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 1.4599 + pslld xmm4, 24 1.4600 + 1.4601 + sub ecx, 1 1.4602 + je convertloop1 // only 1 pixel? 1.4603 + jl convertloop1b 1.4604 + 1.4605 + // 1 pixel loop until destination pointer is aligned. 1.4606 + alignloop1: 1.4607 + test edx, 15 // aligned? 1.4608 + je alignloop1b 1.4609 + movd xmm3, [eax] 1.4610 + lea eax, [eax + 4] 1.4611 + movdqa xmm0, xmm3 // src argb 1.4612 + pxor xmm3, xmm4 // ~alpha 1.4613 + movd xmm2, [esi] // _r_b 1.4614 + pshufb xmm3, kShuffleAlpha // alpha 1.4615 + pand xmm2, xmm6 // _r_b 1.4616 + paddw xmm3, xmm7 // 256 - alpha 1.4617 + pmullw xmm2, xmm3 // _r_b * alpha 1.4618 + movd xmm1, [esi] // _a_g 1.4619 + lea esi, [esi + 4] 1.4620 + psrlw xmm1, 8 // _a_g 1.4621 + por xmm0, xmm4 // set alpha to 255 1.4622 + pmullw xmm1, xmm3 // _a_g * alpha 1.4623 + psrlw xmm2, 8 // _r_b convert to 8 bits again 1.4624 + paddusb xmm0, xmm2 // + src argb 1.4625 + pand xmm1, xmm5 // a_g_ convert to 8 bits again 1.4626 + paddusb xmm0, xmm1 // + src argb 1.4627 + sub ecx, 1 1.4628 + movd [edx], xmm0 1.4629 + lea edx, [edx + 4] 1.4630 + jge alignloop1 1.4631 + 1.4632 + alignloop1b: 1.4633 + add ecx, 1 - 4 1.4634 + jl convertloop4b 1.4635 + 1.4636 + test eax, 15 // unaligned? 1.4637 + jne convertuloop4 1.4638 + test esi, 15 // unaligned? 1.4639 + jne convertuloop4 1.4640 + 1.4641 + // 4 pixel loop. 1.4642 + convertloop4: 1.4643 + movdqa xmm3, [eax] // src argb 1.4644 + lea eax, [eax + 16] 1.4645 + movdqa xmm0, xmm3 // src argb 1.4646 + pxor xmm3, xmm4 // ~alpha 1.4647 + movdqa xmm2, [esi] // _r_b 1.4648 + pshufb xmm3, kShuffleAlpha // alpha 1.4649 + pand xmm2, xmm6 // _r_b 1.4650 + paddw xmm3, xmm7 // 256 - alpha 1.4651 + pmullw xmm2, xmm3 // _r_b * alpha 1.4652 + movdqa xmm1, [esi] // _a_g 1.4653 + lea esi, [esi + 16] 1.4654 + psrlw xmm1, 8 // _a_g 1.4655 + por xmm0, xmm4 // set alpha to 255 1.4656 + pmullw xmm1, xmm3 // _a_g * alpha 1.4657 + psrlw xmm2, 8 // _r_b convert to 8 bits again 1.4658 + paddusb xmm0, xmm2 // + src argb 1.4659 + pand xmm1, xmm5 // a_g_ convert to 8 bits again 1.4660 + paddusb xmm0, xmm1 // + src argb 1.4661 + sub ecx, 4 1.4662 + movdqa [edx], xmm0 1.4663 + lea edx, [edx + 16] 1.4664 + jge convertloop4 1.4665 + jmp convertloop4b 1.4666 + 1.4667 + // 4 pixel unaligned loop. 1.4668 + convertuloop4: 1.4669 + movdqu xmm3, [eax] // src argb 1.4670 + lea eax, [eax + 16] 1.4671 + movdqa xmm0, xmm3 // src argb 1.4672 + pxor xmm3, xmm4 // ~alpha 1.4673 + movdqu xmm2, [esi] // _r_b 1.4674 + pshufb xmm3, kShuffleAlpha // alpha 1.4675 + pand xmm2, xmm6 // _r_b 1.4676 + paddw xmm3, xmm7 // 256 - alpha 1.4677 + pmullw xmm2, xmm3 // _r_b * alpha 1.4678 + movdqu xmm1, [esi] // _a_g 1.4679 + lea esi, [esi + 16] 1.4680 + psrlw xmm1, 8 // _a_g 1.4681 + por xmm0, xmm4 // set alpha to 255 1.4682 + pmullw xmm1, xmm3 // _a_g * alpha 1.4683 + psrlw xmm2, 8 // _r_b convert to 8 bits again 1.4684 + paddusb xmm0, xmm2 // + src argb 1.4685 + pand xmm1, xmm5 // a_g_ convert to 8 bits again 1.4686 + paddusb xmm0, xmm1 // + src argb 1.4687 + sub ecx, 4 1.4688 + movdqa [edx], xmm0 1.4689 + lea edx, [edx + 16] 1.4690 + jge convertuloop4 1.4691 + 1.4692 + convertloop4b: 1.4693 + add ecx, 4 - 1 1.4694 + jl convertloop1b 1.4695 + 1.4696 + // 1 pixel loop. 1.4697 + convertloop1: 1.4698 + movd xmm3, [eax] // src argb 1.4699 + lea eax, [eax + 4] 1.4700 + movdqa xmm0, xmm3 // src argb 1.4701 + pxor xmm3, xmm4 // ~alpha 1.4702 + movd xmm2, [esi] // _r_b 1.4703 + pshufb xmm3, kShuffleAlpha // alpha 1.4704 + pand xmm2, xmm6 // _r_b 1.4705 + paddw xmm3, xmm7 // 256 - alpha 1.4706 + pmullw xmm2, xmm3 // _r_b * alpha 1.4707 + movd xmm1, [esi] // _a_g 1.4708 + lea esi, [esi + 4] 1.4709 + psrlw xmm1, 8 // _a_g 1.4710 + por xmm0, xmm4 // set alpha to 255 1.4711 + pmullw xmm1, xmm3 // _a_g * alpha 1.4712 + psrlw xmm2, 8 // _r_b convert to 8 bits again 1.4713 + paddusb xmm0, xmm2 // + src argb 1.4714 + pand xmm1, xmm5 // a_g_ convert to 8 bits again 1.4715 + paddusb xmm0, xmm1 // + src argb 1.4716 + sub ecx, 1 1.4717 + movd [edx], xmm0 1.4718 + lea edx, [edx + 4] 1.4719 + jge convertloop1 1.4720 + 1.4721 + convertloop1b: 1.4722 + pop esi 1.4723 + ret 1.4724 + } 1.4725 +} 1.4726 +#endif // HAS_ARGBBLENDROW_SSSE3 1.4727 + 1.4728 +#ifdef HAS_ARGBATTENUATEROW_SSE2 1.4729 +// Attenuate 4 pixels at a time. 1.4730 +// Aligned to 16 bytes. 1.4731 +__declspec(naked) __declspec(align(16)) 1.4732 +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 1.4733 + __asm { 1.4734 + mov eax, [esp + 4] // src_argb0 1.4735 + mov edx, [esp + 8] // dst_argb 1.4736 + mov ecx, [esp + 12] // width 1.4737 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 1.4738 + pslld xmm4, 24 1.4739 + pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff 1.4740 + psrld xmm5, 8 1.4741 + 1.4742 + align 4 1.4743 + convertloop: 1.4744 + movdqa xmm0, [eax] // read 4 pixels 1.4745 + punpcklbw xmm0, xmm0 // first 2 1.4746 + pshufhw xmm2, xmm0, 0FFh // 8 alpha words 1.4747 + pshuflw xmm2, xmm2, 0FFh 1.4748 + pmulhuw xmm0, xmm2 // rgb * a 1.4749 + movdqa xmm1, [eax] // read 4 pixels 1.4750 + punpckhbw xmm1, xmm1 // next 2 pixels 1.4751 + pshufhw xmm2, xmm1, 0FFh // 8 alpha words 1.4752 + pshuflw xmm2, xmm2, 0FFh 1.4753 + pmulhuw xmm1, xmm2 // rgb * a 1.4754 + movdqa xmm2, [eax] // alphas 1.4755 + lea eax, [eax + 16] 1.4756 + psrlw xmm0, 8 1.4757 + pand xmm2, xmm4 1.4758 + psrlw xmm1, 8 1.4759 + packuswb xmm0, xmm1 1.4760 + pand xmm0, xmm5 // keep original alphas 1.4761 + por xmm0, xmm2 1.4762 + sub ecx, 4 1.4763 + movdqa [edx], xmm0 1.4764 + lea edx, [edx + 16] 1.4765 + jg convertloop 1.4766 + 1.4767 + ret 1.4768 + } 1.4769 +} 1.4770 +#endif // HAS_ARGBATTENUATEROW_SSE2 1.4771 + 1.4772 +#ifdef HAS_ARGBATTENUATEROW_SSSE3 1.4773 +// Shuffle table duplicating alpha. 1.4774 +static const uvec8 kShuffleAlpha0 = { 1.4775 + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 1.4776 +}; 1.4777 +static const uvec8 kShuffleAlpha1 = { 1.4778 + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 1.4779 + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 1.4780 +}; 1.4781 +__declspec(naked) __declspec(align(16)) 1.4782 +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 1.4783 + __asm { 1.4784 + mov eax, [esp + 4] // src_argb0 1.4785 + mov edx, [esp + 8] // dst_argb 1.4786 + mov ecx, [esp + 12] // width 1.4787 + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 1.4788 + pslld xmm3, 24 1.4789 + movdqa xmm4, kShuffleAlpha0 1.4790 + movdqa xmm5, kShuffleAlpha1 1.4791 + 1.4792 + align 4 1.4793 + convertloop: 1.4794 + movdqu xmm0, [eax] // read 4 pixels 1.4795 + pshufb xmm0, xmm4 // isolate first 2 alphas 1.4796 + movdqu xmm1, [eax] // read 4 pixels 1.4797 + punpcklbw xmm1, xmm1 // first 2 pixel rgbs 1.4798 + pmulhuw xmm0, xmm1 // rgb * a 1.4799 + movdqu xmm1, [eax] // read 4 pixels 1.4800 + pshufb xmm1, xmm5 // isolate next 2 alphas 1.4801 + movdqu xmm2, [eax] // read 4 pixels 1.4802 + punpckhbw xmm2, xmm2 // next 2 pixel rgbs 1.4803 + pmulhuw xmm1, xmm2 // rgb * a 1.4804 + movdqu xmm2, [eax] // mask original alpha 1.4805 + lea eax, [eax + 16] 1.4806 + pand xmm2, xmm3 1.4807 + psrlw xmm0, 8 1.4808 + psrlw xmm1, 8 1.4809 + packuswb xmm0, xmm1 1.4810 + por xmm0, xmm2 // copy original alpha 1.4811 + sub ecx, 4 1.4812 + movdqu [edx], xmm0 1.4813 + lea edx, [edx + 16] 1.4814 + jg convertloop 1.4815 + 1.4816 + ret 1.4817 + } 1.4818 +} 1.4819 +#endif // HAS_ARGBATTENUATEROW_SSSE3 1.4820 + 1.4821 +#ifdef HAS_ARGBATTENUATEROW_AVX2 1.4822 +// Shuffle table duplicating alpha. 1.4823 +static const ulvec8 kShuffleAlpha_AVX2 = { 1.4824 + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 1.4825 + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, 1.4826 + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 1.4827 + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, 1.4828 +}; 1.4829 +__declspec(naked) __declspec(align(16)) 1.4830 +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 1.4831 + __asm { 1.4832 + mov eax, [esp + 4] // src_argb0 1.4833 + mov edx, [esp + 8] // dst_argb 1.4834 + mov ecx, [esp + 12] // width 1.4835 + sub edx, eax 1.4836 + vmovdqa ymm4, kShuffleAlpha_AVX2 1.4837 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 1.4838 + vpslld ymm5, ymm5, 24 1.4839 + 1.4840 + align 4 1.4841 + convertloop: 1.4842 + vmovdqu ymm6, [eax] // read 8 pixels. 1.4843 + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 1.4844 + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 1.4845 + vpshufb ymm2, ymm0, ymm4 // low 4 alphas 1.4846 + vpshufb ymm3, ymm1, ymm4 // high 4 alphas 1.4847 + vpmulhuw ymm0, ymm0, ymm2 // rgb * a 1.4848 + vpmulhuw ymm1, ymm1, ymm3 // rgb * a 1.4849 + vpand ymm6, ymm6, ymm5 // isolate alpha 1.4850 + vpsrlw ymm0, ymm0, 8 1.4851 + vpsrlw ymm1, ymm1, 8 1.4852 + vpackuswb ymm0, ymm0, ymm1 // unmutated. 1.4853 + vpor ymm0, ymm0, ymm6 // copy original alpha 1.4854 + sub ecx, 8 1.4855 + vmovdqu [eax + edx], ymm0 1.4856 + lea eax, [eax + 32] 1.4857 + jg convertloop 1.4858 + 1.4859 + vzeroupper 1.4860 + ret 1.4861 + } 1.4862 +} 1.4863 +#endif // HAS_ARGBATTENUATEROW_AVX2 1.4864 + 1.4865 +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 1.4866 +// Unattenuate 4 pixels at a time. 1.4867 +// Aligned to 16 bytes. 1.4868 +__declspec(naked) __declspec(align(16)) 1.4869 +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 1.4870 + int width) { 1.4871 + __asm { 1.4872 + push esi 1.4873 + push edi 1.4874 + mov eax, [esp + 8 + 4] // src_argb0 1.4875 + mov edx, [esp + 8 + 8] // dst_argb 1.4876 + mov ecx, [esp + 8 + 12] // width 1.4877 + 1.4878 + align 4 1.4879 + convertloop: 1.4880 + movdqu xmm0, [eax] // read 4 pixels 1.4881 + movzx esi, byte ptr [eax + 3] // first alpha 1.4882 + movzx edi, byte ptr [eax + 7] // second alpha 1.4883 + punpcklbw xmm0, xmm0 // first 2 1.4884 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] 1.4885 + movd xmm3, dword ptr fixed_invtbl8[edi * 4] 1.4886 + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a 1.4887 + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 1.4888 + movlhps xmm2, xmm3 1.4889 + pmulhuw xmm0, xmm2 // rgb * a 1.4890 + 1.4891 + movdqu xmm1, [eax] // read 4 pixels 1.4892 + movzx esi, byte ptr [eax + 11] // third alpha 1.4893 + movzx edi, byte ptr [eax + 15] // forth alpha 1.4894 + punpckhbw xmm1, xmm1 // next 2 1.4895 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] 1.4896 + movd xmm3, dword ptr fixed_invtbl8[edi * 4] 1.4897 + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words 1.4898 + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 1.4899 + movlhps xmm2, xmm3 1.4900 + pmulhuw xmm1, xmm2 // rgb * a 1.4901 + lea eax, [eax + 16] 1.4902 + 1.4903 + packuswb xmm0, xmm1 1.4904 + sub ecx, 4 1.4905 + movdqu [edx], xmm0 1.4906 + lea edx, [edx + 16] 1.4907 + jg convertloop 1.4908 + pop edi 1.4909 + pop esi 1.4910 + ret 1.4911 + } 1.4912 +} 1.4913 +#endif // HAS_ARGBUNATTENUATEROW_SSE2 1.4914 + 1.4915 +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 1.4916 +// Shuffle table duplicating alpha. 1.4917 +static const ulvec8 kUnattenShuffleAlpha_AVX2 = { 1.4918 + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, 1.4919 + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, 1.4920 +}; 1.4921 +// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. 1.4922 +// USE_GATHER is not on by default, due to being a slow instruction. 1.4923 +#ifdef USE_GATHER 1.4924 +__declspec(naked) __declspec(align(16)) 1.4925 +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 1.4926 + int width) { 1.4927 + __asm { 1.4928 + mov eax, [esp + 4] // src_argb0 1.4929 + mov edx, [esp + 8] // dst_argb 1.4930 + mov ecx, [esp + 12] // width 1.4931 + sub edx, eax 1.4932 + vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 1.4933 + 1.4934 + align 4 1.4935 + convertloop: 1.4936 + vmovdqu ymm6, [eax] // read 8 pixels. 1.4937 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. 1.4938 + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. 1.4939 + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 1.4940 + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 1.4941 + vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a 1.4942 + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 1.4943 + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 1.4944 + vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a 1.4945 + vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas 1.4946 + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 1.4947 + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 1.4948 + vpackuswb ymm0, ymm0, ymm1 // unmutated. 1.4949 + sub ecx, 8 1.4950 + vmovdqu [eax + edx], ymm0 1.4951 + lea eax, [eax + 32] 1.4952 + jg convertloop 1.4953 + 1.4954 + vzeroupper 1.4955 + ret 1.4956 + } 1.4957 +} 1.4958 +#else // USE_GATHER 1.4959 +__declspec(naked) __declspec(align(16)) 1.4960 +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 1.4961 + int width) { 1.4962 + __asm { 1.4963 + 1.4964 + mov eax, [esp + 4] // src_argb0 1.4965 + mov edx, [esp + 8] // dst_argb 1.4966 + mov ecx, [esp + 12] // width 1.4967 + sub edx, eax 1.4968 + vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 1.4969 + 1.4970 + push esi 1.4971 + push edi 1.4972 + 1.4973 + align 4 1.4974 + convertloop: 1.4975 + // replace VPGATHER 1.4976 + movzx esi, byte ptr [eax + 3] // alpha0 1.4977 + movzx edi, byte ptr [eax + 7] // alpha1 1.4978 + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] 1.4979 + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] 1.4980 + movzx esi, byte ptr [eax + 11] // alpha2 1.4981 + movzx edi, byte ptr [eax + 15] // alpha3 1.4982 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] 1.4983 + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] 1.4984 + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] 1.4985 + movzx esi, byte ptr [eax + 19] // alpha4 1.4986 + movzx edi, byte ptr [eax + 23] // alpha5 1.4987 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] 1.4988 + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] 1.4989 + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] 1.4990 + movzx esi, byte ptr [eax + 27] // alpha6 1.4991 + movzx edi, byte ptr [eax + 31] // alpha7 1.4992 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] 1.4993 + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] 1.4994 + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] 1.4995 + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] 1.4996 + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] 1.4997 + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] 1.4998 + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] 1.4999 + // end of VPGATHER 1.5000 + 1.5001 + vmovdqu ymm6, [eax] // read 8 pixels. 1.5002 + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 1.5003 + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 1.5004 + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 1.5005 + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 1.5006 + vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a 1.5007 + vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas 1.5008 + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 1.5009 + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 1.5010 + vpackuswb ymm0, ymm0, ymm1 // unmutated. 1.5011 + sub ecx, 8 1.5012 + vmovdqu [eax + edx], ymm0 1.5013 + lea eax, [eax + 32] 1.5014 + jg convertloop 1.5015 + 1.5016 + pop edi 1.5017 + pop esi 1.5018 + vzeroupper 1.5019 + ret 1.5020 + } 1.5021 +} 1.5022 +#endif // USE_GATHER 1.5023 +#endif // HAS_ARGBATTENUATEROW_AVX2 1.5024 + 1.5025 +#ifdef HAS_ARGBGRAYROW_SSSE3 1.5026 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 1.5027 +__declspec(naked) __declspec(align(16)) 1.5028 +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 1.5029 + __asm { 1.5030 + mov eax, [esp + 4] /* src_argb */ 1.5031 + mov edx, [esp + 8] /* dst_argb */ 1.5032 + mov ecx, [esp + 12] /* width */ 1.5033 + movdqa xmm4, kARGBToYJ 1.5034 + movdqa xmm5, kAddYJ64 1.5035 + 1.5036 + align 4 1.5037 + convertloop: 1.5038 + movdqa xmm0, [eax] // G 1.5039 + movdqa xmm1, [eax + 16] 1.5040 + pmaddubsw xmm0, xmm4 1.5041 + pmaddubsw xmm1, xmm4 1.5042 + phaddw xmm0, xmm1 1.5043 + paddw xmm0, xmm5 // Add .5 for rounding. 1.5044 + psrlw xmm0, 7 1.5045 + packuswb xmm0, xmm0 // 8 G bytes 1.5046 + movdqa xmm2, [eax] // A 1.5047 + movdqa xmm3, [eax + 16] 1.5048 + lea eax, [eax + 32] 1.5049 + psrld xmm2, 24 1.5050 + psrld xmm3, 24 1.5051 + packuswb xmm2, xmm3 1.5052 + packuswb xmm2, xmm2 // 8 A bytes 1.5053 + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 1.5054 + punpcklbw xmm0, xmm0 // 8 GG words 1.5055 + punpcklbw xmm3, xmm2 // 8 GA words 1.5056 + movdqa xmm1, xmm0 1.5057 + punpcklwd xmm0, xmm3 // GGGA first 4 1.5058 + punpckhwd xmm1, xmm3 // GGGA next 4 1.5059 + sub ecx, 8 1.5060 + movdqa [edx], xmm0 1.5061 + movdqa [edx + 16], xmm1 1.5062 + lea edx, [edx + 32] 1.5063 + jg convertloop 1.5064 + ret 1.5065 + } 1.5066 +} 1.5067 +#endif // HAS_ARGBGRAYROW_SSSE3 1.5068 + 1.5069 +#ifdef HAS_ARGBSEPIAROW_SSSE3 1.5070 +// b = (r * 35 + g * 68 + b * 17) >> 7 1.5071 +// g = (r * 45 + g * 88 + b * 22) >> 7 1.5072 +// r = (r * 50 + g * 98 + b * 24) >> 7 1.5073 +// Constant for ARGB color to sepia tone. 1.5074 +static const vec8 kARGBToSepiaB = { 1.5075 + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 1.5076 +}; 1.5077 + 1.5078 +static const vec8 kARGBToSepiaG = { 1.5079 + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 1.5080 +}; 1.5081 + 1.5082 +static const vec8 kARGBToSepiaR = { 1.5083 + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 1.5084 +}; 1.5085 + 1.5086 +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 1.5087 +__declspec(naked) __declspec(align(16)) 1.5088 +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 1.5089 + __asm { 1.5090 + mov eax, [esp + 4] /* dst_argb */ 1.5091 + mov ecx, [esp + 8] /* width */ 1.5092 + movdqa xmm2, kARGBToSepiaB 1.5093 + movdqa xmm3, kARGBToSepiaG 1.5094 + movdqa xmm4, kARGBToSepiaR 1.5095 + 1.5096 + align 4 1.5097 + convertloop: 1.5098 + movdqa xmm0, [eax] // B 1.5099 + movdqa xmm6, [eax + 16] 1.5100 + pmaddubsw xmm0, xmm2 1.5101 + pmaddubsw xmm6, xmm2 1.5102 + phaddw xmm0, xmm6 1.5103 + psrlw xmm0, 7 1.5104 + packuswb xmm0, xmm0 // 8 B values 1.5105 + movdqa xmm5, [eax] // G 1.5106 + movdqa xmm1, [eax + 16] 1.5107 + pmaddubsw xmm5, xmm3 1.5108 + pmaddubsw xmm1, xmm3 1.5109 + phaddw xmm5, xmm1 1.5110 + psrlw xmm5, 7 1.5111 + packuswb xmm5, xmm5 // 8 G values 1.5112 + punpcklbw xmm0, xmm5 // 8 BG values 1.5113 + movdqa xmm5, [eax] // R 1.5114 + movdqa xmm1, [eax + 16] 1.5115 + pmaddubsw xmm5, xmm4 1.5116 + pmaddubsw xmm1, xmm4 1.5117 + phaddw xmm5, xmm1 1.5118 + psrlw xmm5, 7 1.5119 + packuswb xmm5, xmm5 // 8 R values 1.5120 + movdqa xmm6, [eax] // A 1.5121 + movdqa xmm1, [eax + 16] 1.5122 + psrld xmm6, 24 1.5123 + psrld xmm1, 24 1.5124 + packuswb xmm6, xmm1 1.5125 + packuswb xmm6, xmm6 // 8 A values 1.5126 + punpcklbw xmm5, xmm6 // 8 RA values 1.5127 + movdqa xmm1, xmm0 // Weave BG, RA together 1.5128 + punpcklwd xmm0, xmm5 // BGRA first 4 1.5129 + punpckhwd xmm1, xmm5 // BGRA next 4 1.5130 + sub ecx, 8 1.5131 + movdqa [eax], xmm0 1.5132 + movdqa [eax + 16], xmm1 1.5133 + lea eax, [eax + 32] 1.5134 + jg convertloop 1.5135 + ret 1.5136 + } 1.5137 +} 1.5138 +#endif // HAS_ARGBSEPIAROW_SSSE3 1.5139 + 1.5140 +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 1.5141 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. 1.5142 +// Same as Sepia except matrix is provided. 1.5143 +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 1.5144 +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 1.5145 +__declspec(naked) __declspec(align(16)) 1.5146 +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 1.5147 + const int8* matrix_argb, int width) { 1.5148 + __asm { 1.5149 + mov eax, [esp + 4] /* src_argb */ 1.5150 + mov edx, [esp + 8] /* dst_argb */ 1.5151 + mov ecx, [esp + 12] /* matrix_argb */ 1.5152 + movdqu xmm5, [ecx] 1.5153 + pshufd xmm2, xmm5, 0x00 1.5154 + pshufd xmm3, xmm5, 0x55 1.5155 + pshufd xmm4, xmm5, 0xaa 1.5156 + pshufd xmm5, xmm5, 0xff 1.5157 + mov ecx, [esp + 16] /* width */ 1.5158 + 1.5159 + align 4 1.5160 + convertloop: 1.5161 + movdqa xmm0, [eax] // B 1.5162 + movdqa xmm7, [eax + 16] 1.5163 + pmaddubsw xmm0, xmm2 1.5164 + pmaddubsw xmm7, xmm2 1.5165 + movdqa xmm6, [eax] // G 1.5166 + movdqa xmm1, [eax + 16] 1.5167 + pmaddubsw xmm6, xmm3 1.5168 + pmaddubsw xmm1, xmm3 1.5169 + phaddsw xmm0, xmm7 // B 1.5170 + phaddsw xmm6, xmm1 // G 1.5171 + psraw xmm0, 6 // B 1.5172 + psraw xmm6, 6 // G 1.5173 + packuswb xmm0, xmm0 // 8 B values 1.5174 + packuswb xmm6, xmm6 // 8 G values 1.5175 + punpcklbw xmm0, xmm6 // 8 BG values 1.5176 + movdqa xmm1, [eax] // R 1.5177 + movdqa xmm7, [eax + 16] 1.5178 + pmaddubsw xmm1, xmm4 1.5179 + pmaddubsw xmm7, xmm4 1.5180 + phaddsw xmm1, xmm7 // R 1.5181 + movdqa xmm6, [eax] // A 1.5182 + movdqa xmm7, [eax + 16] 1.5183 + pmaddubsw xmm6, xmm5 1.5184 + pmaddubsw xmm7, xmm5 1.5185 + phaddsw xmm6, xmm7 // A 1.5186 + psraw xmm1, 6 // R 1.5187 + psraw xmm6, 6 // A 1.5188 + packuswb xmm1, xmm1 // 8 R values 1.5189 + packuswb xmm6, xmm6 // 8 A values 1.5190 + punpcklbw xmm1, xmm6 // 8 RA values 1.5191 + movdqa xmm6, xmm0 // Weave BG, RA together 1.5192 + punpcklwd xmm0, xmm1 // BGRA first 4 1.5193 + punpckhwd xmm6, xmm1 // BGRA next 4 1.5194 + sub ecx, 8 1.5195 + movdqa [edx], xmm0 1.5196 + movdqa [edx + 16], xmm6 1.5197 + lea eax, [eax + 32] 1.5198 + lea edx, [edx + 32] 1.5199 + jg convertloop 1.5200 + ret 1.5201 + } 1.5202 +} 1.5203 +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 1.5204 + 1.5205 +#ifdef HAS_ARGBQUANTIZEROW_SSE2 1.5206 +// Quantize 4 ARGB pixels (16 bytes). 1.5207 +// Aligned to 16 bytes. 1.5208 +__declspec(naked) __declspec(align(16)) 1.5209 +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 1.5210 + int interval_offset, int width) { 1.5211 + __asm { 1.5212 + mov eax, [esp + 4] /* dst_argb */ 1.5213 + movd xmm2, [esp + 8] /* scale */ 1.5214 + movd xmm3, [esp + 12] /* interval_size */ 1.5215 + movd xmm4, [esp + 16] /* interval_offset */ 1.5216 + mov ecx, [esp + 20] /* width */ 1.5217 + pshuflw xmm2, xmm2, 040h 1.5218 + pshufd xmm2, xmm2, 044h 1.5219 + pshuflw xmm3, xmm3, 040h 1.5220 + pshufd xmm3, xmm3, 044h 1.5221 + pshuflw xmm4, xmm4, 040h 1.5222 + pshufd xmm4, xmm4, 044h 1.5223 + pxor xmm5, xmm5 // constant 0 1.5224 + pcmpeqb xmm6, xmm6 // generate mask 0xff000000 1.5225 + pslld xmm6, 24 1.5226 + 1.5227 + align 4 1.5228 + convertloop: 1.5229 + movdqa xmm0, [eax] // read 4 pixels 1.5230 + punpcklbw xmm0, xmm5 // first 2 pixels 1.5231 + pmulhuw xmm0, xmm2 // pixel * scale >> 16 1.5232 + movdqa xmm1, [eax] // read 4 pixels 1.5233 + punpckhbw xmm1, xmm5 // next 2 pixels 1.5234 + pmulhuw xmm1, xmm2 1.5235 + pmullw xmm0, xmm3 // * interval_size 1.5236 + movdqa xmm7, [eax] // read 4 pixels 1.5237 + pmullw xmm1, xmm3 1.5238 + pand xmm7, xmm6 // mask alpha 1.5239 + paddw xmm0, xmm4 // + interval_size / 2 1.5240 + paddw xmm1, xmm4 1.5241 + packuswb xmm0, xmm1 1.5242 + por xmm0, xmm7 1.5243 + sub ecx, 4 1.5244 + movdqa [eax], xmm0 1.5245 + lea eax, [eax + 16] 1.5246 + jg convertloop 1.5247 + ret 1.5248 + } 1.5249 +} 1.5250 +#endif // HAS_ARGBQUANTIZEROW_SSE2 1.5251 + 1.5252 +#ifdef HAS_ARGBSHADEROW_SSE2 1.5253 +// Shade 4 pixels at a time by specified value. 1.5254 +// Aligned to 16 bytes. 1.5255 +__declspec(naked) __declspec(align(16)) 1.5256 +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 1.5257 + uint32 value) { 1.5258 + __asm { 1.5259 + mov eax, [esp + 4] // src_argb 1.5260 + mov edx, [esp + 8] // dst_argb 1.5261 + mov ecx, [esp + 12] // width 1.5262 + movd xmm2, [esp + 16] // value 1.5263 + punpcklbw xmm2, xmm2 1.5264 + punpcklqdq xmm2, xmm2 1.5265 + 1.5266 + align 4 1.5267 + convertloop: 1.5268 + movdqa xmm0, [eax] // read 4 pixels 1.5269 + lea eax, [eax + 16] 1.5270 + movdqa xmm1, xmm0 1.5271 + punpcklbw xmm0, xmm0 // first 2 1.5272 + punpckhbw xmm1, xmm1 // next 2 1.5273 + pmulhuw xmm0, xmm2 // argb * value 1.5274 + pmulhuw xmm1, xmm2 // argb * value 1.5275 + psrlw xmm0, 8 1.5276 + psrlw xmm1, 8 1.5277 + packuswb xmm0, xmm1 1.5278 + sub ecx, 4 1.5279 + movdqa [edx], xmm0 1.5280 + lea edx, [edx + 16] 1.5281 + jg convertloop 1.5282 + 1.5283 + ret 1.5284 + } 1.5285 +} 1.5286 +#endif // HAS_ARGBSHADEROW_SSE2 1.5287 + 1.5288 +#ifdef HAS_ARGBMULTIPLYROW_SSE2 1.5289 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 1.5290 +__declspec(naked) __declspec(align(16)) 1.5291 +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 1.5292 + uint8* dst_argb, int width) { 1.5293 + __asm { 1.5294 + push esi 1.5295 + mov eax, [esp + 4 + 4] // src_argb0 1.5296 + mov esi, [esp + 4 + 8] // src_argb1 1.5297 + mov edx, [esp + 4 + 12] // dst_argb 1.5298 + mov ecx, [esp + 4 + 16] // width 1.5299 + pxor xmm5, xmm5 // constant 0 1.5300 + 1.5301 + align 4 1.5302 + convertloop: 1.5303 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 1.5304 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 1.5305 + movdqu xmm1, xmm0 1.5306 + movdqu xmm3, xmm2 1.5307 + punpcklbw xmm0, xmm0 // first 2 1.5308 + punpckhbw xmm1, xmm1 // next 2 1.5309 + punpcklbw xmm2, xmm5 // first 2 1.5310 + punpckhbw xmm3, xmm5 // next 2 1.5311 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 1.5312 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 1.5313 + lea eax, [eax + 16] 1.5314 + lea esi, [esi + 16] 1.5315 + packuswb xmm0, xmm1 1.5316 + sub ecx, 4 1.5317 + movdqu [edx], xmm0 1.5318 + lea edx, [edx + 16] 1.5319 + jg convertloop 1.5320 + 1.5321 + pop esi 1.5322 + ret 1.5323 + } 1.5324 +} 1.5325 +#endif // HAS_ARGBMULTIPLYROW_SSE2 1.5326 + 1.5327 +#ifdef HAS_ARGBADDROW_SSE2 1.5328 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. 1.5329 +// TODO(fbarchard): Port this to posix, neon and other math functions. 1.5330 +__declspec(naked) __declspec(align(16)) 1.5331 +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 1.5332 + uint8* dst_argb, int width) { 1.5333 + __asm { 1.5334 + push esi 1.5335 + mov eax, [esp + 4 + 4] // src_argb0 1.5336 + mov esi, [esp + 4 + 8] // src_argb1 1.5337 + mov edx, [esp + 4 + 12] // dst_argb 1.5338 + mov ecx, [esp + 4 + 16] // width 1.5339 + 1.5340 + sub ecx, 4 1.5341 + jl convertloop49 1.5342 + 1.5343 + align 4 1.5344 + convertloop4: 1.5345 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 1.5346 + lea eax, [eax + 16] 1.5347 + movdqu xmm1, [esi] // read 4 pixels from src_argb1 1.5348 + lea esi, [esi + 16] 1.5349 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 1.5350 + sub ecx, 4 1.5351 + movdqu [edx], xmm0 1.5352 + lea edx, [edx + 16] 1.5353 + jge convertloop4 1.5354 + 1.5355 + convertloop49: 1.5356 + add ecx, 4 - 1 1.5357 + jl convertloop19 1.5358 + 1.5359 + convertloop1: 1.5360 + movd xmm0, [eax] // read 1 pixels from src_argb0 1.5361 + lea eax, [eax + 4] 1.5362 + movd xmm1, [esi] // read 1 pixels from src_argb1 1.5363 + lea esi, [esi + 4] 1.5364 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 1.5365 + sub ecx, 1 1.5366 + movd [edx], xmm0 1.5367 + lea edx, [edx + 4] 1.5368 + jge convertloop1 1.5369 + 1.5370 + convertloop19: 1.5371 + pop esi 1.5372 + ret 1.5373 + } 1.5374 +} 1.5375 +#endif // HAS_ARGBADDROW_SSE2 1.5376 + 1.5377 +#ifdef HAS_ARGBSUBTRACTROW_SSE2 1.5378 +// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. 1.5379 +__declspec(naked) __declspec(align(16)) 1.5380 +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 1.5381 + uint8* dst_argb, int width) { 1.5382 + __asm { 1.5383 + push esi 1.5384 + mov eax, [esp + 4 + 4] // src_argb0 1.5385 + mov esi, [esp + 4 + 8] // src_argb1 1.5386 + mov edx, [esp + 4 + 12] // dst_argb 1.5387 + mov ecx, [esp + 4 + 16] // width 1.5388 + 1.5389 + align 4 1.5390 + convertloop: 1.5391 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 1.5392 + lea eax, [eax + 16] 1.5393 + movdqu xmm1, [esi] // read 4 pixels from src_argb1 1.5394 + lea esi, [esi + 16] 1.5395 + psubusb xmm0, xmm1 // src_argb0 - src_argb1 1.5396 + sub ecx, 4 1.5397 + movdqu [edx], xmm0 1.5398 + lea edx, [edx + 16] 1.5399 + jg convertloop 1.5400 + 1.5401 + pop esi 1.5402 + ret 1.5403 + } 1.5404 +} 1.5405 +#endif // HAS_ARGBSUBTRACTROW_SSE2 1.5406 + 1.5407 +#ifdef HAS_ARGBMULTIPLYROW_AVX2 1.5408 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 1.5409 +__declspec(naked) __declspec(align(16)) 1.5410 +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 1.5411 + uint8* dst_argb, int width) { 1.5412 + __asm { 1.5413 + push esi 1.5414 + mov eax, [esp + 4 + 4] // src_argb0 1.5415 + mov esi, [esp + 4 + 8] // src_argb1 1.5416 + mov edx, [esp + 4 + 12] // dst_argb 1.5417 + mov ecx, [esp + 4 + 16] // width 1.5418 + vpxor ymm5, ymm5, ymm5 // constant 0 1.5419 + 1.5420 + align 4 1.5421 + convertloop: 1.5422 + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 1.5423 + lea eax, [eax + 32] 1.5424 + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 1.5425 + lea esi, [esi + 32] 1.5426 + vpunpcklbw ymm0, ymm1, ymm1 // low 4 1.5427 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 1.5428 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 1.5429 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 1.5430 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 1.5431 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 1.5432 + vpackuswb ymm0, ymm0, ymm1 1.5433 + vmovdqu [edx], ymm0 1.5434 + lea edx, [edx + 32] 1.5435 + sub ecx, 8 1.5436 + jg convertloop 1.5437 + 1.5438 + pop esi 1.5439 + vzeroupper 1.5440 + ret 1.5441 + } 1.5442 +} 1.5443 +#endif // HAS_ARGBMULTIPLYROW_AVX2 1.5444 + 1.5445 +#ifdef HAS_ARGBADDROW_AVX2 1.5446 +// Add 2 rows of ARGB pixels together, 8 pixels at a time. 1.5447 +__declspec(naked) __declspec(align(16)) 1.5448 +void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 1.5449 + uint8* dst_argb, int width) { 1.5450 + __asm { 1.5451 + push esi 1.5452 + mov eax, [esp + 4 + 4] // src_argb0 1.5453 + mov esi, [esp + 4 + 8] // src_argb1 1.5454 + mov edx, [esp + 4 + 12] // dst_argb 1.5455 + mov ecx, [esp + 4 + 16] // width 1.5456 + 1.5457 + align 4 1.5458 + convertloop: 1.5459 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 1.5460 + lea eax, [eax + 32] 1.5461 + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 1.5462 + lea esi, [esi + 32] 1.5463 + vmovdqu [edx], ymm0 1.5464 + lea edx, [edx + 32] 1.5465 + sub ecx, 8 1.5466 + jg convertloop 1.5467 + 1.5468 + pop esi 1.5469 + vzeroupper 1.5470 + ret 1.5471 + } 1.5472 +} 1.5473 +#endif // HAS_ARGBADDROW_AVX2 1.5474 + 1.5475 +#ifdef HAS_ARGBSUBTRACTROW_AVX2 1.5476 +// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. 1.5477 +__declspec(naked) __declspec(align(16)) 1.5478 +void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 1.5479 + uint8* dst_argb, int width) { 1.5480 + __asm { 1.5481 + push esi 1.5482 + mov eax, [esp + 4 + 4] // src_argb0 1.5483 + mov esi, [esp + 4 + 8] // src_argb1 1.5484 + mov edx, [esp + 4 + 12] // dst_argb 1.5485 + mov ecx, [esp + 4 + 16] // width 1.5486 + 1.5487 + align 4 1.5488 + convertloop: 1.5489 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 1.5490 + lea eax, [eax + 32] 1.5491 + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 1.5492 + lea esi, [esi + 32] 1.5493 + vmovdqu [edx], ymm0 1.5494 + lea edx, [edx + 32] 1.5495 + sub ecx, 8 1.5496 + jg convertloop 1.5497 + 1.5498 + pop esi 1.5499 + vzeroupper 1.5500 + ret 1.5501 + } 1.5502 +} 1.5503 +#endif // HAS_ARGBSUBTRACTROW_AVX2 1.5504 + 1.5505 +#ifdef HAS_SOBELXROW_SSE2 1.5506 +// SobelX as a matrix is 1.5507 +// -1 0 1 1.5508 +// -2 0 2 1.5509 +// -1 0 1 1.5510 +__declspec(naked) __declspec(align(16)) 1.5511 +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 1.5512 + const uint8* src_y2, uint8* dst_sobelx, int width) { 1.5513 + __asm { 1.5514 + push esi 1.5515 + push edi 1.5516 + mov eax, [esp + 8 + 4] // src_y0 1.5517 + mov esi, [esp + 8 + 8] // src_y1 1.5518 + mov edi, [esp + 8 + 12] // src_y2 1.5519 + mov edx, [esp + 8 + 16] // dst_sobelx 1.5520 + mov ecx, [esp + 8 + 20] // width 1.5521 + sub esi, eax 1.5522 + sub edi, eax 1.5523 + sub edx, eax 1.5524 + pxor xmm5, xmm5 // constant 0 1.5525 + 1.5526 + align 4 1.5527 + convertloop: 1.5528 + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 1.5529 + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 1.5530 + punpcklbw xmm0, xmm5 1.5531 + punpcklbw xmm1, xmm5 1.5532 + psubw xmm0, xmm1 1.5533 + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 1.5534 + movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 1.5535 + punpcklbw xmm1, xmm5 1.5536 + punpcklbw xmm2, xmm5 1.5537 + psubw xmm1, xmm2 1.5538 + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] 1.5539 + movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] 1.5540 + punpcklbw xmm2, xmm5 1.5541 + punpcklbw xmm3, xmm5 1.5542 + psubw xmm2, xmm3 1.5543 + paddw xmm0, xmm2 1.5544 + paddw xmm0, xmm1 1.5545 + paddw xmm0, xmm1 1.5546 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 1.5547 + psubw xmm1, xmm0 1.5548 + pmaxsw xmm0, xmm1 1.5549 + packuswb xmm0, xmm0 1.5550 + sub ecx, 8 1.5551 + movq qword ptr [eax + edx], xmm0 1.5552 + lea eax, [eax + 8] 1.5553 + jg convertloop 1.5554 + 1.5555 + pop edi 1.5556 + pop esi 1.5557 + ret 1.5558 + } 1.5559 +} 1.5560 +#endif // HAS_SOBELXROW_SSE2 1.5561 + 1.5562 +#ifdef HAS_SOBELYROW_SSE2 1.5563 +// SobelY as a matrix is 1.5564 +// -1 -2 -1 1.5565 +// 0 0 0 1.5566 +// 1 2 1 1.5567 +__declspec(naked) __declspec(align(16)) 1.5568 +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 1.5569 + uint8* dst_sobely, int width) { 1.5570 + __asm { 1.5571 + push esi 1.5572 + mov eax, [esp + 4 + 4] // src_y0 1.5573 + mov esi, [esp + 4 + 8] // src_y1 1.5574 + mov edx, [esp + 4 + 12] // dst_sobely 1.5575 + mov ecx, [esp + 4 + 16] // width 1.5576 + sub esi, eax 1.5577 + sub edx, eax 1.5578 + pxor xmm5, xmm5 // constant 0 1.5579 + 1.5580 + align 4 1.5581 + convertloop: 1.5582 + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 1.5583 + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 1.5584 + punpcklbw xmm0, xmm5 1.5585 + punpcklbw xmm1, xmm5 1.5586 + psubw xmm0, xmm1 1.5587 + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] 1.5588 + movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] 1.5589 + punpcklbw xmm1, xmm5 1.5590 + punpcklbw xmm2, xmm5 1.5591 + psubw xmm1, xmm2 1.5592 + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 1.5593 + movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 1.5594 + punpcklbw xmm2, xmm5 1.5595 + punpcklbw xmm3, xmm5 1.5596 + psubw xmm2, xmm3 1.5597 + paddw xmm0, xmm2 1.5598 + paddw xmm0, xmm1 1.5599 + paddw xmm0, xmm1 1.5600 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 1.5601 + psubw xmm1, xmm0 1.5602 + pmaxsw xmm0, xmm1 1.5603 + packuswb xmm0, xmm0 1.5604 + sub ecx, 8 1.5605 + movq qword ptr [eax + edx], xmm0 1.5606 + lea eax, [eax + 8] 1.5607 + jg convertloop 1.5608 + 1.5609 + pop esi 1.5610 + ret 1.5611 + } 1.5612 +} 1.5613 +#endif // HAS_SOBELYROW_SSE2 1.5614 + 1.5615 +#ifdef HAS_SOBELROW_SSE2 1.5616 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 1.5617 +// A = 255 1.5618 +// R = Sobel 1.5619 +// G = Sobel 1.5620 +// B = Sobel 1.5621 +__declspec(naked) __declspec(align(16)) 1.5622 +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 1.5623 + uint8* dst_argb, int width) { 1.5624 + __asm { 1.5625 + push esi 1.5626 + mov eax, [esp + 4 + 4] // src_sobelx 1.5627 + mov esi, [esp + 4 + 8] // src_sobely 1.5628 + mov edx, [esp + 4 + 12] // dst_argb 1.5629 + mov ecx, [esp + 4 + 16] // width 1.5630 + sub esi, eax 1.5631 + pcmpeqb xmm5, xmm5 // alpha 255 1.5632 + pslld xmm5, 24 // 0xff000000 1.5633 + 1.5634 + align 4 1.5635 + convertloop: 1.5636 + movdqa xmm0, [eax] // read 16 pixels src_sobelx 1.5637 + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 1.5638 + lea eax, [eax + 16] 1.5639 + paddusb xmm0, xmm1 // sobel = sobelx + sobely 1.5640 + movdqa xmm2, xmm0 // GG 1.5641 + punpcklbw xmm2, xmm0 // First 8 1.5642 + punpckhbw xmm0, xmm0 // Next 8 1.5643 + movdqa xmm1, xmm2 // GGGG 1.5644 + punpcklwd xmm1, xmm2 // First 4 1.5645 + punpckhwd xmm2, xmm2 // Next 4 1.5646 + por xmm1, xmm5 // GGGA 1.5647 + por xmm2, xmm5 1.5648 + movdqa xmm3, xmm0 // GGGG 1.5649 + punpcklwd xmm3, xmm0 // Next 4 1.5650 + punpckhwd xmm0, xmm0 // Last 4 1.5651 + por xmm3, xmm5 // GGGA 1.5652 + por xmm0, xmm5 1.5653 + sub ecx, 16 1.5654 + movdqa [edx], xmm1 1.5655 + movdqa [edx + 16], xmm2 1.5656 + movdqa [edx + 32], xmm3 1.5657 + movdqa [edx + 48], xmm0 1.5658 + lea edx, [edx + 64] 1.5659 + jg convertloop 1.5660 + 1.5661 + pop esi 1.5662 + ret 1.5663 + } 1.5664 +} 1.5665 +#endif // HAS_SOBELROW_SSE2 1.5666 + 1.5667 +#ifdef HAS_SOBELTOPLANEROW_SSE2 1.5668 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. 1.5669 +__declspec(naked) __declspec(align(16)) 1.5670 +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 1.5671 + uint8* dst_y, int width) { 1.5672 + __asm { 1.5673 + push esi 1.5674 + mov eax, [esp + 4 + 4] // src_sobelx 1.5675 + mov esi, [esp + 4 + 8] // src_sobely 1.5676 + mov edx, [esp + 4 + 12] // dst_argb 1.5677 + mov ecx, [esp + 4 + 16] // width 1.5678 + sub esi, eax 1.5679 + 1.5680 + align 4 1.5681 + convertloop: 1.5682 + movdqa xmm0, [eax] // read 16 pixels src_sobelx 1.5683 + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 1.5684 + lea eax, [eax + 16] 1.5685 + paddusb xmm0, xmm1 // sobel = sobelx + sobely 1.5686 + sub ecx, 16 1.5687 + movdqa [edx], xmm0 1.5688 + lea edx, [edx + 16] 1.5689 + jg convertloop 1.5690 + 1.5691 + pop esi 1.5692 + ret 1.5693 + } 1.5694 +} 1.5695 +#endif // HAS_SOBELTOPLANEROW_SSE2 1.5696 + 1.5697 +#ifdef HAS_SOBELXYROW_SSE2 1.5698 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. 1.5699 +// A = 255 1.5700 +// R = Sobel X 1.5701 +// G = Sobel 1.5702 +// B = Sobel Y 1.5703 +__declspec(naked) __declspec(align(16)) 1.5704 +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 1.5705 + uint8* dst_argb, int width) { 1.5706 + __asm { 1.5707 + push esi 1.5708 + mov eax, [esp + 4 + 4] // src_sobelx 1.5709 + mov esi, [esp + 4 + 8] // src_sobely 1.5710 + mov edx, [esp + 4 + 12] // dst_argb 1.5711 + mov ecx, [esp + 4 + 16] // width 1.5712 + sub esi, eax 1.5713 + pcmpeqb xmm5, xmm5 // alpha 255 1.5714 + 1.5715 + align 4 1.5716 + convertloop: 1.5717 + movdqa xmm0, [eax] // read 16 pixels src_sobelx 1.5718 + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 1.5719 + lea eax, [eax + 16] 1.5720 + movdqa xmm2, xmm0 1.5721 + paddusb xmm2, xmm1 // sobel = sobelx + sobely 1.5722 + movdqa xmm3, xmm0 // XA 1.5723 + punpcklbw xmm3, xmm5 1.5724 + punpckhbw xmm0, xmm5 1.5725 + movdqa xmm4, xmm1 // YS 1.5726 + punpcklbw xmm4, xmm2 1.5727 + punpckhbw xmm1, xmm2 1.5728 + movdqa xmm6, xmm4 // YSXA 1.5729 + punpcklwd xmm6, xmm3 // First 4 1.5730 + punpckhwd xmm4, xmm3 // Next 4 1.5731 + movdqa xmm7, xmm1 // YSXA 1.5732 + punpcklwd xmm7, xmm0 // Next 4 1.5733 + punpckhwd xmm1, xmm0 // Last 4 1.5734 + sub ecx, 16 1.5735 + movdqa [edx], xmm6 1.5736 + movdqa [edx + 16], xmm4 1.5737 + movdqa [edx + 32], xmm7 1.5738 + movdqa [edx + 48], xmm1 1.5739 + lea edx, [edx + 64] 1.5740 + jg convertloop 1.5741 + 1.5742 + pop esi 1.5743 + ret 1.5744 + } 1.5745 +} 1.5746 +#endif // HAS_SOBELXYROW_SSE2 1.5747 + 1.5748 +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 1.5749 +// Consider float CumulativeSum. 1.5750 +// Consider calling CumulativeSum one row at time as needed. 1.5751 +// Consider circular CumulativeSum buffer of radius * 2 + 1 height. 1.5752 +// Convert cumulative sum for an area to an average for 1 pixel. 1.5753 +// topleft is pointer to top left of CumulativeSum buffer for area. 1.5754 +// botleft is pointer to bottom left of CumulativeSum buffer. 1.5755 +// width is offset from left to right of area in CumulativeSum buffer measured 1.5756 +// in number of ints. 1.5757 +// area is the number of pixels in the area being averaged. 1.5758 +// dst points to pixel to store result to. 1.5759 +// count is number of averaged pixels to produce. 1.5760 +// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte 1.5761 +// aligned. 1.5762 +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 1.5763 + int width, int area, uint8* dst, 1.5764 + int count) { 1.5765 + __asm { 1.5766 + mov eax, topleft // eax topleft 1.5767 + mov esi, botleft // esi botleft 1.5768 + mov edx, width 1.5769 + movd xmm5, area 1.5770 + mov edi, dst 1.5771 + mov ecx, count 1.5772 + cvtdq2ps xmm5, xmm5 1.5773 + rcpss xmm4, xmm5 // 1.0f / area 1.5774 + pshufd xmm4, xmm4, 0 1.5775 + sub ecx, 4 1.5776 + jl l4b 1.5777 + 1.5778 + cmp area, 128 // 128 pixels will not overflow 15 bits. 1.5779 + ja l4 1.5780 + 1.5781 + pshufd xmm5, xmm5, 0 // area 1.5782 + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 1.5783 + psrld xmm6, 16 1.5784 + cvtdq2ps xmm6, xmm6 1.5785 + addps xmm5, xmm6 // (65536.0 + area - 1) 1.5786 + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area 1.5787 + cvtps2dq xmm5, xmm5 // 0.16 fixed point 1.5788 + packssdw xmm5, xmm5 // 16 bit shorts 1.5789 + 1.5790 + // 4 pixel loop small blocks. 1.5791 + align 4 1.5792 + s4: 1.5793 + // top left 1.5794 + movdqa xmm0, [eax] 1.5795 + movdqa xmm1, [eax + 16] 1.5796 + movdqa xmm2, [eax + 32] 1.5797 + movdqa xmm3, [eax + 48] 1.5798 + 1.5799 + // - top right 1.5800 + psubd xmm0, [eax + edx * 4] 1.5801 + psubd xmm1, [eax + edx * 4 + 16] 1.5802 + psubd xmm2, [eax + edx * 4 + 32] 1.5803 + psubd xmm3, [eax + edx * 4 + 48] 1.5804 + lea eax, [eax + 64] 1.5805 + 1.5806 + // - bottom left 1.5807 + psubd xmm0, [esi] 1.5808 + psubd xmm1, [esi + 16] 1.5809 + psubd xmm2, [esi + 32] 1.5810 + psubd xmm3, [esi + 48] 1.5811 + 1.5812 + // + bottom right 1.5813 + paddd xmm0, [esi + edx * 4] 1.5814 + paddd xmm1, [esi + edx * 4 + 16] 1.5815 + paddd xmm2, [esi + edx * 4 + 32] 1.5816 + paddd xmm3, [esi + edx * 4 + 48] 1.5817 + lea esi, [esi + 64] 1.5818 + 1.5819 + packssdw xmm0, xmm1 // pack 4 pixels into 2 registers 1.5820 + packssdw xmm2, xmm3 1.5821 + 1.5822 + pmulhuw xmm0, xmm5 1.5823 + pmulhuw xmm2, xmm5 1.5824 + 1.5825 + packuswb xmm0, xmm2 1.5826 + movdqu [edi], xmm0 1.5827 + lea edi, [edi + 16] 1.5828 + sub ecx, 4 1.5829 + jge s4 1.5830 + 1.5831 + jmp l4b 1.5832 + 1.5833 + // 4 pixel loop 1.5834 + align 4 1.5835 + l4: 1.5836 + // top left 1.5837 + movdqa xmm0, [eax] 1.5838 + movdqa xmm1, [eax + 16] 1.5839 + movdqa xmm2, [eax + 32] 1.5840 + movdqa xmm3, [eax + 48] 1.5841 + 1.5842 + // - top right 1.5843 + psubd xmm0, [eax + edx * 4] 1.5844 + psubd xmm1, [eax + edx * 4 + 16] 1.5845 + psubd xmm2, [eax + edx * 4 + 32] 1.5846 + psubd xmm3, [eax + edx * 4 + 48] 1.5847 + lea eax, [eax + 64] 1.5848 + 1.5849 + // - bottom left 1.5850 + psubd xmm0, [esi] 1.5851 + psubd xmm1, [esi + 16] 1.5852 + psubd xmm2, [esi + 32] 1.5853 + psubd xmm3, [esi + 48] 1.5854 + 1.5855 + // + bottom right 1.5856 + paddd xmm0, [esi + edx * 4] 1.5857 + paddd xmm1, [esi + edx * 4 + 16] 1.5858 + paddd xmm2, [esi + edx * 4 + 32] 1.5859 + paddd xmm3, [esi + edx * 4 + 48] 1.5860 + lea esi, [esi + 64] 1.5861 + 1.5862 + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 1.5863 + cvtdq2ps xmm1, xmm1 1.5864 + mulps xmm0, xmm4 1.5865 + mulps xmm1, xmm4 1.5866 + cvtdq2ps xmm2, xmm2 1.5867 + cvtdq2ps xmm3, xmm3 1.5868 + mulps xmm2, xmm4 1.5869 + mulps xmm3, xmm4 1.5870 + cvtps2dq xmm0, xmm0 1.5871 + cvtps2dq xmm1, xmm1 1.5872 + cvtps2dq xmm2, xmm2 1.5873 + cvtps2dq xmm3, xmm3 1.5874 + packssdw xmm0, xmm1 1.5875 + packssdw xmm2, xmm3 1.5876 + packuswb xmm0, xmm2 1.5877 + movdqu [edi], xmm0 1.5878 + lea edi, [edi + 16] 1.5879 + sub ecx, 4 1.5880 + jge l4 1.5881 + 1.5882 + l4b: 1.5883 + add ecx, 4 - 1 1.5884 + jl l1b 1.5885 + 1.5886 + // 1 pixel loop 1.5887 + align 4 1.5888 + l1: 1.5889 + movdqa xmm0, [eax] 1.5890 + psubd xmm0, [eax + edx * 4] 1.5891 + lea eax, [eax + 16] 1.5892 + psubd xmm0, [esi] 1.5893 + paddd xmm0, [esi + edx * 4] 1.5894 + lea esi, [esi + 16] 1.5895 + cvtdq2ps xmm0, xmm0 1.5896 + mulps xmm0, xmm4 1.5897 + cvtps2dq xmm0, xmm0 1.5898 + packssdw xmm0, xmm0 1.5899 + packuswb xmm0, xmm0 1.5900 + movd dword ptr [edi], xmm0 1.5901 + lea edi, [edi + 4] 1.5902 + sub ecx, 1 1.5903 + jge l1 1.5904 + l1b: 1.5905 + } 1.5906 +} 1.5907 +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 1.5908 + 1.5909 +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 1.5910 +// Creates a table of cumulative sums where each value is a sum of all values 1.5911 +// above and to the left of the value. 1.5912 +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 1.5913 + const int32* previous_cumsum, int width) { 1.5914 + __asm { 1.5915 + mov eax, row 1.5916 + mov edx, cumsum 1.5917 + mov esi, previous_cumsum 1.5918 + mov ecx, width 1.5919 + pxor xmm0, xmm0 1.5920 + pxor xmm1, xmm1 1.5921 + 1.5922 + sub ecx, 4 1.5923 + jl l4b 1.5924 + test edx, 15 1.5925 + jne l4b 1.5926 + 1.5927 + // 4 pixel loop 1.5928 + align 4 1.5929 + l4: 1.5930 + movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 1.5931 + lea eax, [eax + 16] 1.5932 + movdqa xmm4, xmm2 1.5933 + 1.5934 + punpcklbw xmm2, xmm1 1.5935 + movdqa xmm3, xmm2 1.5936 + punpcklwd xmm2, xmm1 1.5937 + punpckhwd xmm3, xmm1 1.5938 + 1.5939 + punpckhbw xmm4, xmm1 1.5940 + movdqa xmm5, xmm4 1.5941 + punpcklwd xmm4, xmm1 1.5942 + punpckhwd xmm5, xmm1 1.5943 + 1.5944 + paddd xmm0, xmm2 1.5945 + movdqa xmm2, [esi] // previous row above. 1.5946 + paddd xmm2, xmm0 1.5947 + 1.5948 + paddd xmm0, xmm3 1.5949 + movdqa xmm3, [esi + 16] 1.5950 + paddd xmm3, xmm0 1.5951 + 1.5952 + paddd xmm0, xmm4 1.5953 + movdqa xmm4, [esi + 32] 1.5954 + paddd xmm4, xmm0 1.5955 + 1.5956 + paddd xmm0, xmm5 1.5957 + movdqa xmm5, [esi + 48] 1.5958 + lea esi, [esi + 64] 1.5959 + paddd xmm5, xmm0 1.5960 + 1.5961 + movdqa [edx], xmm2 1.5962 + movdqa [edx + 16], xmm3 1.5963 + movdqa [edx + 32], xmm4 1.5964 + movdqa [edx + 48], xmm5 1.5965 + 1.5966 + lea edx, [edx + 64] 1.5967 + sub ecx, 4 1.5968 + jge l4 1.5969 + 1.5970 + l4b: 1.5971 + add ecx, 4 - 1 1.5972 + jl l1b 1.5973 + 1.5974 + // 1 pixel loop 1.5975 + align 4 1.5976 + l1: 1.5977 + movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 1.5978 + lea eax, [eax + 4] 1.5979 + punpcklbw xmm2, xmm1 1.5980 + punpcklwd xmm2, xmm1 1.5981 + paddd xmm0, xmm2 1.5982 + movdqu xmm2, [esi] 1.5983 + lea esi, [esi + 16] 1.5984 + paddd xmm2, xmm0 1.5985 + movdqu [edx], xmm2 1.5986 + lea edx, [edx + 16] 1.5987 + sub ecx, 1 1.5988 + jge l1 1.5989 + 1.5990 + l1b: 1.5991 + } 1.5992 +} 1.5993 +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 1.5994 + 1.5995 +#ifdef HAS_ARGBAFFINEROW_SSE2 1.5996 +// Copy ARGB pixels from source image with slope to a row of destination. 1.5997 +__declspec(naked) __declspec(align(16)) 1.5998 +LIBYUV_API 1.5999 +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 1.6000 + uint8* dst_argb, const float* uv_dudv, int width) { 1.6001 + __asm { 1.6002 + push esi 1.6003 + push edi 1.6004 + mov eax, [esp + 12] // src_argb 1.6005 + mov esi, [esp + 16] // stride 1.6006 + mov edx, [esp + 20] // dst_argb 1.6007 + mov ecx, [esp + 24] // pointer to uv_dudv 1.6008 + movq xmm2, qword ptr [ecx] // uv 1.6009 + movq xmm7, qword ptr [ecx + 8] // dudv 1.6010 + mov ecx, [esp + 28] // width 1.6011 + shl esi, 16 // 4, stride 1.6012 + add esi, 4 1.6013 + movd xmm5, esi 1.6014 + sub ecx, 4 1.6015 + jl l4b 1.6016 + 1.6017 + // setup for 4 pixel loop 1.6018 + pshufd xmm7, xmm7, 0x44 // dup dudv 1.6019 + pshufd xmm5, xmm5, 0 // dup 4, stride 1.6020 + movdqa xmm0, xmm2 // x0, y0, x1, y1 1.6021 + addps xmm0, xmm7 1.6022 + movlhps xmm2, xmm0 1.6023 + movdqa xmm4, xmm7 1.6024 + addps xmm4, xmm4 // dudv *= 2 1.6025 + movdqa xmm3, xmm2 // x2, y2, x3, y3 1.6026 + addps xmm3, xmm4 1.6027 + addps xmm4, xmm4 // dudv *= 4 1.6028 + 1.6029 + // 4 pixel loop 1.6030 + align 4 1.6031 + l4: 1.6032 + cvttps2dq xmm0, xmm2 // x, y float to int first 2 1.6033 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 1.6034 + packssdw xmm0, xmm1 // x, y as 8 shorts 1.6035 + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 1.6036 + movd esi, xmm0 1.6037 + pshufd xmm0, xmm0, 0x39 // shift right 1.6038 + movd edi, xmm0 1.6039 + pshufd xmm0, xmm0, 0x39 // shift right 1.6040 + movd xmm1, [eax + esi] // read pixel 0 1.6041 + movd xmm6, [eax + edi] // read pixel 1 1.6042 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 1.6043 + addps xmm2, xmm4 // x, y += dx, dy first 2 1.6044 + movq qword ptr [edx], xmm1 1.6045 + movd esi, xmm0 1.6046 + pshufd xmm0, xmm0, 0x39 // shift right 1.6047 + movd edi, xmm0 1.6048 + movd xmm6, [eax + esi] // read pixel 2 1.6049 + movd xmm0, [eax + edi] // read pixel 3 1.6050 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 1.6051 + addps xmm3, xmm4 // x, y += dx, dy next 2 1.6052 + sub ecx, 4 1.6053 + movq qword ptr 8[edx], xmm6 1.6054 + lea edx, [edx + 16] 1.6055 + jge l4 1.6056 + 1.6057 + l4b: 1.6058 + add ecx, 4 - 1 1.6059 + jl l1b 1.6060 + 1.6061 + // 1 pixel loop 1.6062 + align 4 1.6063 + l1: 1.6064 + cvttps2dq xmm0, xmm2 // x, y float to int 1.6065 + packssdw xmm0, xmm0 // x, y as shorts 1.6066 + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 1.6067 + addps xmm2, xmm7 // x, y += dx, dy 1.6068 + movd esi, xmm0 1.6069 + movd xmm0, [eax + esi] // copy a pixel 1.6070 + sub ecx, 1 1.6071 + movd [edx], xmm0 1.6072 + lea edx, [edx + 4] 1.6073 + jge l1 1.6074 + l1b: 1.6075 + pop edi 1.6076 + pop esi 1.6077 + ret 1.6078 + } 1.6079 +} 1.6080 +#endif // HAS_ARGBAFFINEROW_SSE2 1.6081 + 1.6082 +#ifdef HAS_INTERPOLATEROW_AVX2 1.6083 +// Bilinear filter 16x2 -> 16x1 1.6084 +__declspec(naked) __declspec(align(16)) 1.6085 +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 1.6086 + ptrdiff_t src_stride, int dst_width, 1.6087 + int source_y_fraction) { 1.6088 + __asm { 1.6089 + push esi 1.6090 + push edi 1.6091 + mov edi, [esp + 8 + 4] // dst_ptr 1.6092 + mov esi, [esp + 8 + 8] // src_ptr 1.6093 + mov edx, [esp + 8 + 12] // src_stride 1.6094 + mov ecx, [esp + 8 + 16] // dst_width 1.6095 + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1.6096 + shr eax, 1 1.6097 + // Dispatch to specialized filters if applicable. 1.6098 + cmp eax, 0 1.6099 + je xloop100 // 0 / 128. Blend 100 / 0. 1.6100 + sub edi, esi 1.6101 + cmp eax, 32 1.6102 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 1.6103 + cmp eax, 64 1.6104 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 1.6105 + cmp eax, 96 1.6106 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 1.6107 + 1.6108 + vmovd xmm0, eax // high fraction 0..127 1.6109 + neg eax 1.6110 + add eax, 128 1.6111 + vmovd xmm5, eax // low fraction 128..1 1.6112 + vpunpcklbw xmm5, xmm5, xmm0 1.6113 + vpunpcklwd xmm5, xmm5, xmm5 1.6114 + vpxor ymm0, ymm0, ymm0 1.6115 + vpermd ymm5, ymm0, ymm5 1.6116 + 1.6117 + align 4 1.6118 + xloop: 1.6119 + vmovdqu ymm0, [esi] 1.6120 + vmovdqu ymm2, [esi + edx] 1.6121 + vpunpckhbw ymm1, ymm0, ymm2 // mutates 1.6122 + vpunpcklbw ymm0, ymm0, ymm2 // mutates 1.6123 + vpmaddubsw ymm0, ymm0, ymm5 1.6124 + vpmaddubsw ymm1, ymm1, ymm5 1.6125 + vpsrlw ymm0, ymm0, 7 1.6126 + vpsrlw ymm1, ymm1, 7 1.6127 + vpackuswb ymm0, ymm0, ymm1 // unmutates 1.6128 + sub ecx, 32 1.6129 + vmovdqu [esi + edi], ymm0 1.6130 + lea esi, [esi + 32] 1.6131 + jg xloop 1.6132 + jmp xloop99 1.6133 + 1.6134 + // Blend 25 / 75. 1.6135 + align 4 1.6136 + xloop25: 1.6137 + vmovdqu ymm0, [esi] 1.6138 + vpavgb ymm0, ymm0, [esi + edx] 1.6139 + vpavgb ymm0, ymm0, [esi + edx] 1.6140 + sub ecx, 32 1.6141 + vmovdqu [esi + edi], ymm0 1.6142 + lea esi, [esi + 32] 1.6143 + jg xloop25 1.6144 + jmp xloop99 1.6145 + 1.6146 + // Blend 50 / 50. 1.6147 + align 4 1.6148 + xloop50: 1.6149 + vmovdqu ymm0, [esi] 1.6150 + vpavgb ymm0, ymm0, [esi + edx] 1.6151 + sub ecx, 32 1.6152 + vmovdqu [esi + edi], ymm0 1.6153 + lea esi, [esi + 32] 1.6154 + jg xloop50 1.6155 + jmp xloop99 1.6156 + 1.6157 + // Blend 75 / 25. 1.6158 + align 4 1.6159 + xloop75: 1.6160 + vmovdqu ymm0, [esi + edx] 1.6161 + vpavgb ymm0, ymm0, [esi] 1.6162 + vpavgb ymm0, ymm0, [esi] 1.6163 + sub ecx, 32 1.6164 + vmovdqu [esi + edi], ymm0 1.6165 + lea esi, [esi + 32] 1.6166 + jg xloop75 1.6167 + jmp xloop99 1.6168 + 1.6169 + // Blend 100 / 0 - Copy row unchanged. 1.6170 + align 4 1.6171 + xloop100: 1.6172 + rep movsb 1.6173 + 1.6174 + xloop99: 1.6175 + pop edi 1.6176 + pop esi 1.6177 + vzeroupper 1.6178 + ret 1.6179 + } 1.6180 +} 1.6181 +#endif // HAS_INTERPOLATEROW_AVX2 1.6182 + 1.6183 +#ifdef HAS_INTERPOLATEROW_SSSE3 1.6184 +// Bilinear filter 16x2 -> 16x1 1.6185 +__declspec(naked) __declspec(align(16)) 1.6186 +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1.6187 + ptrdiff_t src_stride, int dst_width, 1.6188 + int source_y_fraction) { 1.6189 + __asm { 1.6190 + push esi 1.6191 + push edi 1.6192 + mov edi, [esp + 8 + 4] // dst_ptr 1.6193 + mov esi, [esp + 8 + 8] // src_ptr 1.6194 + mov edx, [esp + 8 + 12] // src_stride 1.6195 + mov ecx, [esp + 8 + 16] // dst_width 1.6196 + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1.6197 + sub edi, esi 1.6198 + shr eax, 1 1.6199 + // Dispatch to specialized filters if applicable. 1.6200 + cmp eax, 0 1.6201 + je xloop100 // 0 / 128. Blend 100 / 0. 1.6202 + cmp eax, 32 1.6203 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 1.6204 + cmp eax, 64 1.6205 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 1.6206 + cmp eax, 96 1.6207 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 1.6208 + 1.6209 + movd xmm0, eax // high fraction 0..127 1.6210 + neg eax 1.6211 + add eax, 128 1.6212 + movd xmm5, eax // low fraction 128..1 1.6213 + punpcklbw xmm5, xmm0 1.6214 + punpcklwd xmm5, xmm5 1.6215 + pshufd xmm5, xmm5, 0 1.6216 + 1.6217 + align 4 1.6218 + xloop: 1.6219 + movdqa xmm0, [esi] 1.6220 + movdqa xmm2, [esi + edx] 1.6221 + movdqa xmm1, xmm0 1.6222 + punpcklbw xmm0, xmm2 1.6223 + punpckhbw xmm1, xmm2 1.6224 + pmaddubsw xmm0, xmm5 1.6225 + pmaddubsw xmm1, xmm5 1.6226 + psrlw xmm0, 7 1.6227 + psrlw xmm1, 7 1.6228 + packuswb xmm0, xmm1 1.6229 + sub ecx, 16 1.6230 + movdqa [esi + edi], xmm0 1.6231 + lea esi, [esi + 16] 1.6232 + jg xloop 1.6233 + jmp xloop99 1.6234 + 1.6235 + // Blend 25 / 75. 1.6236 + align 4 1.6237 + xloop25: 1.6238 + movdqa xmm0, [esi] 1.6239 + movdqa xmm1, [esi + edx] 1.6240 + pavgb xmm0, xmm1 1.6241 + pavgb xmm0, xmm1 1.6242 + sub ecx, 16 1.6243 + movdqa [esi + edi], xmm0 1.6244 + lea esi, [esi + 16] 1.6245 + jg xloop25 1.6246 + jmp xloop99 1.6247 + 1.6248 + // Blend 50 / 50. 1.6249 + align 4 1.6250 + xloop50: 1.6251 + movdqa xmm0, [esi] 1.6252 + movdqa xmm1, [esi + edx] 1.6253 + pavgb xmm0, xmm1 1.6254 + sub ecx, 16 1.6255 + movdqa [esi + edi], xmm0 1.6256 + lea esi, [esi + 16] 1.6257 + jg xloop50 1.6258 + jmp xloop99 1.6259 + 1.6260 + // Blend 75 / 25. 1.6261 + align 4 1.6262 + xloop75: 1.6263 + movdqa xmm1, [esi] 1.6264 + movdqa xmm0, [esi + edx] 1.6265 + pavgb xmm0, xmm1 1.6266 + pavgb xmm0, xmm1 1.6267 + sub ecx, 16 1.6268 + movdqa [esi + edi], xmm0 1.6269 + lea esi, [esi + 16] 1.6270 + jg xloop75 1.6271 + jmp xloop99 1.6272 + 1.6273 + // Blend 100 / 0 - Copy row unchanged. 1.6274 + align 4 1.6275 + xloop100: 1.6276 + movdqa xmm0, [esi] 1.6277 + sub ecx, 16 1.6278 + movdqa [esi + edi], xmm0 1.6279 + lea esi, [esi + 16] 1.6280 + jg xloop100 1.6281 + 1.6282 + xloop99: 1.6283 + pop edi 1.6284 + pop esi 1.6285 + ret 1.6286 + } 1.6287 +} 1.6288 +#endif // HAS_INTERPOLATEROW_SSSE3 1.6289 + 1.6290 +#ifdef HAS_INTERPOLATEROW_SSE2 1.6291 +// Bilinear filter 16x2 -> 16x1 1.6292 +__declspec(naked) __declspec(align(16)) 1.6293 +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, 1.6294 + ptrdiff_t src_stride, int dst_width, 1.6295 + int source_y_fraction) { 1.6296 + __asm { 1.6297 + push esi 1.6298 + push edi 1.6299 + mov edi, [esp + 8 + 4] // dst_ptr 1.6300 + mov esi, [esp + 8 + 8] // src_ptr 1.6301 + mov edx, [esp + 8 + 12] // src_stride 1.6302 + mov ecx, [esp + 8 + 16] // dst_width 1.6303 + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1.6304 + sub edi, esi 1.6305 + // Dispatch to specialized filters if applicable. 1.6306 + cmp eax, 0 1.6307 + je xloop100 // 0 / 256. Blend 100 / 0. 1.6308 + cmp eax, 64 1.6309 + je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. 1.6310 + cmp eax, 128 1.6311 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 1.6312 + cmp eax, 192 1.6313 + je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. 1.6314 + 1.6315 + movd xmm5, eax // xmm5 = y fraction 1.6316 + punpcklbw xmm5, xmm5 1.6317 + psrlw xmm5, 1 1.6318 + punpcklwd xmm5, xmm5 1.6319 + punpckldq xmm5, xmm5 1.6320 + punpcklqdq xmm5, xmm5 1.6321 + pxor xmm4, xmm4 1.6322 + 1.6323 + align 4 1.6324 + xloop: 1.6325 + movdqa xmm0, [esi] // row0 1.6326 + movdqa xmm2, [esi + edx] // row1 1.6327 + movdqa xmm1, xmm0 1.6328 + movdqa xmm3, xmm2 1.6329 + punpcklbw xmm2, xmm4 1.6330 + punpckhbw xmm3, xmm4 1.6331 + punpcklbw xmm0, xmm4 1.6332 + punpckhbw xmm1, xmm4 1.6333 + psubw xmm2, xmm0 // row1 - row0 1.6334 + psubw xmm3, xmm1 1.6335 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 1.6336 + paddw xmm3, xmm3 1.6337 + pmulhw xmm2, xmm5 // scale diff 1.6338 + pmulhw xmm3, xmm5 1.6339 + paddw xmm0, xmm2 // sum rows 1.6340 + paddw xmm1, xmm3 1.6341 + packuswb xmm0, xmm1 1.6342 + sub ecx, 16 1.6343 + movdqa [esi + edi], xmm0 1.6344 + lea esi, [esi + 16] 1.6345 + jg xloop 1.6346 + jmp xloop99 1.6347 + 1.6348 + // Blend 25 / 75. 1.6349 + align 4 1.6350 + xloop25: 1.6351 + movdqa xmm0, [esi] 1.6352 + movdqa xmm1, [esi + edx] 1.6353 + pavgb xmm0, xmm1 1.6354 + pavgb xmm0, xmm1 1.6355 + sub ecx, 16 1.6356 + movdqa [esi + edi], xmm0 1.6357 + lea esi, [esi + 16] 1.6358 + jg xloop25 1.6359 + jmp xloop99 1.6360 + 1.6361 + // Blend 50 / 50. 1.6362 + align 4 1.6363 + xloop50: 1.6364 + movdqa xmm0, [esi] 1.6365 + movdqa xmm1, [esi + edx] 1.6366 + pavgb xmm0, xmm1 1.6367 + sub ecx, 16 1.6368 + movdqa [esi + edi], xmm0 1.6369 + lea esi, [esi + 16] 1.6370 + jg xloop50 1.6371 + jmp xloop99 1.6372 + 1.6373 + // Blend 75 / 25. 1.6374 + align 4 1.6375 + xloop75: 1.6376 + movdqa xmm1, [esi] 1.6377 + movdqa xmm0, [esi + edx] 1.6378 + pavgb xmm0, xmm1 1.6379 + pavgb xmm0, xmm1 1.6380 + sub ecx, 16 1.6381 + movdqa [esi + edi], xmm0 1.6382 + lea esi, [esi + 16] 1.6383 + jg xloop75 1.6384 + jmp xloop99 1.6385 + 1.6386 + // Blend 100 / 0 - Copy row unchanged. 1.6387 + align 4 1.6388 + xloop100: 1.6389 + movdqa xmm0, [esi] 1.6390 + sub ecx, 16 1.6391 + movdqa [esi + edi], xmm0 1.6392 + lea esi, [esi + 16] 1.6393 + jg xloop100 1.6394 + 1.6395 + xloop99: 1.6396 + pop edi 1.6397 + pop esi 1.6398 + ret 1.6399 + } 1.6400 +} 1.6401 +#endif // HAS_INTERPOLATEROW_SSE2 1.6402 + 1.6403 +// Bilinear filter 16x2 -> 16x1 1.6404 +__declspec(naked) __declspec(align(16)) 1.6405 +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1.6406 + ptrdiff_t src_stride, int dst_width, 1.6407 + int source_y_fraction) { 1.6408 + __asm { 1.6409 + push esi 1.6410 + push edi 1.6411 + mov edi, [esp + 8 + 4] // dst_ptr 1.6412 + mov esi, [esp + 8 + 8] // src_ptr 1.6413 + mov edx, [esp + 8 + 12] // src_stride 1.6414 + mov ecx, [esp + 8 + 16] // dst_width 1.6415 + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1.6416 + sub edi, esi 1.6417 + shr eax, 1 1.6418 + // Dispatch to specialized filters if applicable. 1.6419 + cmp eax, 0 1.6420 + je xloop100 // 0 / 128. Blend 100 / 0. 1.6421 + cmp eax, 32 1.6422 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 1.6423 + cmp eax, 64 1.6424 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 1.6425 + cmp eax, 96 1.6426 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 1.6427 + 1.6428 + movd xmm0, eax // high fraction 0..127 1.6429 + neg eax 1.6430 + add eax, 128 1.6431 + movd xmm5, eax // low fraction 128..1 1.6432 + punpcklbw xmm5, xmm0 1.6433 + punpcklwd xmm5, xmm5 1.6434 + pshufd xmm5, xmm5, 0 1.6435 + 1.6436 + align 4 1.6437 + xloop: 1.6438 + movdqu xmm0, [esi] 1.6439 + movdqu xmm2, [esi + edx] 1.6440 + movdqu xmm1, xmm0 1.6441 + punpcklbw xmm0, xmm2 1.6442 + punpckhbw xmm1, xmm2 1.6443 + pmaddubsw xmm0, xmm5 1.6444 + pmaddubsw xmm1, xmm5 1.6445 + psrlw xmm0, 7 1.6446 + psrlw xmm1, 7 1.6447 + packuswb xmm0, xmm1 1.6448 + sub ecx, 16 1.6449 + movdqu [esi + edi], xmm0 1.6450 + lea esi, [esi + 16] 1.6451 + jg xloop 1.6452 + jmp xloop99 1.6453 + 1.6454 + // Blend 25 / 75. 1.6455 + align 4 1.6456 + xloop25: 1.6457 + movdqu xmm0, [esi] 1.6458 + movdqu xmm1, [esi + edx] 1.6459 + pavgb xmm0, xmm1 1.6460 + pavgb xmm0, xmm1 1.6461 + sub ecx, 16 1.6462 + movdqu [esi + edi], xmm0 1.6463 + lea esi, [esi + 16] 1.6464 + jg xloop25 1.6465 + jmp xloop99 1.6466 + 1.6467 + // Blend 50 / 50. 1.6468 + align 4 1.6469 + xloop50: 1.6470 + movdqu xmm0, [esi] 1.6471 + movdqu xmm1, [esi + edx] 1.6472 + pavgb xmm0, xmm1 1.6473 + sub ecx, 16 1.6474 + movdqu [esi + edi], xmm0 1.6475 + lea esi, [esi + 16] 1.6476 + jg xloop50 1.6477 + jmp xloop99 1.6478 + 1.6479 + // Blend 75 / 25. 1.6480 + align 4 1.6481 + xloop75: 1.6482 + movdqu xmm1, [esi] 1.6483 + movdqu xmm0, [esi + edx] 1.6484 + pavgb xmm0, xmm1 1.6485 + pavgb xmm0, xmm1 1.6486 + sub ecx, 16 1.6487 + movdqu [esi + edi], xmm0 1.6488 + lea esi, [esi + 16] 1.6489 + jg xloop75 1.6490 + jmp xloop99 1.6491 + 1.6492 + // Blend 100 / 0 - Copy row unchanged. 1.6493 + align 4 1.6494 + xloop100: 1.6495 + movdqu xmm0, [esi] 1.6496 + sub ecx, 16 1.6497 + movdqu [esi + edi], xmm0 1.6498 + lea esi, [esi + 16] 1.6499 + jg xloop100 1.6500 + 1.6501 + xloop99: 1.6502 + pop edi 1.6503 + pop esi 1.6504 + ret 1.6505 + } 1.6506 +} 1.6507 + 1.6508 +#ifdef HAS_INTERPOLATEROW_SSE2 1.6509 +// Bilinear filter 16x2 -> 16x1 1.6510 +__declspec(naked) __declspec(align(16)) 1.6511 +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, 1.6512 + ptrdiff_t src_stride, int dst_width, 1.6513 + int source_y_fraction) { 1.6514 + __asm { 1.6515 + push esi 1.6516 + push edi 1.6517 + mov edi, [esp + 8 + 4] // dst_ptr 1.6518 + mov esi, [esp + 8 + 8] // src_ptr 1.6519 + mov edx, [esp + 8 + 12] // src_stride 1.6520 + mov ecx, [esp + 8 + 16] // dst_width 1.6521 + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1.6522 + sub edi, esi 1.6523 + // Dispatch to specialized filters if applicable. 1.6524 + cmp eax, 0 1.6525 + je xloop100 // 0 / 256. Blend 100 / 0. 1.6526 + cmp eax, 64 1.6527 + je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. 1.6528 + cmp eax, 128 1.6529 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 1.6530 + cmp eax, 192 1.6531 + je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. 1.6532 + 1.6533 + movd xmm5, eax // xmm5 = y fraction 1.6534 + punpcklbw xmm5, xmm5 1.6535 + psrlw xmm5, 1 1.6536 + punpcklwd xmm5, xmm5 1.6537 + punpckldq xmm5, xmm5 1.6538 + punpcklqdq xmm5, xmm5 1.6539 + pxor xmm4, xmm4 1.6540 + 1.6541 + align 4 1.6542 + xloop: 1.6543 + movdqu xmm0, [esi] // row0 1.6544 + movdqu xmm2, [esi + edx] // row1 1.6545 + movdqu xmm1, xmm0 1.6546 + movdqu xmm3, xmm2 1.6547 + punpcklbw xmm2, xmm4 1.6548 + punpckhbw xmm3, xmm4 1.6549 + punpcklbw xmm0, xmm4 1.6550 + punpckhbw xmm1, xmm4 1.6551 + psubw xmm2, xmm0 // row1 - row0 1.6552 + psubw xmm3, xmm1 1.6553 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 1.6554 + paddw xmm3, xmm3 1.6555 + pmulhw xmm2, xmm5 // scale diff 1.6556 + pmulhw xmm3, xmm5 1.6557 + paddw xmm0, xmm2 // sum rows 1.6558 + paddw xmm1, xmm3 1.6559 + packuswb xmm0, xmm1 1.6560 + sub ecx, 16 1.6561 + movdqu [esi + edi], xmm0 1.6562 + lea esi, [esi + 16] 1.6563 + jg xloop 1.6564 + jmp xloop99 1.6565 + 1.6566 + // Blend 25 / 75. 1.6567 + align 4 1.6568 + xloop25: 1.6569 + movdqu xmm0, [esi] 1.6570 + movdqu xmm1, [esi + edx] 1.6571 + pavgb xmm0, xmm1 1.6572 + pavgb xmm0, xmm1 1.6573 + sub ecx, 16 1.6574 + movdqu [esi + edi], xmm0 1.6575 + lea esi, [esi + 16] 1.6576 + jg xloop25 1.6577 + jmp xloop99 1.6578 + 1.6579 + // Blend 50 / 50. 1.6580 + align 4 1.6581 + xloop50: 1.6582 + movdqu xmm0, [esi] 1.6583 + movdqu xmm1, [esi + edx] 1.6584 + pavgb xmm0, xmm1 1.6585 + sub ecx, 16 1.6586 + movdqu [esi + edi], xmm0 1.6587 + lea esi, [esi + 16] 1.6588 + jg xloop50 1.6589 + jmp xloop99 1.6590 + 1.6591 + // Blend 75 / 25. 1.6592 + align 4 1.6593 + xloop75: 1.6594 + movdqu xmm1, [esi] 1.6595 + movdqu xmm0, [esi + edx] 1.6596 + pavgb xmm0, xmm1 1.6597 + pavgb xmm0, xmm1 1.6598 + sub ecx, 16 1.6599 + movdqu [esi + edi], xmm0 1.6600 + lea esi, [esi + 16] 1.6601 + jg xloop75 1.6602 + jmp xloop99 1.6603 + 1.6604 + // Blend 100 / 0 - Copy row unchanged. 1.6605 + align 4 1.6606 + xloop100: 1.6607 + movdqu xmm0, [esi] 1.6608 + sub ecx, 16 1.6609 + movdqu [esi + edi], xmm0 1.6610 + lea esi, [esi + 16] 1.6611 + jg xloop100 1.6612 + 1.6613 + xloop99: 1.6614 + pop edi 1.6615 + pop esi 1.6616 + ret 1.6617 + } 1.6618 +} 1.6619 +#endif // HAS_INTERPOLATEROW_SSE2 1.6620 + 1.6621 +__declspec(naked) __declspec(align(16)) 1.6622 +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, 1.6623 + uint8* dst_uv, int pix) { 1.6624 + __asm { 1.6625 + push edi 1.6626 + mov eax, [esp + 4 + 4] // src_uv 1.6627 + mov edx, [esp + 4 + 8] // src_uv_stride 1.6628 + mov edi, [esp + 4 + 12] // dst_v 1.6629 + mov ecx, [esp + 4 + 16] // pix 1.6630 + sub edi, eax 1.6631 + 1.6632 + align 4 1.6633 + convertloop: 1.6634 + movdqa xmm0, [eax] 1.6635 + pavgb xmm0, [eax + edx] 1.6636 + sub ecx, 16 1.6637 + movdqa [eax + edi], xmm0 1.6638 + lea eax, [eax + 16] 1.6639 + jg convertloop 1.6640 + pop edi 1.6641 + ret 1.6642 + } 1.6643 +} 1.6644 + 1.6645 +#ifdef HAS_HALFROW_AVX2 1.6646 +__declspec(naked) __declspec(align(16)) 1.6647 +void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, 1.6648 + uint8* dst_uv, int pix) { 1.6649 + __asm { 1.6650 + push edi 1.6651 + mov eax, [esp + 4 + 4] // src_uv 1.6652 + mov edx, [esp + 4 + 8] // src_uv_stride 1.6653 + mov edi, [esp + 4 + 12] // dst_v 1.6654 + mov ecx, [esp + 4 + 16] // pix 1.6655 + sub edi, eax 1.6656 + 1.6657 + align 4 1.6658 + convertloop: 1.6659 + vmovdqu ymm0, [eax] 1.6660 + vpavgb ymm0, ymm0, [eax + edx] 1.6661 + sub ecx, 32 1.6662 + vmovdqu [eax + edi], ymm0 1.6663 + lea eax, [eax + 32] 1.6664 + jg convertloop 1.6665 + 1.6666 + pop edi 1.6667 + vzeroupper 1.6668 + ret 1.6669 + } 1.6670 +} 1.6671 +#endif // HAS_HALFROW_AVX2 1.6672 + 1.6673 +__declspec(naked) __declspec(align(16)) 1.6674 +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, 1.6675 + uint32 selector, int pix) { 1.6676 + __asm { 1.6677 + mov eax, [esp + 4] // src_argb 1.6678 + mov edx, [esp + 8] // dst_bayer 1.6679 + movd xmm5, [esp + 12] // selector 1.6680 + mov ecx, [esp + 16] // pix 1.6681 + pshufd xmm5, xmm5, 0 1.6682 + 1.6683 + align 4 1.6684 + wloop: 1.6685 + movdqa xmm0, [eax] 1.6686 + movdqa xmm1, [eax + 16] 1.6687 + lea eax, [eax + 32] 1.6688 + pshufb xmm0, xmm5 1.6689 + pshufb xmm1, xmm5 1.6690 + punpckldq xmm0, xmm1 1.6691 + sub ecx, 8 1.6692 + movq qword ptr [edx], xmm0 1.6693 + lea edx, [edx + 8] 1.6694 + jg wloop 1.6695 + ret 1.6696 + } 1.6697 +} 1.6698 + 1.6699 +// Specialized ARGB to Bayer that just isolates G channel. 1.6700 +__declspec(naked) __declspec(align(16)) 1.6701 +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, 1.6702 + uint32 selector, int pix) { 1.6703 + __asm { 1.6704 + mov eax, [esp + 4] // src_argb 1.6705 + mov edx, [esp + 8] // dst_bayer 1.6706 + // selector 1.6707 + mov ecx, [esp + 16] // pix 1.6708 + pcmpeqb xmm5, xmm5 // generate mask 0x000000ff 1.6709 + psrld xmm5, 24 1.6710 + 1.6711 + align 4 1.6712 + wloop: 1.6713 + movdqa xmm0, [eax] 1.6714 + movdqa xmm1, [eax + 16] 1.6715 + lea eax, [eax + 32] 1.6716 + psrld xmm0, 8 // Move green to bottom. 1.6717 + psrld xmm1, 8 1.6718 + pand xmm0, xmm5 1.6719 + pand xmm1, xmm5 1.6720 + packssdw xmm0, xmm1 1.6721 + packuswb xmm0, xmm1 1.6722 + sub ecx, 8 1.6723 + movq qword ptr [edx], xmm0 1.6724 + lea edx, [edx + 8] 1.6725 + jg wloop 1.6726 + ret 1.6727 + } 1.6728 +} 1.6729 + 1.6730 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1.6731 +__declspec(naked) __declspec(align(16)) 1.6732 +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 1.6733 + const uint8* shuffler, int pix) { 1.6734 + __asm { 1.6735 + mov eax, [esp + 4] // src_argb 1.6736 + mov edx, [esp + 8] // dst_argb 1.6737 + mov ecx, [esp + 12] // shuffler 1.6738 + movdqa xmm5, [ecx] 1.6739 + mov ecx, [esp + 16] // pix 1.6740 + 1.6741 + align 4 1.6742 + wloop: 1.6743 + movdqa xmm0, [eax] 1.6744 + movdqa xmm1, [eax + 16] 1.6745 + lea eax, [eax + 32] 1.6746 + pshufb xmm0, xmm5 1.6747 + pshufb xmm1, xmm5 1.6748 + sub ecx, 8 1.6749 + movdqa [edx], xmm0 1.6750 + movdqa [edx + 16], xmm1 1.6751 + lea edx, [edx + 32] 1.6752 + jg wloop 1.6753 + ret 1.6754 + } 1.6755 +} 1.6756 + 1.6757 +__declspec(naked) __declspec(align(16)) 1.6758 +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, 1.6759 + const uint8* shuffler, int pix) { 1.6760 + __asm { 1.6761 + mov eax, [esp + 4] // src_argb 1.6762 + mov edx, [esp + 8] // dst_argb 1.6763 + mov ecx, [esp + 12] // shuffler 1.6764 + movdqa xmm5, [ecx] 1.6765 + mov ecx, [esp + 16] // pix 1.6766 + 1.6767 + align 4 1.6768 + wloop: 1.6769 + movdqu xmm0, [eax] 1.6770 + movdqu xmm1, [eax + 16] 1.6771 + lea eax, [eax + 32] 1.6772 + pshufb xmm0, xmm5 1.6773 + pshufb xmm1, xmm5 1.6774 + sub ecx, 8 1.6775 + movdqu [edx], xmm0 1.6776 + movdqu [edx + 16], xmm1 1.6777 + lea edx, [edx + 32] 1.6778 + jg wloop 1.6779 + ret 1.6780 + } 1.6781 +} 1.6782 + 1.6783 +#ifdef HAS_ARGBSHUFFLEROW_AVX2 1.6784 +__declspec(naked) __declspec(align(16)) 1.6785 +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 1.6786 + const uint8* shuffler, int pix) { 1.6787 + __asm { 1.6788 + mov eax, [esp + 4] // src_argb 1.6789 + mov edx, [esp + 8] // dst_argb 1.6790 + mov ecx, [esp + 12] // shuffler 1.6791 + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 1.6792 + mov ecx, [esp + 16] // pix 1.6793 + 1.6794 + align 4 1.6795 + wloop: 1.6796 + vmovdqu ymm0, [eax] 1.6797 + vmovdqu ymm1, [eax + 32] 1.6798 + lea eax, [eax + 64] 1.6799 + vpshufb ymm0, ymm0, ymm5 1.6800 + vpshufb ymm1, ymm1, ymm5 1.6801 + sub ecx, 16 1.6802 + vmovdqu [edx], ymm0 1.6803 + vmovdqu [edx + 32], ymm1 1.6804 + lea edx, [edx + 64] 1.6805 + jg wloop 1.6806 + 1.6807 + vzeroupper 1.6808 + ret 1.6809 + } 1.6810 +} 1.6811 +#endif // HAS_ARGBSHUFFLEROW_AVX2 1.6812 + 1.6813 +__declspec(naked) __declspec(align(16)) 1.6814 +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 1.6815 + const uint8* shuffler, int pix) { 1.6816 + __asm { 1.6817 + push ebx 1.6818 + push esi 1.6819 + mov eax, [esp + 8 + 4] // src_argb 1.6820 + mov edx, [esp + 8 + 8] // dst_argb 1.6821 + mov esi, [esp + 8 + 12] // shuffler 1.6822 + mov ecx, [esp + 8 + 16] // pix 1.6823 + pxor xmm5, xmm5 1.6824 + 1.6825 + mov ebx, [esi] // shuffler 1.6826 + cmp ebx, 0x03000102 1.6827 + je shuf_3012 1.6828 + cmp ebx, 0x00010203 1.6829 + je shuf_0123 1.6830 + cmp ebx, 0x00030201 1.6831 + je shuf_0321 1.6832 + cmp ebx, 0x02010003 1.6833 + je shuf_2103 1.6834 + 1.6835 + // TODO(fbarchard): Use one source pointer and 3 offsets. 1.6836 + shuf_any1: 1.6837 + movzx ebx, byte ptr [esi] 1.6838 + movzx ebx, byte ptr [eax + ebx] 1.6839 + mov [edx], bl 1.6840 + movzx ebx, byte ptr [esi + 1] 1.6841 + movzx ebx, byte ptr [eax + ebx] 1.6842 + mov [edx + 1], bl 1.6843 + movzx ebx, byte ptr [esi + 2] 1.6844 + movzx ebx, byte ptr [eax + ebx] 1.6845 + mov [edx + 2], bl 1.6846 + movzx ebx, byte ptr [esi + 3] 1.6847 + movzx ebx, byte ptr [eax + ebx] 1.6848 + mov [edx + 3], bl 1.6849 + lea eax, [eax + 4] 1.6850 + lea edx, [edx + 4] 1.6851 + sub ecx, 1 1.6852 + jg shuf_any1 1.6853 + jmp shuf99 1.6854 + 1.6855 + align 4 1.6856 + shuf_0123: 1.6857 + movdqu xmm0, [eax] 1.6858 + lea eax, [eax + 16] 1.6859 + movdqa xmm1, xmm0 1.6860 + punpcklbw xmm0, xmm5 1.6861 + punpckhbw xmm1, xmm5 1.6862 + pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB 1.6863 + pshuflw xmm0, xmm0, 01Bh 1.6864 + pshufhw xmm1, xmm1, 01Bh 1.6865 + pshuflw xmm1, xmm1, 01Bh 1.6866 + packuswb xmm0, xmm1 1.6867 + sub ecx, 4 1.6868 + movdqu [edx], xmm0 1.6869 + lea edx, [edx + 16] 1.6870 + jg shuf_0123 1.6871 + jmp shuf99 1.6872 + 1.6873 + align 4 1.6874 + shuf_0321: 1.6875 + movdqu xmm0, [eax] 1.6876 + lea eax, [eax + 16] 1.6877 + movdqa xmm1, xmm0 1.6878 + punpcklbw xmm0, xmm5 1.6879 + punpckhbw xmm1, xmm5 1.6880 + pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB 1.6881 + pshuflw xmm0, xmm0, 039h 1.6882 + pshufhw xmm1, xmm1, 039h 1.6883 + pshuflw xmm1, xmm1, 039h 1.6884 + packuswb xmm0, xmm1 1.6885 + sub ecx, 4 1.6886 + movdqu [edx], xmm0 1.6887 + lea edx, [edx + 16] 1.6888 + jg shuf_0321 1.6889 + jmp shuf99 1.6890 + 1.6891 + align 4 1.6892 + shuf_2103: 1.6893 + movdqu xmm0, [eax] 1.6894 + lea eax, [eax + 16] 1.6895 + movdqa xmm1, xmm0 1.6896 + punpcklbw xmm0, xmm5 1.6897 + punpckhbw xmm1, xmm5 1.6898 + pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA 1.6899 + pshuflw xmm0, xmm0, 093h 1.6900 + pshufhw xmm1, xmm1, 093h 1.6901 + pshuflw xmm1, xmm1, 093h 1.6902 + packuswb xmm0, xmm1 1.6903 + sub ecx, 4 1.6904 + movdqu [edx], xmm0 1.6905 + lea edx, [edx + 16] 1.6906 + jg shuf_2103 1.6907 + jmp shuf99 1.6908 + 1.6909 + align 4 1.6910 + shuf_3012: 1.6911 + movdqu xmm0, [eax] 1.6912 + lea eax, [eax + 16] 1.6913 + movdqa xmm1, xmm0 1.6914 + punpcklbw xmm0, xmm5 1.6915 + punpckhbw xmm1, xmm5 1.6916 + pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB 1.6917 + pshuflw xmm0, xmm0, 0C6h 1.6918 + pshufhw xmm1, xmm1, 0C6h 1.6919 + pshuflw xmm1, xmm1, 0C6h 1.6920 + packuswb xmm0, xmm1 1.6921 + sub ecx, 4 1.6922 + movdqu [edx], xmm0 1.6923 + lea edx, [edx + 16] 1.6924 + jg shuf_3012 1.6925 + 1.6926 + shuf99: 1.6927 + pop esi 1.6928 + pop ebx 1.6929 + ret 1.6930 + } 1.6931 +} 1.6932 + 1.6933 +// YUY2 - Macro-pixel = 2 image pixels 1.6934 +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... 1.6935 + 1.6936 +// UYVY - Macro-pixel = 2 image pixels 1.6937 +// U0Y0V0Y1 1.6938 + 1.6939 +__declspec(naked) __declspec(align(16)) 1.6940 +void I422ToYUY2Row_SSE2(const uint8* src_y, 1.6941 + const uint8* src_u, 1.6942 + const uint8* src_v, 1.6943 + uint8* dst_frame, int width) { 1.6944 + __asm { 1.6945 + push esi 1.6946 + push edi 1.6947 + mov eax, [esp + 8 + 4] // src_y 1.6948 + mov esi, [esp + 8 + 8] // src_u 1.6949 + mov edx, [esp + 8 + 12] // src_v 1.6950 + mov edi, [esp + 8 + 16] // dst_frame 1.6951 + mov ecx, [esp + 8 + 20] // width 1.6952 + sub edx, esi 1.6953 + 1.6954 + align 4 1.6955 + convertloop: 1.6956 + movq xmm2, qword ptr [esi] // U 1.6957 + movq xmm3, qword ptr [esi + edx] // V 1.6958 + lea esi, [esi + 8] 1.6959 + punpcklbw xmm2, xmm3 // UV 1.6960 + movdqu xmm0, [eax] // Y 1.6961 + lea eax, [eax + 16] 1.6962 + movdqa xmm1, xmm0 1.6963 + punpcklbw xmm0, xmm2 // YUYV 1.6964 + punpckhbw xmm1, xmm2 1.6965 + movdqu [edi], xmm0 1.6966 + movdqu [edi + 16], xmm1 1.6967 + lea edi, [edi + 32] 1.6968 + sub ecx, 16 1.6969 + jg convertloop 1.6970 + 1.6971 + pop edi 1.6972 + pop esi 1.6973 + ret 1.6974 + } 1.6975 +} 1.6976 + 1.6977 +__declspec(naked) __declspec(align(16)) 1.6978 +void I422ToUYVYRow_SSE2(const uint8* src_y, 1.6979 + const uint8* src_u, 1.6980 + const uint8* src_v, 1.6981 + uint8* dst_frame, int width) { 1.6982 + __asm { 1.6983 + push esi 1.6984 + push edi 1.6985 + mov eax, [esp + 8 + 4] // src_y 1.6986 + mov esi, [esp + 8 + 8] // src_u 1.6987 + mov edx, [esp + 8 + 12] // src_v 1.6988 + mov edi, [esp + 8 + 16] // dst_frame 1.6989 + mov ecx, [esp + 8 + 20] // width 1.6990 + sub edx, esi 1.6991 + 1.6992 + align 4 1.6993 + convertloop: 1.6994 + movq xmm2, qword ptr [esi] // U 1.6995 + movq xmm3, qword ptr [esi + edx] // V 1.6996 + lea esi, [esi + 8] 1.6997 + punpcklbw xmm2, xmm3 // UV 1.6998 + movdqu xmm0, [eax] // Y 1.6999 + movdqa xmm1, xmm2 1.7000 + lea eax, [eax + 16] 1.7001 + punpcklbw xmm1, xmm0 // UYVY 1.7002 + punpckhbw xmm2, xmm0 1.7003 + movdqu [edi], xmm1 1.7004 + movdqu [edi + 16], xmm2 1.7005 + lea edi, [edi + 32] 1.7006 + sub ecx, 16 1.7007 + jg convertloop 1.7008 + 1.7009 + pop edi 1.7010 + pop esi 1.7011 + ret 1.7012 + } 1.7013 +} 1.7014 + 1.7015 +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 1.7016 +__declspec(naked) __declspec(align(16)) 1.7017 +void ARGBPolynomialRow_SSE2(const uint8* src_argb, 1.7018 + uint8* dst_argb, const float* poly, 1.7019 + int width) { 1.7020 + __asm { 1.7021 + push esi 1.7022 + mov eax, [esp + 4 + 4] /* src_argb */ 1.7023 + mov edx, [esp + 4 + 8] /* dst_argb */ 1.7024 + mov esi, [esp + 4 + 12] /* poly */ 1.7025 + mov ecx, [esp + 4 + 16] /* width */ 1.7026 + pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 1.7027 + 1.7028 + // 2 pixel loop. 1.7029 + align 4 1.7030 + convertloop: 1.7031 +// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel 1.7032 +// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel 1.7033 + movq xmm0, qword ptr [eax] // BGRABGRA 1.7034 + lea eax, [eax + 8] 1.7035 + punpcklbw xmm0, xmm3 1.7036 + movdqa xmm4, xmm0 1.7037 + punpcklwd xmm0, xmm3 // pixel 0 1.7038 + punpckhwd xmm4, xmm3 // pixel 1 1.7039 + cvtdq2ps xmm0, xmm0 // 4 floats 1.7040 + cvtdq2ps xmm4, xmm4 1.7041 + movdqa xmm1, xmm0 // X 1.7042 + movdqa xmm5, xmm4 1.7043 + mulps xmm0, [esi + 16] // C1 * X 1.7044 + mulps xmm4, [esi + 16] 1.7045 + addps xmm0, [esi] // result = C0 + C1 * X 1.7046 + addps xmm4, [esi] 1.7047 + movdqa xmm2, xmm1 1.7048 + movdqa xmm6, xmm5 1.7049 + mulps xmm2, xmm1 // X * X 1.7050 + mulps xmm6, xmm5 1.7051 + mulps xmm1, xmm2 // X * X * X 1.7052 + mulps xmm5, xmm6 1.7053 + mulps xmm2, [esi + 32] // C2 * X * X 1.7054 + mulps xmm6, [esi + 32] 1.7055 + mulps xmm1, [esi + 48] // C3 * X * X * X 1.7056 + mulps xmm5, [esi + 48] 1.7057 + addps xmm0, xmm2 // result += C2 * X * X 1.7058 + addps xmm4, xmm6 1.7059 + addps xmm0, xmm1 // result += C3 * X * X * X 1.7060 + addps xmm4, xmm5 1.7061 + cvttps2dq xmm0, xmm0 1.7062 + cvttps2dq xmm4, xmm4 1.7063 + packuswb xmm0, xmm4 1.7064 + packuswb xmm0, xmm0 1.7065 + sub ecx, 2 1.7066 + movq qword ptr [edx], xmm0 1.7067 + lea edx, [edx + 8] 1.7068 + jg convertloop 1.7069 + pop esi 1.7070 + ret 1.7071 + } 1.7072 +} 1.7073 +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 1.7074 + 1.7075 +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 1.7076 +__declspec(naked) __declspec(align(16)) 1.7077 +void ARGBPolynomialRow_AVX2(const uint8* src_argb, 1.7078 + uint8* dst_argb, const float* poly, 1.7079 + int width) { 1.7080 + __asm { 1.7081 + mov eax, [esp + 4] /* src_argb */ 1.7082 + mov edx, [esp + 8] /* dst_argb */ 1.7083 + mov ecx, [esp + 12] /* poly */ 1.7084 + vbroadcastf128 ymm4, [ecx] // C0 1.7085 + vbroadcastf128 ymm5, [ecx + 16] // C1 1.7086 + vbroadcastf128 ymm6, [ecx + 32] // C2 1.7087 + vbroadcastf128 ymm7, [ecx + 48] // C3 1.7088 + mov ecx, [esp + 16] /* width */ 1.7089 + 1.7090 + // 2 pixel loop. 1.7091 + align 4 1.7092 + convertloop: 1.7093 + vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels 1.7094 + lea eax, [eax + 8] 1.7095 + vcvtdq2ps ymm0, ymm0 // X 8 floats 1.7096 + vmulps ymm2, ymm0, ymm0 // X * X 1.7097 + vmulps ymm3, ymm0, ymm7 // C3 * X 1.7098 + vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X 1.7099 + vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X 1.7100 + vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X 1.7101 + vcvttps2dq ymm0, ymm0 1.7102 + vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 1.7103 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 1.7104 + vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 1.7105 + sub ecx, 2 1.7106 + vmovq qword ptr [edx], xmm0 1.7107 + lea edx, [edx + 8] 1.7108 + jg convertloop 1.7109 + vzeroupper 1.7110 + ret 1.7111 + } 1.7112 +} 1.7113 +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 1.7114 + 1.7115 +#ifdef HAS_ARGBCOLORTABLEROW_X86 1.7116 +// Tranform ARGB pixels with color table. 1.7117 +__declspec(naked) __declspec(align(16)) 1.7118 +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 1.7119 + int width) { 1.7120 + __asm { 1.7121 + push esi 1.7122 + mov eax, [esp + 4 + 4] /* dst_argb */ 1.7123 + mov esi, [esp + 4 + 8] /* table_argb */ 1.7124 + mov ecx, [esp + 4 + 12] /* width */ 1.7125 + 1.7126 + // 1 pixel loop. 1.7127 + align 4 1.7128 + convertloop: 1.7129 + movzx edx, byte ptr [eax] 1.7130 + lea eax, [eax + 4] 1.7131 + movzx edx, byte ptr [esi + edx * 4] 1.7132 + mov byte ptr [eax - 4], dl 1.7133 + movzx edx, byte ptr [eax - 4 + 1] 1.7134 + movzx edx, byte ptr [esi + edx * 4 + 1] 1.7135 + mov byte ptr [eax - 4 + 1], dl 1.7136 + movzx edx, byte ptr [eax - 4 + 2] 1.7137 + movzx edx, byte ptr [esi + edx * 4 + 2] 1.7138 + mov byte ptr [eax - 4 + 2], dl 1.7139 + movzx edx, byte ptr [eax - 4 + 3] 1.7140 + movzx edx, byte ptr [esi + edx * 4 + 3] 1.7141 + mov byte ptr [eax - 4 + 3], dl 1.7142 + dec ecx 1.7143 + jg convertloop 1.7144 + pop esi 1.7145 + ret 1.7146 + } 1.7147 +} 1.7148 +#endif // HAS_ARGBCOLORTABLEROW_X86 1.7149 + 1.7150 +#ifdef HAS_RGBCOLORTABLEROW_X86 1.7151 +// Tranform RGB pixels with color table. 1.7152 +__declspec(naked) __declspec(align(16)) 1.7153 +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 1.7154 + __asm { 1.7155 + push esi 1.7156 + mov eax, [esp + 4 + 4] /* dst_argb */ 1.7157 + mov esi, [esp + 4 + 8] /* table_argb */ 1.7158 + mov ecx, [esp + 4 + 12] /* width */ 1.7159 + 1.7160 + // 1 pixel loop. 1.7161 + align 4 1.7162 + convertloop: 1.7163 + movzx edx, byte ptr [eax] 1.7164 + lea eax, [eax + 4] 1.7165 + movzx edx, byte ptr [esi + edx * 4] 1.7166 + mov byte ptr [eax - 4], dl 1.7167 + movzx edx, byte ptr [eax - 4 + 1] 1.7168 + movzx edx, byte ptr [esi + edx * 4 + 1] 1.7169 + mov byte ptr [eax - 4 + 1], dl 1.7170 + movzx edx, byte ptr [eax - 4 + 2] 1.7171 + movzx edx, byte ptr [esi + edx * 4 + 2] 1.7172 + mov byte ptr [eax - 4 + 2], dl 1.7173 + dec ecx 1.7174 + jg convertloop 1.7175 + 1.7176 + pop esi 1.7177 + ret 1.7178 + } 1.7179 +} 1.7180 +#endif // HAS_RGBCOLORTABLEROW_X86 1.7181 + 1.7182 +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 1.7183 +// Tranform RGB pixels with luma table. 1.7184 +__declspec(naked) __declspec(align(16)) 1.7185 +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 1.7186 + int width, 1.7187 + const uint8* luma, uint32 lumacoeff) { 1.7188 + __asm { 1.7189 + push esi 1.7190 + push edi 1.7191 + mov eax, [esp + 8 + 4] /* src_argb */ 1.7192 + mov edi, [esp + 8 + 8] /* dst_argb */ 1.7193 + mov ecx, [esp + 8 + 12] /* width */ 1.7194 + movd xmm2, dword ptr [esp + 8 + 16] // luma table 1.7195 + movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff 1.7196 + pshufd xmm2, xmm2, 0 1.7197 + pshufd xmm3, xmm3, 0 1.7198 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 1.7199 + psllw xmm4, 8 1.7200 + pxor xmm5, xmm5 1.7201 + 1.7202 + // 4 pixel loop. 1.7203 + align 4 1.7204 + convertloop: 1.7205 + movdqu xmm0, qword ptr [eax] // generate luma ptr 1.7206 + pmaddubsw xmm0, xmm3 1.7207 + phaddw xmm0, xmm0 1.7208 + pand xmm0, xmm4 // mask out low bits 1.7209 + punpcklwd xmm0, xmm5 1.7210 + paddd xmm0, xmm2 // add table base 1.7211 + movd esi, xmm0 1.7212 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 1.7213 + 1.7214 + movzx edx, byte ptr [eax] 1.7215 + movzx edx, byte ptr [esi + edx] 1.7216 + mov byte ptr [edi], dl 1.7217 + movzx edx, byte ptr [eax + 1] 1.7218 + movzx edx, byte ptr [esi + edx] 1.7219 + mov byte ptr [edi + 1], dl 1.7220 + movzx edx, byte ptr [eax + 2] 1.7221 + movzx edx, byte ptr [esi + edx] 1.7222 + mov byte ptr [edi + 2], dl 1.7223 + movzx edx, byte ptr [eax + 3] // copy alpha. 1.7224 + mov byte ptr [edi + 3], dl 1.7225 + 1.7226 + movd esi, xmm0 1.7227 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 1.7228 + 1.7229 + movzx edx, byte ptr [eax + 4] 1.7230 + movzx edx, byte ptr [esi + edx] 1.7231 + mov byte ptr [edi + 4], dl 1.7232 + movzx edx, byte ptr [eax + 5] 1.7233 + movzx edx, byte ptr [esi + edx] 1.7234 + mov byte ptr [edi + 5], dl 1.7235 + movzx edx, byte ptr [eax + 6] 1.7236 + movzx edx, byte ptr [esi + edx] 1.7237 + mov byte ptr [edi + 6], dl 1.7238 + movzx edx, byte ptr [eax + 7] // copy alpha. 1.7239 + mov byte ptr [edi + 7], dl 1.7240 + 1.7241 + movd esi, xmm0 1.7242 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 1.7243 + 1.7244 + movzx edx, byte ptr [eax + 8] 1.7245 + movzx edx, byte ptr [esi + edx] 1.7246 + mov byte ptr [edi + 8], dl 1.7247 + movzx edx, byte ptr [eax + 9] 1.7248 + movzx edx, byte ptr [esi + edx] 1.7249 + mov byte ptr [edi + 9], dl 1.7250 + movzx edx, byte ptr [eax + 10] 1.7251 + movzx edx, byte ptr [esi + edx] 1.7252 + mov byte ptr [edi + 10], dl 1.7253 + movzx edx, byte ptr [eax + 11] // copy alpha. 1.7254 + mov byte ptr [edi + 11], dl 1.7255 + 1.7256 + movd esi, xmm0 1.7257 + 1.7258 + movzx edx, byte ptr [eax + 12] 1.7259 + movzx edx, byte ptr [esi + edx] 1.7260 + mov byte ptr [edi + 12], dl 1.7261 + movzx edx, byte ptr [eax + 13] 1.7262 + movzx edx, byte ptr [esi + edx] 1.7263 + mov byte ptr [edi + 13], dl 1.7264 + movzx edx, byte ptr [eax + 14] 1.7265 + movzx edx, byte ptr [esi + edx] 1.7266 + mov byte ptr [edi + 14], dl 1.7267 + movzx edx, byte ptr [eax + 15] // copy alpha. 1.7268 + mov byte ptr [edi + 15], dl 1.7269 + 1.7270 + sub ecx, 4 1.7271 + lea eax, [eax + 16] 1.7272 + lea edi, [edi + 16] 1.7273 + jg convertloop 1.7274 + 1.7275 + pop edi 1.7276 + pop esi 1.7277 + ret 1.7278 + } 1.7279 +} 1.7280 +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 1.7281 + 1.7282 +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 1.7283 + 1.7284 +#ifdef __cplusplus 1.7285 +} // extern "C" 1.7286 +} // namespace libyuv 1.7287 +#endif