1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/scale_win.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1320 @@ 1.4 +/* 1.5 + * Copyright 2013 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/row.h" 1.15 + 1.16 +#ifdef __cplusplus 1.17 +namespace libyuv { 1.18 +extern "C" { 1.19 +#endif 1.20 + 1.21 +// This module is for Visual C x86. 1.22 +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 1.23 + 1.24 +// Offsets for source bytes 0 to 9 1.25 +static uvec8 kShuf0 = 1.26 + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.27 + 1.28 +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 1.29 +static uvec8 kShuf1 = 1.30 + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.31 + 1.32 +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 1.33 +static uvec8 kShuf2 = 1.34 + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.35 + 1.36 +// Offsets for source bytes 0 to 10 1.37 +static uvec8 kShuf01 = 1.38 + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 1.39 + 1.40 +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 1.41 +static uvec8 kShuf11 = 1.42 + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 1.43 + 1.44 +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 1.45 +static uvec8 kShuf21 = 1.46 + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 1.47 + 1.48 +// Coefficients for source bytes 0 to 10 1.49 +static uvec8 kMadd01 = 1.50 + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 1.51 + 1.52 +// Coefficients for source bytes 10 to 21 1.53 +static uvec8 kMadd11 = 1.54 + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 1.55 + 1.56 +// Coefficients for source bytes 21 to 31 1.57 +static uvec8 kMadd21 = 1.58 + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 1.59 + 1.60 +// Coefficients for source bytes 21 to 31 1.61 +static vec16 kRound34 = 1.62 + { 2, 2, 2, 2, 2, 2, 2, 2 }; 1.63 + 1.64 +static uvec8 kShuf38a = 1.65 + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.66 + 1.67 +static uvec8 kShuf38b = 1.68 + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 1.69 + 1.70 +// Arrange words 0,3,6 into 0,1,2 1.71 +static uvec8 kShufAc = 1.72 + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.73 + 1.74 +// Arrange words 0,3,6 into 3,4,5 1.75 +static uvec8 kShufAc3 = 1.76 + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 1.77 + 1.78 +// Scaling values for boxes of 3x3 and 2x3 1.79 +static uvec16 kScaleAc33 = 1.80 + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 1.81 + 1.82 +// Arrange first value for pixels 0,1,2,3,4,5 1.83 +static uvec8 kShufAb0 = 1.84 + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 1.85 + 1.86 +// Arrange second value for pixels 0,1,2,3,4,5 1.87 +static uvec8 kShufAb1 = 1.88 + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 1.89 + 1.90 +// Arrange third value for pixels 0,1,2,3,4,5 1.91 +static uvec8 kShufAb2 = 1.92 + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 1.93 + 1.94 +// Scaling values for boxes of 3x2 and 2x2 1.95 +static uvec16 kScaleAb2 = 1.96 + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 1.97 + 1.98 +// Reads 32 pixels, throws half away and writes 16 pixels. 1.99 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 1.100 +__declspec(naked) __declspec(align(16)) 1.101 +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.102 + uint8* dst_ptr, int dst_width) { 1.103 + __asm { 1.104 + mov eax, [esp + 4] // src_ptr 1.105 + // src_stride ignored 1.106 + mov edx, [esp + 12] // dst_ptr 1.107 + mov ecx, [esp + 16] // dst_width 1.108 + 1.109 + align 4 1.110 + wloop: 1.111 + movdqa xmm0, [eax] 1.112 + movdqa xmm1, [eax + 16] 1.113 + lea eax, [eax + 32] 1.114 + psrlw xmm0, 8 // isolate odd pixels. 1.115 + psrlw xmm1, 8 1.116 + packuswb xmm0, xmm1 1.117 + sub ecx, 16 1.118 + movdqa [edx], xmm0 1.119 + lea edx, [edx + 16] 1.120 + jg wloop 1.121 + 1.122 + ret 1.123 + } 1.124 +} 1.125 + 1.126 +// Blends 32x1 rectangle to 16x1. 1.127 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 1.128 +__declspec(naked) __declspec(align(16)) 1.129 +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.130 + uint8* dst_ptr, int dst_width) { 1.131 + __asm { 1.132 + mov eax, [esp + 4] // src_ptr 1.133 + // src_stride 1.134 + mov edx, [esp + 12] // dst_ptr 1.135 + mov ecx, [esp + 16] // dst_width 1.136 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.137 + psrlw xmm5, 8 1.138 + 1.139 + align 4 1.140 + wloop: 1.141 + movdqa xmm0, [eax] 1.142 + movdqa xmm1, [eax + 16] 1.143 + lea eax, [eax + 32] 1.144 + 1.145 + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 1.146 + psrlw xmm0, 8 1.147 + movdqa xmm3, xmm1 1.148 + psrlw xmm1, 8 1.149 + pand xmm2, xmm5 1.150 + pand xmm3, xmm5 1.151 + pavgw xmm0, xmm2 1.152 + pavgw xmm1, xmm3 1.153 + packuswb xmm0, xmm1 1.154 + 1.155 + sub ecx, 16 1.156 + movdqa [edx], xmm0 1.157 + lea edx, [edx + 16] 1.158 + jg wloop 1.159 + 1.160 + ret 1.161 + } 1.162 +} 1.163 + 1.164 +// Blends 32x2 rectangle to 16x1. 1.165 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 1.166 +__declspec(naked) __declspec(align(16)) 1.167 +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.168 + uint8* dst_ptr, int dst_width) { 1.169 + __asm { 1.170 + push esi 1.171 + mov eax, [esp + 4 + 4] // src_ptr 1.172 + mov esi, [esp + 4 + 8] // src_stride 1.173 + mov edx, [esp + 4 + 12] // dst_ptr 1.174 + mov ecx, [esp + 4 + 16] // dst_width 1.175 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.176 + psrlw xmm5, 8 1.177 + 1.178 + align 4 1.179 + wloop: 1.180 + movdqa xmm0, [eax] 1.181 + movdqa xmm1, [eax + 16] 1.182 + movdqa xmm2, [eax + esi] 1.183 + movdqa xmm3, [eax + esi + 16] 1.184 + lea eax, [eax + 32] 1.185 + pavgb xmm0, xmm2 // average rows 1.186 + pavgb xmm1, xmm3 1.187 + 1.188 + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 1.189 + psrlw xmm0, 8 1.190 + movdqa xmm3, xmm1 1.191 + psrlw xmm1, 8 1.192 + pand xmm2, xmm5 1.193 + pand xmm3, xmm5 1.194 + pavgw xmm0, xmm2 1.195 + pavgw xmm1, xmm3 1.196 + packuswb xmm0, xmm1 1.197 + 1.198 + sub ecx, 16 1.199 + movdqa [edx], xmm0 1.200 + lea edx, [edx + 16] 1.201 + jg wloop 1.202 + 1.203 + pop esi 1.204 + ret 1.205 + } 1.206 +} 1.207 + 1.208 +// Reads 32 pixels, throws half away and writes 16 pixels. 1.209 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 1.210 +__declspec(naked) __declspec(align(16)) 1.211 +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, 1.212 + ptrdiff_t src_stride, 1.213 + uint8* dst_ptr, int dst_width) { 1.214 + __asm { 1.215 + mov eax, [esp + 4] // src_ptr 1.216 + // src_stride ignored 1.217 + mov edx, [esp + 12] // dst_ptr 1.218 + mov ecx, [esp + 16] // dst_width 1.219 + 1.220 + align 4 1.221 + wloop: 1.222 + movdqu xmm0, [eax] 1.223 + movdqu xmm1, [eax + 16] 1.224 + lea eax, [eax + 32] 1.225 + psrlw xmm0, 8 // isolate odd pixels. 1.226 + psrlw xmm1, 8 1.227 + packuswb xmm0, xmm1 1.228 + sub ecx, 16 1.229 + movdqu [edx], xmm0 1.230 + lea edx, [edx + 16] 1.231 + jg wloop 1.232 + 1.233 + ret 1.234 + } 1.235 +} 1.236 + 1.237 +// Blends 32x1 rectangle to 16x1. 1.238 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 1.239 +__declspec(naked) __declspec(align(16)) 1.240 +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, 1.241 + ptrdiff_t src_stride, 1.242 + uint8* dst_ptr, int dst_width) { 1.243 + __asm { 1.244 + mov eax, [esp + 4] // src_ptr 1.245 + // src_stride 1.246 + mov edx, [esp + 12] // dst_ptr 1.247 + mov ecx, [esp + 16] // dst_width 1.248 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.249 + psrlw xmm5, 8 1.250 + 1.251 + align 4 1.252 + wloop: 1.253 + movdqu xmm0, [eax] 1.254 + movdqu xmm1, [eax + 16] 1.255 + lea eax, [eax + 32] 1.256 + 1.257 + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 1.258 + psrlw xmm0, 8 1.259 + movdqa xmm3, xmm1 1.260 + psrlw xmm1, 8 1.261 + pand xmm2, xmm5 1.262 + pand xmm3, xmm5 1.263 + pavgw xmm0, xmm2 1.264 + pavgw xmm1, xmm3 1.265 + packuswb xmm0, xmm1 1.266 + 1.267 + sub ecx, 16 1.268 + movdqu [edx], xmm0 1.269 + lea edx, [edx + 16] 1.270 + jg wloop 1.271 + 1.272 + ret 1.273 + } 1.274 +} 1.275 + 1.276 +// Blends 32x2 rectangle to 16x1. 1.277 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 1.278 +__declspec(naked) __declspec(align(16)) 1.279 +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, 1.280 + ptrdiff_t src_stride, 1.281 + uint8* dst_ptr, int dst_width) { 1.282 + __asm { 1.283 + push esi 1.284 + mov eax, [esp + 4 + 4] // src_ptr 1.285 + mov esi, [esp + 4 + 8] // src_stride 1.286 + mov edx, [esp + 4 + 12] // dst_ptr 1.287 + mov ecx, [esp + 4 + 16] // dst_width 1.288 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 1.289 + psrlw xmm5, 8 1.290 + 1.291 + align 4 1.292 + wloop: 1.293 + movdqu xmm0, [eax] 1.294 + movdqu xmm1, [eax + 16] 1.295 + movdqu xmm2, [eax + esi] 1.296 + movdqu xmm3, [eax + esi + 16] 1.297 + lea eax, [eax + 32] 1.298 + pavgb xmm0, xmm2 // average rows 1.299 + pavgb xmm1, xmm3 1.300 + 1.301 + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 1.302 + psrlw xmm0, 8 1.303 + movdqa xmm3, xmm1 1.304 + psrlw xmm1, 8 1.305 + pand xmm2, xmm5 1.306 + pand xmm3, xmm5 1.307 + pavgw xmm0, xmm2 1.308 + pavgw xmm1, xmm3 1.309 + packuswb xmm0, xmm1 1.310 + 1.311 + sub ecx, 16 1.312 + movdqu [edx], xmm0 1.313 + lea edx, [edx + 16] 1.314 + jg wloop 1.315 + 1.316 + pop esi 1.317 + ret 1.318 + } 1.319 +} 1.320 + 1.321 +// Point samples 32 pixels to 8 pixels. 1.322 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 1.323 +__declspec(naked) __declspec(align(16)) 1.324 +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.325 + uint8* dst_ptr, int dst_width) { 1.326 + __asm { 1.327 + mov eax, [esp + 4] // src_ptr 1.328 + // src_stride ignored 1.329 + mov edx, [esp + 12] // dst_ptr 1.330 + mov ecx, [esp + 16] // dst_width 1.331 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 1.332 + psrld xmm5, 24 1.333 + pslld xmm5, 16 1.334 + 1.335 + align 4 1.336 + wloop: 1.337 + movdqa xmm0, [eax] 1.338 + movdqa xmm1, [eax + 16] 1.339 + lea eax, [eax + 32] 1.340 + pand xmm0, xmm5 1.341 + pand xmm1, xmm5 1.342 + packuswb xmm0, xmm1 1.343 + psrlw xmm0, 8 1.344 + packuswb xmm0, xmm0 1.345 + sub ecx, 8 1.346 + movq qword ptr [edx], xmm0 1.347 + lea edx, [edx + 8] 1.348 + jg wloop 1.349 + 1.350 + ret 1.351 + } 1.352 +} 1.353 + 1.354 +// Blends 32x4 rectangle to 8x1. 1.355 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 1.356 +__declspec(naked) __declspec(align(16)) 1.357 +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.358 + uint8* dst_ptr, int dst_width) { 1.359 + __asm { 1.360 + push esi 1.361 + push edi 1.362 + mov eax, [esp + 8 + 4] // src_ptr 1.363 + mov esi, [esp + 8 + 8] // src_stride 1.364 + mov edx, [esp + 8 + 12] // dst_ptr 1.365 + mov ecx, [esp + 8 + 16] // dst_width 1.366 + lea edi, [esi + esi * 2] // src_stride * 3 1.367 + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 1.368 + psrlw xmm7, 8 1.369 + 1.370 + align 4 1.371 + wloop: 1.372 + movdqa xmm0, [eax] 1.373 + movdqa xmm1, [eax + 16] 1.374 + movdqa xmm2, [eax + esi] 1.375 + movdqa xmm3, [eax + esi + 16] 1.376 + pavgb xmm0, xmm2 // average rows 1.377 + pavgb xmm1, xmm3 1.378 + movdqa xmm2, [eax + esi * 2] 1.379 + movdqa xmm3, [eax + esi * 2 + 16] 1.380 + movdqa xmm4, [eax + edi] 1.381 + movdqa xmm5, [eax + edi + 16] 1.382 + lea eax, [eax + 32] 1.383 + pavgb xmm2, xmm4 1.384 + pavgb xmm3, xmm5 1.385 + pavgb xmm0, xmm2 1.386 + pavgb xmm1, xmm3 1.387 + 1.388 + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 1.389 + psrlw xmm0, 8 1.390 + movdqa xmm3, xmm1 1.391 + psrlw xmm1, 8 1.392 + pand xmm2, xmm7 1.393 + pand xmm3, xmm7 1.394 + pavgw xmm0, xmm2 1.395 + pavgw xmm1, xmm3 1.396 + packuswb xmm0, xmm1 1.397 + 1.398 + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) 1.399 + psrlw xmm0, 8 1.400 + pand xmm2, xmm7 1.401 + pavgw xmm0, xmm2 1.402 + packuswb xmm0, xmm0 1.403 + 1.404 + sub ecx, 8 1.405 + movq qword ptr [edx], xmm0 1.406 + lea edx, [edx + 8] 1.407 + jg wloop 1.408 + 1.409 + pop edi 1.410 + pop esi 1.411 + ret 1.412 + } 1.413 +} 1.414 + 1.415 +// Point samples 32 pixels to 24 pixels. 1.416 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 1.417 +// Then shuffled to do the scaling. 1.418 + 1.419 +// Note that movdqa+palign may be better than movdqu. 1.420 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 1.421 +__declspec(naked) __declspec(align(16)) 1.422 +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 1.423 + uint8* dst_ptr, int dst_width) { 1.424 + __asm { 1.425 + mov eax, [esp + 4] // src_ptr 1.426 + // src_stride ignored 1.427 + mov edx, [esp + 12] // dst_ptr 1.428 + mov ecx, [esp + 16] // dst_width 1.429 + movdqa xmm3, kShuf0 1.430 + movdqa xmm4, kShuf1 1.431 + movdqa xmm5, kShuf2 1.432 + 1.433 + align 4 1.434 + wloop: 1.435 + movdqa xmm0, [eax] 1.436 + movdqa xmm1, [eax + 16] 1.437 + lea eax, [eax + 32] 1.438 + movdqa xmm2, xmm1 1.439 + palignr xmm1, xmm0, 8 1.440 + pshufb xmm0, xmm3 1.441 + pshufb xmm1, xmm4 1.442 + pshufb xmm2, xmm5 1.443 + movq qword ptr [edx], xmm0 1.444 + movq qword ptr [edx + 8], xmm1 1.445 + movq qword ptr [edx + 16], xmm2 1.446 + lea edx, [edx + 24] 1.447 + sub ecx, 24 1.448 + jg wloop 1.449 + 1.450 + ret 1.451 + } 1.452 +} 1.453 + 1.454 +// Blends 32x2 rectangle to 24x1 1.455 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 1.456 +// Then shuffled to do the scaling. 1.457 + 1.458 +// Register usage: 1.459 +// xmm0 src_row 0 1.460 +// xmm1 src_row 1 1.461 +// xmm2 shuf 0 1.462 +// xmm3 shuf 1 1.463 +// xmm4 shuf 2 1.464 +// xmm5 madd 0 1.465 +// xmm6 madd 1 1.466 +// xmm7 kRound34 1.467 + 1.468 +// Note that movdqa+palign may be better than movdqu. 1.469 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 1.470 +__declspec(naked) __declspec(align(16)) 1.471 +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 1.472 + ptrdiff_t src_stride, 1.473 + uint8* dst_ptr, int dst_width) { 1.474 + __asm { 1.475 + push esi 1.476 + mov eax, [esp + 4 + 4] // src_ptr 1.477 + mov esi, [esp + 4 + 8] // src_stride 1.478 + mov edx, [esp + 4 + 12] // dst_ptr 1.479 + mov ecx, [esp + 4 + 16] // dst_width 1.480 + movdqa xmm2, kShuf01 1.481 + movdqa xmm3, kShuf11 1.482 + movdqa xmm4, kShuf21 1.483 + movdqa xmm5, kMadd01 1.484 + movdqa xmm6, kMadd11 1.485 + movdqa xmm7, kRound34 1.486 + 1.487 + align 4 1.488 + wloop: 1.489 + movdqa xmm0, [eax] // pixels 0..7 1.490 + movdqa xmm1, [eax + esi] 1.491 + pavgb xmm0, xmm1 1.492 + pshufb xmm0, xmm2 1.493 + pmaddubsw xmm0, xmm5 1.494 + paddsw xmm0, xmm7 1.495 + psrlw xmm0, 2 1.496 + packuswb xmm0, xmm0 1.497 + movq qword ptr [edx], xmm0 1.498 + movdqu xmm0, [eax + 8] // pixels 8..15 1.499 + movdqu xmm1, [eax + esi + 8] 1.500 + pavgb xmm0, xmm1 1.501 + pshufb xmm0, xmm3 1.502 + pmaddubsw xmm0, xmm6 1.503 + paddsw xmm0, xmm7 1.504 + psrlw xmm0, 2 1.505 + packuswb xmm0, xmm0 1.506 + movq qword ptr [edx + 8], xmm0 1.507 + movdqa xmm0, [eax + 16] // pixels 16..23 1.508 + movdqa xmm1, [eax + esi + 16] 1.509 + lea eax, [eax + 32] 1.510 + pavgb xmm0, xmm1 1.511 + pshufb xmm0, xmm4 1.512 + movdqa xmm1, kMadd21 1.513 + pmaddubsw xmm0, xmm1 1.514 + paddsw xmm0, xmm7 1.515 + psrlw xmm0, 2 1.516 + packuswb xmm0, xmm0 1.517 + sub ecx, 24 1.518 + movq qword ptr [edx + 16], xmm0 1.519 + lea edx, [edx + 24] 1.520 + jg wloop 1.521 + 1.522 + pop esi 1.523 + ret 1.524 + } 1.525 +} 1.526 + 1.527 +// Note that movdqa+palign may be better than movdqu. 1.528 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 1.529 +__declspec(naked) __declspec(align(16)) 1.530 +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 1.531 + ptrdiff_t src_stride, 1.532 + uint8* dst_ptr, int dst_width) { 1.533 + __asm { 1.534 + push esi 1.535 + mov eax, [esp + 4 + 4] // src_ptr 1.536 + mov esi, [esp + 4 + 8] // src_stride 1.537 + mov edx, [esp + 4 + 12] // dst_ptr 1.538 + mov ecx, [esp + 4 + 16] // dst_width 1.539 + movdqa xmm2, kShuf01 1.540 + movdqa xmm3, kShuf11 1.541 + movdqa xmm4, kShuf21 1.542 + movdqa xmm5, kMadd01 1.543 + movdqa xmm6, kMadd11 1.544 + movdqa xmm7, kRound34 1.545 + 1.546 + align 4 1.547 + wloop: 1.548 + movdqa xmm0, [eax] // pixels 0..7 1.549 + movdqa xmm1, [eax + esi] 1.550 + pavgb xmm1, xmm0 1.551 + pavgb xmm0, xmm1 1.552 + pshufb xmm0, xmm2 1.553 + pmaddubsw xmm0, xmm5 1.554 + paddsw xmm0, xmm7 1.555 + psrlw xmm0, 2 1.556 + packuswb xmm0, xmm0 1.557 + movq qword ptr [edx], xmm0 1.558 + movdqu xmm0, [eax + 8] // pixels 8..15 1.559 + movdqu xmm1, [eax + esi + 8] 1.560 + pavgb xmm1, xmm0 1.561 + pavgb xmm0, xmm1 1.562 + pshufb xmm0, xmm3 1.563 + pmaddubsw xmm0, xmm6 1.564 + paddsw xmm0, xmm7 1.565 + psrlw xmm0, 2 1.566 + packuswb xmm0, xmm0 1.567 + movq qword ptr [edx + 8], xmm0 1.568 + movdqa xmm0, [eax + 16] // pixels 16..23 1.569 + movdqa xmm1, [eax + esi + 16] 1.570 + lea eax, [eax + 32] 1.571 + pavgb xmm1, xmm0 1.572 + pavgb xmm0, xmm1 1.573 + pshufb xmm0, xmm4 1.574 + movdqa xmm1, kMadd21 1.575 + pmaddubsw xmm0, xmm1 1.576 + paddsw xmm0, xmm7 1.577 + psrlw xmm0, 2 1.578 + packuswb xmm0, xmm0 1.579 + sub ecx, 24 1.580 + movq qword ptr [edx + 16], xmm0 1.581 + lea edx, [edx+24] 1.582 + jg wloop 1.583 + 1.584 + pop esi 1.585 + ret 1.586 + } 1.587 +} 1.588 + 1.589 +// 3/8 point sampler 1.590 + 1.591 +// Scale 32 pixels to 12 1.592 +__declspec(naked) __declspec(align(16)) 1.593 +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 1.594 + uint8* dst_ptr, int dst_width) { 1.595 + __asm { 1.596 + mov eax, [esp + 4] // src_ptr 1.597 + // src_stride ignored 1.598 + mov edx, [esp + 12] // dst_ptr 1.599 + mov ecx, [esp + 16] // dst_width 1.600 + movdqa xmm4, kShuf38a 1.601 + movdqa xmm5, kShuf38b 1.602 + 1.603 + align 4 1.604 + xloop: 1.605 + movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 1.606 + movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 1.607 + lea eax, [eax + 32] 1.608 + pshufb xmm0, xmm4 1.609 + pshufb xmm1, xmm5 1.610 + paddusb xmm0, xmm1 1.611 + 1.612 + sub ecx, 12 1.613 + movq qword ptr [edx], xmm0 // write 12 pixels 1.614 + movhlps xmm1, xmm0 1.615 + movd [edx + 8], xmm1 1.616 + lea edx, [edx + 12] 1.617 + jg xloop 1.618 + 1.619 + ret 1.620 + } 1.621 +} 1.622 + 1.623 +// Scale 16x3 pixels to 6x1 with interpolation 1.624 +__declspec(naked) __declspec(align(16)) 1.625 +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 1.626 + ptrdiff_t src_stride, 1.627 + uint8* dst_ptr, int dst_width) { 1.628 + __asm { 1.629 + push esi 1.630 + mov eax, [esp + 4 + 4] // src_ptr 1.631 + mov esi, [esp + 4 + 8] // src_stride 1.632 + mov edx, [esp + 4 + 12] // dst_ptr 1.633 + mov ecx, [esp + 4 + 16] // dst_width 1.634 + movdqa xmm2, kShufAc 1.635 + movdqa xmm3, kShufAc3 1.636 + movdqa xmm4, kScaleAc33 1.637 + pxor xmm5, xmm5 1.638 + 1.639 + align 4 1.640 + xloop: 1.641 + movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 1.642 + movdqa xmm6, [eax + esi] 1.643 + movhlps xmm1, xmm0 1.644 + movhlps xmm7, xmm6 1.645 + punpcklbw xmm0, xmm5 1.646 + punpcklbw xmm1, xmm5 1.647 + punpcklbw xmm6, xmm5 1.648 + punpcklbw xmm7, xmm5 1.649 + paddusw xmm0, xmm6 1.650 + paddusw xmm1, xmm7 1.651 + movdqa xmm6, [eax + esi * 2] 1.652 + lea eax, [eax + 16] 1.653 + movhlps xmm7, xmm6 1.654 + punpcklbw xmm6, xmm5 1.655 + punpcklbw xmm7, xmm5 1.656 + paddusw xmm0, xmm6 1.657 + paddusw xmm1, xmm7 1.658 + 1.659 + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 1.660 + psrldq xmm0, 2 1.661 + paddusw xmm6, xmm0 1.662 + psrldq xmm0, 2 1.663 + paddusw xmm6, xmm0 1.664 + pshufb xmm6, xmm2 1.665 + 1.666 + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 1.667 + psrldq xmm1, 2 1.668 + paddusw xmm7, xmm1 1.669 + psrldq xmm1, 2 1.670 + paddusw xmm7, xmm1 1.671 + pshufb xmm7, xmm3 1.672 + paddusw xmm6, xmm7 1.673 + 1.674 + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 1.675 + packuswb xmm6, xmm6 1.676 + 1.677 + sub ecx, 6 1.678 + movd [edx], xmm6 // write 6 pixels 1.679 + psrlq xmm6, 16 1.680 + movd [edx + 2], xmm6 1.681 + lea edx, [edx + 6] 1.682 + jg xloop 1.683 + 1.684 + pop esi 1.685 + ret 1.686 + } 1.687 +} 1.688 + 1.689 +// Scale 16x2 pixels to 6x1 with interpolation 1.690 +__declspec(naked) __declspec(align(16)) 1.691 +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 1.692 + ptrdiff_t src_stride, 1.693 + uint8* dst_ptr, int dst_width) { 1.694 + __asm { 1.695 + push esi 1.696 + mov eax, [esp + 4 + 4] // src_ptr 1.697 + mov esi, [esp + 4 + 8] // src_stride 1.698 + mov edx, [esp + 4 + 12] // dst_ptr 1.699 + mov ecx, [esp + 4 + 16] // dst_width 1.700 + movdqa xmm2, kShufAb0 1.701 + movdqa xmm3, kShufAb1 1.702 + movdqa xmm4, kShufAb2 1.703 + movdqa xmm5, kScaleAb2 1.704 + 1.705 + align 4 1.706 + xloop: 1.707 + movdqa xmm0, [eax] // average 2 rows into xmm0 1.708 + pavgb xmm0, [eax + esi] 1.709 + lea eax, [eax + 16] 1.710 + 1.711 + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 1.712 + pshufb xmm1, xmm2 1.713 + movdqa xmm6, xmm0 1.714 + pshufb xmm6, xmm3 1.715 + paddusw xmm1, xmm6 1.716 + pshufb xmm0, xmm4 1.717 + paddusw xmm1, xmm0 1.718 + 1.719 + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 1.720 + packuswb xmm1, xmm1 1.721 + 1.722 + sub ecx, 6 1.723 + movd [edx], xmm1 // write 6 pixels 1.724 + psrlq xmm1, 16 1.725 + movd [edx + 2], xmm1 1.726 + lea edx, [edx + 6] 1.727 + jg xloop 1.728 + 1.729 + pop esi 1.730 + ret 1.731 + } 1.732 +} 1.733 + 1.734 +// Reads 16xN bytes and produces 16 shorts at a time. 1.735 +// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. 1.736 +__declspec(naked) __declspec(align(16)) 1.737 +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.738 + uint16* dst_ptr, int src_width, 1.739 + int src_height) { 1.740 + __asm { 1.741 + push esi 1.742 + push edi 1.743 + push ebx 1.744 + push ebp 1.745 + mov esi, [esp + 16 + 4] // src_ptr 1.746 + mov edx, [esp + 16 + 8] // src_stride 1.747 + mov edi, [esp + 16 + 12] // dst_ptr 1.748 + mov ecx, [esp + 16 + 16] // dst_width 1.749 + mov ebx, [esp + 16 + 20] // height 1.750 + pxor xmm4, xmm4 1.751 + dec ebx 1.752 + 1.753 + align 4 1.754 + xloop: 1.755 + // first row 1.756 + movdqa xmm0, [esi] 1.757 + lea eax, [esi + edx] 1.758 + movdqa xmm1, xmm0 1.759 + punpcklbw xmm0, xmm4 1.760 + punpckhbw xmm1, xmm4 1.761 + lea esi, [esi + 16] 1.762 + mov ebp, ebx 1.763 + test ebp, ebp 1.764 + je ydone 1.765 + 1.766 + // sum remaining rows 1.767 + align 4 1.768 + yloop: 1.769 + movdqa xmm2, [eax] // read 16 pixels 1.770 + lea eax, [eax + edx] // advance to next row 1.771 + movdqa xmm3, xmm2 1.772 + punpcklbw xmm2, xmm4 1.773 + punpckhbw xmm3, xmm4 1.774 + paddusw xmm0, xmm2 // sum 16 words 1.775 + paddusw xmm1, xmm3 1.776 + sub ebp, 1 1.777 + jg yloop 1.778 + 1.779 + align 4 1.780 + ydone: 1.781 + movdqa [edi], xmm0 1.782 + movdqa [edi + 16], xmm1 1.783 + lea edi, [edi + 32] 1.784 + 1.785 + sub ecx, 16 1.786 + jg xloop 1.787 + 1.788 + pop ebp 1.789 + pop ebx 1.790 + pop edi 1.791 + pop esi 1.792 + ret 1.793 + } 1.794 +} 1.795 + 1.796 +// Bilinear column filtering. SSSE3 version. 1.797 +// TODO(fbarchard): Port to Neon 1.798 +// TODO(fbarchard): Switch the following: 1.799 +// xor ebx, ebx 1.800 +// mov bx, word ptr [esi + eax] // 2 source x0 pixels 1.801 +// To 1.802 +// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 1.803 +// when drmemory bug fixed. 1.804 +// https://code.google.com/p/drmemory/issues/detail?id=1396 1.805 + 1.806 +__declspec(naked) __declspec(align(16)) 1.807 +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1.808 + int dst_width, int x, int dx) { 1.809 + __asm { 1.810 + push ebx 1.811 + push esi 1.812 + push edi 1.813 + mov edi, [esp + 12 + 4] // dst_ptr 1.814 + mov esi, [esp + 12 + 8] // src_ptr 1.815 + mov ecx, [esp + 12 + 12] // dst_width 1.816 + movd xmm2, [esp + 12 + 16] // x 1.817 + movd xmm3, [esp + 12 + 20] // dx 1.818 + mov eax, 0x04040000 // shuffle to line up fractions with pixel. 1.819 + movd xmm5, eax 1.820 + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1.821 + psrlw xmm6, 9 1.822 + pextrw eax, xmm2, 1 // get x0 integer. preroll 1.823 + sub ecx, 2 1.824 + jl xloop29 1.825 + 1.826 + movdqa xmm0, xmm2 // x1 = x0 + dx 1.827 + paddd xmm0, xmm3 1.828 + punpckldq xmm2, xmm0 // x0 x1 1.829 + punpckldq xmm3, xmm3 // dx dx 1.830 + paddd xmm3, xmm3 // dx * 2, dx * 2 1.831 + pextrw edx, xmm2, 3 // get x1 integer. preroll 1.832 + 1.833 + // 2 Pixel loop. 1.834 + align 4 1.835 + xloop2: 1.836 + movdqa xmm1, xmm2 // x0, x1 fractions. 1.837 + paddd xmm2, xmm3 // x += dx 1.838 + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 1.839 + movd xmm0, ebx 1.840 + psrlw xmm1, 9 // 7 bit fractions. 1.841 + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 1.842 + movd xmm4, ebx 1.843 + pshufb xmm1, xmm5 // 0011 1.844 + punpcklwd xmm0, xmm4 1.845 + pxor xmm1, xmm6 // 0..7f and 7f..0 1.846 + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. 1.847 + pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1.848 + pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1.849 + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 1.850 + packuswb xmm0, xmm0 // 8 bits, 2 pixels. 1.851 + movd ebx, xmm0 1.852 + mov [edi], bx 1.853 + lea edi, [edi + 2] 1.854 + sub ecx, 2 // 2 pixels 1.855 + jge xloop2 1.856 + 1.857 + align 4 1.858 + xloop29: 1.859 + 1.860 + add ecx, 2 - 1 1.861 + jl xloop99 1.862 + 1.863 + // 1 pixel remainder 1.864 + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 1.865 + movd xmm0, ebx 1.866 + psrlw xmm2, 9 // 7 bit fractions. 1.867 + pshufb xmm2, xmm5 // 0011 1.868 + pxor xmm2, xmm6 // 0..7f and 7f..0 1.869 + pmaddubsw xmm0, xmm2 // 16 bit 1.870 + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 1.871 + packuswb xmm0, xmm0 // 8 bits 1.872 + movd ebx, xmm0 1.873 + mov [edi], bl 1.874 + 1.875 + align 4 1.876 + xloop99: 1.877 + 1.878 + pop edi 1.879 + pop esi 1.880 + pop ebx 1.881 + ret 1.882 + } 1.883 +} 1.884 + 1.885 +// Reads 16 pixels, duplicates them and writes 32 pixels. 1.886 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1.887 +__declspec(naked) __declspec(align(16)) 1.888 +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 1.889 + int dst_width, int x, int dx) { 1.890 + __asm { 1.891 + mov edx, [esp + 4] // dst_ptr 1.892 + mov eax, [esp + 8] // src_ptr 1.893 + mov ecx, [esp + 12] // dst_width 1.894 + 1.895 + align 4 1.896 + wloop: 1.897 + movdqa xmm0, [eax] 1.898 + lea eax, [eax + 16] 1.899 + movdqa xmm1, xmm0 1.900 + punpcklbw xmm0, xmm0 1.901 + punpckhbw xmm1, xmm1 1.902 + sub ecx, 32 1.903 + movdqa [edx], xmm0 1.904 + movdqa [edx + 16], xmm1 1.905 + lea edx, [edx + 32] 1.906 + jg wloop 1.907 + 1.908 + ret 1.909 + } 1.910 +} 1.911 + 1.912 +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 1.913 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1.914 +__declspec(naked) __declspec(align(16)) 1.915 +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 1.916 + ptrdiff_t src_stride, 1.917 + uint8* dst_argb, int dst_width) { 1.918 + __asm { 1.919 + mov eax, [esp + 4] // src_argb 1.920 + // src_stride ignored 1.921 + mov edx, [esp + 12] // dst_argb 1.922 + mov ecx, [esp + 16] // dst_width 1.923 + 1.924 + align 4 1.925 + wloop: 1.926 + movdqa xmm0, [eax] 1.927 + movdqa xmm1, [eax + 16] 1.928 + lea eax, [eax + 32] 1.929 + shufps xmm0, xmm1, 0xdd 1.930 + sub ecx, 4 1.931 + movdqa [edx], xmm0 1.932 + lea edx, [edx + 16] 1.933 + jg wloop 1.934 + 1.935 + ret 1.936 + } 1.937 +} 1.938 + 1.939 +// Blends 8x1 rectangle to 4x1. 1.940 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1.941 +__declspec(naked) __declspec(align(16)) 1.942 +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 1.943 + ptrdiff_t src_stride, 1.944 + uint8* dst_argb, int dst_width) { 1.945 + __asm { 1.946 + mov eax, [esp + 4] // src_argb 1.947 + // src_stride ignored 1.948 + mov edx, [esp + 12] // dst_argb 1.949 + mov ecx, [esp + 16] // dst_width 1.950 + 1.951 + align 4 1.952 + wloop: 1.953 + movdqa xmm0, [eax] 1.954 + movdqa xmm1, [eax + 16] 1.955 + lea eax, [eax + 32] 1.956 + movdqa xmm2, xmm0 1.957 + shufps xmm0, xmm1, 0x88 // even pixels 1.958 + shufps xmm2, xmm1, 0xdd // odd pixels 1.959 + pavgb xmm0, xmm2 1.960 + sub ecx, 4 1.961 + movdqa [edx], xmm0 1.962 + lea edx, [edx + 16] 1.963 + jg wloop 1.964 + 1.965 + ret 1.966 + } 1.967 +} 1.968 + 1.969 +// Blends 8x2 rectangle to 4x1. 1.970 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1.971 +__declspec(naked) __declspec(align(16)) 1.972 +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1.973 + ptrdiff_t src_stride, 1.974 + uint8* dst_argb, int dst_width) { 1.975 + __asm { 1.976 + push esi 1.977 + mov eax, [esp + 4 + 4] // src_argb 1.978 + mov esi, [esp + 4 + 8] // src_stride 1.979 + mov edx, [esp + 4 + 12] // dst_argb 1.980 + mov ecx, [esp + 4 + 16] // dst_width 1.981 + 1.982 + align 4 1.983 + wloop: 1.984 + movdqa xmm0, [eax] 1.985 + movdqa xmm1, [eax + 16] 1.986 + movdqa xmm2, [eax + esi] 1.987 + movdqa xmm3, [eax + esi + 16] 1.988 + lea eax, [eax + 32] 1.989 + pavgb xmm0, xmm2 // average rows 1.990 + pavgb xmm1, xmm3 1.991 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1.992 + shufps xmm0, xmm1, 0x88 // even pixels 1.993 + shufps xmm2, xmm1, 0xdd // odd pixels 1.994 + pavgb xmm0, xmm2 1.995 + sub ecx, 4 1.996 + movdqa [edx], xmm0 1.997 + lea edx, [edx + 16] 1.998 + jg wloop 1.999 + 1.1000 + pop esi 1.1001 + ret 1.1002 + } 1.1003 +} 1.1004 + 1.1005 +// Reads 4 pixels at a time. 1.1006 +// Alignment requirement: dst_argb 16 byte aligned. 1.1007 +__declspec(naked) __declspec(align(16)) 1.1008 +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1.1009 + int src_stepx, 1.1010 + uint8* dst_argb, int dst_width) { 1.1011 + __asm { 1.1012 + push ebx 1.1013 + push edi 1.1014 + mov eax, [esp + 8 + 4] // src_argb 1.1015 + // src_stride ignored 1.1016 + mov ebx, [esp + 8 + 12] // src_stepx 1.1017 + mov edx, [esp + 8 + 16] // dst_argb 1.1018 + mov ecx, [esp + 8 + 20] // dst_width 1.1019 + lea ebx, [ebx * 4] 1.1020 + lea edi, [ebx + ebx * 2] 1.1021 + 1.1022 + align 4 1.1023 + wloop: 1.1024 + movd xmm0, [eax] 1.1025 + movd xmm1, [eax + ebx] 1.1026 + punpckldq xmm0, xmm1 1.1027 + movd xmm2, [eax + ebx * 2] 1.1028 + movd xmm3, [eax + edi] 1.1029 + lea eax, [eax + ebx * 4] 1.1030 + punpckldq xmm2, xmm3 1.1031 + punpcklqdq xmm0, xmm2 1.1032 + sub ecx, 4 1.1033 + movdqa [edx], xmm0 1.1034 + lea edx, [edx + 16] 1.1035 + jg wloop 1.1036 + 1.1037 + pop edi 1.1038 + pop ebx 1.1039 + ret 1.1040 + } 1.1041 +} 1.1042 + 1.1043 +// Blends four 2x2 to 4x1. 1.1044 +// Alignment requirement: dst_argb 16 byte aligned. 1.1045 +__declspec(naked) __declspec(align(16)) 1.1046 +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1.1047 + ptrdiff_t src_stride, 1.1048 + int src_stepx, 1.1049 + uint8* dst_argb, int dst_width) { 1.1050 + __asm { 1.1051 + push ebx 1.1052 + push esi 1.1053 + push edi 1.1054 + mov eax, [esp + 12 + 4] // src_argb 1.1055 + mov esi, [esp + 12 + 8] // src_stride 1.1056 + mov ebx, [esp + 12 + 12] // src_stepx 1.1057 + mov edx, [esp + 12 + 16] // dst_argb 1.1058 + mov ecx, [esp + 12 + 20] // dst_width 1.1059 + lea esi, [eax + esi] // row1 pointer 1.1060 + lea ebx, [ebx * 4] 1.1061 + lea edi, [ebx + ebx * 2] 1.1062 + 1.1063 + align 4 1.1064 + wloop: 1.1065 + movq xmm0, qword ptr [eax] // row0 4 pairs 1.1066 + movhps xmm0, qword ptr [eax + ebx] 1.1067 + movq xmm1, qword ptr [eax + ebx * 2] 1.1068 + movhps xmm1, qword ptr [eax + edi] 1.1069 + lea eax, [eax + ebx * 4] 1.1070 + movq xmm2, qword ptr [esi] // row1 4 pairs 1.1071 + movhps xmm2, qword ptr [esi + ebx] 1.1072 + movq xmm3, qword ptr [esi + ebx * 2] 1.1073 + movhps xmm3, qword ptr [esi + edi] 1.1074 + lea esi, [esi + ebx * 4] 1.1075 + pavgb xmm0, xmm2 // average rows 1.1076 + pavgb xmm1, xmm3 1.1077 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1.1078 + shufps xmm0, xmm1, 0x88 // even pixels 1.1079 + shufps xmm2, xmm1, 0xdd // odd pixels 1.1080 + pavgb xmm0, xmm2 1.1081 + sub ecx, 4 1.1082 + movdqa [edx], xmm0 1.1083 + lea edx, [edx + 16] 1.1084 + jg wloop 1.1085 + 1.1086 + pop edi 1.1087 + pop esi 1.1088 + pop ebx 1.1089 + ret 1.1090 + } 1.1091 +} 1.1092 + 1.1093 +// Column scaling unfiltered. SSE2 version. 1.1094 +__declspec(naked) __declspec(align(16)) 1.1095 +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1.1096 + int dst_width, int x, int dx) { 1.1097 + __asm { 1.1098 + push edi 1.1099 + push esi 1.1100 + mov edi, [esp + 8 + 4] // dst_argb 1.1101 + mov esi, [esp + 8 + 8] // src_argb 1.1102 + mov ecx, [esp + 8 + 12] // dst_width 1.1103 + movd xmm2, [esp + 8 + 16] // x 1.1104 + movd xmm3, [esp + 8 + 20] // dx 1.1105 + 1.1106 + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 1.1107 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 1.1108 + paddd xmm2, xmm0 1.1109 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 1.1110 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 1.1111 + paddd xmm2, xmm0 // x3 x2 x1 x0 1.1112 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 1.1113 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 1.1114 + 1.1115 + pextrw eax, xmm2, 1 // get x0 integer. 1.1116 + pextrw edx, xmm2, 3 // get x1 integer. 1.1117 + 1.1118 + cmp ecx, 0 1.1119 + jle xloop99 1.1120 + sub ecx, 4 1.1121 + jl xloop49 1.1122 + 1.1123 + // 4 Pixel loop. 1.1124 + align 4 1.1125 + xloop4: 1.1126 + movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1.1127 + movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1.1128 + pextrw eax, xmm2, 5 // get x2 integer. 1.1129 + pextrw edx, xmm2, 7 // get x3 integer. 1.1130 + paddd xmm2, xmm3 // x += dx 1.1131 + punpckldq xmm0, xmm1 // x0 x1 1.1132 + 1.1133 + movd xmm1, [esi + eax * 4] // 1 source x2 pixels 1.1134 + movd xmm4, [esi + edx * 4] // 1 source x3 pixels 1.1135 + pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1.1136 + pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1.1137 + punpckldq xmm1, xmm4 // x2 x3 1.1138 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 1.1139 + sub ecx, 4 // 4 pixels 1.1140 + movdqu [edi], xmm0 1.1141 + lea edi, [edi + 16] 1.1142 + jge xloop4 1.1143 + 1.1144 + align 4 1.1145 + xloop49: 1.1146 + test ecx, 2 1.1147 + je xloop29 1.1148 + 1.1149 + // 2 Pixels. 1.1150 + movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1.1151 + movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1.1152 + pextrw eax, xmm2, 5 // get x2 integer. 1.1153 + punpckldq xmm0, xmm1 // x0 x1 1.1154 + 1.1155 + movq qword ptr [edi], xmm0 1.1156 + lea edi, [edi + 8] 1.1157 + 1.1158 + xloop29: 1.1159 + test ecx, 1 1.1160 + je xloop99 1.1161 + 1.1162 + // 1 Pixels. 1.1163 + movd xmm0, [esi + eax * 4] // 1 source x2 pixels 1.1164 + movd dword ptr [edi], xmm0 1.1165 + align 4 1.1166 + xloop99: 1.1167 + 1.1168 + pop esi 1.1169 + pop edi 1.1170 + ret 1.1171 + } 1.1172 +} 1.1173 + 1.1174 +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. 1.1175 +// TODO(fbarchard): Port to Neon 1.1176 + 1.1177 +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1.1178 +static uvec8 kShuffleColARGB = { 1.1179 + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1.1180 + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1.1181 +}; 1.1182 + 1.1183 +// Shuffle table for duplicating 2 fractions into 8 bytes each 1.1184 +static uvec8 kShuffleFractions = { 1.1185 + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1.1186 +}; 1.1187 + 1.1188 +__declspec(naked) __declspec(align(16)) 1.1189 +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1.1190 + int dst_width, int x, int dx) { 1.1191 + __asm { 1.1192 + push esi 1.1193 + push edi 1.1194 + mov edi, [esp + 8 + 4] // dst_argb 1.1195 + mov esi, [esp + 8 + 8] // src_argb 1.1196 + mov ecx, [esp + 8 + 12] // dst_width 1.1197 + movd xmm2, [esp + 8 + 16] // x 1.1198 + movd xmm3, [esp + 8 + 20] // dx 1.1199 + movdqa xmm4, kShuffleColARGB 1.1200 + movdqa xmm5, kShuffleFractions 1.1201 + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1.1202 + psrlw xmm6, 9 1.1203 + pextrw eax, xmm2, 1 // get x0 integer. preroll 1.1204 + sub ecx, 2 1.1205 + jl xloop29 1.1206 + 1.1207 + movdqa xmm0, xmm2 // x1 = x0 + dx 1.1208 + paddd xmm0, xmm3 1.1209 + punpckldq xmm2, xmm0 // x0 x1 1.1210 + punpckldq xmm3, xmm3 // dx dx 1.1211 + paddd xmm3, xmm3 // dx * 2, dx * 2 1.1212 + pextrw edx, xmm2, 3 // get x1 integer. preroll 1.1213 + 1.1214 + // 2 Pixel loop. 1.1215 + align 4 1.1216 + xloop2: 1.1217 + movdqa xmm1, xmm2 // x0, x1 fractions. 1.1218 + paddd xmm2, xmm3 // x += dx 1.1219 + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1.1220 + psrlw xmm1, 9 // 7 bit fractions. 1.1221 + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels 1.1222 + pshufb xmm1, xmm5 // 0000000011111111 1.1223 + pshufb xmm0, xmm4 // arrange pixels into pairs 1.1224 + pxor xmm1, xmm6 // 0..7f and 7f..0 1.1225 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. 1.1226 + pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1.1227 + pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1.1228 + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. 1.1229 + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. 1.1230 + movq qword ptr [edi], xmm0 1.1231 + lea edi, [edi + 8] 1.1232 + sub ecx, 2 // 2 pixels 1.1233 + jge xloop2 1.1234 + 1.1235 + align 4 1.1236 + xloop29: 1.1237 + 1.1238 + add ecx, 2 - 1 1.1239 + jl xloop99 1.1240 + 1.1241 + // 1 pixel remainder 1.1242 + psrlw xmm2, 9 // 7 bit fractions. 1.1243 + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1.1244 + pshufb xmm2, xmm5 // 00000000 1.1245 + pshufb xmm0, xmm4 // arrange pixels into pairs 1.1246 + pxor xmm2, xmm6 // 0..7f and 7f..0 1.1247 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. 1.1248 + psrlw xmm0, 7 1.1249 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. 1.1250 + movd [edi], xmm0 1.1251 + 1.1252 + align 4 1.1253 + xloop99: 1.1254 + 1.1255 + pop edi 1.1256 + pop esi 1.1257 + ret 1.1258 + } 1.1259 +} 1.1260 + 1.1261 +// Reads 4 pixels, duplicates them and writes 8 pixels. 1.1262 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1.1263 +__declspec(naked) __declspec(align(16)) 1.1264 +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 1.1265 + int dst_width, int x, int dx) { 1.1266 + __asm { 1.1267 + mov edx, [esp + 4] // dst_argb 1.1268 + mov eax, [esp + 8] // src_argb 1.1269 + mov ecx, [esp + 12] // dst_width 1.1270 + 1.1271 + align 4 1.1272 + wloop: 1.1273 + movdqa xmm0, [eax] 1.1274 + lea eax, [eax + 16] 1.1275 + movdqa xmm1, xmm0 1.1276 + punpckldq xmm0, xmm0 1.1277 + punpckhdq xmm1, xmm1 1.1278 + sub ecx, 8 1.1279 + movdqa [edx], xmm0 1.1280 + movdqa [edx + 16], xmm1 1.1281 + lea edx, [edx + 32] 1.1282 + jg wloop 1.1283 + 1.1284 + ret 1.1285 + } 1.1286 +} 1.1287 + 1.1288 +// Divide num by div and return as 16.16 fixed point result. 1.1289 +__declspec(naked) __declspec(align(16)) 1.1290 +int FixedDiv_X86(int num, int div) { 1.1291 + __asm { 1.1292 + mov eax, [esp + 4] // num 1.1293 + cdq // extend num to 64 bits 1.1294 + shld edx, eax, 16 // 32.16 1.1295 + shl eax, 16 1.1296 + idiv dword ptr [esp + 8] 1.1297 + ret 1.1298 + } 1.1299 +} 1.1300 + 1.1301 +// Divide num by div and return as 16.16 fixed point result. 1.1302 +__declspec(naked) __declspec(align(16)) 1.1303 +int FixedDiv1_X86(int num, int div) { 1.1304 + __asm { 1.1305 + mov eax, [esp + 4] // num 1.1306 + mov ecx, [esp + 8] // denom 1.1307 + cdq // extend num to 64 bits 1.1308 + shld edx, eax, 16 // 32.16 1.1309 + shl eax, 16 1.1310 + sub eax, 0x00010001 1.1311 + sbb edx, 0 1.1312 + sub ecx, 1 1.1313 + idiv ecx 1.1314 + ret 1.1315 + } 1.1316 +} 1.1317 + 1.1318 +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 1.1319 + 1.1320 +#ifdef __cplusplus 1.1321 +} // extern "C" 1.1322 +} // namespace libyuv 1.1323 +#endif