1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1420 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION_RODATA 1.17 +pw_8: times 8 dw 8 1.18 +bilin_filter_m_sse2: times 8 dw 16 1.19 + times 8 dw 0 1.20 + times 8 dw 15 1.21 + times 8 dw 1 1.22 + times 8 dw 14 1.23 + times 8 dw 2 1.24 + times 8 dw 13 1.25 + times 8 dw 3 1.26 + times 8 dw 12 1.27 + times 8 dw 4 1.28 + times 8 dw 11 1.29 + times 8 dw 5 1.30 + times 8 dw 10 1.31 + times 8 dw 6 1.32 + times 8 dw 9 1.33 + times 8 dw 7 1.34 + times 16 dw 8 1.35 + times 8 dw 7 1.36 + times 8 dw 9 1.37 + times 8 dw 6 1.38 + times 8 dw 10 1.39 + times 8 dw 5 1.40 + times 8 dw 11 1.41 + times 8 dw 4 1.42 + times 8 dw 12 1.43 + times 8 dw 3 1.44 + times 8 dw 13 1.45 + times 8 dw 2 1.46 + times 8 dw 14 1.47 + times 8 dw 1 1.48 + times 8 dw 15 1.49 + 1.50 +bilin_filter_m_ssse3: times 8 db 16, 0 1.51 + times 8 db 15, 1 1.52 + times 8 db 14, 2 1.53 + times 8 db 13, 3 1.54 + times 8 db 12, 4 1.55 + times 8 db 11, 5 1.56 + times 8 db 10, 6 1.57 + times 8 db 9, 7 1.58 + times 16 db 8 1.59 + times 8 db 7, 9 1.60 + times 8 db 6, 10 1.61 + times 8 db 5, 11 1.62 + times 8 db 4, 12 1.63 + times 8 db 3, 13 1.64 + times 8 db 2, 14 1.65 + times 8 db 1, 15 1.66 + 1.67 +SECTION .text 1.68 + 1.69 +; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 1.70 +; int x_offset, int y_offset, 1.71 +; const uint8_t *dst, ptrdiff_t dst_stride, 1.72 +; int height, unsigned int *sse); 1.73 +; 1.74 +; This function returns the SE and stores SSE in the given pointer. 1.75 + 1.76 +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 1.77 + psubw %3, %4 1.78 + psubw %1, %2 1.79 + paddw %5, %3 1.80 + pmaddwd %3, %3 1.81 + paddw %5, %1 1.82 + pmaddwd %1, %1 1.83 + paddd %6, %3 1.84 + paddd %6, %1 1.85 +%endmacro 1.86 + 1.87 +%macro STORE_AND_RET 0 1.88 +%if mmsize == 16 1.89 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 1.90 + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 1.91 + ; We have to sign-extend it before adding the words within the register 1.92 + ; and outputing to a dword. 1.93 + pcmpgtw m5, m6 ; mask for 0 > x 1.94 + movhlps m3, m7 1.95 + punpcklwd m4, m6, m5 1.96 + punpckhwd m6, m5 ; sign-extend m6 word->dword 1.97 + paddd m7, m3 1.98 + paddd m6, m4 1.99 + pshufd m3, m7, 0x1 1.100 + movhlps m4, m6 1.101 + paddd m7, m3 1.102 + paddd m6, m4 1.103 + mov r1, ssem ; r1 = unsigned int *sse 1.104 + pshufd m4, m6, 0x1 1.105 + movd [r1], m7 ; store sse 1.106 + paddd m6, m4 1.107 + movd rax, m6 ; store sum as return value 1.108 +%else ; mmsize == 8 1.109 + pshufw m4, m6, 0xe 1.110 + pshufw m3, m7, 0xe 1.111 + paddw m6, m4 1.112 + paddd m7, m3 1.113 + pcmpgtw m5, m6 ; mask for 0 > x 1.114 + mov r1, ssem ; r1 = unsigned int *sse 1.115 + punpcklwd m6, m5 ; sign-extend m6 word->dword 1.116 + movd [r1], m7 ; store sse 1.117 + pshufw m4, m6, 0xe 1.118 + paddd m6, m4 1.119 + movd rax, m6 ; store sum as return value 1.120 +%endif 1.121 + RET 1.122 +%endmacro 1.123 + 1.124 +%macro INC_SRC_BY_SRC_STRIDE 0 1.125 +%if ARCH_X86=1 && CONFIG_PIC=1 1.126 + add srcq, src_stridemp 1.127 +%else 1.128 + add srcq, src_strideq 1.129 +%endif 1.130 +%endmacro 1.131 + 1.132 +%macro SUBPEL_VARIANCE 1-2 0 ; W 1.133 +%if cpuflag(ssse3) 1.134 +%define bilin_filter_m bilin_filter_m_ssse3 1.135 +%define filter_idx_shift 4 1.136 +%else 1.137 +%define bilin_filter_m bilin_filter_m_sse2 1.138 +%define filter_idx_shift 5 1.139 +%endif 1.140 +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 1.141 +; 11, not 13, if the registers are ordered correctly. May make a minor speed 1.142 +; difference on Win64 1.143 + 1.144 +%ifdef PIC ; 64bit PIC 1.145 + %if %2 == 1 ; avg 1.146 + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 1.147 + x_offset, y_offset, \ 1.148 + dst, dst_stride, \ 1.149 + sec, sec_stride, height, sse 1.150 + %define sec_str sec_strideq 1.151 + %else 1.152 + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ 1.153 + y_offset, dst, dst_stride, height, sse 1.154 + %endif 1.155 + %define h heightd 1.156 + %define bilin_filter sseq 1.157 +%else 1.158 + %if ARCH_X86=1 && CONFIG_PIC=1 1.159 + %if %2 == 1 ; avg 1.160 + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 1.161 + x_offset, y_offset, \ 1.162 + dst, dst_stride, \ 1.163 + sec, sec_stride, \ 1.164 + height, sse, g_bilin_filter, g_pw_8 1.165 + %define h dword heightm 1.166 + %define sec_str sec_stridemp 1.167 + 1.168 + ;Store bilin_filter and pw_8 location in stack 1.169 + GET_GOT eax 1.170 + add esp, 4 ; restore esp 1.171 + 1.172 + lea ecx, [GLOBAL(bilin_filter_m)] 1.173 + mov g_bilin_filterm, ecx 1.174 + 1.175 + lea ecx, [GLOBAL(pw_8)] 1.176 + mov g_pw_8m, ecx 1.177 + 1.178 + LOAD_IF_USED 0, 1 ; load eax, ecx back 1.179 + %else 1.180 + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 1.181 + y_offset, dst, dst_stride, height, sse, \ 1.182 + g_bilin_filter, g_pw_8 1.183 + %define h heightd 1.184 + 1.185 + ;Store bilin_filter and pw_8 location in stack 1.186 + GET_GOT eax 1.187 + add esp, 4 ; restore esp 1.188 + 1.189 + lea ecx, [GLOBAL(bilin_filter_m)] 1.190 + mov g_bilin_filterm, ecx 1.191 + 1.192 + lea ecx, [GLOBAL(pw_8)] 1.193 + mov g_pw_8m, ecx 1.194 + 1.195 + LOAD_IF_USED 0, 1 ; load eax, ecx back 1.196 + %endif 1.197 + %else 1.198 + %if %2 == 1 ; avg 1.199 + cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ 1.200 + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ 1.201 + x_offset, y_offset, \ 1.202 + dst, dst_stride, \ 1.203 + sec, sec_stride, \ 1.204 + height, sse 1.205 + %if ARCH_X86_64 1.206 + %define h heightd 1.207 + %define sec_str sec_strideq 1.208 + %else 1.209 + %define h dword heightm 1.210 + %define sec_str sec_stridemp 1.211 + %endif 1.212 + %else 1.213 + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 1.214 + y_offset, dst, dst_stride, height, sse 1.215 + %define h heightd 1.216 + %endif 1.217 + 1.218 + %define bilin_filter bilin_filter_m 1.219 + %endif 1.220 +%endif 1.221 + 1.222 + ASSERT %1 <= 16 ; m6 overflows if w > 16 1.223 + pxor m6, m6 ; sum 1.224 + pxor m7, m7 ; sse 1.225 + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 1.226 + ; could perhaps use it for something more productive then 1.227 + pxor m5, m5 ; dedicated zero register 1.228 +%if %1 < 16 1.229 + sar h, 1 1.230 +%if %2 == 1 ; avg 1.231 + shl sec_str, 1 1.232 +%endif 1.233 +%endif 1.234 + 1.235 + ; FIXME(rbultje) replace by jumptable? 1.236 + test x_offsetd, x_offsetd 1.237 + jnz .x_nonzero 1.238 + ; x_offset == 0 1.239 + test y_offsetd, y_offsetd 1.240 + jnz .x_zero_y_nonzero 1.241 + 1.242 + ; x_offset == 0 && y_offset == 0 1.243 +.x_zero_y_zero_loop: 1.244 +%if %1 == 16 1.245 + movu m0, [srcq] 1.246 + mova m1, [dstq] 1.247 +%if %2 == 1 ; avg 1.248 + pavgb m0, [secq] 1.249 + punpckhbw m3, m1, m5 1.250 + punpcklbw m1, m5 1.251 +%endif 1.252 + punpckhbw m2, m0, m5 1.253 + punpcklbw m0, m5 1.254 +%if %2 == 0 ; !avg 1.255 + punpckhbw m3, m1, m5 1.256 + punpcklbw m1, m5 1.257 +%endif 1.258 + SUM_SSE m0, m1, m2, m3, m6, m7 1.259 + 1.260 + add srcq, src_strideq 1.261 + add dstq, dst_strideq 1.262 +%else ; %1 < 16 1.263 + movh m0, [srcq] 1.264 +%if %2 == 1 ; avg 1.265 +%if mmsize == 16 1.266 + movhps m0, [srcq+src_strideq] 1.267 +%else ; mmsize == 8 1.268 + punpckldq m0, [srcq+src_strideq] 1.269 +%endif 1.270 +%else ; !avg 1.271 + movh m2, [srcq+src_strideq] 1.272 +%endif 1.273 + movh m1, [dstq] 1.274 + movh m3, [dstq+dst_strideq] 1.275 +%if %2 == 1 ; avg 1.276 + pavgb m0, [secq] 1.277 + punpcklbw m3, m5 1.278 + punpcklbw m1, m5 1.279 + punpckhbw m2, m0, m5 1.280 + punpcklbw m0, m5 1.281 +%else ; !avg 1.282 + punpcklbw m0, m5 1.283 + punpcklbw m2, m5 1.284 + punpcklbw m3, m5 1.285 + punpcklbw m1, m5 1.286 +%endif 1.287 + SUM_SSE m0, m1, m2, m3, m6, m7 1.288 + 1.289 + lea srcq, [srcq+src_strideq*2] 1.290 + lea dstq, [dstq+dst_strideq*2] 1.291 +%endif 1.292 +%if %2 == 1 ; avg 1.293 + add secq, sec_str 1.294 +%endif 1.295 + dec h 1.296 + jg .x_zero_y_zero_loop 1.297 + STORE_AND_RET 1.298 + 1.299 +.x_zero_y_nonzero: 1.300 + cmp y_offsetd, 8 1.301 + jne .x_zero_y_nonhalf 1.302 + 1.303 + ; x_offset == 0 && y_offset == 0.5 1.304 +.x_zero_y_half_loop: 1.305 +%if %1 == 16 1.306 + movu m0, [srcq] 1.307 + movu m4, [srcq+src_strideq] 1.308 + mova m1, [dstq] 1.309 + pavgb m0, m4 1.310 + punpckhbw m3, m1, m5 1.311 +%if %2 == 1 ; avg 1.312 + pavgb m0, [secq] 1.313 +%endif 1.314 + punpcklbw m1, m5 1.315 + punpckhbw m2, m0, m5 1.316 + punpcklbw m0, m5 1.317 + SUM_SSE m0, m1, m2, m3, m6, m7 1.318 + 1.319 + add srcq, src_strideq 1.320 + add dstq, dst_strideq 1.321 +%else ; %1 < 16 1.322 + movh m0, [srcq] 1.323 + movh m2, [srcq+src_strideq] 1.324 +%if %2 == 1 ; avg 1.325 +%if mmsize == 16 1.326 + movhps m2, [srcq+src_strideq*2] 1.327 +%else ; mmsize == 8 1.328 +%if %1 == 4 1.329 + movh m1, [srcq+src_strideq*2] 1.330 + punpckldq m2, m1 1.331 +%else 1.332 + punpckldq m2, [srcq+src_strideq*2] 1.333 +%endif 1.334 +%endif 1.335 + movh m1, [dstq] 1.336 +%if mmsize == 16 1.337 + movlhps m0, m2 1.338 +%else ; mmsize == 8 1.339 + punpckldq m0, m2 1.340 +%endif 1.341 + movh m3, [dstq+dst_strideq] 1.342 + pavgb m0, m2 1.343 + punpcklbw m1, m5 1.344 + pavgb m0, [secq] 1.345 + punpcklbw m3, m5 1.346 + punpckhbw m2, m0, m5 1.347 + punpcklbw m0, m5 1.348 +%else ; !avg 1.349 + movh m4, [srcq+src_strideq*2] 1.350 + movh m1, [dstq] 1.351 + pavgb m0, m2 1.352 + movh m3, [dstq+dst_strideq] 1.353 + pavgb m2, m4 1.354 + punpcklbw m0, m5 1.355 + punpcklbw m2, m5 1.356 + punpcklbw m3, m5 1.357 + punpcklbw m1, m5 1.358 +%endif 1.359 + SUM_SSE m0, m1, m2, m3, m6, m7 1.360 + 1.361 + lea srcq, [srcq+src_strideq*2] 1.362 + lea dstq, [dstq+dst_strideq*2] 1.363 +%endif 1.364 +%if %2 == 1 ; avg 1.365 + add secq, sec_str 1.366 +%endif 1.367 + dec h 1.368 + jg .x_zero_y_half_loop 1.369 + STORE_AND_RET 1.370 + 1.371 +.x_zero_y_nonhalf: 1.372 + ; x_offset == 0 && y_offset == bilin interpolation 1.373 +%ifdef PIC 1.374 + lea bilin_filter, [bilin_filter_m] 1.375 +%endif 1.376 + shl y_offsetd, filter_idx_shift 1.377 +%if ARCH_X86_64 && mmsize == 16 1.378 + mova m8, [bilin_filter+y_offsetq] 1.379 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1.380 + mova m9, [bilin_filter+y_offsetq+16] 1.381 +%endif 1.382 + mova m10, [pw_8] 1.383 +%define filter_y_a m8 1.384 +%define filter_y_b m9 1.385 +%define filter_rnd m10 1.386 +%else ; x86-32 or mmx 1.387 +%if ARCH_X86=1 && CONFIG_PIC=1 1.388 +; x_offset == 0, reuse x_offset reg 1.389 +%define tempq x_offsetq 1.390 + add y_offsetq, g_bilin_filterm 1.391 +%define filter_y_a [y_offsetq] 1.392 +%define filter_y_b [y_offsetq+16] 1.393 + mov tempq, g_pw_8m 1.394 +%define filter_rnd [tempq] 1.395 +%else 1.396 + add y_offsetq, bilin_filter 1.397 +%define filter_y_a [y_offsetq] 1.398 +%define filter_y_b [y_offsetq+16] 1.399 +%define filter_rnd [pw_8] 1.400 +%endif 1.401 +%endif 1.402 + 1.403 +.x_zero_y_other_loop: 1.404 +%if %1 == 16 1.405 + movu m0, [srcq] 1.406 + movu m4, [srcq+src_strideq] 1.407 + mova m1, [dstq] 1.408 +%if cpuflag(ssse3) 1.409 + punpckhbw m2, m0, m4 1.410 + punpcklbw m0, m4 1.411 + pmaddubsw m2, filter_y_a 1.412 + pmaddubsw m0, filter_y_a 1.413 + paddw m2, filter_rnd 1.414 + paddw m0, filter_rnd 1.415 +%else 1.416 + punpckhbw m2, m0, m5 1.417 + punpckhbw m3, m4, m5 1.418 + punpcklbw m0, m5 1.419 + punpcklbw m4, m5 1.420 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 1.421 + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 1.422 + ; instructions is the same (5), but it is 1 mul instead of 2, so might be 1.423 + ; slightly faster because of pmullw latency. It would also cut our rodata 1.424 + ; tables in half for this function, and save 1-2 registers on x86-64. 1.425 + pmullw m2, filter_y_a 1.426 + pmullw m3, filter_y_b 1.427 + paddw m2, filter_rnd 1.428 + pmullw m0, filter_y_a 1.429 + pmullw m4, filter_y_b 1.430 + paddw m0, filter_rnd 1.431 + paddw m2, m3 1.432 + paddw m0, m4 1.433 +%endif 1.434 + psraw m2, 4 1.435 + psraw m0, 4 1.436 +%if %2 == 1 ; avg 1.437 + ; FIXME(rbultje) pipeline 1.438 + packuswb m0, m2 1.439 + pavgb m0, [secq] 1.440 + punpckhbw m2, m0, m5 1.441 + punpcklbw m0, m5 1.442 +%endif 1.443 + punpckhbw m3, m1, m5 1.444 + punpcklbw m1, m5 1.445 + SUM_SSE m0, m1, m2, m3, m6, m7 1.446 + 1.447 + add srcq, src_strideq 1.448 + add dstq, dst_strideq 1.449 +%else ; %1 < 16 1.450 + movh m0, [srcq] 1.451 + movh m2, [srcq+src_strideq] 1.452 + movh m4, [srcq+src_strideq*2] 1.453 + movh m3, [dstq+dst_strideq] 1.454 +%if cpuflag(ssse3) 1.455 + movh m1, [dstq] 1.456 + punpcklbw m0, m2 1.457 + punpcklbw m2, m4 1.458 + pmaddubsw m0, filter_y_a 1.459 + pmaddubsw m2, filter_y_a 1.460 + punpcklbw m3, m5 1.461 + paddw m2, filter_rnd 1.462 + paddw m0, filter_rnd 1.463 +%else 1.464 + punpcklbw m0, m5 1.465 + punpcklbw m2, m5 1.466 + punpcklbw m4, m5 1.467 + pmullw m0, filter_y_a 1.468 + pmullw m1, m2, filter_y_b 1.469 + punpcklbw m3, m5 1.470 + paddw m0, filter_rnd 1.471 + pmullw m2, filter_y_a 1.472 + pmullw m4, filter_y_b 1.473 + paddw m0, m1 1.474 + paddw m2, filter_rnd 1.475 + movh m1, [dstq] 1.476 + paddw m2, m4 1.477 +%endif 1.478 + psraw m0, 4 1.479 + psraw m2, 4 1.480 +%if %2 == 1 ; avg 1.481 + ; FIXME(rbultje) pipeline 1.482 + packuswb m0, m2 1.483 + pavgb m0, [secq] 1.484 + punpckhbw m2, m0, m5 1.485 + punpcklbw m0, m5 1.486 +%endif 1.487 + punpcklbw m1, m5 1.488 + SUM_SSE m0, m1, m2, m3, m6, m7 1.489 + 1.490 + lea srcq, [srcq+src_strideq*2] 1.491 + lea dstq, [dstq+dst_strideq*2] 1.492 +%endif 1.493 +%if %2 == 1 ; avg 1.494 + add secq, sec_str 1.495 +%endif 1.496 + dec h 1.497 + jg .x_zero_y_other_loop 1.498 +%undef filter_y_a 1.499 +%undef filter_y_b 1.500 +%undef filter_rnd 1.501 + STORE_AND_RET 1.502 + 1.503 +.x_nonzero: 1.504 + cmp x_offsetd, 8 1.505 + jne .x_nonhalf 1.506 + ; x_offset == 0.5 1.507 + test y_offsetd, y_offsetd 1.508 + jnz .x_half_y_nonzero 1.509 + 1.510 + ; x_offset == 0.5 && y_offset == 0 1.511 +.x_half_y_zero_loop: 1.512 +%if %1 == 16 1.513 + movu m0, [srcq] 1.514 + movu m4, [srcq+1] 1.515 + mova m1, [dstq] 1.516 + pavgb m0, m4 1.517 + punpckhbw m3, m1, m5 1.518 +%if %2 == 1 ; avg 1.519 + pavgb m0, [secq] 1.520 +%endif 1.521 + punpcklbw m1, m5 1.522 + punpckhbw m2, m0, m5 1.523 + punpcklbw m0, m5 1.524 + SUM_SSE m0, m1, m2, m3, m6, m7 1.525 + 1.526 + add srcq, src_strideq 1.527 + add dstq, dst_strideq 1.528 +%else ; %1 < 16 1.529 + movh m0, [srcq] 1.530 + movh m4, [srcq+1] 1.531 +%if %2 == 1 ; avg 1.532 +%if mmsize == 16 1.533 + movhps m0, [srcq+src_strideq] 1.534 + movhps m4, [srcq+src_strideq+1] 1.535 +%else ; mmsize == 8 1.536 + punpckldq m0, [srcq+src_strideq] 1.537 + punpckldq m4, [srcq+src_strideq+1] 1.538 +%endif 1.539 + movh m1, [dstq] 1.540 + movh m3, [dstq+dst_strideq] 1.541 + pavgb m0, m4 1.542 + punpcklbw m3, m5 1.543 + pavgb m0, [secq] 1.544 + punpcklbw m1, m5 1.545 + punpckhbw m2, m0, m5 1.546 + punpcklbw m0, m5 1.547 +%else ; !avg 1.548 + movh m2, [srcq+src_strideq] 1.549 + movh m1, [dstq] 1.550 + pavgb m0, m4 1.551 + movh m4, [srcq+src_strideq+1] 1.552 + movh m3, [dstq+dst_strideq] 1.553 + pavgb m2, m4 1.554 + punpcklbw m0, m5 1.555 + punpcklbw m2, m5 1.556 + punpcklbw m3, m5 1.557 + punpcklbw m1, m5 1.558 +%endif 1.559 + SUM_SSE m0, m1, m2, m3, m6, m7 1.560 + 1.561 + lea srcq, [srcq+src_strideq*2] 1.562 + lea dstq, [dstq+dst_strideq*2] 1.563 +%endif 1.564 +%if %2 == 1 ; avg 1.565 + add secq, sec_str 1.566 +%endif 1.567 + dec h 1.568 + jg .x_half_y_zero_loop 1.569 + STORE_AND_RET 1.570 + 1.571 +.x_half_y_nonzero: 1.572 + cmp y_offsetd, 8 1.573 + jne .x_half_y_nonhalf 1.574 + 1.575 + ; x_offset == 0.5 && y_offset == 0.5 1.576 +%if %1 == 16 1.577 + movu m0, [srcq] 1.578 + movu m3, [srcq+1] 1.579 + add srcq, src_strideq 1.580 + pavgb m0, m3 1.581 +.x_half_y_half_loop: 1.582 + movu m4, [srcq] 1.583 + movu m3, [srcq+1] 1.584 + mova m1, [dstq] 1.585 + pavgb m4, m3 1.586 + punpckhbw m3, m1, m5 1.587 + pavgb m0, m4 1.588 +%if %2 == 1 ; avg 1.589 + punpcklbw m1, m5 1.590 + pavgb m0, [secq] 1.591 + punpckhbw m2, m0, m5 1.592 + punpcklbw m0, m5 1.593 +%else 1.594 + punpckhbw m2, m0, m5 1.595 + punpcklbw m0, m5 1.596 + punpcklbw m1, m5 1.597 +%endif 1.598 + SUM_SSE m0, m1, m2, m3, m6, m7 1.599 + mova m0, m4 1.600 + 1.601 + add srcq, src_strideq 1.602 + add dstq, dst_strideq 1.603 +%else ; %1 < 16 1.604 + movh m0, [srcq] 1.605 + movh m3, [srcq+1] 1.606 + add srcq, src_strideq 1.607 + pavgb m0, m3 1.608 +.x_half_y_half_loop: 1.609 + movh m2, [srcq] 1.610 + movh m3, [srcq+1] 1.611 +%if %2 == 1 ; avg 1.612 +%if mmsize == 16 1.613 + movhps m2, [srcq+src_strideq] 1.614 + movhps m3, [srcq+src_strideq+1] 1.615 +%else 1.616 +%if %1 == 4 1.617 + movh m1, [srcq+src_strideq] 1.618 + punpckldq m2, m1 1.619 + movh m1, [srcq+src_strideq+1] 1.620 + punpckldq m3, m1 1.621 +%else 1.622 + punpckldq m2, [srcq+src_strideq] 1.623 + punpckldq m3, [srcq+src_strideq+1] 1.624 +%endif 1.625 +%endif 1.626 + pavgb m2, m3 1.627 +%if mmsize == 16 1.628 + movlhps m0, m2 1.629 + movhlps m4, m2 1.630 +%else ; mmsize == 8 1.631 + punpckldq m0, m2 1.632 + pshufw m4, m2, 0xe 1.633 +%endif 1.634 + movh m1, [dstq] 1.635 + pavgb m0, m2 1.636 + movh m3, [dstq+dst_strideq] 1.637 + pavgb m0, [secq] 1.638 + punpcklbw m3, m5 1.639 + punpcklbw m1, m5 1.640 + punpckhbw m2, m0, m5 1.641 + punpcklbw m0, m5 1.642 +%else ; !avg 1.643 + movh m4, [srcq+src_strideq] 1.644 + movh m1, [srcq+src_strideq+1] 1.645 + pavgb m2, m3 1.646 + pavgb m4, m1 1.647 + pavgb m0, m2 1.648 + pavgb m2, m4 1.649 + movh m1, [dstq] 1.650 + movh m3, [dstq+dst_strideq] 1.651 + punpcklbw m0, m5 1.652 + punpcklbw m2, m5 1.653 + punpcklbw m3, m5 1.654 + punpcklbw m1, m5 1.655 +%endif 1.656 + SUM_SSE m0, m1, m2, m3, m6, m7 1.657 + mova m0, m4 1.658 + 1.659 + lea srcq, [srcq+src_strideq*2] 1.660 + lea dstq, [dstq+dst_strideq*2] 1.661 +%endif 1.662 +%if %2 == 1 ; avg 1.663 + add secq, sec_str 1.664 +%endif 1.665 + dec h 1.666 + jg .x_half_y_half_loop 1.667 + STORE_AND_RET 1.668 + 1.669 +.x_half_y_nonhalf: 1.670 + ; x_offset == 0.5 && y_offset == bilin interpolation 1.671 +%ifdef PIC 1.672 + lea bilin_filter, [bilin_filter_m] 1.673 +%endif 1.674 + shl y_offsetd, filter_idx_shift 1.675 +%if ARCH_X86_64 && mmsize == 16 1.676 + mova m8, [bilin_filter+y_offsetq] 1.677 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1.678 + mova m9, [bilin_filter+y_offsetq+16] 1.679 +%endif 1.680 + mova m10, [pw_8] 1.681 +%define filter_y_a m8 1.682 +%define filter_y_b m9 1.683 +%define filter_rnd m10 1.684 +%else ;x86_32 1.685 +%if ARCH_X86=1 && CONFIG_PIC=1 1.686 +; x_offset == 0.5. We can reuse x_offset reg 1.687 +%define tempq x_offsetq 1.688 + add y_offsetq, g_bilin_filterm 1.689 +%define filter_y_a [y_offsetq] 1.690 +%define filter_y_b [y_offsetq+16] 1.691 + mov tempq, g_pw_8m 1.692 +%define filter_rnd [tempq] 1.693 +%else 1.694 + add y_offsetq, bilin_filter 1.695 +%define filter_y_a [y_offsetq] 1.696 +%define filter_y_b [y_offsetq+16] 1.697 +%define filter_rnd [pw_8] 1.698 +%endif 1.699 +%endif 1.700 + 1.701 +%if %1 == 16 1.702 + movu m0, [srcq] 1.703 + movu m3, [srcq+1] 1.704 + add srcq, src_strideq 1.705 + pavgb m0, m3 1.706 +.x_half_y_other_loop: 1.707 + movu m4, [srcq] 1.708 + movu m2, [srcq+1] 1.709 + mova m1, [dstq] 1.710 + pavgb m4, m2 1.711 +%if cpuflag(ssse3) 1.712 + punpckhbw m2, m0, m4 1.713 + punpcklbw m0, m4 1.714 + pmaddubsw m2, filter_y_a 1.715 + pmaddubsw m0, filter_y_a 1.716 + paddw m2, filter_rnd 1.717 + paddw m0, filter_rnd 1.718 + psraw m2, 4 1.719 +%else 1.720 + punpckhbw m2, m0, m5 1.721 + punpckhbw m3, m4, m5 1.722 + pmullw m2, filter_y_a 1.723 + pmullw m3, filter_y_b 1.724 + paddw m2, filter_rnd 1.725 + punpcklbw m0, m5 1.726 + paddw m2, m3 1.727 + punpcklbw m3, m4, m5 1.728 + pmullw m0, filter_y_a 1.729 + pmullw m3, filter_y_b 1.730 + paddw m0, filter_rnd 1.731 + psraw m2, 4 1.732 + paddw m0, m3 1.733 +%endif 1.734 + punpckhbw m3, m1, m5 1.735 + psraw m0, 4 1.736 +%if %2 == 1 ; avg 1.737 + ; FIXME(rbultje) pipeline 1.738 + packuswb m0, m2 1.739 + pavgb m0, [secq] 1.740 + punpckhbw m2, m0, m5 1.741 + punpcklbw m0, m5 1.742 +%endif 1.743 + punpcklbw m1, m5 1.744 + SUM_SSE m0, m1, m2, m3, m6, m7 1.745 + mova m0, m4 1.746 + 1.747 + add srcq, src_strideq 1.748 + add dstq, dst_strideq 1.749 +%else ; %1 < 16 1.750 + movh m0, [srcq] 1.751 + movh m3, [srcq+1] 1.752 + add srcq, src_strideq 1.753 + pavgb m0, m3 1.754 +%if notcpuflag(ssse3) 1.755 + punpcklbw m0, m5 1.756 +%endif 1.757 +.x_half_y_other_loop: 1.758 + movh m2, [srcq] 1.759 + movh m1, [srcq+1] 1.760 + movh m4, [srcq+src_strideq] 1.761 + movh m3, [srcq+src_strideq+1] 1.762 + pavgb m2, m1 1.763 + pavgb m4, m3 1.764 + movh m3, [dstq+dst_strideq] 1.765 +%if cpuflag(ssse3) 1.766 + movh m1, [dstq] 1.767 + punpcklbw m0, m2 1.768 + punpcklbw m2, m4 1.769 + pmaddubsw m0, filter_y_a 1.770 + pmaddubsw m2, filter_y_a 1.771 + punpcklbw m3, m5 1.772 + paddw m0, filter_rnd 1.773 + paddw m2, filter_rnd 1.774 +%else 1.775 + punpcklbw m2, m5 1.776 + punpcklbw m4, m5 1.777 + pmullw m0, filter_y_a 1.778 + pmullw m1, m2, filter_y_b 1.779 + punpcklbw m3, m5 1.780 + paddw m0, filter_rnd 1.781 + pmullw m2, filter_y_a 1.782 + paddw m0, m1 1.783 + pmullw m1, m4, filter_y_b 1.784 + paddw m2, filter_rnd 1.785 + paddw m2, m1 1.786 + movh m1, [dstq] 1.787 +%endif 1.788 + psraw m0, 4 1.789 + psraw m2, 4 1.790 +%if %2 == 1 ; avg 1.791 + ; FIXME(rbultje) pipeline 1.792 + packuswb m0, m2 1.793 + pavgb m0, [secq] 1.794 + punpckhbw m2, m0, m5 1.795 + punpcklbw m0, m5 1.796 +%endif 1.797 + punpcklbw m1, m5 1.798 + SUM_SSE m0, m1, m2, m3, m6, m7 1.799 + mova m0, m4 1.800 + 1.801 + lea srcq, [srcq+src_strideq*2] 1.802 + lea dstq, [dstq+dst_strideq*2] 1.803 +%endif 1.804 +%if %2 == 1 ; avg 1.805 + add secq, sec_str 1.806 +%endif 1.807 + dec h 1.808 + jg .x_half_y_other_loop 1.809 +%undef filter_y_a 1.810 +%undef filter_y_b 1.811 +%undef filter_rnd 1.812 + STORE_AND_RET 1.813 + 1.814 +.x_nonhalf: 1.815 + test y_offsetd, y_offsetd 1.816 + jnz .x_nonhalf_y_nonzero 1.817 + 1.818 + ; x_offset == bilin interpolation && y_offset == 0 1.819 +%ifdef PIC 1.820 + lea bilin_filter, [bilin_filter_m] 1.821 +%endif 1.822 + shl x_offsetd, filter_idx_shift 1.823 +%if ARCH_X86_64 && mmsize == 16 1.824 + mova m8, [bilin_filter+x_offsetq] 1.825 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1.826 + mova m9, [bilin_filter+x_offsetq+16] 1.827 +%endif 1.828 + mova m10, [pw_8] 1.829 +%define filter_x_a m8 1.830 +%define filter_x_b m9 1.831 +%define filter_rnd m10 1.832 +%else ; x86-32 1.833 +%if ARCH_X86=1 && CONFIG_PIC=1 1.834 +;y_offset == 0. We can reuse y_offset reg. 1.835 +%define tempq y_offsetq 1.836 + add x_offsetq, g_bilin_filterm 1.837 +%define filter_x_a [x_offsetq] 1.838 +%define filter_x_b [x_offsetq+16] 1.839 + mov tempq, g_pw_8m 1.840 +%define filter_rnd [tempq] 1.841 +%else 1.842 + add x_offsetq, bilin_filter 1.843 +%define filter_x_a [x_offsetq] 1.844 +%define filter_x_b [x_offsetq+16] 1.845 +%define filter_rnd [pw_8] 1.846 +%endif 1.847 +%endif 1.848 + 1.849 +.x_other_y_zero_loop: 1.850 +%if %1 == 16 1.851 + movu m0, [srcq] 1.852 + movu m4, [srcq+1] 1.853 + mova m1, [dstq] 1.854 +%if cpuflag(ssse3) 1.855 + punpckhbw m2, m0, m4 1.856 + punpcklbw m0, m4 1.857 + pmaddubsw m2, filter_x_a 1.858 + pmaddubsw m0, filter_x_a 1.859 + paddw m2, filter_rnd 1.860 + paddw m0, filter_rnd 1.861 +%else 1.862 + punpckhbw m2, m0, m5 1.863 + punpckhbw m3, m4, m5 1.864 + punpcklbw m0, m5 1.865 + punpcklbw m4, m5 1.866 + pmullw m2, filter_x_a 1.867 + pmullw m3, filter_x_b 1.868 + paddw m2, filter_rnd 1.869 + pmullw m0, filter_x_a 1.870 + pmullw m4, filter_x_b 1.871 + paddw m0, filter_rnd 1.872 + paddw m2, m3 1.873 + paddw m0, m4 1.874 +%endif 1.875 + psraw m2, 4 1.876 + psraw m0, 4 1.877 +%if %2 == 1 ; avg 1.878 + ; FIXME(rbultje) pipeline 1.879 + packuswb m0, m2 1.880 + pavgb m0, [secq] 1.881 + punpckhbw m2, m0, m5 1.882 + punpcklbw m0, m5 1.883 +%endif 1.884 + punpckhbw m3, m1, m5 1.885 + punpcklbw m1, m5 1.886 + SUM_SSE m0, m1, m2, m3, m6, m7 1.887 + 1.888 + add srcq, src_strideq 1.889 + add dstq, dst_strideq 1.890 +%else ; %1 < 16 1.891 + movh m0, [srcq] 1.892 + movh m1, [srcq+1] 1.893 + movh m2, [srcq+src_strideq] 1.894 + movh m4, [srcq+src_strideq+1] 1.895 + movh m3, [dstq+dst_strideq] 1.896 +%if cpuflag(ssse3) 1.897 + punpcklbw m0, m1 1.898 + movh m1, [dstq] 1.899 + punpcklbw m2, m4 1.900 + pmaddubsw m0, filter_x_a 1.901 + pmaddubsw m2, filter_x_a 1.902 + punpcklbw m3, m5 1.903 + paddw m0, filter_rnd 1.904 + paddw m2, filter_rnd 1.905 +%else 1.906 + punpcklbw m0, m5 1.907 + punpcklbw m1, m5 1.908 + punpcklbw m2, m5 1.909 + punpcklbw m4, m5 1.910 + pmullw m0, filter_x_a 1.911 + pmullw m1, filter_x_b 1.912 + punpcklbw m3, m5 1.913 + paddw m0, filter_rnd 1.914 + pmullw m2, filter_x_a 1.915 + pmullw m4, filter_x_b 1.916 + paddw m0, m1 1.917 + paddw m2, filter_rnd 1.918 + movh m1, [dstq] 1.919 + paddw m2, m4 1.920 +%endif 1.921 + psraw m0, 4 1.922 + psraw m2, 4 1.923 +%if %2 == 1 ; avg 1.924 + ; FIXME(rbultje) pipeline 1.925 + packuswb m0, m2 1.926 + pavgb m0, [secq] 1.927 + punpckhbw m2, m0, m5 1.928 + punpcklbw m0, m5 1.929 +%endif 1.930 + punpcklbw m1, m5 1.931 + SUM_SSE m0, m1, m2, m3, m6, m7 1.932 + 1.933 + lea srcq, [srcq+src_strideq*2] 1.934 + lea dstq, [dstq+dst_strideq*2] 1.935 +%endif 1.936 +%if %2 == 1 ; avg 1.937 + add secq, sec_str 1.938 +%endif 1.939 + dec h 1.940 + jg .x_other_y_zero_loop 1.941 +%undef filter_x_a 1.942 +%undef filter_x_b 1.943 +%undef filter_rnd 1.944 + STORE_AND_RET 1.945 + 1.946 +.x_nonhalf_y_nonzero: 1.947 + cmp y_offsetd, 8 1.948 + jne .x_nonhalf_y_nonhalf 1.949 + 1.950 + ; x_offset == bilin interpolation && y_offset == 0.5 1.951 +%ifdef PIC 1.952 + lea bilin_filter, [bilin_filter_m] 1.953 +%endif 1.954 + shl x_offsetd, filter_idx_shift 1.955 +%if ARCH_X86_64 && mmsize == 16 1.956 + mova m8, [bilin_filter+x_offsetq] 1.957 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1.958 + mova m9, [bilin_filter+x_offsetq+16] 1.959 +%endif 1.960 + mova m10, [pw_8] 1.961 +%define filter_x_a m8 1.962 +%define filter_x_b m9 1.963 +%define filter_rnd m10 1.964 +%else ; x86-32 1.965 +%if ARCH_X86=1 && CONFIG_PIC=1 1.966 +; y_offset == 0.5. We can reuse y_offset reg. 1.967 +%define tempq y_offsetq 1.968 + add x_offsetq, g_bilin_filterm 1.969 +%define filter_x_a [x_offsetq] 1.970 +%define filter_x_b [x_offsetq+16] 1.971 + mov tempq, g_pw_8m 1.972 +%define filter_rnd [tempq] 1.973 +%else 1.974 + add x_offsetq, bilin_filter 1.975 +%define filter_x_a [x_offsetq] 1.976 +%define filter_x_b [x_offsetq+16] 1.977 +%define filter_rnd [pw_8] 1.978 +%endif 1.979 +%endif 1.980 + 1.981 +%if %1 == 16 1.982 + movu m0, [srcq] 1.983 + movu m1, [srcq+1] 1.984 +%if cpuflag(ssse3) 1.985 + punpckhbw m2, m0, m1 1.986 + punpcklbw m0, m1 1.987 + pmaddubsw m2, filter_x_a 1.988 + pmaddubsw m0, filter_x_a 1.989 + paddw m2, filter_rnd 1.990 + paddw m0, filter_rnd 1.991 +%else 1.992 + punpckhbw m2, m0, m5 1.993 + punpckhbw m3, m1, m5 1.994 + punpcklbw m0, m5 1.995 + punpcklbw m1, m5 1.996 + pmullw m0, filter_x_a 1.997 + pmullw m1, filter_x_b 1.998 + paddw m0, filter_rnd 1.999 + pmullw m2, filter_x_a 1.1000 + pmullw m3, filter_x_b 1.1001 + paddw m2, filter_rnd 1.1002 + paddw m0, m1 1.1003 + paddw m2, m3 1.1004 +%endif 1.1005 + psraw m0, 4 1.1006 + psraw m2, 4 1.1007 + add srcq, src_strideq 1.1008 + packuswb m0, m2 1.1009 +.x_other_y_half_loop: 1.1010 + movu m4, [srcq] 1.1011 + movu m3, [srcq+1] 1.1012 +%if cpuflag(ssse3) 1.1013 + mova m1, [dstq] 1.1014 + punpckhbw m2, m4, m3 1.1015 + punpcklbw m4, m3 1.1016 + pmaddubsw m2, filter_x_a 1.1017 + pmaddubsw m4, filter_x_a 1.1018 + paddw m2, filter_rnd 1.1019 + paddw m4, filter_rnd 1.1020 + psraw m2, 4 1.1021 + psraw m4, 4 1.1022 + packuswb m4, m2 1.1023 + pavgb m0, m4 1.1024 + punpckhbw m3, m1, m5 1.1025 + punpcklbw m1, m5 1.1026 +%else 1.1027 + punpckhbw m2, m4, m5 1.1028 + punpckhbw m1, m3, m5 1.1029 + punpcklbw m4, m5 1.1030 + punpcklbw m3, m5 1.1031 + pmullw m4, filter_x_a 1.1032 + pmullw m3, filter_x_b 1.1033 + paddw m4, filter_rnd 1.1034 + pmullw m2, filter_x_a 1.1035 + pmullw m1, filter_x_b 1.1036 + paddw m2, filter_rnd 1.1037 + paddw m4, m3 1.1038 + paddw m2, m1 1.1039 + mova m1, [dstq] 1.1040 + psraw m4, 4 1.1041 + psraw m2, 4 1.1042 + punpckhbw m3, m1, m5 1.1043 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 1.1044 + ; have a 1-register shortage to be able to store the backup of the bilin 1.1045 + ; filtered second line as words as cache for the next line. Packing into 1.1046 + ; a byte costs 1 pack and 2 unpacks, but saves a register. 1.1047 + packuswb m4, m2 1.1048 + punpcklbw m1, m5 1.1049 + pavgb m0, m4 1.1050 +%endif 1.1051 +%if %2 == 1 ; avg 1.1052 + ; FIXME(rbultje) pipeline 1.1053 + pavgb m0, [secq] 1.1054 +%endif 1.1055 + punpckhbw m2, m0, m5 1.1056 + punpcklbw m0, m5 1.1057 + SUM_SSE m0, m1, m2, m3, m6, m7 1.1058 + mova m0, m4 1.1059 + 1.1060 + add srcq, src_strideq 1.1061 + add dstq, dst_strideq 1.1062 +%else ; %1 < 16 1.1063 + movh m0, [srcq] 1.1064 + movh m1, [srcq+1] 1.1065 +%if cpuflag(ssse3) 1.1066 + punpcklbw m0, m1 1.1067 + pmaddubsw m0, filter_x_a 1.1068 + paddw m0, filter_rnd 1.1069 +%else 1.1070 + punpcklbw m0, m5 1.1071 + punpcklbw m1, m5 1.1072 + pmullw m0, filter_x_a 1.1073 + pmullw m1, filter_x_b 1.1074 + paddw m0, filter_rnd 1.1075 + paddw m0, m1 1.1076 +%endif 1.1077 + add srcq, src_strideq 1.1078 + psraw m0, 4 1.1079 +.x_other_y_half_loop: 1.1080 + movh m2, [srcq] 1.1081 + movh m1, [srcq+1] 1.1082 + movh m4, [srcq+src_strideq] 1.1083 + movh m3, [srcq+src_strideq+1] 1.1084 +%if cpuflag(ssse3) 1.1085 + punpcklbw m2, m1 1.1086 + punpcklbw m4, m3 1.1087 + pmaddubsw m2, filter_x_a 1.1088 + pmaddubsw m4, filter_x_a 1.1089 + movh m1, [dstq] 1.1090 + movh m3, [dstq+dst_strideq] 1.1091 + paddw m2, filter_rnd 1.1092 + paddw m4, filter_rnd 1.1093 +%else 1.1094 + punpcklbw m2, m5 1.1095 + punpcklbw m1, m5 1.1096 + punpcklbw m4, m5 1.1097 + punpcklbw m3, m5 1.1098 + pmullw m2, filter_x_a 1.1099 + pmullw m1, filter_x_b 1.1100 + paddw m2, filter_rnd 1.1101 + pmullw m4, filter_x_a 1.1102 + pmullw m3, filter_x_b 1.1103 + paddw m4, filter_rnd 1.1104 + paddw m2, m1 1.1105 + movh m1, [dstq] 1.1106 + paddw m4, m3 1.1107 + movh m3, [dstq+dst_strideq] 1.1108 +%endif 1.1109 + psraw m2, 4 1.1110 + psraw m4, 4 1.1111 + pavgw m0, m2 1.1112 + pavgw m2, m4 1.1113 +%if %2 == 1 ; avg 1.1114 + ; FIXME(rbultje) pipeline - also consider going to bytes here 1.1115 + packuswb m0, m2 1.1116 + pavgb m0, [secq] 1.1117 + punpckhbw m2, m0, m5 1.1118 + punpcklbw m0, m5 1.1119 +%endif 1.1120 + punpcklbw m3, m5 1.1121 + punpcklbw m1, m5 1.1122 + SUM_SSE m0, m1, m2, m3, m6, m7 1.1123 + mova m0, m4 1.1124 + 1.1125 + lea srcq, [srcq+src_strideq*2] 1.1126 + lea dstq, [dstq+dst_strideq*2] 1.1127 +%endif 1.1128 +%if %2 == 1 ; avg 1.1129 + add secq, sec_str 1.1130 +%endif 1.1131 + dec h 1.1132 + jg .x_other_y_half_loop 1.1133 +%undef filter_x_a 1.1134 +%undef filter_x_b 1.1135 +%undef filter_rnd 1.1136 + STORE_AND_RET 1.1137 + 1.1138 +.x_nonhalf_y_nonhalf: 1.1139 +%ifdef PIC 1.1140 + lea bilin_filter, [bilin_filter_m] 1.1141 +%endif 1.1142 + shl x_offsetd, filter_idx_shift 1.1143 + shl y_offsetd, filter_idx_shift 1.1144 +%if ARCH_X86_64 && mmsize == 16 1.1145 + mova m8, [bilin_filter+x_offsetq] 1.1146 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1.1147 + mova m9, [bilin_filter+x_offsetq+16] 1.1148 +%endif 1.1149 + mova m10, [bilin_filter+y_offsetq] 1.1150 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1.1151 + mova m11, [bilin_filter+y_offsetq+16] 1.1152 +%endif 1.1153 + mova m12, [pw_8] 1.1154 +%define filter_x_a m8 1.1155 +%define filter_x_b m9 1.1156 +%define filter_y_a m10 1.1157 +%define filter_y_b m11 1.1158 +%define filter_rnd m12 1.1159 +%else ; x86-32 1.1160 +%if ARCH_X86=1 && CONFIG_PIC=1 1.1161 +; In this case, there is NO unused register. Used src_stride register. Later, 1.1162 +; src_stride has to be loaded from stack when it is needed. 1.1163 +%define tempq src_strideq 1.1164 + mov tempq, g_bilin_filterm 1.1165 + add x_offsetq, tempq 1.1166 + add y_offsetq, tempq 1.1167 +%define filter_x_a [x_offsetq] 1.1168 +%define filter_x_b [x_offsetq+16] 1.1169 +%define filter_y_a [y_offsetq] 1.1170 +%define filter_y_b [y_offsetq+16] 1.1171 + 1.1172 + mov tempq, g_pw_8m 1.1173 +%define filter_rnd [tempq] 1.1174 +%else 1.1175 + add x_offsetq, bilin_filter 1.1176 + add y_offsetq, bilin_filter 1.1177 +%define filter_x_a [x_offsetq] 1.1178 +%define filter_x_b [x_offsetq+16] 1.1179 +%define filter_y_a [y_offsetq] 1.1180 +%define filter_y_b [y_offsetq+16] 1.1181 +%define filter_rnd [pw_8] 1.1182 +%endif 1.1183 +%endif 1.1184 + 1.1185 + ; x_offset == bilin interpolation && y_offset == bilin interpolation 1.1186 +%if %1 == 16 1.1187 + movu m0, [srcq] 1.1188 + movu m1, [srcq+1] 1.1189 +%if cpuflag(ssse3) 1.1190 + punpckhbw m2, m0, m1 1.1191 + punpcklbw m0, m1 1.1192 + pmaddubsw m2, filter_x_a 1.1193 + pmaddubsw m0, filter_x_a 1.1194 + paddw m2, filter_rnd 1.1195 + paddw m0, filter_rnd 1.1196 +%else 1.1197 + punpckhbw m2, m0, m5 1.1198 + punpckhbw m3, m1, m5 1.1199 + punpcklbw m0, m5 1.1200 + punpcklbw m1, m5 1.1201 + pmullw m0, filter_x_a 1.1202 + pmullw m1, filter_x_b 1.1203 + paddw m0, filter_rnd 1.1204 + pmullw m2, filter_x_a 1.1205 + pmullw m3, filter_x_b 1.1206 + paddw m2, filter_rnd 1.1207 + paddw m0, m1 1.1208 + paddw m2, m3 1.1209 +%endif 1.1210 + psraw m0, 4 1.1211 + psraw m2, 4 1.1212 + 1.1213 + INC_SRC_BY_SRC_STRIDE 1.1214 + 1.1215 + packuswb m0, m2 1.1216 +.x_other_y_other_loop: 1.1217 +%if cpuflag(ssse3) 1.1218 + movu m4, [srcq] 1.1219 + movu m3, [srcq+1] 1.1220 + mova m1, [dstq] 1.1221 + punpckhbw m2, m4, m3 1.1222 + punpcklbw m4, m3 1.1223 + pmaddubsw m2, filter_x_a 1.1224 + pmaddubsw m4, filter_x_a 1.1225 + punpckhbw m3, m1, m5 1.1226 + paddw m2, filter_rnd 1.1227 + paddw m4, filter_rnd 1.1228 + psraw m2, 4 1.1229 + psraw m4, 4 1.1230 + packuswb m4, m2 1.1231 + punpckhbw m2, m0, m4 1.1232 + punpcklbw m0, m4 1.1233 + pmaddubsw m2, filter_y_a 1.1234 + pmaddubsw m0, filter_y_a 1.1235 + punpcklbw m1, m5 1.1236 + paddw m2, filter_rnd 1.1237 + paddw m0, filter_rnd 1.1238 + psraw m2, 4 1.1239 + psraw m0, 4 1.1240 +%else 1.1241 + movu m3, [srcq] 1.1242 + movu m4, [srcq+1] 1.1243 + punpckhbw m1, m3, m5 1.1244 + punpckhbw m2, m4, m5 1.1245 + punpcklbw m3, m5 1.1246 + punpcklbw m4, m5 1.1247 + pmullw m3, filter_x_a 1.1248 + pmullw m4, filter_x_b 1.1249 + paddw m3, filter_rnd 1.1250 + pmullw m1, filter_x_a 1.1251 + pmullw m2, filter_x_b 1.1252 + paddw m1, filter_rnd 1.1253 + paddw m3, m4 1.1254 + paddw m1, m2 1.1255 + psraw m3, 4 1.1256 + psraw m1, 4 1.1257 + packuswb m4, m3, m1 1.1258 + punpckhbw m2, m0, m5 1.1259 + punpcklbw m0, m5 1.1260 + pmullw m2, filter_y_a 1.1261 + pmullw m1, filter_y_b 1.1262 + paddw m2, filter_rnd 1.1263 + pmullw m0, filter_y_a 1.1264 + pmullw m3, filter_y_b 1.1265 + paddw m2, m1 1.1266 + mova m1, [dstq] 1.1267 + paddw m0, filter_rnd 1.1268 + psraw m2, 4 1.1269 + paddw m0, m3 1.1270 + punpckhbw m3, m1, m5 1.1271 + psraw m0, 4 1.1272 + punpcklbw m1, m5 1.1273 +%endif 1.1274 +%if %2 == 1 ; avg 1.1275 + ; FIXME(rbultje) pipeline 1.1276 + packuswb m0, m2 1.1277 + pavgb m0, [secq] 1.1278 + punpckhbw m2, m0, m5 1.1279 + punpcklbw m0, m5 1.1280 +%endif 1.1281 + SUM_SSE m0, m1, m2, m3, m6, m7 1.1282 + mova m0, m4 1.1283 + 1.1284 + INC_SRC_BY_SRC_STRIDE 1.1285 + add dstq, dst_strideq 1.1286 +%else ; %1 < 16 1.1287 + movh m0, [srcq] 1.1288 + movh m1, [srcq+1] 1.1289 +%if cpuflag(ssse3) 1.1290 + punpcklbw m0, m1 1.1291 + pmaddubsw m0, filter_x_a 1.1292 + paddw m0, filter_rnd 1.1293 +%else 1.1294 + punpcklbw m0, m5 1.1295 + punpcklbw m1, m5 1.1296 + pmullw m0, filter_x_a 1.1297 + pmullw m1, filter_x_b 1.1298 + paddw m0, filter_rnd 1.1299 + paddw m0, m1 1.1300 +%endif 1.1301 + psraw m0, 4 1.1302 +%if cpuflag(ssse3) 1.1303 + packuswb m0, m0 1.1304 +%endif 1.1305 + 1.1306 + INC_SRC_BY_SRC_STRIDE 1.1307 + 1.1308 +.x_other_y_other_loop: 1.1309 + movh m2, [srcq] 1.1310 + movh m1, [srcq+1] 1.1311 + 1.1312 + INC_SRC_BY_SRC_STRIDE 1.1313 + movh m4, [srcq] 1.1314 + movh m3, [srcq+1] 1.1315 + 1.1316 +%if cpuflag(ssse3) 1.1317 + punpcklbw m2, m1 1.1318 + punpcklbw m4, m3 1.1319 + pmaddubsw m2, filter_x_a 1.1320 + pmaddubsw m4, filter_x_a 1.1321 + movh m3, [dstq+dst_strideq] 1.1322 + movh m1, [dstq] 1.1323 + paddw m2, filter_rnd 1.1324 + paddw m4, filter_rnd 1.1325 + psraw m2, 4 1.1326 + psraw m4, 4 1.1327 + packuswb m2, m2 1.1328 + packuswb m4, m4 1.1329 + punpcklbw m0, m2 1.1330 + punpcklbw m2, m4 1.1331 + pmaddubsw m0, filter_y_a 1.1332 + pmaddubsw m2, filter_y_a 1.1333 + punpcklbw m3, m5 1.1334 + paddw m0, filter_rnd 1.1335 + paddw m2, filter_rnd 1.1336 + psraw m0, 4 1.1337 + psraw m2, 4 1.1338 + punpcklbw m1, m5 1.1339 +%else 1.1340 + punpcklbw m2, m5 1.1341 + punpcklbw m1, m5 1.1342 + punpcklbw m4, m5 1.1343 + punpcklbw m3, m5 1.1344 + pmullw m2, filter_x_a 1.1345 + pmullw m1, filter_x_b 1.1346 + paddw m2, filter_rnd 1.1347 + pmullw m4, filter_x_a 1.1348 + pmullw m3, filter_x_b 1.1349 + paddw m4, filter_rnd 1.1350 + paddw m2, m1 1.1351 + paddw m4, m3 1.1352 + psraw m2, 4 1.1353 + psraw m4, 4 1.1354 + pmullw m0, filter_y_a 1.1355 + pmullw m3, m2, filter_y_b 1.1356 + paddw m0, filter_rnd 1.1357 + pmullw m2, filter_y_a 1.1358 + pmullw m1, m4, filter_y_b 1.1359 + paddw m2, filter_rnd 1.1360 + paddw m0, m3 1.1361 + movh m3, [dstq+dst_strideq] 1.1362 + paddw m2, m1 1.1363 + movh m1, [dstq] 1.1364 + psraw m0, 4 1.1365 + psraw m2, 4 1.1366 + punpcklbw m3, m5 1.1367 + punpcklbw m1, m5 1.1368 +%endif 1.1369 +%if %2 == 1 ; avg 1.1370 + ; FIXME(rbultje) pipeline 1.1371 + packuswb m0, m2 1.1372 + pavgb m0, [secq] 1.1373 + punpckhbw m2, m0, m5 1.1374 + punpcklbw m0, m5 1.1375 +%endif 1.1376 + SUM_SSE m0, m1, m2, m3, m6, m7 1.1377 + mova m0, m4 1.1378 + 1.1379 + INC_SRC_BY_SRC_STRIDE 1.1380 + lea dstq, [dstq+dst_strideq*2] 1.1381 +%endif 1.1382 +%if %2 == 1 ; avg 1.1383 + add secq, sec_str 1.1384 +%endif 1.1385 + dec h 1.1386 + jg .x_other_y_other_loop 1.1387 +%undef filter_x_a 1.1388 +%undef filter_x_b 1.1389 +%undef filter_y_a 1.1390 +%undef filter_y_b 1.1391 +%undef filter_rnd 1.1392 + STORE_AND_RET 1.1393 +%endmacro 1.1394 + 1.1395 +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 1.1396 +; between the ssse3 and non-ssse3 version. It may make sense to merge their 1.1397 +; code in the sense that the ssse3 version would jump to the appropriate 1.1398 +; location in the sse/2 version, rather than duplicating that code in the 1.1399 +; binary. 1.1400 + 1.1401 +INIT_MMX sse 1.1402 +SUBPEL_VARIANCE 4 1.1403 +INIT_XMM sse2 1.1404 +SUBPEL_VARIANCE 8 1.1405 +SUBPEL_VARIANCE 16 1.1406 + 1.1407 +INIT_MMX ssse3 1.1408 +SUBPEL_VARIANCE 4 1.1409 +INIT_XMM ssse3 1.1410 +SUBPEL_VARIANCE 8 1.1411 +SUBPEL_VARIANCE 16 1.1412 + 1.1413 +INIT_MMX sse 1.1414 +SUBPEL_VARIANCE 4, 1 1.1415 +INIT_XMM sse2 1.1416 +SUBPEL_VARIANCE 8, 1 1.1417 +SUBPEL_VARIANCE 16, 1 1.1418 + 1.1419 +INIT_MMX ssse3 1.1420 +SUBPEL_VARIANCE 4, 1 1.1421 +INIT_XMM ssse3 1.1422 +SUBPEL_VARIANCE 8, 1 1.1423 +SUBPEL_VARIANCE 16, 1