media/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %include "third_party/x86inc/x86inc.asm"
michael@0 12
michael@0 13 SECTION_RODATA
michael@0 14 pw_8: times 8 dw 8
michael@0 15 bilin_filter_m_sse2: times 8 dw 16
michael@0 16 times 8 dw 0
michael@0 17 times 8 dw 15
michael@0 18 times 8 dw 1
michael@0 19 times 8 dw 14
michael@0 20 times 8 dw 2
michael@0 21 times 8 dw 13
michael@0 22 times 8 dw 3
michael@0 23 times 8 dw 12
michael@0 24 times 8 dw 4
michael@0 25 times 8 dw 11
michael@0 26 times 8 dw 5
michael@0 27 times 8 dw 10
michael@0 28 times 8 dw 6
michael@0 29 times 8 dw 9
michael@0 30 times 8 dw 7
michael@0 31 times 16 dw 8
michael@0 32 times 8 dw 7
michael@0 33 times 8 dw 9
michael@0 34 times 8 dw 6
michael@0 35 times 8 dw 10
michael@0 36 times 8 dw 5
michael@0 37 times 8 dw 11
michael@0 38 times 8 dw 4
michael@0 39 times 8 dw 12
michael@0 40 times 8 dw 3
michael@0 41 times 8 dw 13
michael@0 42 times 8 dw 2
michael@0 43 times 8 dw 14
michael@0 44 times 8 dw 1
michael@0 45 times 8 dw 15
michael@0 46
michael@0 47 bilin_filter_m_ssse3: times 8 db 16, 0
michael@0 48 times 8 db 15, 1
michael@0 49 times 8 db 14, 2
michael@0 50 times 8 db 13, 3
michael@0 51 times 8 db 12, 4
michael@0 52 times 8 db 11, 5
michael@0 53 times 8 db 10, 6
michael@0 54 times 8 db 9, 7
michael@0 55 times 16 db 8
michael@0 56 times 8 db 7, 9
michael@0 57 times 8 db 6, 10
michael@0 58 times 8 db 5, 11
michael@0 59 times 8 db 4, 12
michael@0 60 times 8 db 3, 13
michael@0 61 times 8 db 2, 14
michael@0 62 times 8 db 1, 15
michael@0 63
michael@0 64 SECTION .text
michael@0 65
michael@0 66 ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
michael@0 67 ; int x_offset, int y_offset,
michael@0 68 ; const uint8_t *dst, ptrdiff_t dst_stride,
michael@0 69 ; int height, unsigned int *sse);
michael@0 70 ;
michael@0 71 ; This function returns the SE and stores SSE in the given pointer.
michael@0 72
michael@0 73 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
michael@0 74 psubw %3, %4
michael@0 75 psubw %1, %2
michael@0 76 paddw %5, %3
michael@0 77 pmaddwd %3, %3
michael@0 78 paddw %5, %1
michael@0 79 pmaddwd %1, %1
michael@0 80 paddd %6, %3
michael@0 81 paddd %6, %1
michael@0 82 %endmacro
michael@0 83
michael@0 84 %macro STORE_AND_RET 0
michael@0 85 %if mmsize == 16
michael@0 86 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
michael@0 87 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
michael@0 88 ; We have to sign-extend it before adding the words within the register
michael@0 89 ; and outputing to a dword.
michael@0 90 pcmpgtw m5, m6 ; mask for 0 > x
michael@0 91 movhlps m3, m7
michael@0 92 punpcklwd m4, m6, m5
michael@0 93 punpckhwd m6, m5 ; sign-extend m6 word->dword
michael@0 94 paddd m7, m3
michael@0 95 paddd m6, m4
michael@0 96 pshufd m3, m7, 0x1
michael@0 97 movhlps m4, m6
michael@0 98 paddd m7, m3
michael@0 99 paddd m6, m4
michael@0 100 mov r1, ssem ; r1 = unsigned int *sse
michael@0 101 pshufd m4, m6, 0x1
michael@0 102 movd [r1], m7 ; store sse
michael@0 103 paddd m6, m4
michael@0 104 movd rax, m6 ; store sum as return value
michael@0 105 %else ; mmsize == 8
michael@0 106 pshufw m4, m6, 0xe
michael@0 107 pshufw m3, m7, 0xe
michael@0 108 paddw m6, m4
michael@0 109 paddd m7, m3
michael@0 110 pcmpgtw m5, m6 ; mask for 0 > x
michael@0 111 mov r1, ssem ; r1 = unsigned int *sse
michael@0 112 punpcklwd m6, m5 ; sign-extend m6 word->dword
michael@0 113 movd [r1], m7 ; store sse
michael@0 114 pshufw m4, m6, 0xe
michael@0 115 paddd m6, m4
michael@0 116 movd rax, m6 ; store sum as return value
michael@0 117 %endif
michael@0 118 RET
michael@0 119 %endmacro
michael@0 120
michael@0 121 %macro INC_SRC_BY_SRC_STRIDE 0
michael@0 122 %if ARCH_X86=1 && CONFIG_PIC=1
michael@0 123 add srcq, src_stridemp
michael@0 124 %else
michael@0 125 add srcq, src_strideq
michael@0 126 %endif
michael@0 127 %endmacro
michael@0 128
michael@0 129 %macro SUBPEL_VARIANCE 1-2 0 ; W
michael@0 130 %if cpuflag(ssse3)
michael@0 131 %define bilin_filter_m bilin_filter_m_ssse3
michael@0 132 %define filter_idx_shift 4
michael@0 133 %else
michael@0 134 %define bilin_filter_m bilin_filter_m_sse2
michael@0 135 %define filter_idx_shift 5
michael@0 136 %endif
michael@0 137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
michael@0 138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
michael@0 139 ; difference on Win64
michael@0 140
michael@0 141 %ifdef PIC ; 64bit PIC
michael@0 142 %if %2 == 1 ; avg
michael@0 143 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
michael@0 144 x_offset, y_offset, \
michael@0 145 dst, dst_stride, \
michael@0 146 sec, sec_stride, height, sse
michael@0 147 %define sec_str sec_strideq
michael@0 148 %else
michael@0 149 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
michael@0 150 y_offset, dst, dst_stride, height, sse
michael@0 151 %endif
michael@0 152 %define h heightd
michael@0 153 %define bilin_filter sseq
michael@0 154 %else
michael@0 155 %if ARCH_X86=1 && CONFIG_PIC=1
michael@0 156 %if %2 == 1 ; avg
michael@0 157 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
michael@0 158 x_offset, y_offset, \
michael@0 159 dst, dst_stride, \
michael@0 160 sec, sec_stride, \
michael@0 161 height, sse, g_bilin_filter, g_pw_8
michael@0 162 %define h dword heightm
michael@0 163 %define sec_str sec_stridemp
michael@0 164
michael@0 165 ;Store bilin_filter and pw_8 location in stack
michael@0 166 GET_GOT eax
michael@0 167 add esp, 4 ; restore esp
michael@0 168
michael@0 169 lea ecx, [GLOBAL(bilin_filter_m)]
michael@0 170 mov g_bilin_filterm, ecx
michael@0 171
michael@0 172 lea ecx, [GLOBAL(pw_8)]
michael@0 173 mov g_pw_8m, ecx
michael@0 174
michael@0 175 LOAD_IF_USED 0, 1 ; load eax, ecx back
michael@0 176 %else
michael@0 177 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
michael@0 178 y_offset, dst, dst_stride, height, sse, \
michael@0 179 g_bilin_filter, g_pw_8
michael@0 180 %define h heightd
michael@0 181
michael@0 182 ;Store bilin_filter and pw_8 location in stack
michael@0 183 GET_GOT eax
michael@0 184 add esp, 4 ; restore esp
michael@0 185
michael@0 186 lea ecx, [GLOBAL(bilin_filter_m)]
michael@0 187 mov g_bilin_filterm, ecx
michael@0 188
michael@0 189 lea ecx, [GLOBAL(pw_8)]
michael@0 190 mov g_pw_8m, ecx
michael@0 191
michael@0 192 LOAD_IF_USED 0, 1 ; load eax, ecx back
michael@0 193 %endif
michael@0 194 %else
michael@0 195 %if %2 == 1 ; avg
michael@0 196 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
michael@0 197 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
michael@0 198 x_offset, y_offset, \
michael@0 199 dst, dst_stride, \
michael@0 200 sec, sec_stride, \
michael@0 201 height, sse
michael@0 202 %if ARCH_X86_64
michael@0 203 %define h heightd
michael@0 204 %define sec_str sec_strideq
michael@0 205 %else
michael@0 206 %define h dword heightm
michael@0 207 %define sec_str sec_stridemp
michael@0 208 %endif
michael@0 209 %else
michael@0 210 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
michael@0 211 y_offset, dst, dst_stride, height, sse
michael@0 212 %define h heightd
michael@0 213 %endif
michael@0 214
michael@0 215 %define bilin_filter bilin_filter_m
michael@0 216 %endif
michael@0 217 %endif
michael@0 218
michael@0 219 ASSERT %1 <= 16 ; m6 overflows if w > 16
michael@0 220 pxor m6, m6 ; sum
michael@0 221 pxor m7, m7 ; sse
michael@0 222 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
michael@0 223 ; could perhaps use it for something more productive then
michael@0 224 pxor m5, m5 ; dedicated zero register
michael@0 225 %if %1 < 16
michael@0 226 sar h, 1
michael@0 227 %if %2 == 1 ; avg
michael@0 228 shl sec_str, 1
michael@0 229 %endif
michael@0 230 %endif
michael@0 231
michael@0 232 ; FIXME(rbultje) replace by jumptable?
michael@0 233 test x_offsetd, x_offsetd
michael@0 234 jnz .x_nonzero
michael@0 235 ; x_offset == 0
michael@0 236 test y_offsetd, y_offsetd
michael@0 237 jnz .x_zero_y_nonzero
michael@0 238
michael@0 239 ; x_offset == 0 && y_offset == 0
michael@0 240 .x_zero_y_zero_loop:
michael@0 241 %if %1 == 16
michael@0 242 movu m0, [srcq]
michael@0 243 mova m1, [dstq]
michael@0 244 %if %2 == 1 ; avg
michael@0 245 pavgb m0, [secq]
michael@0 246 punpckhbw m3, m1, m5
michael@0 247 punpcklbw m1, m5
michael@0 248 %endif
michael@0 249 punpckhbw m2, m0, m5
michael@0 250 punpcklbw m0, m5
michael@0 251 %if %2 == 0 ; !avg
michael@0 252 punpckhbw m3, m1, m5
michael@0 253 punpcklbw m1, m5
michael@0 254 %endif
michael@0 255 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 256
michael@0 257 add srcq, src_strideq
michael@0 258 add dstq, dst_strideq
michael@0 259 %else ; %1 < 16
michael@0 260 movh m0, [srcq]
michael@0 261 %if %2 == 1 ; avg
michael@0 262 %if mmsize == 16
michael@0 263 movhps m0, [srcq+src_strideq]
michael@0 264 %else ; mmsize == 8
michael@0 265 punpckldq m0, [srcq+src_strideq]
michael@0 266 %endif
michael@0 267 %else ; !avg
michael@0 268 movh m2, [srcq+src_strideq]
michael@0 269 %endif
michael@0 270 movh m1, [dstq]
michael@0 271 movh m3, [dstq+dst_strideq]
michael@0 272 %if %2 == 1 ; avg
michael@0 273 pavgb m0, [secq]
michael@0 274 punpcklbw m3, m5
michael@0 275 punpcklbw m1, m5
michael@0 276 punpckhbw m2, m0, m5
michael@0 277 punpcklbw m0, m5
michael@0 278 %else ; !avg
michael@0 279 punpcklbw m0, m5
michael@0 280 punpcklbw m2, m5
michael@0 281 punpcklbw m3, m5
michael@0 282 punpcklbw m1, m5
michael@0 283 %endif
michael@0 284 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 285
michael@0 286 lea srcq, [srcq+src_strideq*2]
michael@0 287 lea dstq, [dstq+dst_strideq*2]
michael@0 288 %endif
michael@0 289 %if %2 == 1 ; avg
michael@0 290 add secq, sec_str
michael@0 291 %endif
michael@0 292 dec h
michael@0 293 jg .x_zero_y_zero_loop
michael@0 294 STORE_AND_RET
michael@0 295
michael@0 296 .x_zero_y_nonzero:
michael@0 297 cmp y_offsetd, 8
michael@0 298 jne .x_zero_y_nonhalf
michael@0 299
michael@0 300 ; x_offset == 0 && y_offset == 0.5
michael@0 301 .x_zero_y_half_loop:
michael@0 302 %if %1 == 16
michael@0 303 movu m0, [srcq]
michael@0 304 movu m4, [srcq+src_strideq]
michael@0 305 mova m1, [dstq]
michael@0 306 pavgb m0, m4
michael@0 307 punpckhbw m3, m1, m5
michael@0 308 %if %2 == 1 ; avg
michael@0 309 pavgb m0, [secq]
michael@0 310 %endif
michael@0 311 punpcklbw m1, m5
michael@0 312 punpckhbw m2, m0, m5
michael@0 313 punpcklbw m0, m5
michael@0 314 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 315
michael@0 316 add srcq, src_strideq
michael@0 317 add dstq, dst_strideq
michael@0 318 %else ; %1 < 16
michael@0 319 movh m0, [srcq]
michael@0 320 movh m2, [srcq+src_strideq]
michael@0 321 %if %2 == 1 ; avg
michael@0 322 %if mmsize == 16
michael@0 323 movhps m2, [srcq+src_strideq*2]
michael@0 324 %else ; mmsize == 8
michael@0 325 %if %1 == 4
michael@0 326 movh m1, [srcq+src_strideq*2]
michael@0 327 punpckldq m2, m1
michael@0 328 %else
michael@0 329 punpckldq m2, [srcq+src_strideq*2]
michael@0 330 %endif
michael@0 331 %endif
michael@0 332 movh m1, [dstq]
michael@0 333 %if mmsize == 16
michael@0 334 movlhps m0, m2
michael@0 335 %else ; mmsize == 8
michael@0 336 punpckldq m0, m2
michael@0 337 %endif
michael@0 338 movh m3, [dstq+dst_strideq]
michael@0 339 pavgb m0, m2
michael@0 340 punpcklbw m1, m5
michael@0 341 pavgb m0, [secq]
michael@0 342 punpcklbw m3, m5
michael@0 343 punpckhbw m2, m0, m5
michael@0 344 punpcklbw m0, m5
michael@0 345 %else ; !avg
michael@0 346 movh m4, [srcq+src_strideq*2]
michael@0 347 movh m1, [dstq]
michael@0 348 pavgb m0, m2
michael@0 349 movh m3, [dstq+dst_strideq]
michael@0 350 pavgb m2, m4
michael@0 351 punpcklbw m0, m5
michael@0 352 punpcklbw m2, m5
michael@0 353 punpcklbw m3, m5
michael@0 354 punpcklbw m1, m5
michael@0 355 %endif
michael@0 356 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 357
michael@0 358 lea srcq, [srcq+src_strideq*2]
michael@0 359 lea dstq, [dstq+dst_strideq*2]
michael@0 360 %endif
michael@0 361 %if %2 == 1 ; avg
michael@0 362 add secq, sec_str
michael@0 363 %endif
michael@0 364 dec h
michael@0 365 jg .x_zero_y_half_loop
michael@0 366 STORE_AND_RET
michael@0 367
michael@0 368 .x_zero_y_nonhalf:
michael@0 369 ; x_offset == 0 && y_offset == bilin interpolation
michael@0 370 %ifdef PIC
michael@0 371 lea bilin_filter, [bilin_filter_m]
michael@0 372 %endif
michael@0 373 shl y_offsetd, filter_idx_shift
michael@0 374 %if ARCH_X86_64 && mmsize == 16
michael@0 375 mova m8, [bilin_filter+y_offsetq]
michael@0 376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0 377 mova m9, [bilin_filter+y_offsetq+16]
michael@0 378 %endif
michael@0 379 mova m10, [pw_8]
michael@0 380 %define filter_y_a m8
michael@0 381 %define filter_y_b m9
michael@0 382 %define filter_rnd m10
michael@0 383 %else ; x86-32 or mmx
michael@0 384 %if ARCH_X86=1 && CONFIG_PIC=1
michael@0 385 ; x_offset == 0, reuse x_offset reg
michael@0 386 %define tempq x_offsetq
michael@0 387 add y_offsetq, g_bilin_filterm
michael@0 388 %define filter_y_a [y_offsetq]
michael@0 389 %define filter_y_b [y_offsetq+16]
michael@0 390 mov tempq, g_pw_8m
michael@0 391 %define filter_rnd [tempq]
michael@0 392 %else
michael@0 393 add y_offsetq, bilin_filter
michael@0 394 %define filter_y_a [y_offsetq]
michael@0 395 %define filter_y_b [y_offsetq+16]
michael@0 396 %define filter_rnd [pw_8]
michael@0 397 %endif
michael@0 398 %endif
michael@0 399
michael@0 400 .x_zero_y_other_loop:
michael@0 401 %if %1 == 16
michael@0 402 movu m0, [srcq]
michael@0 403 movu m4, [srcq+src_strideq]
michael@0 404 mova m1, [dstq]
michael@0 405 %if cpuflag(ssse3)
michael@0 406 punpckhbw m2, m0, m4
michael@0 407 punpcklbw m0, m4
michael@0 408 pmaddubsw m2, filter_y_a
michael@0 409 pmaddubsw m0, filter_y_a
michael@0 410 paddw m2, filter_rnd
michael@0 411 paddw m0, filter_rnd
michael@0 412 %else
michael@0 413 punpckhbw m2, m0, m5
michael@0 414 punpckhbw m3, m4, m5
michael@0 415 punpcklbw m0, m5
michael@0 416 punpcklbw m4, m5
michael@0 417 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
michael@0 418 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
michael@0 419 ; instructions is the same (5), but it is 1 mul instead of 2, so might be
michael@0 420 ; slightly faster because of pmullw latency. It would also cut our rodata
michael@0 421 ; tables in half for this function, and save 1-2 registers on x86-64.
michael@0 422 pmullw m2, filter_y_a
michael@0 423 pmullw m3, filter_y_b
michael@0 424 paddw m2, filter_rnd
michael@0 425 pmullw m0, filter_y_a
michael@0 426 pmullw m4, filter_y_b
michael@0 427 paddw m0, filter_rnd
michael@0 428 paddw m2, m3
michael@0 429 paddw m0, m4
michael@0 430 %endif
michael@0 431 psraw m2, 4
michael@0 432 psraw m0, 4
michael@0 433 %if %2 == 1 ; avg
michael@0 434 ; FIXME(rbultje) pipeline
michael@0 435 packuswb m0, m2
michael@0 436 pavgb m0, [secq]
michael@0 437 punpckhbw m2, m0, m5
michael@0 438 punpcklbw m0, m5
michael@0 439 %endif
michael@0 440 punpckhbw m3, m1, m5
michael@0 441 punpcklbw m1, m5
michael@0 442 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 443
michael@0 444 add srcq, src_strideq
michael@0 445 add dstq, dst_strideq
michael@0 446 %else ; %1 < 16
michael@0 447 movh m0, [srcq]
michael@0 448 movh m2, [srcq+src_strideq]
michael@0 449 movh m4, [srcq+src_strideq*2]
michael@0 450 movh m3, [dstq+dst_strideq]
michael@0 451 %if cpuflag(ssse3)
michael@0 452 movh m1, [dstq]
michael@0 453 punpcklbw m0, m2
michael@0 454 punpcklbw m2, m4
michael@0 455 pmaddubsw m0, filter_y_a
michael@0 456 pmaddubsw m2, filter_y_a
michael@0 457 punpcklbw m3, m5
michael@0 458 paddw m2, filter_rnd
michael@0 459 paddw m0, filter_rnd
michael@0 460 %else
michael@0 461 punpcklbw m0, m5
michael@0 462 punpcklbw m2, m5
michael@0 463 punpcklbw m4, m5
michael@0 464 pmullw m0, filter_y_a
michael@0 465 pmullw m1, m2, filter_y_b
michael@0 466 punpcklbw m3, m5
michael@0 467 paddw m0, filter_rnd
michael@0 468 pmullw m2, filter_y_a
michael@0 469 pmullw m4, filter_y_b
michael@0 470 paddw m0, m1
michael@0 471 paddw m2, filter_rnd
michael@0 472 movh m1, [dstq]
michael@0 473 paddw m2, m4
michael@0 474 %endif
michael@0 475 psraw m0, 4
michael@0 476 psraw m2, 4
michael@0 477 %if %2 == 1 ; avg
michael@0 478 ; FIXME(rbultje) pipeline
michael@0 479 packuswb m0, m2
michael@0 480 pavgb m0, [secq]
michael@0 481 punpckhbw m2, m0, m5
michael@0 482 punpcklbw m0, m5
michael@0 483 %endif
michael@0 484 punpcklbw m1, m5
michael@0 485 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 486
michael@0 487 lea srcq, [srcq+src_strideq*2]
michael@0 488 lea dstq, [dstq+dst_strideq*2]
michael@0 489 %endif
michael@0 490 %if %2 == 1 ; avg
michael@0 491 add secq, sec_str
michael@0 492 %endif
michael@0 493 dec h
michael@0 494 jg .x_zero_y_other_loop
michael@0 495 %undef filter_y_a
michael@0 496 %undef filter_y_b
michael@0 497 %undef filter_rnd
michael@0 498 STORE_AND_RET
michael@0 499
michael@0 500 .x_nonzero:
michael@0 501 cmp x_offsetd, 8
michael@0 502 jne .x_nonhalf
michael@0 503 ; x_offset == 0.5
michael@0 504 test y_offsetd, y_offsetd
michael@0 505 jnz .x_half_y_nonzero
michael@0 506
michael@0 507 ; x_offset == 0.5 && y_offset == 0
michael@0 508 .x_half_y_zero_loop:
michael@0 509 %if %1 == 16
michael@0 510 movu m0, [srcq]
michael@0 511 movu m4, [srcq+1]
michael@0 512 mova m1, [dstq]
michael@0 513 pavgb m0, m4
michael@0 514 punpckhbw m3, m1, m5
michael@0 515 %if %2 == 1 ; avg
michael@0 516 pavgb m0, [secq]
michael@0 517 %endif
michael@0 518 punpcklbw m1, m5
michael@0 519 punpckhbw m2, m0, m5
michael@0 520 punpcklbw m0, m5
michael@0 521 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 522
michael@0 523 add srcq, src_strideq
michael@0 524 add dstq, dst_strideq
michael@0 525 %else ; %1 < 16
michael@0 526 movh m0, [srcq]
michael@0 527 movh m4, [srcq+1]
michael@0 528 %if %2 == 1 ; avg
michael@0 529 %if mmsize == 16
michael@0 530 movhps m0, [srcq+src_strideq]
michael@0 531 movhps m4, [srcq+src_strideq+1]
michael@0 532 %else ; mmsize == 8
michael@0 533 punpckldq m0, [srcq+src_strideq]
michael@0 534 punpckldq m4, [srcq+src_strideq+1]
michael@0 535 %endif
michael@0 536 movh m1, [dstq]
michael@0 537 movh m3, [dstq+dst_strideq]
michael@0 538 pavgb m0, m4
michael@0 539 punpcklbw m3, m5
michael@0 540 pavgb m0, [secq]
michael@0 541 punpcklbw m1, m5
michael@0 542 punpckhbw m2, m0, m5
michael@0 543 punpcklbw m0, m5
michael@0 544 %else ; !avg
michael@0 545 movh m2, [srcq+src_strideq]
michael@0 546 movh m1, [dstq]
michael@0 547 pavgb m0, m4
michael@0 548 movh m4, [srcq+src_strideq+1]
michael@0 549 movh m3, [dstq+dst_strideq]
michael@0 550 pavgb m2, m4
michael@0 551 punpcklbw m0, m5
michael@0 552 punpcklbw m2, m5
michael@0 553 punpcklbw m3, m5
michael@0 554 punpcklbw m1, m5
michael@0 555 %endif
michael@0 556 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 557
michael@0 558 lea srcq, [srcq+src_strideq*2]
michael@0 559 lea dstq, [dstq+dst_strideq*2]
michael@0 560 %endif
michael@0 561 %if %2 == 1 ; avg
michael@0 562 add secq, sec_str
michael@0 563 %endif
michael@0 564 dec h
michael@0 565 jg .x_half_y_zero_loop
michael@0 566 STORE_AND_RET
michael@0 567
michael@0 568 .x_half_y_nonzero:
michael@0 569 cmp y_offsetd, 8
michael@0 570 jne .x_half_y_nonhalf
michael@0 571
michael@0 572 ; x_offset == 0.5 && y_offset == 0.5
michael@0 573 %if %1 == 16
michael@0 574 movu m0, [srcq]
michael@0 575 movu m3, [srcq+1]
michael@0 576 add srcq, src_strideq
michael@0 577 pavgb m0, m3
michael@0 578 .x_half_y_half_loop:
michael@0 579 movu m4, [srcq]
michael@0 580 movu m3, [srcq+1]
michael@0 581 mova m1, [dstq]
michael@0 582 pavgb m4, m3
michael@0 583 punpckhbw m3, m1, m5
michael@0 584 pavgb m0, m4
michael@0 585 %if %2 == 1 ; avg
michael@0 586 punpcklbw m1, m5
michael@0 587 pavgb m0, [secq]
michael@0 588 punpckhbw m2, m0, m5
michael@0 589 punpcklbw m0, m5
michael@0 590 %else
michael@0 591 punpckhbw m2, m0, m5
michael@0 592 punpcklbw m0, m5
michael@0 593 punpcklbw m1, m5
michael@0 594 %endif
michael@0 595 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 596 mova m0, m4
michael@0 597
michael@0 598 add srcq, src_strideq
michael@0 599 add dstq, dst_strideq
michael@0 600 %else ; %1 < 16
michael@0 601 movh m0, [srcq]
michael@0 602 movh m3, [srcq+1]
michael@0 603 add srcq, src_strideq
michael@0 604 pavgb m0, m3
michael@0 605 .x_half_y_half_loop:
michael@0 606 movh m2, [srcq]
michael@0 607 movh m3, [srcq+1]
michael@0 608 %if %2 == 1 ; avg
michael@0 609 %if mmsize == 16
michael@0 610 movhps m2, [srcq+src_strideq]
michael@0 611 movhps m3, [srcq+src_strideq+1]
michael@0 612 %else
michael@0 613 %if %1 == 4
michael@0 614 movh m1, [srcq+src_strideq]
michael@0 615 punpckldq m2, m1
michael@0 616 movh m1, [srcq+src_strideq+1]
michael@0 617 punpckldq m3, m1
michael@0 618 %else
michael@0 619 punpckldq m2, [srcq+src_strideq]
michael@0 620 punpckldq m3, [srcq+src_strideq+1]
michael@0 621 %endif
michael@0 622 %endif
michael@0 623 pavgb m2, m3
michael@0 624 %if mmsize == 16
michael@0 625 movlhps m0, m2
michael@0 626 movhlps m4, m2
michael@0 627 %else ; mmsize == 8
michael@0 628 punpckldq m0, m2
michael@0 629 pshufw m4, m2, 0xe
michael@0 630 %endif
michael@0 631 movh m1, [dstq]
michael@0 632 pavgb m0, m2
michael@0 633 movh m3, [dstq+dst_strideq]
michael@0 634 pavgb m0, [secq]
michael@0 635 punpcklbw m3, m5
michael@0 636 punpcklbw m1, m5
michael@0 637 punpckhbw m2, m0, m5
michael@0 638 punpcklbw m0, m5
michael@0 639 %else ; !avg
michael@0 640 movh m4, [srcq+src_strideq]
michael@0 641 movh m1, [srcq+src_strideq+1]
michael@0 642 pavgb m2, m3
michael@0 643 pavgb m4, m1
michael@0 644 pavgb m0, m2
michael@0 645 pavgb m2, m4
michael@0 646 movh m1, [dstq]
michael@0 647 movh m3, [dstq+dst_strideq]
michael@0 648 punpcklbw m0, m5
michael@0 649 punpcklbw m2, m5
michael@0 650 punpcklbw m3, m5
michael@0 651 punpcklbw m1, m5
michael@0 652 %endif
michael@0 653 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 654 mova m0, m4
michael@0 655
michael@0 656 lea srcq, [srcq+src_strideq*2]
michael@0 657 lea dstq, [dstq+dst_strideq*2]
michael@0 658 %endif
michael@0 659 %if %2 == 1 ; avg
michael@0 660 add secq, sec_str
michael@0 661 %endif
michael@0 662 dec h
michael@0 663 jg .x_half_y_half_loop
michael@0 664 STORE_AND_RET
michael@0 665
michael@0 666 .x_half_y_nonhalf:
michael@0 667 ; x_offset == 0.5 && y_offset == bilin interpolation
michael@0 668 %ifdef PIC
michael@0 669 lea bilin_filter, [bilin_filter_m]
michael@0 670 %endif
michael@0 671 shl y_offsetd, filter_idx_shift
michael@0 672 %if ARCH_X86_64 && mmsize == 16
michael@0 673 mova m8, [bilin_filter+y_offsetq]
michael@0 674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0 675 mova m9, [bilin_filter+y_offsetq+16]
michael@0 676 %endif
michael@0 677 mova m10, [pw_8]
michael@0 678 %define filter_y_a m8
michael@0 679 %define filter_y_b m9
michael@0 680 %define filter_rnd m10
michael@0 681 %else ;x86_32
michael@0 682 %if ARCH_X86=1 && CONFIG_PIC=1
michael@0 683 ; x_offset == 0.5. We can reuse x_offset reg
michael@0 684 %define tempq x_offsetq
michael@0 685 add y_offsetq, g_bilin_filterm
michael@0 686 %define filter_y_a [y_offsetq]
michael@0 687 %define filter_y_b [y_offsetq+16]
michael@0 688 mov tempq, g_pw_8m
michael@0 689 %define filter_rnd [tempq]
michael@0 690 %else
michael@0 691 add y_offsetq, bilin_filter
michael@0 692 %define filter_y_a [y_offsetq]
michael@0 693 %define filter_y_b [y_offsetq+16]
michael@0 694 %define filter_rnd [pw_8]
michael@0 695 %endif
michael@0 696 %endif
michael@0 697
michael@0 698 %if %1 == 16
michael@0 699 movu m0, [srcq]
michael@0 700 movu m3, [srcq+1]
michael@0 701 add srcq, src_strideq
michael@0 702 pavgb m0, m3
michael@0 703 .x_half_y_other_loop:
michael@0 704 movu m4, [srcq]
michael@0 705 movu m2, [srcq+1]
michael@0 706 mova m1, [dstq]
michael@0 707 pavgb m4, m2
michael@0 708 %if cpuflag(ssse3)
michael@0 709 punpckhbw m2, m0, m4
michael@0 710 punpcklbw m0, m4
michael@0 711 pmaddubsw m2, filter_y_a
michael@0 712 pmaddubsw m0, filter_y_a
michael@0 713 paddw m2, filter_rnd
michael@0 714 paddw m0, filter_rnd
michael@0 715 psraw m2, 4
michael@0 716 %else
michael@0 717 punpckhbw m2, m0, m5
michael@0 718 punpckhbw m3, m4, m5
michael@0 719 pmullw m2, filter_y_a
michael@0 720 pmullw m3, filter_y_b
michael@0 721 paddw m2, filter_rnd
michael@0 722 punpcklbw m0, m5
michael@0 723 paddw m2, m3
michael@0 724 punpcklbw m3, m4, m5
michael@0 725 pmullw m0, filter_y_a
michael@0 726 pmullw m3, filter_y_b
michael@0 727 paddw m0, filter_rnd
michael@0 728 psraw m2, 4
michael@0 729 paddw m0, m3
michael@0 730 %endif
michael@0 731 punpckhbw m3, m1, m5
michael@0 732 psraw m0, 4
michael@0 733 %if %2 == 1 ; avg
michael@0 734 ; FIXME(rbultje) pipeline
michael@0 735 packuswb m0, m2
michael@0 736 pavgb m0, [secq]
michael@0 737 punpckhbw m2, m0, m5
michael@0 738 punpcklbw m0, m5
michael@0 739 %endif
michael@0 740 punpcklbw m1, m5
michael@0 741 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 742 mova m0, m4
michael@0 743
michael@0 744 add srcq, src_strideq
michael@0 745 add dstq, dst_strideq
michael@0 746 %else ; %1 < 16
michael@0 747 movh m0, [srcq]
michael@0 748 movh m3, [srcq+1]
michael@0 749 add srcq, src_strideq
michael@0 750 pavgb m0, m3
michael@0 751 %if notcpuflag(ssse3)
michael@0 752 punpcklbw m0, m5
michael@0 753 %endif
michael@0 754 .x_half_y_other_loop:
michael@0 755 movh m2, [srcq]
michael@0 756 movh m1, [srcq+1]
michael@0 757 movh m4, [srcq+src_strideq]
michael@0 758 movh m3, [srcq+src_strideq+1]
michael@0 759 pavgb m2, m1
michael@0 760 pavgb m4, m3
michael@0 761 movh m3, [dstq+dst_strideq]
michael@0 762 %if cpuflag(ssse3)
michael@0 763 movh m1, [dstq]
michael@0 764 punpcklbw m0, m2
michael@0 765 punpcklbw m2, m4
michael@0 766 pmaddubsw m0, filter_y_a
michael@0 767 pmaddubsw m2, filter_y_a
michael@0 768 punpcklbw m3, m5
michael@0 769 paddw m0, filter_rnd
michael@0 770 paddw m2, filter_rnd
michael@0 771 %else
michael@0 772 punpcklbw m2, m5
michael@0 773 punpcklbw m4, m5
michael@0 774 pmullw m0, filter_y_a
michael@0 775 pmullw m1, m2, filter_y_b
michael@0 776 punpcklbw m3, m5
michael@0 777 paddw m0, filter_rnd
michael@0 778 pmullw m2, filter_y_a
michael@0 779 paddw m0, m1
michael@0 780 pmullw m1, m4, filter_y_b
michael@0 781 paddw m2, filter_rnd
michael@0 782 paddw m2, m1
michael@0 783 movh m1, [dstq]
michael@0 784 %endif
michael@0 785 psraw m0, 4
michael@0 786 psraw m2, 4
michael@0 787 %if %2 == 1 ; avg
michael@0 788 ; FIXME(rbultje) pipeline
michael@0 789 packuswb m0, m2
michael@0 790 pavgb m0, [secq]
michael@0 791 punpckhbw m2, m0, m5
michael@0 792 punpcklbw m0, m5
michael@0 793 %endif
michael@0 794 punpcklbw m1, m5
michael@0 795 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 796 mova m0, m4
michael@0 797
michael@0 798 lea srcq, [srcq+src_strideq*2]
michael@0 799 lea dstq, [dstq+dst_strideq*2]
michael@0 800 %endif
michael@0 801 %if %2 == 1 ; avg
michael@0 802 add secq, sec_str
michael@0 803 %endif
michael@0 804 dec h
michael@0 805 jg .x_half_y_other_loop
michael@0 806 %undef filter_y_a
michael@0 807 %undef filter_y_b
michael@0 808 %undef filter_rnd
michael@0 809 STORE_AND_RET
michael@0 810
michael@0 811 .x_nonhalf:
michael@0 812 test y_offsetd, y_offsetd
michael@0 813 jnz .x_nonhalf_y_nonzero
michael@0 814
michael@0 815 ; x_offset == bilin interpolation && y_offset == 0
michael@0 816 %ifdef PIC
michael@0 817 lea bilin_filter, [bilin_filter_m]
michael@0 818 %endif
michael@0 819 shl x_offsetd, filter_idx_shift
michael@0 820 %if ARCH_X86_64 && mmsize == 16
michael@0 821 mova m8, [bilin_filter+x_offsetq]
michael@0 822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0 823 mova m9, [bilin_filter+x_offsetq+16]
michael@0 824 %endif
michael@0 825 mova m10, [pw_8]
michael@0 826 %define filter_x_a m8
michael@0 827 %define filter_x_b m9
michael@0 828 %define filter_rnd m10
michael@0 829 %else ; x86-32
michael@0 830 %if ARCH_X86=1 && CONFIG_PIC=1
michael@0 831 ;y_offset == 0. We can reuse y_offset reg.
michael@0 832 %define tempq y_offsetq
michael@0 833 add x_offsetq, g_bilin_filterm
michael@0 834 %define filter_x_a [x_offsetq]
michael@0 835 %define filter_x_b [x_offsetq+16]
michael@0 836 mov tempq, g_pw_8m
michael@0 837 %define filter_rnd [tempq]
michael@0 838 %else
michael@0 839 add x_offsetq, bilin_filter
michael@0 840 %define filter_x_a [x_offsetq]
michael@0 841 %define filter_x_b [x_offsetq+16]
michael@0 842 %define filter_rnd [pw_8]
michael@0 843 %endif
michael@0 844 %endif
michael@0 845
michael@0 846 .x_other_y_zero_loop:
michael@0 847 %if %1 == 16
michael@0 848 movu m0, [srcq]
michael@0 849 movu m4, [srcq+1]
michael@0 850 mova m1, [dstq]
michael@0 851 %if cpuflag(ssse3)
michael@0 852 punpckhbw m2, m0, m4
michael@0 853 punpcklbw m0, m4
michael@0 854 pmaddubsw m2, filter_x_a
michael@0 855 pmaddubsw m0, filter_x_a
michael@0 856 paddw m2, filter_rnd
michael@0 857 paddw m0, filter_rnd
michael@0 858 %else
michael@0 859 punpckhbw m2, m0, m5
michael@0 860 punpckhbw m3, m4, m5
michael@0 861 punpcklbw m0, m5
michael@0 862 punpcklbw m4, m5
michael@0 863 pmullw m2, filter_x_a
michael@0 864 pmullw m3, filter_x_b
michael@0 865 paddw m2, filter_rnd
michael@0 866 pmullw m0, filter_x_a
michael@0 867 pmullw m4, filter_x_b
michael@0 868 paddw m0, filter_rnd
michael@0 869 paddw m2, m3
michael@0 870 paddw m0, m4
michael@0 871 %endif
michael@0 872 psraw m2, 4
michael@0 873 psraw m0, 4
michael@0 874 %if %2 == 1 ; avg
michael@0 875 ; FIXME(rbultje) pipeline
michael@0 876 packuswb m0, m2
michael@0 877 pavgb m0, [secq]
michael@0 878 punpckhbw m2, m0, m5
michael@0 879 punpcklbw m0, m5
michael@0 880 %endif
michael@0 881 punpckhbw m3, m1, m5
michael@0 882 punpcklbw m1, m5
michael@0 883 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 884
michael@0 885 add srcq, src_strideq
michael@0 886 add dstq, dst_strideq
michael@0 887 %else ; %1 < 16
michael@0 888 movh m0, [srcq]
michael@0 889 movh m1, [srcq+1]
michael@0 890 movh m2, [srcq+src_strideq]
michael@0 891 movh m4, [srcq+src_strideq+1]
michael@0 892 movh m3, [dstq+dst_strideq]
michael@0 893 %if cpuflag(ssse3)
michael@0 894 punpcklbw m0, m1
michael@0 895 movh m1, [dstq]
michael@0 896 punpcklbw m2, m4
michael@0 897 pmaddubsw m0, filter_x_a
michael@0 898 pmaddubsw m2, filter_x_a
michael@0 899 punpcklbw m3, m5
michael@0 900 paddw m0, filter_rnd
michael@0 901 paddw m2, filter_rnd
michael@0 902 %else
michael@0 903 punpcklbw m0, m5
michael@0 904 punpcklbw m1, m5
michael@0 905 punpcklbw m2, m5
michael@0 906 punpcklbw m4, m5
michael@0 907 pmullw m0, filter_x_a
michael@0 908 pmullw m1, filter_x_b
michael@0 909 punpcklbw m3, m5
michael@0 910 paddw m0, filter_rnd
michael@0 911 pmullw m2, filter_x_a
michael@0 912 pmullw m4, filter_x_b
michael@0 913 paddw m0, m1
michael@0 914 paddw m2, filter_rnd
michael@0 915 movh m1, [dstq]
michael@0 916 paddw m2, m4
michael@0 917 %endif
michael@0 918 psraw m0, 4
michael@0 919 psraw m2, 4
michael@0 920 %if %2 == 1 ; avg
michael@0 921 ; FIXME(rbultje) pipeline
michael@0 922 packuswb m0, m2
michael@0 923 pavgb m0, [secq]
michael@0 924 punpckhbw m2, m0, m5
michael@0 925 punpcklbw m0, m5
michael@0 926 %endif
michael@0 927 punpcklbw m1, m5
michael@0 928 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 929
michael@0 930 lea srcq, [srcq+src_strideq*2]
michael@0 931 lea dstq, [dstq+dst_strideq*2]
michael@0 932 %endif
michael@0 933 %if %2 == 1 ; avg
michael@0 934 add secq, sec_str
michael@0 935 %endif
michael@0 936 dec h
michael@0 937 jg .x_other_y_zero_loop
michael@0 938 %undef filter_x_a
michael@0 939 %undef filter_x_b
michael@0 940 %undef filter_rnd
michael@0 941 STORE_AND_RET
michael@0 942
michael@0 943 .x_nonhalf_y_nonzero:
michael@0 944 cmp y_offsetd, 8
michael@0 945 jne .x_nonhalf_y_nonhalf
michael@0 946
michael@0 947 ; x_offset == bilin interpolation && y_offset == 0.5
michael@0 948 %ifdef PIC
michael@0 949 lea bilin_filter, [bilin_filter_m]
michael@0 950 %endif
michael@0 951 shl x_offsetd, filter_idx_shift
michael@0 952 %if ARCH_X86_64 && mmsize == 16
michael@0 953 mova m8, [bilin_filter+x_offsetq]
michael@0 954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0 955 mova m9, [bilin_filter+x_offsetq+16]
michael@0 956 %endif
michael@0 957 mova m10, [pw_8]
michael@0 958 %define filter_x_a m8
michael@0 959 %define filter_x_b m9
michael@0 960 %define filter_rnd m10
michael@0 961 %else ; x86-32
michael@0 962 %if ARCH_X86=1 && CONFIG_PIC=1
michael@0 963 ; y_offset == 0.5. We can reuse y_offset reg.
michael@0 964 %define tempq y_offsetq
michael@0 965 add x_offsetq, g_bilin_filterm
michael@0 966 %define filter_x_a [x_offsetq]
michael@0 967 %define filter_x_b [x_offsetq+16]
michael@0 968 mov tempq, g_pw_8m
michael@0 969 %define filter_rnd [tempq]
michael@0 970 %else
michael@0 971 add x_offsetq, bilin_filter
michael@0 972 %define filter_x_a [x_offsetq]
michael@0 973 %define filter_x_b [x_offsetq+16]
michael@0 974 %define filter_rnd [pw_8]
michael@0 975 %endif
michael@0 976 %endif
michael@0 977
michael@0 978 %if %1 == 16
michael@0 979 movu m0, [srcq]
michael@0 980 movu m1, [srcq+1]
michael@0 981 %if cpuflag(ssse3)
michael@0 982 punpckhbw m2, m0, m1
michael@0 983 punpcklbw m0, m1
michael@0 984 pmaddubsw m2, filter_x_a
michael@0 985 pmaddubsw m0, filter_x_a
michael@0 986 paddw m2, filter_rnd
michael@0 987 paddw m0, filter_rnd
michael@0 988 %else
michael@0 989 punpckhbw m2, m0, m5
michael@0 990 punpckhbw m3, m1, m5
michael@0 991 punpcklbw m0, m5
michael@0 992 punpcklbw m1, m5
michael@0 993 pmullw m0, filter_x_a
michael@0 994 pmullw m1, filter_x_b
michael@0 995 paddw m0, filter_rnd
michael@0 996 pmullw m2, filter_x_a
michael@0 997 pmullw m3, filter_x_b
michael@0 998 paddw m2, filter_rnd
michael@0 999 paddw m0, m1
michael@0 1000 paddw m2, m3
michael@0 1001 %endif
michael@0 1002 psraw m0, 4
michael@0 1003 psraw m2, 4
michael@0 1004 add srcq, src_strideq
michael@0 1005 packuswb m0, m2
michael@0 1006 .x_other_y_half_loop:
michael@0 1007 movu m4, [srcq]
michael@0 1008 movu m3, [srcq+1]
michael@0 1009 %if cpuflag(ssse3)
michael@0 1010 mova m1, [dstq]
michael@0 1011 punpckhbw m2, m4, m3
michael@0 1012 punpcklbw m4, m3
michael@0 1013 pmaddubsw m2, filter_x_a
michael@0 1014 pmaddubsw m4, filter_x_a
michael@0 1015 paddw m2, filter_rnd
michael@0 1016 paddw m4, filter_rnd
michael@0 1017 psraw m2, 4
michael@0 1018 psraw m4, 4
michael@0 1019 packuswb m4, m2
michael@0 1020 pavgb m0, m4
michael@0 1021 punpckhbw m3, m1, m5
michael@0 1022 punpcklbw m1, m5
michael@0 1023 %else
michael@0 1024 punpckhbw m2, m4, m5
michael@0 1025 punpckhbw m1, m3, m5
michael@0 1026 punpcklbw m4, m5
michael@0 1027 punpcklbw m3, m5
michael@0 1028 pmullw m4, filter_x_a
michael@0 1029 pmullw m3, filter_x_b
michael@0 1030 paddw m4, filter_rnd
michael@0 1031 pmullw m2, filter_x_a
michael@0 1032 pmullw m1, filter_x_b
michael@0 1033 paddw m2, filter_rnd
michael@0 1034 paddw m4, m3
michael@0 1035 paddw m2, m1
michael@0 1036 mova m1, [dstq]
michael@0 1037 psraw m4, 4
michael@0 1038 psraw m2, 4
michael@0 1039 punpckhbw m3, m1, m5
michael@0 1040 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
michael@0 1041 ; have a 1-register shortage to be able to store the backup of the bilin
michael@0 1042 ; filtered second line as words as cache for the next line. Packing into
michael@0 1043 ; a byte costs 1 pack and 2 unpacks, but saves a register.
michael@0 1044 packuswb m4, m2
michael@0 1045 punpcklbw m1, m5
michael@0 1046 pavgb m0, m4
michael@0 1047 %endif
michael@0 1048 %if %2 == 1 ; avg
michael@0 1049 ; FIXME(rbultje) pipeline
michael@0 1050 pavgb m0, [secq]
michael@0 1051 %endif
michael@0 1052 punpckhbw m2, m0, m5
michael@0 1053 punpcklbw m0, m5
michael@0 1054 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 1055 mova m0, m4
michael@0 1056
michael@0 1057 add srcq, src_strideq
michael@0 1058 add dstq, dst_strideq
michael@0 1059 %else ; %1 < 16
michael@0 1060 movh m0, [srcq]
michael@0 1061 movh m1, [srcq+1]
michael@0 1062 %if cpuflag(ssse3)
michael@0 1063 punpcklbw m0, m1
michael@0 1064 pmaddubsw m0, filter_x_a
michael@0 1065 paddw m0, filter_rnd
michael@0 1066 %else
michael@0 1067 punpcklbw m0, m5
michael@0 1068 punpcklbw m1, m5
michael@0 1069 pmullw m0, filter_x_a
michael@0 1070 pmullw m1, filter_x_b
michael@0 1071 paddw m0, filter_rnd
michael@0 1072 paddw m0, m1
michael@0 1073 %endif
michael@0 1074 add srcq, src_strideq
michael@0 1075 psraw m0, 4
michael@0 1076 .x_other_y_half_loop:
michael@0 1077 movh m2, [srcq]
michael@0 1078 movh m1, [srcq+1]
michael@0 1079 movh m4, [srcq+src_strideq]
michael@0 1080 movh m3, [srcq+src_strideq+1]
michael@0 1081 %if cpuflag(ssse3)
michael@0 1082 punpcklbw m2, m1
michael@0 1083 punpcklbw m4, m3
michael@0 1084 pmaddubsw m2, filter_x_a
michael@0 1085 pmaddubsw m4, filter_x_a
michael@0 1086 movh m1, [dstq]
michael@0 1087 movh m3, [dstq+dst_strideq]
michael@0 1088 paddw m2, filter_rnd
michael@0 1089 paddw m4, filter_rnd
michael@0 1090 %else
michael@0 1091 punpcklbw m2, m5
michael@0 1092 punpcklbw m1, m5
michael@0 1093 punpcklbw m4, m5
michael@0 1094 punpcklbw m3, m5
michael@0 1095 pmullw m2, filter_x_a
michael@0 1096 pmullw m1, filter_x_b
michael@0 1097 paddw m2, filter_rnd
michael@0 1098 pmullw m4, filter_x_a
michael@0 1099 pmullw m3, filter_x_b
michael@0 1100 paddw m4, filter_rnd
michael@0 1101 paddw m2, m1
michael@0 1102 movh m1, [dstq]
michael@0 1103 paddw m4, m3
michael@0 1104 movh m3, [dstq+dst_strideq]
michael@0 1105 %endif
michael@0 1106 psraw m2, 4
michael@0 1107 psraw m4, 4
michael@0 1108 pavgw m0, m2
michael@0 1109 pavgw m2, m4
michael@0 1110 %if %2 == 1 ; avg
michael@0 1111 ; FIXME(rbultje) pipeline - also consider going to bytes here
michael@0 1112 packuswb m0, m2
michael@0 1113 pavgb m0, [secq]
michael@0 1114 punpckhbw m2, m0, m5
michael@0 1115 punpcklbw m0, m5
michael@0 1116 %endif
michael@0 1117 punpcklbw m3, m5
michael@0 1118 punpcklbw m1, m5
michael@0 1119 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 1120 mova m0, m4
michael@0 1121
michael@0 1122 lea srcq, [srcq+src_strideq*2]
michael@0 1123 lea dstq, [dstq+dst_strideq*2]
michael@0 1124 %endif
michael@0 1125 %if %2 == 1 ; avg
michael@0 1126 add secq, sec_str
michael@0 1127 %endif
michael@0 1128 dec h
michael@0 1129 jg .x_other_y_half_loop
michael@0 1130 %undef filter_x_a
michael@0 1131 %undef filter_x_b
michael@0 1132 %undef filter_rnd
michael@0 1133 STORE_AND_RET
michael@0 1134
michael@0 1135 .x_nonhalf_y_nonhalf:
michael@0 1136 %ifdef PIC
michael@0 1137 lea bilin_filter, [bilin_filter_m]
michael@0 1138 %endif
michael@0 1139 shl x_offsetd, filter_idx_shift
michael@0 1140 shl y_offsetd, filter_idx_shift
michael@0 1141 %if ARCH_X86_64 && mmsize == 16
michael@0 1142 mova m8, [bilin_filter+x_offsetq]
michael@0 1143 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0 1144 mova m9, [bilin_filter+x_offsetq+16]
michael@0 1145 %endif
michael@0 1146 mova m10, [bilin_filter+y_offsetq]
michael@0 1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0 1148 mova m11, [bilin_filter+y_offsetq+16]
michael@0 1149 %endif
michael@0 1150 mova m12, [pw_8]
michael@0 1151 %define filter_x_a m8
michael@0 1152 %define filter_x_b m9
michael@0 1153 %define filter_y_a m10
michael@0 1154 %define filter_y_b m11
michael@0 1155 %define filter_rnd m12
michael@0 1156 %else ; x86-32
michael@0 1157 %if ARCH_X86=1 && CONFIG_PIC=1
michael@0 1158 ; In this case, there is NO unused register. Used src_stride register. Later,
michael@0 1159 ; src_stride has to be loaded from stack when it is needed.
michael@0 1160 %define tempq src_strideq
michael@0 1161 mov tempq, g_bilin_filterm
michael@0 1162 add x_offsetq, tempq
michael@0 1163 add y_offsetq, tempq
michael@0 1164 %define filter_x_a [x_offsetq]
michael@0 1165 %define filter_x_b [x_offsetq+16]
michael@0 1166 %define filter_y_a [y_offsetq]
michael@0 1167 %define filter_y_b [y_offsetq+16]
michael@0 1168
michael@0 1169 mov tempq, g_pw_8m
michael@0 1170 %define filter_rnd [tempq]
michael@0 1171 %else
michael@0 1172 add x_offsetq, bilin_filter
michael@0 1173 add y_offsetq, bilin_filter
michael@0 1174 %define filter_x_a [x_offsetq]
michael@0 1175 %define filter_x_b [x_offsetq+16]
michael@0 1176 %define filter_y_a [y_offsetq]
michael@0 1177 %define filter_y_b [y_offsetq+16]
michael@0 1178 %define filter_rnd [pw_8]
michael@0 1179 %endif
michael@0 1180 %endif
michael@0 1181
michael@0 1182 ; x_offset == bilin interpolation && y_offset == bilin interpolation
michael@0 1183 %if %1 == 16
michael@0 1184 movu m0, [srcq]
michael@0 1185 movu m1, [srcq+1]
michael@0 1186 %if cpuflag(ssse3)
michael@0 1187 punpckhbw m2, m0, m1
michael@0 1188 punpcklbw m0, m1
michael@0 1189 pmaddubsw m2, filter_x_a
michael@0 1190 pmaddubsw m0, filter_x_a
michael@0 1191 paddw m2, filter_rnd
michael@0 1192 paddw m0, filter_rnd
michael@0 1193 %else
michael@0 1194 punpckhbw m2, m0, m5
michael@0 1195 punpckhbw m3, m1, m5
michael@0 1196 punpcklbw m0, m5
michael@0 1197 punpcklbw m1, m5
michael@0 1198 pmullw m0, filter_x_a
michael@0 1199 pmullw m1, filter_x_b
michael@0 1200 paddw m0, filter_rnd
michael@0 1201 pmullw m2, filter_x_a
michael@0 1202 pmullw m3, filter_x_b
michael@0 1203 paddw m2, filter_rnd
michael@0 1204 paddw m0, m1
michael@0 1205 paddw m2, m3
michael@0 1206 %endif
michael@0 1207 psraw m0, 4
michael@0 1208 psraw m2, 4
michael@0 1209
michael@0 1210 INC_SRC_BY_SRC_STRIDE
michael@0 1211
michael@0 1212 packuswb m0, m2
michael@0 1213 .x_other_y_other_loop:
michael@0 1214 %if cpuflag(ssse3)
michael@0 1215 movu m4, [srcq]
michael@0 1216 movu m3, [srcq+1]
michael@0 1217 mova m1, [dstq]
michael@0 1218 punpckhbw m2, m4, m3
michael@0 1219 punpcklbw m4, m3
michael@0 1220 pmaddubsw m2, filter_x_a
michael@0 1221 pmaddubsw m4, filter_x_a
michael@0 1222 punpckhbw m3, m1, m5
michael@0 1223 paddw m2, filter_rnd
michael@0 1224 paddw m4, filter_rnd
michael@0 1225 psraw m2, 4
michael@0 1226 psraw m4, 4
michael@0 1227 packuswb m4, m2
michael@0 1228 punpckhbw m2, m0, m4
michael@0 1229 punpcklbw m0, m4
michael@0 1230 pmaddubsw m2, filter_y_a
michael@0 1231 pmaddubsw m0, filter_y_a
michael@0 1232 punpcklbw m1, m5
michael@0 1233 paddw m2, filter_rnd
michael@0 1234 paddw m0, filter_rnd
michael@0 1235 psraw m2, 4
michael@0 1236 psraw m0, 4
michael@0 1237 %else
michael@0 1238 movu m3, [srcq]
michael@0 1239 movu m4, [srcq+1]
michael@0 1240 punpckhbw m1, m3, m5
michael@0 1241 punpckhbw m2, m4, m5
michael@0 1242 punpcklbw m3, m5
michael@0 1243 punpcklbw m4, m5
michael@0 1244 pmullw m3, filter_x_a
michael@0 1245 pmullw m4, filter_x_b
michael@0 1246 paddw m3, filter_rnd
michael@0 1247 pmullw m1, filter_x_a
michael@0 1248 pmullw m2, filter_x_b
michael@0 1249 paddw m1, filter_rnd
michael@0 1250 paddw m3, m4
michael@0 1251 paddw m1, m2
michael@0 1252 psraw m3, 4
michael@0 1253 psraw m1, 4
michael@0 1254 packuswb m4, m3, m1
michael@0 1255 punpckhbw m2, m0, m5
michael@0 1256 punpcklbw m0, m5
michael@0 1257 pmullw m2, filter_y_a
michael@0 1258 pmullw m1, filter_y_b
michael@0 1259 paddw m2, filter_rnd
michael@0 1260 pmullw m0, filter_y_a
michael@0 1261 pmullw m3, filter_y_b
michael@0 1262 paddw m2, m1
michael@0 1263 mova m1, [dstq]
michael@0 1264 paddw m0, filter_rnd
michael@0 1265 psraw m2, 4
michael@0 1266 paddw m0, m3
michael@0 1267 punpckhbw m3, m1, m5
michael@0 1268 psraw m0, 4
michael@0 1269 punpcklbw m1, m5
michael@0 1270 %endif
michael@0 1271 %if %2 == 1 ; avg
michael@0 1272 ; FIXME(rbultje) pipeline
michael@0 1273 packuswb m0, m2
michael@0 1274 pavgb m0, [secq]
michael@0 1275 punpckhbw m2, m0, m5
michael@0 1276 punpcklbw m0, m5
michael@0 1277 %endif
michael@0 1278 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 1279 mova m0, m4
michael@0 1280
michael@0 1281 INC_SRC_BY_SRC_STRIDE
michael@0 1282 add dstq, dst_strideq
michael@0 1283 %else ; %1 < 16
michael@0 1284 movh m0, [srcq]
michael@0 1285 movh m1, [srcq+1]
michael@0 1286 %if cpuflag(ssse3)
michael@0 1287 punpcklbw m0, m1
michael@0 1288 pmaddubsw m0, filter_x_a
michael@0 1289 paddw m0, filter_rnd
michael@0 1290 %else
michael@0 1291 punpcklbw m0, m5
michael@0 1292 punpcklbw m1, m5
michael@0 1293 pmullw m0, filter_x_a
michael@0 1294 pmullw m1, filter_x_b
michael@0 1295 paddw m0, filter_rnd
michael@0 1296 paddw m0, m1
michael@0 1297 %endif
michael@0 1298 psraw m0, 4
michael@0 1299 %if cpuflag(ssse3)
michael@0 1300 packuswb m0, m0
michael@0 1301 %endif
michael@0 1302
michael@0 1303 INC_SRC_BY_SRC_STRIDE
michael@0 1304
michael@0 1305 .x_other_y_other_loop:
michael@0 1306 movh m2, [srcq]
michael@0 1307 movh m1, [srcq+1]
michael@0 1308
michael@0 1309 INC_SRC_BY_SRC_STRIDE
michael@0 1310 movh m4, [srcq]
michael@0 1311 movh m3, [srcq+1]
michael@0 1312
michael@0 1313 %if cpuflag(ssse3)
michael@0 1314 punpcklbw m2, m1
michael@0 1315 punpcklbw m4, m3
michael@0 1316 pmaddubsw m2, filter_x_a
michael@0 1317 pmaddubsw m4, filter_x_a
michael@0 1318 movh m3, [dstq+dst_strideq]
michael@0 1319 movh m1, [dstq]
michael@0 1320 paddw m2, filter_rnd
michael@0 1321 paddw m4, filter_rnd
michael@0 1322 psraw m2, 4
michael@0 1323 psraw m4, 4
michael@0 1324 packuswb m2, m2
michael@0 1325 packuswb m4, m4
michael@0 1326 punpcklbw m0, m2
michael@0 1327 punpcklbw m2, m4
michael@0 1328 pmaddubsw m0, filter_y_a
michael@0 1329 pmaddubsw m2, filter_y_a
michael@0 1330 punpcklbw m3, m5
michael@0 1331 paddw m0, filter_rnd
michael@0 1332 paddw m2, filter_rnd
michael@0 1333 psraw m0, 4
michael@0 1334 psraw m2, 4
michael@0 1335 punpcklbw m1, m5
michael@0 1336 %else
michael@0 1337 punpcklbw m2, m5
michael@0 1338 punpcklbw m1, m5
michael@0 1339 punpcklbw m4, m5
michael@0 1340 punpcklbw m3, m5
michael@0 1341 pmullw m2, filter_x_a
michael@0 1342 pmullw m1, filter_x_b
michael@0 1343 paddw m2, filter_rnd
michael@0 1344 pmullw m4, filter_x_a
michael@0 1345 pmullw m3, filter_x_b
michael@0 1346 paddw m4, filter_rnd
michael@0 1347 paddw m2, m1
michael@0 1348 paddw m4, m3
michael@0 1349 psraw m2, 4
michael@0 1350 psraw m4, 4
michael@0 1351 pmullw m0, filter_y_a
michael@0 1352 pmullw m3, m2, filter_y_b
michael@0 1353 paddw m0, filter_rnd
michael@0 1354 pmullw m2, filter_y_a
michael@0 1355 pmullw m1, m4, filter_y_b
michael@0 1356 paddw m2, filter_rnd
michael@0 1357 paddw m0, m3
michael@0 1358 movh m3, [dstq+dst_strideq]
michael@0 1359 paddw m2, m1
michael@0 1360 movh m1, [dstq]
michael@0 1361 psraw m0, 4
michael@0 1362 psraw m2, 4
michael@0 1363 punpcklbw m3, m5
michael@0 1364 punpcklbw m1, m5
michael@0 1365 %endif
michael@0 1366 %if %2 == 1 ; avg
michael@0 1367 ; FIXME(rbultje) pipeline
michael@0 1368 packuswb m0, m2
michael@0 1369 pavgb m0, [secq]
michael@0 1370 punpckhbw m2, m0, m5
michael@0 1371 punpcklbw m0, m5
michael@0 1372 %endif
michael@0 1373 SUM_SSE m0, m1, m2, m3, m6, m7
michael@0 1374 mova m0, m4
michael@0 1375
michael@0 1376 INC_SRC_BY_SRC_STRIDE
michael@0 1377 lea dstq, [dstq+dst_strideq*2]
michael@0 1378 %endif
michael@0 1379 %if %2 == 1 ; avg
michael@0 1380 add secq, sec_str
michael@0 1381 %endif
michael@0 1382 dec h
michael@0 1383 jg .x_other_y_other_loop
michael@0 1384 %undef filter_x_a
michael@0 1385 %undef filter_x_b
michael@0 1386 %undef filter_y_a
michael@0 1387 %undef filter_y_b
michael@0 1388 %undef filter_rnd
michael@0 1389 STORE_AND_RET
michael@0 1390 %endmacro
michael@0 1391
michael@0 1392 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
michael@0 1393 ; between the ssse3 and non-ssse3 version. It may make sense to merge their
michael@0 1394 ; code in the sense that the ssse3 version would jump to the appropriate
michael@0 1395 ; location in the sse/2 version, rather than duplicating that code in the
michael@0 1396 ; binary.
michael@0 1397
michael@0 1398 INIT_MMX sse
michael@0 1399 SUBPEL_VARIANCE 4
michael@0 1400 INIT_XMM sse2
michael@0 1401 SUBPEL_VARIANCE 8
michael@0 1402 SUBPEL_VARIANCE 16
michael@0 1403
michael@0 1404 INIT_MMX ssse3
michael@0 1405 SUBPEL_VARIANCE 4
michael@0 1406 INIT_XMM ssse3
michael@0 1407 SUBPEL_VARIANCE 8
michael@0 1408 SUBPEL_VARIANCE 16
michael@0 1409
michael@0 1410 INIT_MMX sse
michael@0 1411 SUBPEL_VARIANCE 4, 1
michael@0 1412 INIT_XMM sse2
michael@0 1413 SUBPEL_VARIANCE 8, 1
michael@0 1414 SUBPEL_VARIANCE 16, 1
michael@0 1415
michael@0 1416 INIT_MMX ssse3
michael@0 1417 SUBPEL_VARIANCE 4, 1
michael@0 1418 INIT_XMM ssse3
michael@0 1419 SUBPEL_VARIANCE 8, 1
michael@0 1420 SUBPEL_VARIANCE 16, 1

mercurial