media/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1420 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%include "third_party/x86inc/x86inc.asm"
    1.15 +
    1.16 +SECTION_RODATA
    1.17 +pw_8: times  8 dw  8
    1.18 +bilin_filter_m_sse2: times  8 dw 16
    1.19 +                     times  8 dw  0
    1.20 +                     times  8 dw 15
    1.21 +                     times  8 dw  1
    1.22 +                     times  8 dw 14
    1.23 +                     times  8 dw  2
    1.24 +                     times  8 dw 13
    1.25 +                     times  8 dw  3
    1.26 +                     times  8 dw 12
    1.27 +                     times  8 dw  4
    1.28 +                     times  8 dw 11
    1.29 +                     times  8 dw  5
    1.30 +                     times  8 dw 10
    1.31 +                     times  8 dw  6
    1.32 +                     times  8 dw  9
    1.33 +                     times  8 dw  7
    1.34 +                     times 16 dw  8
    1.35 +                     times  8 dw  7
    1.36 +                     times  8 dw  9
    1.37 +                     times  8 dw  6
    1.38 +                     times  8 dw 10
    1.39 +                     times  8 dw  5
    1.40 +                     times  8 dw 11
    1.41 +                     times  8 dw  4
    1.42 +                     times  8 dw 12
    1.43 +                     times  8 dw  3
    1.44 +                     times  8 dw 13
    1.45 +                     times  8 dw  2
    1.46 +                     times  8 dw 14
    1.47 +                     times  8 dw  1
    1.48 +                     times  8 dw 15
    1.49 +
    1.50 +bilin_filter_m_ssse3: times  8 db 16,  0
    1.51 +                      times  8 db 15,  1
    1.52 +                      times  8 db 14,  2
    1.53 +                      times  8 db 13,  3
    1.54 +                      times  8 db 12,  4
    1.55 +                      times  8 db 11,  5
    1.56 +                      times  8 db 10,  6
    1.57 +                      times  8 db  9,  7
    1.58 +                      times 16 db  8
    1.59 +                      times  8 db  7,  9
    1.60 +                      times  8 db  6, 10
    1.61 +                      times  8 db  5, 11
    1.62 +                      times  8 db  4, 12
    1.63 +                      times  8 db  3, 13
    1.64 +                      times  8 db  2, 14
    1.65 +                      times  8 db  1, 15
    1.66 +
    1.67 +SECTION .text
    1.68 +
    1.69 +; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
    1.70 +;                               int x_offset, int y_offset,
    1.71 +;                               const uint8_t *dst, ptrdiff_t dst_stride,
    1.72 +;                               int height, unsigned int *sse);
    1.73 +;
    1.74 +; This function returns the SE and stores SSE in the given pointer.
    1.75 +
    1.76 +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
    1.77 +  psubw                %3, %4
    1.78 +  psubw                %1, %2
    1.79 +  paddw                %5, %3
    1.80 +  pmaddwd              %3, %3
    1.81 +  paddw                %5, %1
    1.82 +  pmaddwd              %1, %1
    1.83 +  paddd                %6, %3
    1.84 +  paddd                %6, %1
    1.85 +%endmacro
    1.86 +
    1.87 +%macro STORE_AND_RET 0
    1.88 +%if mmsize == 16
    1.89 +  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
    1.90 +  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
    1.91 +  ; We have to sign-extend it before adding the words within the register
    1.92 +  ; and outputing to a dword.
    1.93 +  pcmpgtw              m5, m6           ; mask for 0 > x
    1.94 +  movhlps              m3, m7
    1.95 +  punpcklwd            m4, m6, m5
    1.96 +  punpckhwd            m6, m5           ; sign-extend m6 word->dword
    1.97 +  paddd                m7, m3
    1.98 +  paddd                m6, m4
    1.99 +  pshufd               m3, m7, 0x1
   1.100 +  movhlps              m4, m6
   1.101 +  paddd                m7, m3
   1.102 +  paddd                m6, m4
   1.103 +  mov                  r1, ssem         ; r1 = unsigned int *sse
   1.104 +  pshufd               m4, m6, 0x1
   1.105 +  movd               [r1], m7           ; store sse
   1.106 +  paddd                m6, m4
   1.107 +  movd                rax, m6           ; store sum as return value
   1.108 +%else ; mmsize == 8
   1.109 +  pshufw               m4, m6, 0xe
   1.110 +  pshufw               m3, m7, 0xe
   1.111 +  paddw                m6, m4
   1.112 +  paddd                m7, m3
   1.113 +  pcmpgtw              m5, m6           ; mask for 0 > x
   1.114 +  mov                  r1, ssem         ; r1 = unsigned int *sse
   1.115 +  punpcklwd            m6, m5           ; sign-extend m6 word->dword
   1.116 +  movd               [r1], m7           ; store sse
   1.117 +  pshufw               m4, m6, 0xe
   1.118 +  paddd                m6, m4
   1.119 +  movd                rax, m6           ; store sum as return value
   1.120 +%endif
   1.121 +  RET
   1.122 +%endmacro
   1.123 +
   1.124 +%macro INC_SRC_BY_SRC_STRIDE  0
   1.125 +%if ARCH_X86=1 && CONFIG_PIC=1
   1.126 +  add                srcq, src_stridemp
   1.127 +%else
   1.128 +  add                srcq, src_strideq
   1.129 +%endif
   1.130 +%endmacro
   1.131 +
   1.132 +%macro SUBPEL_VARIANCE 1-2 0 ; W
   1.133 +%if cpuflag(ssse3)
   1.134 +%define bilin_filter_m bilin_filter_m_ssse3
   1.135 +%define filter_idx_shift 4
   1.136 +%else
   1.137 +%define bilin_filter_m bilin_filter_m_sse2
   1.138 +%define filter_idx_shift 5
   1.139 +%endif
   1.140 +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
   1.141 +; 11, not 13, if the registers are ordered correctly. May make a minor speed
   1.142 +; difference on Win64
   1.143 +
   1.144 +%ifdef PIC    ; 64bit PIC
   1.145 +  %if %2 == 1 ; avg
   1.146 +    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
   1.147 +                                      x_offset, y_offset, \
   1.148 +                                      dst, dst_stride, \
   1.149 +                                      sec, sec_stride, height, sse
   1.150 +    %define sec_str sec_strideq
   1.151 +  %else
   1.152 +    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
   1.153 +                                  y_offset, dst, dst_stride, height, sse
   1.154 +  %endif
   1.155 +  %define h heightd
   1.156 +  %define bilin_filter sseq
   1.157 +%else
   1.158 +  %if ARCH_X86=1 && CONFIG_PIC=1
   1.159 +    %if %2 == 1 ; avg
   1.160 +      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
   1.161 +                                  x_offset, y_offset, \
   1.162 +                                  dst, dst_stride, \
   1.163 +                                  sec, sec_stride, \
   1.164 +                                  height, sse, g_bilin_filter, g_pw_8
   1.165 +      %define h dword heightm
   1.166 +      %define sec_str sec_stridemp
   1.167 +
   1.168 +      ;Store bilin_filter and pw_8 location in stack
   1.169 +      GET_GOT eax
   1.170 +      add esp, 4                ; restore esp
   1.171 +
   1.172 +      lea ecx, [GLOBAL(bilin_filter_m)]
   1.173 +      mov g_bilin_filterm, ecx
   1.174 +
   1.175 +      lea ecx, [GLOBAL(pw_8)]
   1.176 +      mov g_pw_8m, ecx
   1.177 +
   1.178 +      LOAD_IF_USED 0, 1         ; load eax, ecx back
   1.179 +    %else
   1.180 +      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
   1.181 +                                y_offset, dst, dst_stride, height, sse, \
   1.182 +                                g_bilin_filter, g_pw_8
   1.183 +      %define h heightd
   1.184 +
   1.185 +      ;Store bilin_filter and pw_8 location in stack
   1.186 +      GET_GOT eax
   1.187 +      add esp, 4                ; restore esp
   1.188 +
   1.189 +      lea ecx, [GLOBAL(bilin_filter_m)]
   1.190 +      mov g_bilin_filterm, ecx
   1.191 +
   1.192 +      lea ecx, [GLOBAL(pw_8)]
   1.193 +      mov g_pw_8m, ecx
   1.194 +
   1.195 +      LOAD_IF_USED 0, 1         ; load eax, ecx back
   1.196 +    %endif
   1.197 +  %else
   1.198 +    %if %2 == 1 ; avg
   1.199 +      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
   1.200 +                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
   1.201 +                                             x_offset, y_offset, \
   1.202 +                                             dst, dst_stride, \
   1.203 +                                             sec, sec_stride, \
   1.204 +                                             height, sse
   1.205 +      %if ARCH_X86_64
   1.206 +      %define h heightd
   1.207 +      %define sec_str sec_strideq
   1.208 +      %else
   1.209 +      %define h dword heightm
   1.210 +      %define sec_str sec_stridemp
   1.211 +      %endif
   1.212 +    %else
   1.213 +      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
   1.214 +                              y_offset, dst, dst_stride, height, sse
   1.215 +      %define h heightd
   1.216 +    %endif
   1.217 +
   1.218 +    %define bilin_filter bilin_filter_m
   1.219 +  %endif
   1.220 +%endif
   1.221 +
   1.222 +  ASSERT               %1 <= 16         ; m6 overflows if w > 16
   1.223 +  pxor                 m6, m6           ; sum
   1.224 +  pxor                 m7, m7           ; sse
   1.225 +  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
   1.226 +  ; could perhaps use it for something more productive then
   1.227 +  pxor                 m5, m5           ; dedicated zero register
   1.228 +%if %1 < 16
   1.229 +  sar                   h, 1
   1.230 +%if %2 == 1 ; avg
   1.231 +  shl             sec_str, 1
   1.232 +%endif
   1.233 +%endif
   1.234 +
   1.235 +  ; FIXME(rbultje) replace by jumptable?
   1.236 +  test          x_offsetd, x_offsetd
   1.237 +  jnz .x_nonzero
   1.238 +  ; x_offset == 0
   1.239 +  test          y_offsetd, y_offsetd
   1.240 +  jnz .x_zero_y_nonzero
   1.241 +
   1.242 +  ; x_offset == 0 && y_offset == 0
   1.243 +.x_zero_y_zero_loop:
   1.244 +%if %1 == 16
   1.245 +  movu                 m0, [srcq]
   1.246 +  mova                 m1, [dstq]
   1.247 +%if %2 == 1 ; avg
   1.248 +  pavgb                m0, [secq]
   1.249 +  punpckhbw            m3, m1, m5
   1.250 +  punpcklbw            m1, m5
   1.251 +%endif
   1.252 +  punpckhbw            m2, m0, m5
   1.253 +  punpcklbw            m0, m5
   1.254 +%if %2 == 0 ; !avg
   1.255 +  punpckhbw            m3, m1, m5
   1.256 +  punpcklbw            m1, m5
   1.257 +%endif
   1.258 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.259 +
   1.260 +  add                srcq, src_strideq
   1.261 +  add                dstq, dst_strideq
   1.262 +%else ; %1 < 16
   1.263 +  movh                 m0, [srcq]
   1.264 +%if %2 == 1 ; avg
   1.265 +%if mmsize == 16
   1.266 +  movhps               m0, [srcq+src_strideq]
   1.267 +%else ; mmsize == 8
   1.268 +  punpckldq            m0, [srcq+src_strideq]
   1.269 +%endif
   1.270 +%else ; !avg
   1.271 +  movh                 m2, [srcq+src_strideq]
   1.272 +%endif
   1.273 +  movh                 m1, [dstq]
   1.274 +  movh                 m3, [dstq+dst_strideq]
   1.275 +%if %2 == 1 ; avg
   1.276 +  pavgb                m0, [secq]
   1.277 +  punpcklbw            m3, m5
   1.278 +  punpcklbw            m1, m5
   1.279 +  punpckhbw            m2, m0, m5
   1.280 +  punpcklbw            m0, m5
   1.281 +%else ; !avg
   1.282 +  punpcklbw            m0, m5
   1.283 +  punpcklbw            m2, m5
   1.284 +  punpcklbw            m3, m5
   1.285 +  punpcklbw            m1, m5
   1.286 +%endif
   1.287 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.288 +
   1.289 +  lea                srcq, [srcq+src_strideq*2]
   1.290 +  lea                dstq, [dstq+dst_strideq*2]
   1.291 +%endif
   1.292 +%if %2 == 1 ; avg
   1.293 +  add                secq, sec_str
   1.294 +%endif
   1.295 +  dec                   h
   1.296 +  jg .x_zero_y_zero_loop
   1.297 +  STORE_AND_RET
   1.298 +
   1.299 +.x_zero_y_nonzero:
   1.300 +  cmp           y_offsetd, 8
   1.301 +  jne .x_zero_y_nonhalf
   1.302 +
   1.303 +  ; x_offset == 0 && y_offset == 0.5
   1.304 +.x_zero_y_half_loop:
   1.305 +%if %1 == 16
   1.306 +  movu                 m0, [srcq]
   1.307 +  movu                 m4, [srcq+src_strideq]
   1.308 +  mova                 m1, [dstq]
   1.309 +  pavgb                m0, m4
   1.310 +  punpckhbw            m3, m1, m5
   1.311 +%if %2 == 1 ; avg
   1.312 +  pavgb                m0, [secq]
   1.313 +%endif
   1.314 +  punpcklbw            m1, m5
   1.315 +  punpckhbw            m2, m0, m5
   1.316 +  punpcklbw            m0, m5
   1.317 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.318 +
   1.319 +  add                srcq, src_strideq
   1.320 +  add                dstq, dst_strideq
   1.321 +%else ; %1 < 16
   1.322 +  movh                 m0, [srcq]
   1.323 +  movh                 m2, [srcq+src_strideq]
   1.324 +%if %2 == 1 ; avg
   1.325 +%if mmsize == 16
   1.326 +  movhps               m2, [srcq+src_strideq*2]
   1.327 +%else ; mmsize == 8
   1.328 +%if %1 == 4
   1.329 +  movh                 m1, [srcq+src_strideq*2]
   1.330 +  punpckldq            m2, m1
   1.331 +%else
   1.332 +  punpckldq            m2, [srcq+src_strideq*2]
   1.333 +%endif
   1.334 +%endif
   1.335 +  movh                 m1, [dstq]
   1.336 +%if mmsize == 16
   1.337 +  movlhps              m0, m2
   1.338 +%else ; mmsize == 8
   1.339 +  punpckldq            m0, m2
   1.340 +%endif
   1.341 +  movh                 m3, [dstq+dst_strideq]
   1.342 +  pavgb                m0, m2
   1.343 +  punpcklbw            m1, m5
   1.344 +  pavgb                m0, [secq]
   1.345 +  punpcklbw            m3, m5
   1.346 +  punpckhbw            m2, m0, m5
   1.347 +  punpcklbw            m0, m5
   1.348 +%else ; !avg
   1.349 +  movh                 m4, [srcq+src_strideq*2]
   1.350 +  movh                 m1, [dstq]
   1.351 +  pavgb                m0, m2
   1.352 +  movh                 m3, [dstq+dst_strideq]
   1.353 +  pavgb                m2, m4
   1.354 +  punpcklbw            m0, m5
   1.355 +  punpcklbw            m2, m5
   1.356 +  punpcklbw            m3, m5
   1.357 +  punpcklbw            m1, m5
   1.358 +%endif
   1.359 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.360 +
   1.361 +  lea                srcq, [srcq+src_strideq*2]
   1.362 +  lea                dstq, [dstq+dst_strideq*2]
   1.363 +%endif
   1.364 +%if %2 == 1 ; avg
   1.365 +  add                secq, sec_str
   1.366 +%endif
   1.367 +  dec                   h
   1.368 +  jg .x_zero_y_half_loop
   1.369 +  STORE_AND_RET
   1.370 +
   1.371 +.x_zero_y_nonhalf:
   1.372 +  ; x_offset == 0 && y_offset == bilin interpolation
   1.373 +%ifdef PIC
   1.374 +  lea        bilin_filter, [bilin_filter_m]
   1.375 +%endif
   1.376 +  shl           y_offsetd, filter_idx_shift
   1.377 +%if ARCH_X86_64 && mmsize == 16
   1.378 +  mova                 m8, [bilin_filter+y_offsetq]
   1.379 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   1.380 +  mova                 m9, [bilin_filter+y_offsetq+16]
   1.381 +%endif
   1.382 +  mova                m10, [pw_8]
   1.383 +%define filter_y_a m8
   1.384 +%define filter_y_b m9
   1.385 +%define filter_rnd m10
   1.386 +%else ; x86-32 or mmx
   1.387 +%if ARCH_X86=1 && CONFIG_PIC=1
   1.388 +; x_offset == 0, reuse x_offset reg
   1.389 +%define tempq x_offsetq
   1.390 +  add y_offsetq, g_bilin_filterm
   1.391 +%define filter_y_a [y_offsetq]
   1.392 +%define filter_y_b [y_offsetq+16]
   1.393 +  mov tempq, g_pw_8m
   1.394 +%define filter_rnd [tempq]
   1.395 +%else
   1.396 +  add           y_offsetq, bilin_filter
   1.397 +%define filter_y_a [y_offsetq]
   1.398 +%define filter_y_b [y_offsetq+16]
   1.399 +%define filter_rnd [pw_8]
   1.400 +%endif
   1.401 +%endif
   1.402 +
   1.403 +.x_zero_y_other_loop:
   1.404 +%if %1 == 16
   1.405 +  movu                 m0, [srcq]
   1.406 +  movu                 m4, [srcq+src_strideq]
   1.407 +  mova                 m1, [dstq]
   1.408 +%if cpuflag(ssse3)
   1.409 +  punpckhbw            m2, m0, m4
   1.410 +  punpcklbw            m0, m4
   1.411 +  pmaddubsw            m2, filter_y_a
   1.412 +  pmaddubsw            m0, filter_y_a
   1.413 +  paddw                m2, filter_rnd
   1.414 +  paddw                m0, filter_rnd
   1.415 +%else
   1.416 +  punpckhbw            m2, m0, m5
   1.417 +  punpckhbw            m3, m4, m5
   1.418 +  punpcklbw            m0, m5
   1.419 +  punpcklbw            m4, m5
   1.420 +  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
   1.421 +  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
   1.422 +  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
   1.423 +  ; slightly faster because of pmullw latency. It would also cut our rodata
   1.424 +  ; tables in half for this function, and save 1-2 registers on x86-64.
   1.425 +  pmullw               m2, filter_y_a
   1.426 +  pmullw               m3, filter_y_b
   1.427 +  paddw                m2, filter_rnd
   1.428 +  pmullw               m0, filter_y_a
   1.429 +  pmullw               m4, filter_y_b
   1.430 +  paddw                m0, filter_rnd
   1.431 +  paddw                m2, m3
   1.432 +  paddw                m0, m4
   1.433 +%endif
   1.434 +  psraw                m2, 4
   1.435 +  psraw                m0, 4
   1.436 +%if %2 == 1 ; avg
   1.437 +  ; FIXME(rbultje) pipeline
   1.438 +  packuswb             m0, m2
   1.439 +  pavgb                m0, [secq]
   1.440 +  punpckhbw            m2, m0, m5
   1.441 +  punpcklbw            m0, m5
   1.442 +%endif
   1.443 +  punpckhbw            m3, m1, m5
   1.444 +  punpcklbw            m1, m5
   1.445 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.446 +
   1.447 +  add                srcq, src_strideq
   1.448 +  add                dstq, dst_strideq
   1.449 +%else ; %1 < 16
   1.450 +  movh                 m0, [srcq]
   1.451 +  movh                 m2, [srcq+src_strideq]
   1.452 +  movh                 m4, [srcq+src_strideq*2]
   1.453 +  movh                 m3, [dstq+dst_strideq]
   1.454 +%if cpuflag(ssse3)
   1.455 +  movh                 m1, [dstq]
   1.456 +  punpcklbw            m0, m2
   1.457 +  punpcklbw            m2, m4
   1.458 +  pmaddubsw            m0, filter_y_a
   1.459 +  pmaddubsw            m2, filter_y_a
   1.460 +  punpcklbw            m3, m5
   1.461 +  paddw                m2, filter_rnd
   1.462 +  paddw                m0, filter_rnd
   1.463 +%else
   1.464 +  punpcklbw            m0, m5
   1.465 +  punpcklbw            m2, m5
   1.466 +  punpcklbw            m4, m5
   1.467 +  pmullw               m0, filter_y_a
   1.468 +  pmullw               m1, m2, filter_y_b
   1.469 +  punpcklbw            m3, m5
   1.470 +  paddw                m0, filter_rnd
   1.471 +  pmullw               m2, filter_y_a
   1.472 +  pmullw               m4, filter_y_b
   1.473 +  paddw                m0, m1
   1.474 +  paddw                m2, filter_rnd
   1.475 +  movh                 m1, [dstq]
   1.476 +  paddw                m2, m4
   1.477 +%endif
   1.478 +  psraw                m0, 4
   1.479 +  psraw                m2, 4
   1.480 +%if %2 == 1 ; avg
   1.481 +  ; FIXME(rbultje) pipeline
   1.482 +  packuswb             m0, m2
   1.483 +  pavgb                m0, [secq]
   1.484 +  punpckhbw            m2, m0, m5
   1.485 +  punpcklbw            m0, m5
   1.486 +%endif
   1.487 +  punpcklbw            m1, m5
   1.488 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.489 +
   1.490 +  lea                srcq, [srcq+src_strideq*2]
   1.491 +  lea                dstq, [dstq+dst_strideq*2]
   1.492 +%endif
   1.493 +%if %2 == 1 ; avg
   1.494 +  add                secq, sec_str
   1.495 +%endif
   1.496 +  dec                   h
   1.497 +  jg .x_zero_y_other_loop
   1.498 +%undef filter_y_a
   1.499 +%undef filter_y_b
   1.500 +%undef filter_rnd
   1.501 +  STORE_AND_RET
   1.502 +
   1.503 +.x_nonzero:
   1.504 +  cmp           x_offsetd, 8
   1.505 +  jne .x_nonhalf
   1.506 +  ; x_offset == 0.5
   1.507 +  test          y_offsetd, y_offsetd
   1.508 +  jnz .x_half_y_nonzero
   1.509 +
   1.510 +  ; x_offset == 0.5 && y_offset == 0
   1.511 +.x_half_y_zero_loop:
   1.512 +%if %1 == 16
   1.513 +  movu                 m0, [srcq]
   1.514 +  movu                 m4, [srcq+1]
   1.515 +  mova                 m1, [dstq]
   1.516 +  pavgb                m0, m4
   1.517 +  punpckhbw            m3, m1, m5
   1.518 +%if %2 == 1 ; avg
   1.519 +  pavgb                m0, [secq]
   1.520 +%endif
   1.521 +  punpcklbw            m1, m5
   1.522 +  punpckhbw            m2, m0, m5
   1.523 +  punpcklbw            m0, m5
   1.524 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.525 +
   1.526 +  add                srcq, src_strideq
   1.527 +  add                dstq, dst_strideq
   1.528 +%else ; %1 < 16
   1.529 +  movh                 m0, [srcq]
   1.530 +  movh                 m4, [srcq+1]
   1.531 +%if %2 == 1 ; avg
   1.532 +%if mmsize == 16
   1.533 +  movhps               m0, [srcq+src_strideq]
   1.534 +  movhps               m4, [srcq+src_strideq+1]
   1.535 +%else ; mmsize == 8
   1.536 +  punpckldq            m0, [srcq+src_strideq]
   1.537 +  punpckldq            m4, [srcq+src_strideq+1]
   1.538 +%endif
   1.539 +  movh                 m1, [dstq]
   1.540 +  movh                 m3, [dstq+dst_strideq]
   1.541 +  pavgb                m0, m4
   1.542 +  punpcklbw            m3, m5
   1.543 +  pavgb                m0, [secq]
   1.544 +  punpcklbw            m1, m5
   1.545 +  punpckhbw            m2, m0, m5
   1.546 +  punpcklbw            m0, m5
   1.547 +%else ; !avg
   1.548 +  movh                 m2, [srcq+src_strideq]
   1.549 +  movh                 m1, [dstq]
   1.550 +  pavgb                m0, m4
   1.551 +  movh                 m4, [srcq+src_strideq+1]
   1.552 +  movh                 m3, [dstq+dst_strideq]
   1.553 +  pavgb                m2, m4
   1.554 +  punpcklbw            m0, m5
   1.555 +  punpcklbw            m2, m5
   1.556 +  punpcklbw            m3, m5
   1.557 +  punpcklbw            m1, m5
   1.558 +%endif
   1.559 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.560 +
   1.561 +  lea                srcq, [srcq+src_strideq*2]
   1.562 +  lea                dstq, [dstq+dst_strideq*2]
   1.563 +%endif
   1.564 +%if %2 == 1 ; avg
   1.565 +  add                secq, sec_str
   1.566 +%endif
   1.567 +  dec                   h
   1.568 +  jg .x_half_y_zero_loop
   1.569 +  STORE_AND_RET
   1.570 +
   1.571 +.x_half_y_nonzero:
   1.572 +  cmp           y_offsetd, 8
   1.573 +  jne .x_half_y_nonhalf
   1.574 +
   1.575 +  ; x_offset == 0.5 && y_offset == 0.5
   1.576 +%if %1 == 16
   1.577 +  movu                 m0, [srcq]
   1.578 +  movu                 m3, [srcq+1]
   1.579 +  add                srcq, src_strideq
   1.580 +  pavgb                m0, m3
   1.581 +.x_half_y_half_loop:
   1.582 +  movu                 m4, [srcq]
   1.583 +  movu                 m3, [srcq+1]
   1.584 +  mova                 m1, [dstq]
   1.585 +  pavgb                m4, m3
   1.586 +  punpckhbw            m3, m1, m5
   1.587 +  pavgb                m0, m4
   1.588 +%if %2 == 1 ; avg
   1.589 +  punpcklbw            m1, m5
   1.590 +  pavgb                m0, [secq]
   1.591 +  punpckhbw            m2, m0, m5
   1.592 +  punpcklbw            m0, m5
   1.593 +%else
   1.594 +  punpckhbw            m2, m0, m5
   1.595 +  punpcklbw            m0, m5
   1.596 +  punpcklbw            m1, m5
   1.597 +%endif
   1.598 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.599 +  mova                 m0, m4
   1.600 +
   1.601 +  add                srcq, src_strideq
   1.602 +  add                dstq, dst_strideq
   1.603 +%else ; %1 < 16
   1.604 +  movh                 m0, [srcq]
   1.605 +  movh                 m3, [srcq+1]
   1.606 +  add                srcq, src_strideq
   1.607 +  pavgb                m0, m3
   1.608 +.x_half_y_half_loop:
   1.609 +  movh                 m2, [srcq]
   1.610 +  movh                 m3, [srcq+1]
   1.611 +%if %2 == 1 ; avg
   1.612 +%if mmsize == 16
   1.613 +  movhps               m2, [srcq+src_strideq]
   1.614 +  movhps               m3, [srcq+src_strideq+1]
   1.615 +%else
   1.616 +%if %1 == 4
   1.617 +  movh                 m1, [srcq+src_strideq]
   1.618 +  punpckldq            m2, m1
   1.619 +  movh                 m1, [srcq+src_strideq+1]
   1.620 +  punpckldq            m3, m1
   1.621 +%else
   1.622 +  punpckldq            m2, [srcq+src_strideq]
   1.623 +  punpckldq            m3, [srcq+src_strideq+1]
   1.624 +%endif
   1.625 +%endif
   1.626 +  pavgb                m2, m3
   1.627 +%if mmsize == 16
   1.628 +  movlhps              m0, m2
   1.629 +  movhlps              m4, m2
   1.630 +%else ; mmsize == 8
   1.631 +  punpckldq            m0, m2
   1.632 +  pshufw               m4, m2, 0xe
   1.633 +%endif
   1.634 +  movh                 m1, [dstq]
   1.635 +  pavgb                m0, m2
   1.636 +  movh                 m3, [dstq+dst_strideq]
   1.637 +  pavgb                m0, [secq]
   1.638 +  punpcklbw            m3, m5
   1.639 +  punpcklbw            m1, m5
   1.640 +  punpckhbw            m2, m0, m5
   1.641 +  punpcklbw            m0, m5
   1.642 +%else ; !avg
   1.643 +  movh                 m4, [srcq+src_strideq]
   1.644 +  movh                 m1, [srcq+src_strideq+1]
   1.645 +  pavgb                m2, m3
   1.646 +  pavgb                m4, m1
   1.647 +  pavgb                m0, m2
   1.648 +  pavgb                m2, m4
   1.649 +  movh                 m1, [dstq]
   1.650 +  movh                 m3, [dstq+dst_strideq]
   1.651 +  punpcklbw            m0, m5
   1.652 +  punpcklbw            m2, m5
   1.653 +  punpcklbw            m3, m5
   1.654 +  punpcklbw            m1, m5
   1.655 +%endif
   1.656 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.657 +  mova                 m0, m4
   1.658 +
   1.659 +  lea                srcq, [srcq+src_strideq*2]
   1.660 +  lea                dstq, [dstq+dst_strideq*2]
   1.661 +%endif
   1.662 +%if %2 == 1 ; avg
   1.663 +  add                secq, sec_str
   1.664 +%endif
   1.665 +  dec                   h
   1.666 +  jg .x_half_y_half_loop
   1.667 +  STORE_AND_RET
   1.668 +
   1.669 +.x_half_y_nonhalf:
   1.670 +  ; x_offset == 0.5 && y_offset == bilin interpolation
   1.671 +%ifdef PIC
   1.672 +  lea        bilin_filter, [bilin_filter_m]
   1.673 +%endif
   1.674 +  shl           y_offsetd, filter_idx_shift
   1.675 +%if ARCH_X86_64 && mmsize == 16
   1.676 +  mova                 m8, [bilin_filter+y_offsetq]
   1.677 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   1.678 +  mova                 m9, [bilin_filter+y_offsetq+16]
   1.679 +%endif
   1.680 +  mova                m10, [pw_8]
   1.681 +%define filter_y_a m8
   1.682 +%define filter_y_b m9
   1.683 +%define filter_rnd m10
   1.684 +%else  ;x86_32
   1.685 +%if ARCH_X86=1 && CONFIG_PIC=1
   1.686 +; x_offset == 0.5. We can reuse x_offset reg
   1.687 +%define tempq x_offsetq
   1.688 +  add y_offsetq, g_bilin_filterm
   1.689 +%define filter_y_a [y_offsetq]
   1.690 +%define filter_y_b [y_offsetq+16]
   1.691 +  mov tempq, g_pw_8m
   1.692 +%define filter_rnd [tempq]
   1.693 +%else
   1.694 +  add           y_offsetq, bilin_filter
   1.695 +%define filter_y_a [y_offsetq]
   1.696 +%define filter_y_b [y_offsetq+16]
   1.697 +%define filter_rnd [pw_8]
   1.698 +%endif
   1.699 +%endif
   1.700 +
   1.701 +%if %1 == 16
   1.702 +  movu                 m0, [srcq]
   1.703 +  movu                 m3, [srcq+1]
   1.704 +  add                srcq, src_strideq
   1.705 +  pavgb                m0, m3
   1.706 +.x_half_y_other_loop:
   1.707 +  movu                 m4, [srcq]
   1.708 +  movu                 m2, [srcq+1]
   1.709 +  mova                 m1, [dstq]
   1.710 +  pavgb                m4, m2
   1.711 +%if cpuflag(ssse3)
   1.712 +  punpckhbw            m2, m0, m4
   1.713 +  punpcklbw            m0, m4
   1.714 +  pmaddubsw            m2, filter_y_a
   1.715 +  pmaddubsw            m0, filter_y_a
   1.716 +  paddw                m2, filter_rnd
   1.717 +  paddw                m0, filter_rnd
   1.718 +  psraw                m2, 4
   1.719 +%else
   1.720 +  punpckhbw            m2, m0, m5
   1.721 +  punpckhbw            m3, m4, m5
   1.722 +  pmullw               m2, filter_y_a
   1.723 +  pmullw               m3, filter_y_b
   1.724 +  paddw                m2, filter_rnd
   1.725 +  punpcklbw            m0, m5
   1.726 +  paddw                m2, m3
   1.727 +  punpcklbw            m3, m4, m5
   1.728 +  pmullw               m0, filter_y_a
   1.729 +  pmullw               m3, filter_y_b
   1.730 +  paddw                m0, filter_rnd
   1.731 +  psraw                m2, 4
   1.732 +  paddw                m0, m3
   1.733 +%endif
   1.734 +  punpckhbw            m3, m1, m5
   1.735 +  psraw                m0, 4
   1.736 +%if %2 == 1 ; avg
   1.737 +  ; FIXME(rbultje) pipeline
   1.738 +  packuswb             m0, m2
   1.739 +  pavgb                m0, [secq]
   1.740 +  punpckhbw            m2, m0, m5
   1.741 +  punpcklbw            m0, m5
   1.742 +%endif
   1.743 +  punpcklbw            m1, m5
   1.744 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.745 +  mova                 m0, m4
   1.746 +
   1.747 +  add                srcq, src_strideq
   1.748 +  add                dstq, dst_strideq
   1.749 +%else ; %1 < 16
   1.750 +  movh                 m0, [srcq]
   1.751 +  movh                 m3, [srcq+1]
   1.752 +  add                srcq, src_strideq
   1.753 +  pavgb                m0, m3
   1.754 +%if notcpuflag(ssse3)
   1.755 +  punpcklbw            m0, m5
   1.756 +%endif
   1.757 +.x_half_y_other_loop:
   1.758 +  movh                 m2, [srcq]
   1.759 +  movh                 m1, [srcq+1]
   1.760 +  movh                 m4, [srcq+src_strideq]
   1.761 +  movh                 m3, [srcq+src_strideq+1]
   1.762 +  pavgb                m2, m1
   1.763 +  pavgb                m4, m3
   1.764 +  movh                 m3, [dstq+dst_strideq]
   1.765 +%if cpuflag(ssse3)
   1.766 +  movh                 m1, [dstq]
   1.767 +  punpcklbw            m0, m2
   1.768 +  punpcklbw            m2, m4
   1.769 +  pmaddubsw            m0, filter_y_a
   1.770 +  pmaddubsw            m2, filter_y_a
   1.771 +  punpcklbw            m3, m5
   1.772 +  paddw                m0, filter_rnd
   1.773 +  paddw                m2, filter_rnd
   1.774 +%else
   1.775 +  punpcklbw            m2, m5
   1.776 +  punpcklbw            m4, m5
   1.777 +  pmullw               m0, filter_y_a
   1.778 +  pmullw               m1, m2, filter_y_b
   1.779 +  punpcklbw            m3, m5
   1.780 +  paddw                m0, filter_rnd
   1.781 +  pmullw               m2, filter_y_a
   1.782 +  paddw                m0, m1
   1.783 +  pmullw               m1, m4, filter_y_b
   1.784 +  paddw                m2, filter_rnd
   1.785 +  paddw                m2, m1
   1.786 +  movh                 m1, [dstq]
   1.787 +%endif
   1.788 +  psraw                m0, 4
   1.789 +  psraw                m2, 4
   1.790 +%if %2 == 1 ; avg
   1.791 +  ; FIXME(rbultje) pipeline
   1.792 +  packuswb             m0, m2
   1.793 +  pavgb                m0, [secq]
   1.794 +  punpckhbw            m2, m0, m5
   1.795 +  punpcklbw            m0, m5
   1.796 +%endif
   1.797 +  punpcklbw            m1, m5
   1.798 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.799 +  mova                 m0, m4
   1.800 +
   1.801 +  lea                srcq, [srcq+src_strideq*2]
   1.802 +  lea                dstq, [dstq+dst_strideq*2]
   1.803 +%endif
   1.804 +%if %2 == 1 ; avg
   1.805 +  add                secq, sec_str
   1.806 +%endif
   1.807 +  dec                   h
   1.808 +  jg .x_half_y_other_loop
   1.809 +%undef filter_y_a
   1.810 +%undef filter_y_b
   1.811 +%undef filter_rnd
   1.812 +  STORE_AND_RET
   1.813 +
   1.814 +.x_nonhalf:
   1.815 +  test          y_offsetd, y_offsetd
   1.816 +  jnz .x_nonhalf_y_nonzero
   1.817 +
   1.818 +  ; x_offset == bilin interpolation && y_offset == 0
   1.819 +%ifdef PIC
   1.820 +  lea        bilin_filter, [bilin_filter_m]
   1.821 +%endif
   1.822 +  shl           x_offsetd, filter_idx_shift
   1.823 +%if ARCH_X86_64 && mmsize == 16
   1.824 +  mova                 m8, [bilin_filter+x_offsetq]
   1.825 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   1.826 +  mova                 m9, [bilin_filter+x_offsetq+16]
   1.827 +%endif
   1.828 +  mova                m10, [pw_8]
   1.829 +%define filter_x_a m8
   1.830 +%define filter_x_b m9
   1.831 +%define filter_rnd m10
   1.832 +%else    ; x86-32
   1.833 +%if ARCH_X86=1 && CONFIG_PIC=1
   1.834 +;y_offset == 0. We can reuse y_offset reg.
   1.835 +%define tempq y_offsetq
   1.836 +  add x_offsetq, g_bilin_filterm
   1.837 +%define filter_x_a [x_offsetq]
   1.838 +%define filter_x_b [x_offsetq+16]
   1.839 +  mov tempq, g_pw_8m
   1.840 +%define filter_rnd [tempq]
   1.841 +%else
   1.842 +  add           x_offsetq, bilin_filter
   1.843 +%define filter_x_a [x_offsetq]
   1.844 +%define filter_x_b [x_offsetq+16]
   1.845 +%define filter_rnd [pw_8]
   1.846 +%endif
   1.847 +%endif
   1.848 +
   1.849 +.x_other_y_zero_loop:
   1.850 +%if %1 == 16
   1.851 +  movu                 m0, [srcq]
   1.852 +  movu                 m4, [srcq+1]
   1.853 +  mova                 m1, [dstq]
   1.854 +%if cpuflag(ssse3)
   1.855 +  punpckhbw            m2, m0, m4
   1.856 +  punpcklbw            m0, m4
   1.857 +  pmaddubsw            m2, filter_x_a
   1.858 +  pmaddubsw            m0, filter_x_a
   1.859 +  paddw                m2, filter_rnd
   1.860 +  paddw                m0, filter_rnd
   1.861 +%else
   1.862 +  punpckhbw            m2, m0, m5
   1.863 +  punpckhbw            m3, m4, m5
   1.864 +  punpcklbw            m0, m5
   1.865 +  punpcklbw            m4, m5
   1.866 +  pmullw               m2, filter_x_a
   1.867 +  pmullw               m3, filter_x_b
   1.868 +  paddw                m2, filter_rnd
   1.869 +  pmullw               m0, filter_x_a
   1.870 +  pmullw               m4, filter_x_b
   1.871 +  paddw                m0, filter_rnd
   1.872 +  paddw                m2, m3
   1.873 +  paddw                m0, m4
   1.874 +%endif
   1.875 +  psraw                m2, 4
   1.876 +  psraw                m0, 4
   1.877 +%if %2 == 1 ; avg
   1.878 +  ; FIXME(rbultje) pipeline
   1.879 +  packuswb             m0, m2
   1.880 +  pavgb                m0, [secq]
   1.881 +  punpckhbw            m2, m0, m5
   1.882 +  punpcklbw            m0, m5
   1.883 +%endif
   1.884 +  punpckhbw            m3, m1, m5
   1.885 +  punpcklbw            m1, m5
   1.886 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.887 +
   1.888 +  add                srcq, src_strideq
   1.889 +  add                dstq, dst_strideq
   1.890 +%else ; %1 < 16
   1.891 +  movh                 m0, [srcq]
   1.892 +  movh                 m1, [srcq+1]
   1.893 +  movh                 m2, [srcq+src_strideq]
   1.894 +  movh                 m4, [srcq+src_strideq+1]
   1.895 +  movh                 m3, [dstq+dst_strideq]
   1.896 +%if cpuflag(ssse3)
   1.897 +  punpcklbw            m0, m1
   1.898 +  movh                 m1, [dstq]
   1.899 +  punpcklbw            m2, m4
   1.900 +  pmaddubsw            m0, filter_x_a
   1.901 +  pmaddubsw            m2, filter_x_a
   1.902 +  punpcklbw            m3, m5
   1.903 +  paddw                m0, filter_rnd
   1.904 +  paddw                m2, filter_rnd
   1.905 +%else
   1.906 +  punpcklbw            m0, m5
   1.907 +  punpcklbw            m1, m5
   1.908 +  punpcklbw            m2, m5
   1.909 +  punpcklbw            m4, m5
   1.910 +  pmullw               m0, filter_x_a
   1.911 +  pmullw               m1, filter_x_b
   1.912 +  punpcklbw            m3, m5
   1.913 +  paddw                m0, filter_rnd
   1.914 +  pmullw               m2, filter_x_a
   1.915 +  pmullw               m4, filter_x_b
   1.916 +  paddw                m0, m1
   1.917 +  paddw                m2, filter_rnd
   1.918 +  movh                 m1, [dstq]
   1.919 +  paddw                m2, m4
   1.920 +%endif
   1.921 +  psraw                m0, 4
   1.922 +  psraw                m2, 4
   1.923 +%if %2 == 1 ; avg
   1.924 +  ; FIXME(rbultje) pipeline
   1.925 +  packuswb             m0, m2
   1.926 +  pavgb                m0, [secq]
   1.927 +  punpckhbw            m2, m0, m5
   1.928 +  punpcklbw            m0, m5
   1.929 +%endif
   1.930 +  punpcklbw            m1, m5
   1.931 +  SUM_SSE              m0, m1, m2, m3, m6, m7
   1.932 +
   1.933 +  lea                srcq, [srcq+src_strideq*2]
   1.934 +  lea                dstq, [dstq+dst_strideq*2]
   1.935 +%endif
   1.936 +%if %2 == 1 ; avg
   1.937 +  add                secq, sec_str
   1.938 +%endif
   1.939 +  dec                   h
   1.940 +  jg .x_other_y_zero_loop
   1.941 +%undef filter_x_a
   1.942 +%undef filter_x_b
   1.943 +%undef filter_rnd
   1.944 +  STORE_AND_RET
   1.945 +
   1.946 +.x_nonhalf_y_nonzero:
   1.947 +  cmp           y_offsetd, 8
   1.948 +  jne .x_nonhalf_y_nonhalf
   1.949 +
   1.950 +  ; x_offset == bilin interpolation && y_offset == 0.5
   1.951 +%ifdef PIC
   1.952 +  lea        bilin_filter, [bilin_filter_m]
   1.953 +%endif
   1.954 +  shl           x_offsetd, filter_idx_shift
   1.955 +%if ARCH_X86_64 && mmsize == 16
   1.956 +  mova                 m8, [bilin_filter+x_offsetq]
   1.957 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   1.958 +  mova                 m9, [bilin_filter+x_offsetq+16]
   1.959 +%endif
   1.960 +  mova                m10, [pw_8]
   1.961 +%define filter_x_a m8
   1.962 +%define filter_x_b m9
   1.963 +%define filter_rnd m10
   1.964 +%else    ; x86-32
   1.965 +%if ARCH_X86=1 && CONFIG_PIC=1
   1.966 +; y_offset == 0.5. We can reuse y_offset reg.
   1.967 +%define tempq y_offsetq
   1.968 +  add x_offsetq, g_bilin_filterm
   1.969 +%define filter_x_a [x_offsetq]
   1.970 +%define filter_x_b [x_offsetq+16]
   1.971 +  mov tempq, g_pw_8m
   1.972 +%define filter_rnd [tempq]
   1.973 +%else
   1.974 +  add           x_offsetq, bilin_filter
   1.975 +%define filter_x_a [x_offsetq]
   1.976 +%define filter_x_b [x_offsetq+16]
   1.977 +%define filter_rnd [pw_8]
   1.978 +%endif
   1.979 +%endif
   1.980 +
   1.981 +%if %1 == 16
   1.982 +  movu                 m0, [srcq]
   1.983 +  movu                 m1, [srcq+1]
   1.984 +%if cpuflag(ssse3)
   1.985 +  punpckhbw            m2, m0, m1
   1.986 +  punpcklbw            m0, m1
   1.987 +  pmaddubsw            m2, filter_x_a
   1.988 +  pmaddubsw            m0, filter_x_a
   1.989 +  paddw                m2, filter_rnd
   1.990 +  paddw                m0, filter_rnd
   1.991 +%else
   1.992 +  punpckhbw            m2, m0, m5
   1.993 +  punpckhbw            m3, m1, m5
   1.994 +  punpcklbw            m0, m5
   1.995 +  punpcklbw            m1, m5
   1.996 +  pmullw               m0, filter_x_a
   1.997 +  pmullw               m1, filter_x_b
   1.998 +  paddw                m0, filter_rnd
   1.999 +  pmullw               m2, filter_x_a
  1.1000 +  pmullw               m3, filter_x_b
  1.1001 +  paddw                m2, filter_rnd
  1.1002 +  paddw                m0, m1
  1.1003 +  paddw                m2, m3
  1.1004 +%endif
  1.1005 +  psraw                m0, 4
  1.1006 +  psraw                m2, 4
  1.1007 +  add                srcq, src_strideq
  1.1008 +  packuswb             m0, m2
  1.1009 +.x_other_y_half_loop:
  1.1010 +  movu                 m4, [srcq]
  1.1011 +  movu                 m3, [srcq+1]
  1.1012 +%if cpuflag(ssse3)
  1.1013 +  mova                 m1, [dstq]
  1.1014 +  punpckhbw            m2, m4, m3
  1.1015 +  punpcklbw            m4, m3
  1.1016 +  pmaddubsw            m2, filter_x_a
  1.1017 +  pmaddubsw            m4, filter_x_a
  1.1018 +  paddw                m2, filter_rnd
  1.1019 +  paddw                m4, filter_rnd
  1.1020 +  psraw                m2, 4
  1.1021 +  psraw                m4, 4
  1.1022 +  packuswb             m4, m2
  1.1023 +  pavgb                m0, m4
  1.1024 +  punpckhbw            m3, m1, m5
  1.1025 +  punpcklbw            m1, m5
  1.1026 +%else
  1.1027 +  punpckhbw            m2, m4, m5
  1.1028 +  punpckhbw            m1, m3, m5
  1.1029 +  punpcklbw            m4, m5
  1.1030 +  punpcklbw            m3, m5
  1.1031 +  pmullw               m4, filter_x_a
  1.1032 +  pmullw               m3, filter_x_b
  1.1033 +  paddw                m4, filter_rnd
  1.1034 +  pmullw               m2, filter_x_a
  1.1035 +  pmullw               m1, filter_x_b
  1.1036 +  paddw                m2, filter_rnd
  1.1037 +  paddw                m4, m3
  1.1038 +  paddw                m2, m1
  1.1039 +  mova                 m1, [dstq]
  1.1040 +  psraw                m4, 4
  1.1041 +  psraw                m2, 4
  1.1042 +  punpckhbw            m3, m1, m5
  1.1043 +  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
  1.1044 +  ; have a 1-register shortage to be able to store the backup of the bilin
  1.1045 +  ; filtered second line as words as cache for the next line. Packing into
  1.1046 +  ; a byte costs 1 pack and 2 unpacks, but saves a register.
  1.1047 +  packuswb             m4, m2
  1.1048 +  punpcklbw            m1, m5
  1.1049 +  pavgb                m0, m4
  1.1050 +%endif
  1.1051 +%if %2 == 1 ; avg
  1.1052 +  ; FIXME(rbultje) pipeline
  1.1053 +  pavgb                m0, [secq]
  1.1054 +%endif
  1.1055 +  punpckhbw            m2, m0, m5
  1.1056 +  punpcklbw            m0, m5
  1.1057 +  SUM_SSE              m0, m1, m2, m3, m6, m7
  1.1058 +  mova                 m0, m4
  1.1059 +
  1.1060 +  add                srcq, src_strideq
  1.1061 +  add                dstq, dst_strideq
  1.1062 +%else ; %1 < 16
  1.1063 +  movh                 m0, [srcq]
  1.1064 +  movh                 m1, [srcq+1]
  1.1065 +%if cpuflag(ssse3)
  1.1066 +  punpcklbw            m0, m1
  1.1067 +  pmaddubsw            m0, filter_x_a
  1.1068 +  paddw                m0, filter_rnd
  1.1069 +%else
  1.1070 +  punpcklbw            m0, m5
  1.1071 +  punpcklbw            m1, m5
  1.1072 +  pmullw               m0, filter_x_a
  1.1073 +  pmullw               m1, filter_x_b
  1.1074 +  paddw                m0, filter_rnd
  1.1075 +  paddw                m0, m1
  1.1076 +%endif
  1.1077 +  add                srcq, src_strideq
  1.1078 +  psraw                m0, 4
  1.1079 +.x_other_y_half_loop:
  1.1080 +  movh                 m2, [srcq]
  1.1081 +  movh                 m1, [srcq+1]
  1.1082 +  movh                 m4, [srcq+src_strideq]
  1.1083 +  movh                 m3, [srcq+src_strideq+1]
  1.1084 +%if cpuflag(ssse3)
  1.1085 +  punpcklbw            m2, m1
  1.1086 +  punpcklbw            m4, m3
  1.1087 +  pmaddubsw            m2, filter_x_a
  1.1088 +  pmaddubsw            m4, filter_x_a
  1.1089 +  movh                 m1, [dstq]
  1.1090 +  movh                 m3, [dstq+dst_strideq]
  1.1091 +  paddw                m2, filter_rnd
  1.1092 +  paddw                m4, filter_rnd
  1.1093 +%else
  1.1094 +  punpcklbw            m2, m5
  1.1095 +  punpcklbw            m1, m5
  1.1096 +  punpcklbw            m4, m5
  1.1097 +  punpcklbw            m3, m5
  1.1098 +  pmullw               m2, filter_x_a
  1.1099 +  pmullw               m1, filter_x_b
  1.1100 +  paddw                m2, filter_rnd
  1.1101 +  pmullw               m4, filter_x_a
  1.1102 +  pmullw               m3, filter_x_b
  1.1103 +  paddw                m4, filter_rnd
  1.1104 +  paddw                m2, m1
  1.1105 +  movh                 m1, [dstq]
  1.1106 +  paddw                m4, m3
  1.1107 +  movh                 m3, [dstq+dst_strideq]
  1.1108 +%endif
  1.1109 +  psraw                m2, 4
  1.1110 +  psraw                m4, 4
  1.1111 +  pavgw                m0, m2
  1.1112 +  pavgw                m2, m4
  1.1113 +%if %2 == 1 ; avg
  1.1114 +  ; FIXME(rbultje) pipeline - also consider going to bytes here
  1.1115 +  packuswb             m0, m2
  1.1116 +  pavgb                m0, [secq]
  1.1117 +  punpckhbw            m2, m0, m5
  1.1118 +  punpcklbw            m0, m5
  1.1119 +%endif
  1.1120 +  punpcklbw            m3, m5
  1.1121 +  punpcklbw            m1, m5
  1.1122 +  SUM_SSE              m0, m1, m2, m3, m6, m7
  1.1123 +  mova                 m0, m4
  1.1124 +
  1.1125 +  lea                srcq, [srcq+src_strideq*2]
  1.1126 +  lea                dstq, [dstq+dst_strideq*2]
  1.1127 +%endif
  1.1128 +%if %2 == 1 ; avg
  1.1129 +  add                secq, sec_str
  1.1130 +%endif
  1.1131 +  dec                   h
  1.1132 +  jg .x_other_y_half_loop
  1.1133 +%undef filter_x_a
  1.1134 +%undef filter_x_b
  1.1135 +%undef filter_rnd
  1.1136 +  STORE_AND_RET
  1.1137 +
  1.1138 +.x_nonhalf_y_nonhalf:
  1.1139 +%ifdef PIC
  1.1140 +  lea        bilin_filter, [bilin_filter_m]
  1.1141 +%endif
  1.1142 +  shl           x_offsetd, filter_idx_shift
  1.1143 +  shl           y_offsetd, filter_idx_shift
  1.1144 +%if ARCH_X86_64 && mmsize == 16
  1.1145 +  mova                 m8, [bilin_filter+x_offsetq]
  1.1146 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  1.1147 +  mova                 m9, [bilin_filter+x_offsetq+16]
  1.1148 +%endif
  1.1149 +  mova                m10, [bilin_filter+y_offsetq]
  1.1150 +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  1.1151 +  mova                m11, [bilin_filter+y_offsetq+16]
  1.1152 +%endif
  1.1153 +  mova                m12, [pw_8]
  1.1154 +%define filter_x_a m8
  1.1155 +%define filter_x_b m9
  1.1156 +%define filter_y_a m10
  1.1157 +%define filter_y_b m11
  1.1158 +%define filter_rnd m12
  1.1159 +%else   ; x86-32
  1.1160 +%if ARCH_X86=1 && CONFIG_PIC=1
  1.1161 +; In this case, there is NO unused register. Used src_stride register. Later,
  1.1162 +; src_stride has to be loaded from stack when it is needed.
  1.1163 +%define tempq src_strideq
  1.1164 +  mov tempq, g_bilin_filterm
  1.1165 +  add           x_offsetq, tempq
  1.1166 +  add           y_offsetq, tempq
  1.1167 +%define filter_x_a [x_offsetq]
  1.1168 +%define filter_x_b [x_offsetq+16]
  1.1169 +%define filter_y_a [y_offsetq]
  1.1170 +%define filter_y_b [y_offsetq+16]
  1.1171 +
  1.1172 +  mov tempq, g_pw_8m
  1.1173 +%define filter_rnd [tempq]
  1.1174 +%else
  1.1175 +  add           x_offsetq, bilin_filter
  1.1176 +  add           y_offsetq, bilin_filter
  1.1177 +%define filter_x_a [x_offsetq]
  1.1178 +%define filter_x_b [x_offsetq+16]
  1.1179 +%define filter_y_a [y_offsetq]
  1.1180 +%define filter_y_b [y_offsetq+16]
  1.1181 +%define filter_rnd [pw_8]
  1.1182 +%endif
  1.1183 +%endif
  1.1184 +
  1.1185 +  ; x_offset == bilin interpolation && y_offset == bilin interpolation
  1.1186 +%if %1 == 16
  1.1187 +  movu                 m0, [srcq]
  1.1188 +  movu                 m1, [srcq+1]
  1.1189 +%if cpuflag(ssse3)
  1.1190 +  punpckhbw            m2, m0, m1
  1.1191 +  punpcklbw            m0, m1
  1.1192 +  pmaddubsw            m2, filter_x_a
  1.1193 +  pmaddubsw            m0, filter_x_a
  1.1194 +  paddw                m2, filter_rnd
  1.1195 +  paddw                m0, filter_rnd
  1.1196 +%else
  1.1197 +  punpckhbw            m2, m0, m5
  1.1198 +  punpckhbw            m3, m1, m5
  1.1199 +  punpcklbw            m0, m5
  1.1200 +  punpcklbw            m1, m5
  1.1201 +  pmullw               m0, filter_x_a
  1.1202 +  pmullw               m1, filter_x_b
  1.1203 +  paddw                m0, filter_rnd
  1.1204 +  pmullw               m2, filter_x_a
  1.1205 +  pmullw               m3, filter_x_b
  1.1206 +  paddw                m2, filter_rnd
  1.1207 +  paddw                m0, m1
  1.1208 +  paddw                m2, m3
  1.1209 +%endif
  1.1210 +  psraw                m0, 4
  1.1211 +  psraw                m2, 4
  1.1212 +
  1.1213 +  INC_SRC_BY_SRC_STRIDE
  1.1214 +
  1.1215 +  packuswb             m0, m2
  1.1216 +.x_other_y_other_loop:
  1.1217 +%if cpuflag(ssse3)
  1.1218 +  movu                 m4, [srcq]
  1.1219 +  movu                 m3, [srcq+1]
  1.1220 +  mova                 m1, [dstq]
  1.1221 +  punpckhbw            m2, m4, m3
  1.1222 +  punpcklbw            m4, m3
  1.1223 +  pmaddubsw            m2, filter_x_a
  1.1224 +  pmaddubsw            m4, filter_x_a
  1.1225 +  punpckhbw            m3, m1, m5
  1.1226 +  paddw                m2, filter_rnd
  1.1227 +  paddw                m4, filter_rnd
  1.1228 +  psraw                m2, 4
  1.1229 +  psraw                m4, 4
  1.1230 +  packuswb             m4, m2
  1.1231 +  punpckhbw            m2, m0, m4
  1.1232 +  punpcklbw            m0, m4
  1.1233 +  pmaddubsw            m2, filter_y_a
  1.1234 +  pmaddubsw            m0, filter_y_a
  1.1235 +  punpcklbw            m1, m5
  1.1236 +  paddw                m2, filter_rnd
  1.1237 +  paddw                m0, filter_rnd
  1.1238 +  psraw                m2, 4
  1.1239 +  psraw                m0, 4
  1.1240 +%else
  1.1241 +  movu                 m3, [srcq]
  1.1242 +  movu                 m4, [srcq+1]
  1.1243 +  punpckhbw            m1, m3, m5
  1.1244 +  punpckhbw            m2, m4, m5
  1.1245 +  punpcklbw            m3, m5
  1.1246 +  punpcklbw            m4, m5
  1.1247 +  pmullw               m3, filter_x_a
  1.1248 +  pmullw               m4, filter_x_b
  1.1249 +  paddw                m3, filter_rnd
  1.1250 +  pmullw               m1, filter_x_a
  1.1251 +  pmullw               m2, filter_x_b
  1.1252 +  paddw                m1, filter_rnd
  1.1253 +  paddw                m3, m4
  1.1254 +  paddw                m1, m2
  1.1255 +  psraw                m3, 4
  1.1256 +  psraw                m1, 4
  1.1257 +  packuswb             m4, m3, m1
  1.1258 +  punpckhbw            m2, m0, m5
  1.1259 +  punpcklbw            m0, m5
  1.1260 +  pmullw               m2, filter_y_a
  1.1261 +  pmullw               m1, filter_y_b
  1.1262 +  paddw                m2, filter_rnd
  1.1263 +  pmullw               m0, filter_y_a
  1.1264 +  pmullw               m3, filter_y_b
  1.1265 +  paddw                m2, m1
  1.1266 +  mova                 m1, [dstq]
  1.1267 +  paddw                m0, filter_rnd
  1.1268 +  psraw                m2, 4
  1.1269 +  paddw                m0, m3
  1.1270 +  punpckhbw            m3, m1, m5
  1.1271 +  psraw                m0, 4
  1.1272 +  punpcklbw            m1, m5
  1.1273 +%endif
  1.1274 +%if %2 == 1 ; avg
  1.1275 +  ; FIXME(rbultje) pipeline
  1.1276 +  packuswb             m0, m2
  1.1277 +  pavgb                m0, [secq]
  1.1278 +  punpckhbw            m2, m0, m5
  1.1279 +  punpcklbw            m0, m5
  1.1280 +%endif
  1.1281 +  SUM_SSE              m0, m1, m2, m3, m6, m7
  1.1282 +  mova                 m0, m4
  1.1283 +
  1.1284 +  INC_SRC_BY_SRC_STRIDE
  1.1285 +  add                dstq, dst_strideq
  1.1286 +%else ; %1 < 16
  1.1287 +  movh                 m0, [srcq]
  1.1288 +  movh                 m1, [srcq+1]
  1.1289 +%if cpuflag(ssse3)
  1.1290 +  punpcklbw            m0, m1
  1.1291 +  pmaddubsw            m0, filter_x_a
  1.1292 +  paddw                m0, filter_rnd
  1.1293 +%else
  1.1294 +  punpcklbw            m0, m5
  1.1295 +  punpcklbw            m1, m5
  1.1296 +  pmullw               m0, filter_x_a
  1.1297 +  pmullw               m1, filter_x_b
  1.1298 +  paddw                m0, filter_rnd
  1.1299 +  paddw                m0, m1
  1.1300 +%endif
  1.1301 +  psraw                m0, 4
  1.1302 +%if cpuflag(ssse3)
  1.1303 +  packuswb             m0, m0
  1.1304 +%endif
  1.1305 +
  1.1306 +  INC_SRC_BY_SRC_STRIDE
  1.1307 +
  1.1308 +.x_other_y_other_loop:
  1.1309 +  movh                 m2, [srcq]
  1.1310 +  movh                 m1, [srcq+1]
  1.1311 +
  1.1312 +  INC_SRC_BY_SRC_STRIDE
  1.1313 +  movh                 m4, [srcq]
  1.1314 +  movh                 m3, [srcq+1]
  1.1315 +
  1.1316 +%if cpuflag(ssse3)
  1.1317 +  punpcklbw            m2, m1
  1.1318 +  punpcklbw            m4, m3
  1.1319 +  pmaddubsw            m2, filter_x_a
  1.1320 +  pmaddubsw            m4, filter_x_a
  1.1321 +  movh                 m3, [dstq+dst_strideq]
  1.1322 +  movh                 m1, [dstq]
  1.1323 +  paddw                m2, filter_rnd
  1.1324 +  paddw                m4, filter_rnd
  1.1325 +  psraw                m2, 4
  1.1326 +  psraw                m4, 4
  1.1327 +  packuswb             m2, m2
  1.1328 +  packuswb             m4, m4
  1.1329 +  punpcklbw            m0, m2
  1.1330 +  punpcklbw            m2, m4
  1.1331 +  pmaddubsw            m0, filter_y_a
  1.1332 +  pmaddubsw            m2, filter_y_a
  1.1333 +  punpcklbw            m3, m5
  1.1334 +  paddw                m0, filter_rnd
  1.1335 +  paddw                m2, filter_rnd
  1.1336 +  psraw                m0, 4
  1.1337 +  psraw                m2, 4
  1.1338 +  punpcklbw            m1, m5
  1.1339 +%else
  1.1340 +  punpcklbw            m2, m5
  1.1341 +  punpcklbw            m1, m5
  1.1342 +  punpcklbw            m4, m5
  1.1343 +  punpcklbw            m3, m5
  1.1344 +  pmullw               m2, filter_x_a
  1.1345 +  pmullw               m1, filter_x_b
  1.1346 +  paddw                m2, filter_rnd
  1.1347 +  pmullw               m4, filter_x_a
  1.1348 +  pmullw               m3, filter_x_b
  1.1349 +  paddw                m4, filter_rnd
  1.1350 +  paddw                m2, m1
  1.1351 +  paddw                m4, m3
  1.1352 +  psraw                m2, 4
  1.1353 +  psraw                m4, 4
  1.1354 +  pmullw               m0, filter_y_a
  1.1355 +  pmullw               m3, m2, filter_y_b
  1.1356 +  paddw                m0, filter_rnd
  1.1357 +  pmullw               m2, filter_y_a
  1.1358 +  pmullw               m1, m4, filter_y_b
  1.1359 +  paddw                m2, filter_rnd
  1.1360 +  paddw                m0, m3
  1.1361 +  movh                 m3, [dstq+dst_strideq]
  1.1362 +  paddw                m2, m1
  1.1363 +  movh                 m1, [dstq]
  1.1364 +  psraw                m0, 4
  1.1365 +  psraw                m2, 4
  1.1366 +  punpcklbw            m3, m5
  1.1367 +  punpcklbw            m1, m5
  1.1368 +%endif
  1.1369 +%if %2 == 1 ; avg
  1.1370 +  ; FIXME(rbultje) pipeline
  1.1371 +  packuswb             m0, m2
  1.1372 +  pavgb                m0, [secq]
  1.1373 +  punpckhbw            m2, m0, m5
  1.1374 +  punpcklbw            m0, m5
  1.1375 +%endif
  1.1376 +  SUM_SSE              m0, m1, m2, m3, m6, m7
  1.1377 +  mova                 m0, m4
  1.1378 +
  1.1379 +  INC_SRC_BY_SRC_STRIDE
  1.1380 +  lea                dstq, [dstq+dst_strideq*2]
  1.1381 +%endif
  1.1382 +%if %2 == 1 ; avg
  1.1383 +  add                secq, sec_str
  1.1384 +%endif
  1.1385 +  dec                   h
  1.1386 +  jg .x_other_y_other_loop
  1.1387 +%undef filter_x_a
  1.1388 +%undef filter_x_b
  1.1389 +%undef filter_y_a
  1.1390 +%undef filter_y_b
  1.1391 +%undef filter_rnd
  1.1392 +  STORE_AND_RET
  1.1393 +%endmacro
  1.1394 +
  1.1395 +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
  1.1396 +; between the ssse3 and non-ssse3 version. It may make sense to merge their
  1.1397 +; code in the sense that the ssse3 version would jump to the appropriate
  1.1398 +; location in the sse/2 version, rather than duplicating that code in the
  1.1399 +; binary.
  1.1400 +
  1.1401 +INIT_MMX sse
  1.1402 +SUBPEL_VARIANCE  4
  1.1403 +INIT_XMM sse2
  1.1404 +SUBPEL_VARIANCE  8
  1.1405 +SUBPEL_VARIANCE 16
  1.1406 +
  1.1407 +INIT_MMX ssse3
  1.1408 +SUBPEL_VARIANCE  4
  1.1409 +INIT_XMM ssse3
  1.1410 +SUBPEL_VARIANCE  8
  1.1411 +SUBPEL_VARIANCE 16
  1.1412 +
  1.1413 +INIT_MMX sse
  1.1414 +SUBPEL_VARIANCE  4, 1
  1.1415 +INIT_XMM sse2
  1.1416 +SUBPEL_VARIANCE  8, 1
  1.1417 +SUBPEL_VARIANCE 16, 1
  1.1418 +
  1.1419 +INIT_MMX ssse3
  1.1420 +SUBPEL_VARIANCE  4, 1
  1.1421 +INIT_XMM ssse3
  1.1422 +SUBPEL_VARIANCE  8, 1
  1.1423 +SUBPEL_VARIANCE 16, 1

mercurial