media/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "third_party/x86inc/x86inc.asm"
    13 SECTION_RODATA
    14 pw_8: times  8 dw  8
    15 bilin_filter_m_sse2: times  8 dw 16
    16                      times  8 dw  0
    17                      times  8 dw 15
    18                      times  8 dw  1
    19                      times  8 dw 14
    20                      times  8 dw  2
    21                      times  8 dw 13
    22                      times  8 dw  3
    23                      times  8 dw 12
    24                      times  8 dw  4
    25                      times  8 dw 11
    26                      times  8 dw  5
    27                      times  8 dw 10
    28                      times  8 dw  6
    29                      times  8 dw  9
    30                      times  8 dw  7
    31                      times 16 dw  8
    32                      times  8 dw  7
    33                      times  8 dw  9
    34                      times  8 dw  6
    35                      times  8 dw 10
    36                      times  8 dw  5
    37                      times  8 dw 11
    38                      times  8 dw  4
    39                      times  8 dw 12
    40                      times  8 dw  3
    41                      times  8 dw 13
    42                      times  8 dw  2
    43                      times  8 dw 14
    44                      times  8 dw  1
    45                      times  8 dw 15
    47 bilin_filter_m_ssse3: times  8 db 16,  0
    48                       times  8 db 15,  1
    49                       times  8 db 14,  2
    50                       times  8 db 13,  3
    51                       times  8 db 12,  4
    52                       times  8 db 11,  5
    53                       times  8 db 10,  6
    54                       times  8 db  9,  7
    55                       times 16 db  8
    56                       times  8 db  7,  9
    57                       times  8 db  6, 10
    58                       times  8 db  5, 11
    59                       times  8 db  4, 12
    60                       times  8 db  3, 13
    61                       times  8 db  2, 14
    62                       times  8 db  1, 15
    64 SECTION .text
    66 ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
    67 ;                               int x_offset, int y_offset,
    68 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
    69 ;                               int height, unsigned int *sse);
    70 ;
    71 ; This function returns the SE and stores SSE in the given pointer.
    73 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
    74   psubw                %3, %4
    75   psubw                %1, %2
    76   paddw                %5, %3
    77   pmaddwd              %3, %3
    78   paddw                %5, %1
    79   pmaddwd              %1, %1
    80   paddd                %6, %3
    81   paddd                %6, %1
    82 %endmacro
    84 %macro STORE_AND_RET 0
    85 %if mmsize == 16
    86   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
    87   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
    88   ; We have to sign-extend it before adding the words within the register
    89   ; and outputing to a dword.
    90   pcmpgtw              m5, m6           ; mask for 0 > x
    91   movhlps              m3, m7
    92   punpcklwd            m4, m6, m5
    93   punpckhwd            m6, m5           ; sign-extend m6 word->dword
    94   paddd                m7, m3
    95   paddd                m6, m4
    96   pshufd               m3, m7, 0x1
    97   movhlps              m4, m6
    98   paddd                m7, m3
    99   paddd                m6, m4
   100   mov                  r1, ssem         ; r1 = unsigned int *sse
   101   pshufd               m4, m6, 0x1
   102   movd               [r1], m7           ; store sse
   103   paddd                m6, m4
   104   movd                rax, m6           ; store sum as return value
   105 %else ; mmsize == 8
   106   pshufw               m4, m6, 0xe
   107   pshufw               m3, m7, 0xe
   108   paddw                m6, m4
   109   paddd                m7, m3
   110   pcmpgtw              m5, m6           ; mask for 0 > x
   111   mov                  r1, ssem         ; r1 = unsigned int *sse
   112   punpcklwd            m6, m5           ; sign-extend m6 word->dword
   113   movd               [r1], m7           ; store sse
   114   pshufw               m4, m6, 0xe
   115   paddd                m6, m4
   116   movd                rax, m6           ; store sum as return value
   117 %endif
   118   RET
   119 %endmacro
   121 %macro INC_SRC_BY_SRC_STRIDE  0
   122 %if ARCH_X86=1 && CONFIG_PIC=1
   123   add                srcq, src_stridemp
   124 %else
   125   add                srcq, src_strideq
   126 %endif
   127 %endmacro
   129 %macro SUBPEL_VARIANCE 1-2 0 ; W
   130 %if cpuflag(ssse3)
   131 %define bilin_filter_m bilin_filter_m_ssse3
   132 %define filter_idx_shift 4
   133 %else
   134 %define bilin_filter_m bilin_filter_m_sse2
   135 %define filter_idx_shift 5
   136 %endif
   137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
   138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
   139 ; difference on Win64
   141 %ifdef PIC    ; 64bit PIC
   142   %if %2 == 1 ; avg
   143     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
   144                                       x_offset, y_offset, \
   145                                       dst, dst_stride, \
   146                                       sec, sec_stride, height, sse
   147     %define sec_str sec_strideq
   148   %else
   149     cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
   150                                   y_offset, dst, dst_stride, height, sse
   151   %endif
   152   %define h heightd
   153   %define bilin_filter sseq
   154 %else
   155   %if ARCH_X86=1 && CONFIG_PIC=1
   156     %if %2 == 1 ; avg
   157       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
   158                                   x_offset, y_offset, \
   159                                   dst, dst_stride, \
   160                                   sec, sec_stride, \
   161                                   height, sse, g_bilin_filter, g_pw_8
   162       %define h dword heightm
   163       %define sec_str sec_stridemp
   165       ;Store bilin_filter and pw_8 location in stack
   166       GET_GOT eax
   167       add esp, 4                ; restore esp
   169       lea ecx, [GLOBAL(bilin_filter_m)]
   170       mov g_bilin_filterm, ecx
   172       lea ecx, [GLOBAL(pw_8)]
   173       mov g_pw_8m, ecx
   175       LOAD_IF_USED 0, 1         ; load eax, ecx back
   176     %else
   177       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
   178                                 y_offset, dst, dst_stride, height, sse, \
   179                                 g_bilin_filter, g_pw_8
   180       %define h heightd
   182       ;Store bilin_filter and pw_8 location in stack
   183       GET_GOT eax
   184       add esp, 4                ; restore esp
   186       lea ecx, [GLOBAL(bilin_filter_m)]
   187       mov g_bilin_filterm, ecx
   189       lea ecx, [GLOBAL(pw_8)]
   190       mov g_pw_8m, ecx
   192       LOAD_IF_USED 0, 1         ; load eax, ecx back
   193     %endif
   194   %else
   195     %if %2 == 1 ; avg
   196       cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
   197                         7 + 2 * ARCH_X86_64, 13, src, src_stride, \
   198                                              x_offset, y_offset, \
   199                                              dst, dst_stride, \
   200                                              sec, sec_stride, \
   201                                              height, sse
   202       %if ARCH_X86_64
   203       %define h heightd
   204       %define sec_str sec_strideq
   205       %else
   206       %define h dword heightm
   207       %define sec_str sec_stridemp
   208       %endif
   209     %else
   210       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
   211                               y_offset, dst, dst_stride, height, sse
   212       %define h heightd
   213     %endif
   215     %define bilin_filter bilin_filter_m
   216   %endif
   217 %endif
   219   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   220   pxor                 m6, m6           ; sum
   221   pxor                 m7, m7           ; sse
   222   ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
   223   ; could perhaps use it for something more productive then
   224   pxor                 m5, m5           ; dedicated zero register
   225 %if %1 < 16
   226   sar                   h, 1
   227 %if %2 == 1 ; avg
   228   shl             sec_str, 1
   229 %endif
   230 %endif
   232   ; FIXME(rbultje) replace by jumptable?
   233   test          x_offsetd, x_offsetd
   234   jnz .x_nonzero
   235   ; x_offset == 0
   236   test          y_offsetd, y_offsetd
   237   jnz .x_zero_y_nonzero
   239   ; x_offset == 0 && y_offset == 0
   240 .x_zero_y_zero_loop:
   241 %if %1 == 16
   242   movu                 m0, [srcq]
   243   mova                 m1, [dstq]
   244 %if %2 == 1 ; avg
   245   pavgb                m0, [secq]
   246   punpckhbw            m3, m1, m5
   247   punpcklbw            m1, m5
   248 %endif
   249   punpckhbw            m2, m0, m5
   250   punpcklbw            m0, m5
   251 %if %2 == 0 ; !avg
   252   punpckhbw            m3, m1, m5
   253   punpcklbw            m1, m5
   254 %endif
   255   SUM_SSE              m0, m1, m2, m3, m6, m7
   257   add                srcq, src_strideq
   258   add                dstq, dst_strideq
   259 %else ; %1 < 16
   260   movh                 m0, [srcq]
   261 %if %2 == 1 ; avg
   262 %if mmsize == 16
   263   movhps               m0, [srcq+src_strideq]
   264 %else ; mmsize == 8
   265   punpckldq            m0, [srcq+src_strideq]
   266 %endif
   267 %else ; !avg
   268   movh                 m2, [srcq+src_strideq]
   269 %endif
   270   movh                 m1, [dstq]
   271   movh                 m3, [dstq+dst_strideq]
   272 %if %2 == 1 ; avg
   273   pavgb                m0, [secq]
   274   punpcklbw            m3, m5
   275   punpcklbw            m1, m5
   276   punpckhbw            m2, m0, m5
   277   punpcklbw            m0, m5
   278 %else ; !avg
   279   punpcklbw            m0, m5
   280   punpcklbw            m2, m5
   281   punpcklbw            m3, m5
   282   punpcklbw            m1, m5
   283 %endif
   284   SUM_SSE              m0, m1, m2, m3, m6, m7
   286   lea                srcq, [srcq+src_strideq*2]
   287   lea                dstq, [dstq+dst_strideq*2]
   288 %endif
   289 %if %2 == 1 ; avg
   290   add                secq, sec_str
   291 %endif
   292   dec                   h
   293   jg .x_zero_y_zero_loop
   294   STORE_AND_RET
   296 .x_zero_y_nonzero:
   297   cmp           y_offsetd, 8
   298   jne .x_zero_y_nonhalf
   300   ; x_offset == 0 && y_offset == 0.5
   301 .x_zero_y_half_loop:
   302 %if %1 == 16
   303   movu                 m0, [srcq]
   304   movu                 m4, [srcq+src_strideq]
   305   mova                 m1, [dstq]
   306   pavgb                m0, m4
   307   punpckhbw            m3, m1, m5
   308 %if %2 == 1 ; avg
   309   pavgb                m0, [secq]
   310 %endif
   311   punpcklbw            m1, m5
   312   punpckhbw            m2, m0, m5
   313   punpcklbw            m0, m5
   314   SUM_SSE              m0, m1, m2, m3, m6, m7
   316   add                srcq, src_strideq
   317   add                dstq, dst_strideq
   318 %else ; %1 < 16
   319   movh                 m0, [srcq]
   320   movh                 m2, [srcq+src_strideq]
   321 %if %2 == 1 ; avg
   322 %if mmsize == 16
   323   movhps               m2, [srcq+src_strideq*2]
   324 %else ; mmsize == 8
   325 %if %1 == 4
   326   movh                 m1, [srcq+src_strideq*2]
   327   punpckldq            m2, m1
   328 %else
   329   punpckldq            m2, [srcq+src_strideq*2]
   330 %endif
   331 %endif
   332   movh                 m1, [dstq]
   333 %if mmsize == 16
   334   movlhps              m0, m2
   335 %else ; mmsize == 8
   336   punpckldq            m0, m2
   337 %endif
   338   movh                 m3, [dstq+dst_strideq]
   339   pavgb                m0, m2
   340   punpcklbw            m1, m5
   341   pavgb                m0, [secq]
   342   punpcklbw            m3, m5
   343   punpckhbw            m2, m0, m5
   344   punpcklbw            m0, m5
   345 %else ; !avg
   346   movh                 m4, [srcq+src_strideq*2]
   347   movh                 m1, [dstq]
   348   pavgb                m0, m2
   349   movh                 m3, [dstq+dst_strideq]
   350   pavgb                m2, m4
   351   punpcklbw            m0, m5
   352   punpcklbw            m2, m5
   353   punpcklbw            m3, m5
   354   punpcklbw            m1, m5
   355 %endif
   356   SUM_SSE              m0, m1, m2, m3, m6, m7
   358   lea                srcq, [srcq+src_strideq*2]
   359   lea                dstq, [dstq+dst_strideq*2]
   360 %endif
   361 %if %2 == 1 ; avg
   362   add                secq, sec_str
   363 %endif
   364   dec                   h
   365   jg .x_zero_y_half_loop
   366   STORE_AND_RET
   368 .x_zero_y_nonhalf:
   369   ; x_offset == 0 && y_offset == bilin interpolation
   370 %ifdef PIC
   371   lea        bilin_filter, [bilin_filter_m]
   372 %endif
   373   shl           y_offsetd, filter_idx_shift
   374 %if ARCH_X86_64 && mmsize == 16
   375   mova                 m8, [bilin_filter+y_offsetq]
   376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   377   mova                 m9, [bilin_filter+y_offsetq+16]
   378 %endif
   379   mova                m10, [pw_8]
   380 %define filter_y_a m8
   381 %define filter_y_b m9
   382 %define filter_rnd m10
   383 %else ; x86-32 or mmx
   384 %if ARCH_X86=1 && CONFIG_PIC=1
   385 ; x_offset == 0, reuse x_offset reg
   386 %define tempq x_offsetq
   387   add y_offsetq, g_bilin_filterm
   388 %define filter_y_a [y_offsetq]
   389 %define filter_y_b [y_offsetq+16]
   390   mov tempq, g_pw_8m
   391 %define filter_rnd [tempq]
   392 %else
   393   add           y_offsetq, bilin_filter
   394 %define filter_y_a [y_offsetq]
   395 %define filter_y_b [y_offsetq+16]
   396 %define filter_rnd [pw_8]
   397 %endif
   398 %endif
   400 .x_zero_y_other_loop:
   401 %if %1 == 16
   402   movu                 m0, [srcq]
   403   movu                 m4, [srcq+src_strideq]
   404   mova                 m1, [dstq]
   405 %if cpuflag(ssse3)
   406   punpckhbw            m2, m0, m4
   407   punpcklbw            m0, m4
   408   pmaddubsw            m2, filter_y_a
   409   pmaddubsw            m0, filter_y_a
   410   paddw                m2, filter_rnd
   411   paddw                m0, filter_rnd
   412 %else
   413   punpckhbw            m2, m0, m5
   414   punpckhbw            m3, m4, m5
   415   punpcklbw            m0, m5
   416   punpcklbw            m4, m5
   417   ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
   418   ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
   419   ; instructions is the same (5), but it is 1 mul instead of 2, so might be
   420   ; slightly faster because of pmullw latency. It would also cut our rodata
   421   ; tables in half for this function, and save 1-2 registers on x86-64.
   422   pmullw               m2, filter_y_a
   423   pmullw               m3, filter_y_b
   424   paddw                m2, filter_rnd
   425   pmullw               m0, filter_y_a
   426   pmullw               m4, filter_y_b
   427   paddw                m0, filter_rnd
   428   paddw                m2, m3
   429   paddw                m0, m4
   430 %endif
   431   psraw                m2, 4
   432   psraw                m0, 4
   433 %if %2 == 1 ; avg
   434   ; FIXME(rbultje) pipeline
   435   packuswb             m0, m2
   436   pavgb                m0, [secq]
   437   punpckhbw            m2, m0, m5
   438   punpcklbw            m0, m5
   439 %endif
   440   punpckhbw            m3, m1, m5
   441   punpcklbw            m1, m5
   442   SUM_SSE              m0, m1, m2, m3, m6, m7
   444   add                srcq, src_strideq
   445   add                dstq, dst_strideq
   446 %else ; %1 < 16
   447   movh                 m0, [srcq]
   448   movh                 m2, [srcq+src_strideq]
   449   movh                 m4, [srcq+src_strideq*2]
   450   movh                 m3, [dstq+dst_strideq]
   451 %if cpuflag(ssse3)
   452   movh                 m1, [dstq]
   453   punpcklbw            m0, m2
   454   punpcklbw            m2, m4
   455   pmaddubsw            m0, filter_y_a
   456   pmaddubsw            m2, filter_y_a
   457   punpcklbw            m3, m5
   458   paddw                m2, filter_rnd
   459   paddw                m0, filter_rnd
   460 %else
   461   punpcklbw            m0, m5
   462   punpcklbw            m2, m5
   463   punpcklbw            m4, m5
   464   pmullw               m0, filter_y_a
   465   pmullw               m1, m2, filter_y_b
   466   punpcklbw            m3, m5
   467   paddw                m0, filter_rnd
   468   pmullw               m2, filter_y_a
   469   pmullw               m4, filter_y_b
   470   paddw                m0, m1
   471   paddw                m2, filter_rnd
   472   movh                 m1, [dstq]
   473   paddw                m2, m4
   474 %endif
   475   psraw                m0, 4
   476   psraw                m2, 4
   477 %if %2 == 1 ; avg
   478   ; FIXME(rbultje) pipeline
   479   packuswb             m0, m2
   480   pavgb                m0, [secq]
   481   punpckhbw            m2, m0, m5
   482   punpcklbw            m0, m5
   483 %endif
   484   punpcklbw            m1, m5
   485   SUM_SSE              m0, m1, m2, m3, m6, m7
   487   lea                srcq, [srcq+src_strideq*2]
   488   lea                dstq, [dstq+dst_strideq*2]
   489 %endif
   490 %if %2 == 1 ; avg
   491   add                secq, sec_str
   492 %endif
   493   dec                   h
   494   jg .x_zero_y_other_loop
   495 %undef filter_y_a
   496 %undef filter_y_b
   497 %undef filter_rnd
   498   STORE_AND_RET
   500 .x_nonzero:
   501   cmp           x_offsetd, 8
   502   jne .x_nonhalf
   503   ; x_offset == 0.5
   504   test          y_offsetd, y_offsetd
   505   jnz .x_half_y_nonzero
   507   ; x_offset == 0.5 && y_offset == 0
   508 .x_half_y_zero_loop:
   509 %if %1 == 16
   510   movu                 m0, [srcq]
   511   movu                 m4, [srcq+1]
   512   mova                 m1, [dstq]
   513   pavgb                m0, m4
   514   punpckhbw            m3, m1, m5
   515 %if %2 == 1 ; avg
   516   pavgb                m0, [secq]
   517 %endif
   518   punpcklbw            m1, m5
   519   punpckhbw            m2, m0, m5
   520   punpcklbw            m0, m5
   521   SUM_SSE              m0, m1, m2, m3, m6, m7
   523   add                srcq, src_strideq
   524   add                dstq, dst_strideq
   525 %else ; %1 < 16
   526   movh                 m0, [srcq]
   527   movh                 m4, [srcq+1]
   528 %if %2 == 1 ; avg
   529 %if mmsize == 16
   530   movhps               m0, [srcq+src_strideq]
   531   movhps               m4, [srcq+src_strideq+1]
   532 %else ; mmsize == 8
   533   punpckldq            m0, [srcq+src_strideq]
   534   punpckldq            m4, [srcq+src_strideq+1]
   535 %endif
   536   movh                 m1, [dstq]
   537   movh                 m3, [dstq+dst_strideq]
   538   pavgb                m0, m4
   539   punpcklbw            m3, m5
   540   pavgb                m0, [secq]
   541   punpcklbw            m1, m5
   542   punpckhbw            m2, m0, m5
   543   punpcklbw            m0, m5
   544 %else ; !avg
   545   movh                 m2, [srcq+src_strideq]
   546   movh                 m1, [dstq]
   547   pavgb                m0, m4
   548   movh                 m4, [srcq+src_strideq+1]
   549   movh                 m3, [dstq+dst_strideq]
   550   pavgb                m2, m4
   551   punpcklbw            m0, m5
   552   punpcklbw            m2, m5
   553   punpcklbw            m3, m5
   554   punpcklbw            m1, m5
   555 %endif
   556   SUM_SSE              m0, m1, m2, m3, m6, m7
   558   lea                srcq, [srcq+src_strideq*2]
   559   lea                dstq, [dstq+dst_strideq*2]
   560 %endif
   561 %if %2 == 1 ; avg
   562   add                secq, sec_str
   563 %endif
   564   dec                   h
   565   jg .x_half_y_zero_loop
   566   STORE_AND_RET
   568 .x_half_y_nonzero:
   569   cmp           y_offsetd, 8
   570   jne .x_half_y_nonhalf
   572   ; x_offset == 0.5 && y_offset == 0.5
   573 %if %1 == 16
   574   movu                 m0, [srcq]
   575   movu                 m3, [srcq+1]
   576   add                srcq, src_strideq
   577   pavgb                m0, m3
   578 .x_half_y_half_loop:
   579   movu                 m4, [srcq]
   580   movu                 m3, [srcq+1]
   581   mova                 m1, [dstq]
   582   pavgb                m4, m3
   583   punpckhbw            m3, m1, m5
   584   pavgb                m0, m4
   585 %if %2 == 1 ; avg
   586   punpcklbw            m1, m5
   587   pavgb                m0, [secq]
   588   punpckhbw            m2, m0, m5
   589   punpcklbw            m0, m5
   590 %else
   591   punpckhbw            m2, m0, m5
   592   punpcklbw            m0, m5
   593   punpcklbw            m1, m5
   594 %endif
   595   SUM_SSE              m0, m1, m2, m3, m6, m7
   596   mova                 m0, m4
   598   add                srcq, src_strideq
   599   add                dstq, dst_strideq
   600 %else ; %1 < 16
   601   movh                 m0, [srcq]
   602   movh                 m3, [srcq+1]
   603   add                srcq, src_strideq
   604   pavgb                m0, m3
   605 .x_half_y_half_loop:
   606   movh                 m2, [srcq]
   607   movh                 m3, [srcq+1]
   608 %if %2 == 1 ; avg
   609 %if mmsize == 16
   610   movhps               m2, [srcq+src_strideq]
   611   movhps               m3, [srcq+src_strideq+1]
   612 %else
   613 %if %1 == 4
   614   movh                 m1, [srcq+src_strideq]
   615   punpckldq            m2, m1
   616   movh                 m1, [srcq+src_strideq+1]
   617   punpckldq            m3, m1
   618 %else
   619   punpckldq            m2, [srcq+src_strideq]
   620   punpckldq            m3, [srcq+src_strideq+1]
   621 %endif
   622 %endif
   623   pavgb                m2, m3
   624 %if mmsize == 16
   625   movlhps              m0, m2
   626   movhlps              m4, m2
   627 %else ; mmsize == 8
   628   punpckldq            m0, m2
   629   pshufw               m4, m2, 0xe
   630 %endif
   631   movh                 m1, [dstq]
   632   pavgb                m0, m2
   633   movh                 m3, [dstq+dst_strideq]
   634   pavgb                m0, [secq]
   635   punpcklbw            m3, m5
   636   punpcklbw            m1, m5
   637   punpckhbw            m2, m0, m5
   638   punpcklbw            m0, m5
   639 %else ; !avg
   640   movh                 m4, [srcq+src_strideq]
   641   movh                 m1, [srcq+src_strideq+1]
   642   pavgb                m2, m3
   643   pavgb                m4, m1
   644   pavgb                m0, m2
   645   pavgb                m2, m4
   646   movh                 m1, [dstq]
   647   movh                 m3, [dstq+dst_strideq]
   648   punpcklbw            m0, m5
   649   punpcklbw            m2, m5
   650   punpcklbw            m3, m5
   651   punpcklbw            m1, m5
   652 %endif
   653   SUM_SSE              m0, m1, m2, m3, m6, m7
   654   mova                 m0, m4
   656   lea                srcq, [srcq+src_strideq*2]
   657   lea                dstq, [dstq+dst_strideq*2]
   658 %endif
   659 %if %2 == 1 ; avg
   660   add                secq, sec_str
   661 %endif
   662   dec                   h
   663   jg .x_half_y_half_loop
   664   STORE_AND_RET
   666 .x_half_y_nonhalf:
   667   ; x_offset == 0.5 && y_offset == bilin interpolation
   668 %ifdef PIC
   669   lea        bilin_filter, [bilin_filter_m]
   670 %endif
   671   shl           y_offsetd, filter_idx_shift
   672 %if ARCH_X86_64 && mmsize == 16
   673   mova                 m8, [bilin_filter+y_offsetq]
   674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   675   mova                 m9, [bilin_filter+y_offsetq+16]
   676 %endif
   677   mova                m10, [pw_8]
   678 %define filter_y_a m8
   679 %define filter_y_b m9
   680 %define filter_rnd m10
   681 %else  ;x86_32
   682 %if ARCH_X86=1 && CONFIG_PIC=1
   683 ; x_offset == 0.5. We can reuse x_offset reg
   684 %define tempq x_offsetq
   685   add y_offsetq, g_bilin_filterm
   686 %define filter_y_a [y_offsetq]
   687 %define filter_y_b [y_offsetq+16]
   688   mov tempq, g_pw_8m
   689 %define filter_rnd [tempq]
   690 %else
   691   add           y_offsetq, bilin_filter
   692 %define filter_y_a [y_offsetq]
   693 %define filter_y_b [y_offsetq+16]
   694 %define filter_rnd [pw_8]
   695 %endif
   696 %endif
   698 %if %1 == 16
   699   movu                 m0, [srcq]
   700   movu                 m3, [srcq+1]
   701   add                srcq, src_strideq
   702   pavgb                m0, m3
   703 .x_half_y_other_loop:
   704   movu                 m4, [srcq]
   705   movu                 m2, [srcq+1]
   706   mova                 m1, [dstq]
   707   pavgb                m4, m2
   708 %if cpuflag(ssse3)
   709   punpckhbw            m2, m0, m4
   710   punpcklbw            m0, m4
   711   pmaddubsw            m2, filter_y_a
   712   pmaddubsw            m0, filter_y_a
   713   paddw                m2, filter_rnd
   714   paddw                m0, filter_rnd
   715   psraw                m2, 4
   716 %else
   717   punpckhbw            m2, m0, m5
   718   punpckhbw            m3, m4, m5
   719   pmullw               m2, filter_y_a
   720   pmullw               m3, filter_y_b
   721   paddw                m2, filter_rnd
   722   punpcklbw            m0, m5
   723   paddw                m2, m3
   724   punpcklbw            m3, m4, m5
   725   pmullw               m0, filter_y_a
   726   pmullw               m3, filter_y_b
   727   paddw                m0, filter_rnd
   728   psraw                m2, 4
   729   paddw                m0, m3
   730 %endif
   731   punpckhbw            m3, m1, m5
   732   psraw                m0, 4
   733 %if %2 == 1 ; avg
   734   ; FIXME(rbultje) pipeline
   735   packuswb             m0, m2
   736   pavgb                m0, [secq]
   737   punpckhbw            m2, m0, m5
   738   punpcklbw            m0, m5
   739 %endif
   740   punpcklbw            m1, m5
   741   SUM_SSE              m0, m1, m2, m3, m6, m7
   742   mova                 m0, m4
   744   add                srcq, src_strideq
   745   add                dstq, dst_strideq
   746 %else ; %1 < 16
   747   movh                 m0, [srcq]
   748   movh                 m3, [srcq+1]
   749   add                srcq, src_strideq
   750   pavgb                m0, m3
   751 %if notcpuflag(ssse3)
   752   punpcklbw            m0, m5
   753 %endif
   754 .x_half_y_other_loop:
   755   movh                 m2, [srcq]
   756   movh                 m1, [srcq+1]
   757   movh                 m4, [srcq+src_strideq]
   758   movh                 m3, [srcq+src_strideq+1]
   759   pavgb                m2, m1
   760   pavgb                m4, m3
   761   movh                 m3, [dstq+dst_strideq]
   762 %if cpuflag(ssse3)
   763   movh                 m1, [dstq]
   764   punpcklbw            m0, m2
   765   punpcklbw            m2, m4
   766   pmaddubsw            m0, filter_y_a
   767   pmaddubsw            m2, filter_y_a
   768   punpcklbw            m3, m5
   769   paddw                m0, filter_rnd
   770   paddw                m2, filter_rnd
   771 %else
   772   punpcklbw            m2, m5
   773   punpcklbw            m4, m5
   774   pmullw               m0, filter_y_a
   775   pmullw               m1, m2, filter_y_b
   776   punpcklbw            m3, m5
   777   paddw                m0, filter_rnd
   778   pmullw               m2, filter_y_a
   779   paddw                m0, m1
   780   pmullw               m1, m4, filter_y_b
   781   paddw                m2, filter_rnd
   782   paddw                m2, m1
   783   movh                 m1, [dstq]
   784 %endif
   785   psraw                m0, 4
   786   psraw                m2, 4
   787 %if %2 == 1 ; avg
   788   ; FIXME(rbultje) pipeline
   789   packuswb             m0, m2
   790   pavgb                m0, [secq]
   791   punpckhbw            m2, m0, m5
   792   punpcklbw            m0, m5
   793 %endif
   794   punpcklbw            m1, m5
   795   SUM_SSE              m0, m1, m2, m3, m6, m7
   796   mova                 m0, m4
   798   lea                srcq, [srcq+src_strideq*2]
   799   lea                dstq, [dstq+dst_strideq*2]
   800 %endif
   801 %if %2 == 1 ; avg
   802   add                secq, sec_str
   803 %endif
   804   dec                   h
   805   jg .x_half_y_other_loop
   806 %undef filter_y_a
   807 %undef filter_y_b
   808 %undef filter_rnd
   809   STORE_AND_RET
   811 .x_nonhalf:
   812   test          y_offsetd, y_offsetd
   813   jnz .x_nonhalf_y_nonzero
   815   ; x_offset == bilin interpolation && y_offset == 0
   816 %ifdef PIC
   817   lea        bilin_filter, [bilin_filter_m]
   818 %endif
   819   shl           x_offsetd, filter_idx_shift
   820 %if ARCH_X86_64 && mmsize == 16
   821   mova                 m8, [bilin_filter+x_offsetq]
   822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   823   mova                 m9, [bilin_filter+x_offsetq+16]
   824 %endif
   825   mova                m10, [pw_8]
   826 %define filter_x_a m8
   827 %define filter_x_b m9
   828 %define filter_rnd m10
   829 %else    ; x86-32
   830 %if ARCH_X86=1 && CONFIG_PIC=1
   831 ;y_offset == 0. We can reuse y_offset reg.
   832 %define tempq y_offsetq
   833   add x_offsetq, g_bilin_filterm
   834 %define filter_x_a [x_offsetq]
   835 %define filter_x_b [x_offsetq+16]
   836   mov tempq, g_pw_8m
   837 %define filter_rnd [tempq]
   838 %else
   839   add           x_offsetq, bilin_filter
   840 %define filter_x_a [x_offsetq]
   841 %define filter_x_b [x_offsetq+16]
   842 %define filter_rnd [pw_8]
   843 %endif
   844 %endif
   846 .x_other_y_zero_loop:
   847 %if %1 == 16
   848   movu                 m0, [srcq]
   849   movu                 m4, [srcq+1]
   850   mova                 m1, [dstq]
   851 %if cpuflag(ssse3)
   852   punpckhbw            m2, m0, m4
   853   punpcklbw            m0, m4
   854   pmaddubsw            m2, filter_x_a
   855   pmaddubsw            m0, filter_x_a
   856   paddw                m2, filter_rnd
   857   paddw                m0, filter_rnd
   858 %else
   859   punpckhbw            m2, m0, m5
   860   punpckhbw            m3, m4, m5
   861   punpcklbw            m0, m5
   862   punpcklbw            m4, m5
   863   pmullw               m2, filter_x_a
   864   pmullw               m3, filter_x_b
   865   paddw                m2, filter_rnd
   866   pmullw               m0, filter_x_a
   867   pmullw               m4, filter_x_b
   868   paddw                m0, filter_rnd
   869   paddw                m2, m3
   870   paddw                m0, m4
   871 %endif
   872   psraw                m2, 4
   873   psraw                m0, 4
   874 %if %2 == 1 ; avg
   875   ; FIXME(rbultje) pipeline
   876   packuswb             m0, m2
   877   pavgb                m0, [secq]
   878   punpckhbw            m2, m0, m5
   879   punpcklbw            m0, m5
   880 %endif
   881   punpckhbw            m3, m1, m5
   882   punpcklbw            m1, m5
   883   SUM_SSE              m0, m1, m2, m3, m6, m7
   885   add                srcq, src_strideq
   886   add                dstq, dst_strideq
   887 %else ; %1 < 16
   888   movh                 m0, [srcq]
   889   movh                 m1, [srcq+1]
   890   movh                 m2, [srcq+src_strideq]
   891   movh                 m4, [srcq+src_strideq+1]
   892   movh                 m3, [dstq+dst_strideq]
   893 %if cpuflag(ssse3)
   894   punpcklbw            m0, m1
   895   movh                 m1, [dstq]
   896   punpcklbw            m2, m4
   897   pmaddubsw            m0, filter_x_a
   898   pmaddubsw            m2, filter_x_a
   899   punpcklbw            m3, m5
   900   paddw                m0, filter_rnd
   901   paddw                m2, filter_rnd
   902 %else
   903   punpcklbw            m0, m5
   904   punpcklbw            m1, m5
   905   punpcklbw            m2, m5
   906   punpcklbw            m4, m5
   907   pmullw               m0, filter_x_a
   908   pmullw               m1, filter_x_b
   909   punpcklbw            m3, m5
   910   paddw                m0, filter_rnd
   911   pmullw               m2, filter_x_a
   912   pmullw               m4, filter_x_b
   913   paddw                m0, m1
   914   paddw                m2, filter_rnd
   915   movh                 m1, [dstq]
   916   paddw                m2, m4
   917 %endif
   918   psraw                m0, 4
   919   psraw                m2, 4
   920 %if %2 == 1 ; avg
   921   ; FIXME(rbultje) pipeline
   922   packuswb             m0, m2
   923   pavgb                m0, [secq]
   924   punpckhbw            m2, m0, m5
   925   punpcklbw            m0, m5
   926 %endif
   927   punpcklbw            m1, m5
   928   SUM_SSE              m0, m1, m2, m3, m6, m7
   930   lea                srcq, [srcq+src_strideq*2]
   931   lea                dstq, [dstq+dst_strideq*2]
   932 %endif
   933 %if %2 == 1 ; avg
   934   add                secq, sec_str
   935 %endif
   936   dec                   h
   937   jg .x_other_y_zero_loop
   938 %undef filter_x_a
   939 %undef filter_x_b
   940 %undef filter_rnd
   941   STORE_AND_RET
   943 .x_nonhalf_y_nonzero:
   944   cmp           y_offsetd, 8
   945   jne .x_nonhalf_y_nonhalf
   947   ; x_offset == bilin interpolation && y_offset == 0.5
   948 %ifdef PIC
   949   lea        bilin_filter, [bilin_filter_m]
   950 %endif
   951   shl           x_offsetd, filter_idx_shift
   952 %if ARCH_X86_64 && mmsize == 16
   953   mova                 m8, [bilin_filter+x_offsetq]
   954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   955   mova                 m9, [bilin_filter+x_offsetq+16]
   956 %endif
   957   mova                m10, [pw_8]
   958 %define filter_x_a m8
   959 %define filter_x_b m9
   960 %define filter_rnd m10
   961 %else    ; x86-32
   962 %if ARCH_X86=1 && CONFIG_PIC=1
   963 ; y_offset == 0.5. We can reuse y_offset reg.
   964 %define tempq y_offsetq
   965   add x_offsetq, g_bilin_filterm
   966 %define filter_x_a [x_offsetq]
   967 %define filter_x_b [x_offsetq+16]
   968   mov tempq, g_pw_8m
   969 %define filter_rnd [tempq]
   970 %else
   971   add           x_offsetq, bilin_filter
   972 %define filter_x_a [x_offsetq]
   973 %define filter_x_b [x_offsetq+16]
   974 %define filter_rnd [pw_8]
   975 %endif
   976 %endif
   978 %if %1 == 16
   979   movu                 m0, [srcq]
   980   movu                 m1, [srcq+1]
   981 %if cpuflag(ssse3)
   982   punpckhbw            m2, m0, m1
   983   punpcklbw            m0, m1
   984   pmaddubsw            m2, filter_x_a
   985   pmaddubsw            m0, filter_x_a
   986   paddw                m2, filter_rnd
   987   paddw                m0, filter_rnd
   988 %else
   989   punpckhbw            m2, m0, m5
   990   punpckhbw            m3, m1, m5
   991   punpcklbw            m0, m5
   992   punpcklbw            m1, m5
   993   pmullw               m0, filter_x_a
   994   pmullw               m1, filter_x_b
   995   paddw                m0, filter_rnd
   996   pmullw               m2, filter_x_a
   997   pmullw               m3, filter_x_b
   998   paddw                m2, filter_rnd
   999   paddw                m0, m1
  1000   paddw                m2, m3
  1001 %endif
  1002   psraw                m0, 4
  1003   psraw                m2, 4
  1004   add                srcq, src_strideq
  1005   packuswb             m0, m2
  1006 .x_other_y_half_loop:
  1007   movu                 m4, [srcq]
  1008   movu                 m3, [srcq+1]
  1009 %if cpuflag(ssse3)
  1010   mova                 m1, [dstq]
  1011   punpckhbw            m2, m4, m3
  1012   punpcklbw            m4, m3
  1013   pmaddubsw            m2, filter_x_a
  1014   pmaddubsw            m4, filter_x_a
  1015   paddw                m2, filter_rnd
  1016   paddw                m4, filter_rnd
  1017   psraw                m2, 4
  1018   psraw                m4, 4
  1019   packuswb             m4, m2
  1020   pavgb                m0, m4
  1021   punpckhbw            m3, m1, m5
  1022   punpcklbw            m1, m5
  1023 %else
  1024   punpckhbw            m2, m4, m5
  1025   punpckhbw            m1, m3, m5
  1026   punpcklbw            m4, m5
  1027   punpcklbw            m3, m5
  1028   pmullw               m4, filter_x_a
  1029   pmullw               m3, filter_x_b
  1030   paddw                m4, filter_rnd
  1031   pmullw               m2, filter_x_a
  1032   pmullw               m1, filter_x_b
  1033   paddw                m2, filter_rnd
  1034   paddw                m4, m3
  1035   paddw                m2, m1
  1036   mova                 m1, [dstq]
  1037   psraw                m4, 4
  1038   psraw                m2, 4
  1039   punpckhbw            m3, m1, m5
  1040   ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
  1041   ; have a 1-register shortage to be able to store the backup of the bilin
  1042   ; filtered second line as words as cache for the next line. Packing into
  1043   ; a byte costs 1 pack and 2 unpacks, but saves a register.
  1044   packuswb             m4, m2
  1045   punpcklbw            m1, m5
  1046   pavgb                m0, m4
  1047 %endif
  1048 %if %2 == 1 ; avg
  1049   ; FIXME(rbultje) pipeline
  1050   pavgb                m0, [secq]
  1051 %endif
  1052   punpckhbw            m2, m0, m5
  1053   punpcklbw            m0, m5
  1054   SUM_SSE              m0, m1, m2, m3, m6, m7
  1055   mova                 m0, m4
  1057   add                srcq, src_strideq
  1058   add                dstq, dst_strideq
  1059 %else ; %1 < 16
  1060   movh                 m0, [srcq]
  1061   movh                 m1, [srcq+1]
  1062 %if cpuflag(ssse3)
  1063   punpcklbw            m0, m1
  1064   pmaddubsw            m0, filter_x_a
  1065   paddw                m0, filter_rnd
  1066 %else
  1067   punpcklbw            m0, m5
  1068   punpcklbw            m1, m5
  1069   pmullw               m0, filter_x_a
  1070   pmullw               m1, filter_x_b
  1071   paddw                m0, filter_rnd
  1072   paddw                m0, m1
  1073 %endif
  1074   add                srcq, src_strideq
  1075   psraw                m0, 4
  1076 .x_other_y_half_loop:
  1077   movh                 m2, [srcq]
  1078   movh                 m1, [srcq+1]
  1079   movh                 m4, [srcq+src_strideq]
  1080   movh                 m3, [srcq+src_strideq+1]
  1081 %if cpuflag(ssse3)
  1082   punpcklbw            m2, m1
  1083   punpcklbw            m4, m3
  1084   pmaddubsw            m2, filter_x_a
  1085   pmaddubsw            m4, filter_x_a
  1086   movh                 m1, [dstq]
  1087   movh                 m3, [dstq+dst_strideq]
  1088   paddw                m2, filter_rnd
  1089   paddw                m4, filter_rnd
  1090 %else
  1091   punpcklbw            m2, m5
  1092   punpcklbw            m1, m5
  1093   punpcklbw            m4, m5
  1094   punpcklbw            m3, m5
  1095   pmullw               m2, filter_x_a
  1096   pmullw               m1, filter_x_b
  1097   paddw                m2, filter_rnd
  1098   pmullw               m4, filter_x_a
  1099   pmullw               m3, filter_x_b
  1100   paddw                m4, filter_rnd
  1101   paddw                m2, m1
  1102   movh                 m1, [dstq]
  1103   paddw                m4, m3
  1104   movh                 m3, [dstq+dst_strideq]
  1105 %endif
  1106   psraw                m2, 4
  1107   psraw                m4, 4
  1108   pavgw                m0, m2
  1109   pavgw                m2, m4
  1110 %if %2 == 1 ; avg
  1111   ; FIXME(rbultje) pipeline - also consider going to bytes here
  1112   packuswb             m0, m2
  1113   pavgb                m0, [secq]
  1114   punpckhbw            m2, m0, m5
  1115   punpcklbw            m0, m5
  1116 %endif
  1117   punpcklbw            m3, m5
  1118   punpcklbw            m1, m5
  1119   SUM_SSE              m0, m1, m2, m3, m6, m7
  1120   mova                 m0, m4
  1122   lea                srcq, [srcq+src_strideq*2]
  1123   lea                dstq, [dstq+dst_strideq*2]
  1124 %endif
  1125 %if %2 == 1 ; avg
  1126   add                secq, sec_str
  1127 %endif
  1128   dec                   h
  1129   jg .x_other_y_half_loop
  1130 %undef filter_x_a
  1131 %undef filter_x_b
  1132 %undef filter_rnd
  1133   STORE_AND_RET
  1135 .x_nonhalf_y_nonhalf:
  1136 %ifdef PIC
  1137   lea        bilin_filter, [bilin_filter_m]
  1138 %endif
  1139   shl           x_offsetd, filter_idx_shift
  1140   shl           y_offsetd, filter_idx_shift
  1141 %if ARCH_X86_64 && mmsize == 16
  1142   mova                 m8, [bilin_filter+x_offsetq]
  1143 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  1144   mova                 m9, [bilin_filter+x_offsetq+16]
  1145 %endif
  1146   mova                m10, [bilin_filter+y_offsetq]
  1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
  1148   mova                m11, [bilin_filter+y_offsetq+16]
  1149 %endif
  1150   mova                m12, [pw_8]
  1151 %define filter_x_a m8
  1152 %define filter_x_b m9
  1153 %define filter_y_a m10
  1154 %define filter_y_b m11
  1155 %define filter_rnd m12
  1156 %else   ; x86-32
  1157 %if ARCH_X86=1 && CONFIG_PIC=1
  1158 ; In this case, there is NO unused register. Used src_stride register. Later,
  1159 ; src_stride has to be loaded from stack when it is needed.
  1160 %define tempq src_strideq
  1161   mov tempq, g_bilin_filterm
  1162   add           x_offsetq, tempq
  1163   add           y_offsetq, tempq
  1164 %define filter_x_a [x_offsetq]
  1165 %define filter_x_b [x_offsetq+16]
  1166 %define filter_y_a [y_offsetq]
  1167 %define filter_y_b [y_offsetq+16]
  1169   mov tempq, g_pw_8m
  1170 %define filter_rnd [tempq]
  1171 %else
  1172   add           x_offsetq, bilin_filter
  1173   add           y_offsetq, bilin_filter
  1174 %define filter_x_a [x_offsetq]
  1175 %define filter_x_b [x_offsetq+16]
  1176 %define filter_y_a [y_offsetq]
  1177 %define filter_y_b [y_offsetq+16]
  1178 %define filter_rnd [pw_8]
  1179 %endif
  1180 %endif
  1182   ; x_offset == bilin interpolation && y_offset == bilin interpolation
  1183 %if %1 == 16
  1184   movu                 m0, [srcq]
  1185   movu                 m1, [srcq+1]
  1186 %if cpuflag(ssse3)
  1187   punpckhbw            m2, m0, m1
  1188   punpcklbw            m0, m1
  1189   pmaddubsw            m2, filter_x_a
  1190   pmaddubsw            m0, filter_x_a
  1191   paddw                m2, filter_rnd
  1192   paddw                m0, filter_rnd
  1193 %else
  1194   punpckhbw            m2, m0, m5
  1195   punpckhbw            m3, m1, m5
  1196   punpcklbw            m0, m5
  1197   punpcklbw            m1, m5
  1198   pmullw               m0, filter_x_a
  1199   pmullw               m1, filter_x_b
  1200   paddw                m0, filter_rnd
  1201   pmullw               m2, filter_x_a
  1202   pmullw               m3, filter_x_b
  1203   paddw                m2, filter_rnd
  1204   paddw                m0, m1
  1205   paddw                m2, m3
  1206 %endif
  1207   psraw                m0, 4
  1208   psraw                m2, 4
  1210   INC_SRC_BY_SRC_STRIDE
  1212   packuswb             m0, m2
  1213 .x_other_y_other_loop:
  1214 %if cpuflag(ssse3)
  1215   movu                 m4, [srcq]
  1216   movu                 m3, [srcq+1]
  1217   mova                 m1, [dstq]
  1218   punpckhbw            m2, m4, m3
  1219   punpcklbw            m4, m3
  1220   pmaddubsw            m2, filter_x_a
  1221   pmaddubsw            m4, filter_x_a
  1222   punpckhbw            m3, m1, m5
  1223   paddw                m2, filter_rnd
  1224   paddw                m4, filter_rnd
  1225   psraw                m2, 4
  1226   psraw                m4, 4
  1227   packuswb             m4, m2
  1228   punpckhbw            m2, m0, m4
  1229   punpcklbw            m0, m4
  1230   pmaddubsw            m2, filter_y_a
  1231   pmaddubsw            m0, filter_y_a
  1232   punpcklbw            m1, m5
  1233   paddw                m2, filter_rnd
  1234   paddw                m0, filter_rnd
  1235   psraw                m2, 4
  1236   psraw                m0, 4
  1237 %else
  1238   movu                 m3, [srcq]
  1239   movu                 m4, [srcq+1]
  1240   punpckhbw            m1, m3, m5
  1241   punpckhbw            m2, m4, m5
  1242   punpcklbw            m3, m5
  1243   punpcklbw            m4, m5
  1244   pmullw               m3, filter_x_a
  1245   pmullw               m4, filter_x_b
  1246   paddw                m3, filter_rnd
  1247   pmullw               m1, filter_x_a
  1248   pmullw               m2, filter_x_b
  1249   paddw                m1, filter_rnd
  1250   paddw                m3, m4
  1251   paddw                m1, m2
  1252   psraw                m3, 4
  1253   psraw                m1, 4
  1254   packuswb             m4, m3, m1
  1255   punpckhbw            m2, m0, m5
  1256   punpcklbw            m0, m5
  1257   pmullw               m2, filter_y_a
  1258   pmullw               m1, filter_y_b
  1259   paddw                m2, filter_rnd
  1260   pmullw               m0, filter_y_a
  1261   pmullw               m3, filter_y_b
  1262   paddw                m2, m1
  1263   mova                 m1, [dstq]
  1264   paddw                m0, filter_rnd
  1265   psraw                m2, 4
  1266   paddw                m0, m3
  1267   punpckhbw            m3, m1, m5
  1268   psraw                m0, 4
  1269   punpcklbw            m1, m5
  1270 %endif
  1271 %if %2 == 1 ; avg
  1272   ; FIXME(rbultje) pipeline
  1273   packuswb             m0, m2
  1274   pavgb                m0, [secq]
  1275   punpckhbw            m2, m0, m5
  1276   punpcklbw            m0, m5
  1277 %endif
  1278   SUM_SSE              m0, m1, m2, m3, m6, m7
  1279   mova                 m0, m4
  1281   INC_SRC_BY_SRC_STRIDE
  1282   add                dstq, dst_strideq
  1283 %else ; %1 < 16
  1284   movh                 m0, [srcq]
  1285   movh                 m1, [srcq+1]
  1286 %if cpuflag(ssse3)
  1287   punpcklbw            m0, m1
  1288   pmaddubsw            m0, filter_x_a
  1289   paddw                m0, filter_rnd
  1290 %else
  1291   punpcklbw            m0, m5
  1292   punpcklbw            m1, m5
  1293   pmullw               m0, filter_x_a
  1294   pmullw               m1, filter_x_b
  1295   paddw                m0, filter_rnd
  1296   paddw                m0, m1
  1297 %endif
  1298   psraw                m0, 4
  1299 %if cpuflag(ssse3)
  1300   packuswb             m0, m0
  1301 %endif
  1303   INC_SRC_BY_SRC_STRIDE
  1305 .x_other_y_other_loop:
  1306   movh                 m2, [srcq]
  1307   movh                 m1, [srcq+1]
  1309   INC_SRC_BY_SRC_STRIDE
  1310   movh                 m4, [srcq]
  1311   movh                 m3, [srcq+1]
  1313 %if cpuflag(ssse3)
  1314   punpcklbw            m2, m1
  1315   punpcklbw            m4, m3
  1316   pmaddubsw            m2, filter_x_a
  1317   pmaddubsw            m4, filter_x_a
  1318   movh                 m3, [dstq+dst_strideq]
  1319   movh                 m1, [dstq]
  1320   paddw                m2, filter_rnd
  1321   paddw                m4, filter_rnd
  1322   psraw                m2, 4
  1323   psraw                m4, 4
  1324   packuswb             m2, m2
  1325   packuswb             m4, m4
  1326   punpcklbw            m0, m2
  1327   punpcklbw            m2, m4
  1328   pmaddubsw            m0, filter_y_a
  1329   pmaddubsw            m2, filter_y_a
  1330   punpcklbw            m3, m5
  1331   paddw                m0, filter_rnd
  1332   paddw                m2, filter_rnd
  1333   psraw                m0, 4
  1334   psraw                m2, 4
  1335   punpcklbw            m1, m5
  1336 %else
  1337   punpcklbw            m2, m5
  1338   punpcklbw            m1, m5
  1339   punpcklbw            m4, m5
  1340   punpcklbw            m3, m5
  1341   pmullw               m2, filter_x_a
  1342   pmullw               m1, filter_x_b
  1343   paddw                m2, filter_rnd
  1344   pmullw               m4, filter_x_a
  1345   pmullw               m3, filter_x_b
  1346   paddw                m4, filter_rnd
  1347   paddw                m2, m1
  1348   paddw                m4, m3
  1349   psraw                m2, 4
  1350   psraw                m4, 4
  1351   pmullw               m0, filter_y_a
  1352   pmullw               m3, m2, filter_y_b
  1353   paddw                m0, filter_rnd
  1354   pmullw               m2, filter_y_a
  1355   pmullw               m1, m4, filter_y_b
  1356   paddw                m2, filter_rnd
  1357   paddw                m0, m3
  1358   movh                 m3, [dstq+dst_strideq]
  1359   paddw                m2, m1
  1360   movh                 m1, [dstq]
  1361   psraw                m0, 4
  1362   psraw                m2, 4
  1363   punpcklbw            m3, m5
  1364   punpcklbw            m1, m5
  1365 %endif
  1366 %if %2 == 1 ; avg
  1367   ; FIXME(rbultje) pipeline
  1368   packuswb             m0, m2
  1369   pavgb                m0, [secq]
  1370   punpckhbw            m2, m0, m5
  1371   punpcklbw            m0, m5
  1372 %endif
  1373   SUM_SSE              m0, m1, m2, m3, m6, m7
  1374   mova                 m0, m4
  1376   INC_SRC_BY_SRC_STRIDE
  1377   lea                dstq, [dstq+dst_strideq*2]
  1378 %endif
  1379 %if %2 == 1 ; avg
  1380   add                secq, sec_str
  1381 %endif
  1382   dec                   h
  1383   jg .x_other_y_other_loop
  1384 %undef filter_x_a
  1385 %undef filter_x_b
  1386 %undef filter_y_a
  1387 %undef filter_y_b
  1388 %undef filter_rnd
  1389   STORE_AND_RET
  1390 %endmacro
  1392 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
  1393 ; between the ssse3 and non-ssse3 version. It may make sense to merge their
  1394 ; code in the sense that the ssse3 version would jump to the appropriate
  1395 ; location in the sse/2 version, rather than duplicating that code in the
  1396 ; binary.
  1398 INIT_MMX sse
  1399 SUBPEL_VARIANCE  4
  1400 INIT_XMM sse2
  1401 SUBPEL_VARIANCE  8
  1402 SUBPEL_VARIANCE 16
  1404 INIT_MMX ssse3
  1405 SUBPEL_VARIANCE  4
  1406 INIT_XMM ssse3
  1407 SUBPEL_VARIANCE  8
  1408 SUBPEL_VARIANCE 16
  1410 INIT_MMX sse
  1411 SUBPEL_VARIANCE  4, 1
  1412 INIT_XMM sse2
  1413 SUBPEL_VARIANCE  8, 1
  1414 SUBPEL_VARIANCE 16, 1
  1416 INIT_MMX ssse3
  1417 SUBPEL_VARIANCE  4, 1
  1418 INIT_XMM ssse3
  1419 SUBPEL_VARIANCE  8, 1
  1420 SUBPEL_VARIANCE 16, 1

mercurial