media/libvpx/vp8/common/x86/variance_impl_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 %define xmm_filter_shift            7
    16 ;unsigned int vp8_get_mb_ss_sse2
    17 ;(
    18 ;    short *src_ptr
    19 ;)
    20 global sym(vp8_get_mb_ss_sse2) PRIVATE
    21 sym(vp8_get_mb_ss_sse2):
    22     push        rbp
    23     mov         rbp, rsp
    24     SHADOW_ARGS_TO_STACK 1
    25     GET_GOT     rbx
    26     push rsi
    27     push rdi
    28     sub         rsp, 16
    29     ; end prolog
    32         mov         rax, arg(0) ;[src_ptr]
    33         mov         rcx, 8
    34         pxor        xmm4, xmm4
    36 .NEXTROW:
    37         movdqa      xmm0, [rax]
    38         movdqa      xmm1, [rax+16]
    39         movdqa      xmm2, [rax+32]
    40         movdqa      xmm3, [rax+48]
    41         pmaddwd     xmm0, xmm0
    42         pmaddwd     xmm1, xmm1
    43         pmaddwd     xmm2, xmm2
    44         pmaddwd     xmm3, xmm3
    46         paddd       xmm0, xmm1
    47         paddd       xmm2, xmm3
    48         paddd       xmm4, xmm0
    49         paddd       xmm4, xmm2
    51         add         rax, 0x40
    52         dec         rcx
    53         ja          .NEXTROW
    55         movdqa      xmm3,xmm4
    56         psrldq      xmm4,8
    57         paddd       xmm4,xmm3
    58         movdqa      xmm3,xmm4
    59         psrldq      xmm4,4
    60         paddd       xmm4,xmm3
    61         movq        rax,xmm4
    64     ; begin epilog
    65     add rsp, 16
    66     pop rdi
    67     pop rsi
    68     RESTORE_GOT
    69     UNSHADOW_ARGS
    70     pop         rbp
    71     ret
    74 ;unsigned int vp8_get16x16var_sse2
    75 ;(
    76 ;    unsigned char   *  src_ptr,
    77 ;    int             source_stride,
    78 ;    unsigned char   *  ref_ptr,
    79 ;    int             recon_stride,
    80 ;    unsigned int    *  SSE,
    81 ;    int             *  Sum
    82 ;)
    83 global sym(vp8_get16x16var_sse2) PRIVATE
    84 sym(vp8_get16x16var_sse2):
    85     push        rbp
    86     mov         rbp, rsp
    87     SHADOW_ARGS_TO_STACK 6
    88     SAVE_XMM 7
    89     push rbx
    90     push rsi
    91     push rdi
    92     ; end prolog
    94         mov         rsi,            arg(0) ;[src_ptr]
    95         mov         rdi,            arg(2) ;[ref_ptr]
    97         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
    98         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
   100         ; Prefetch data
   101         lea             rcx,    [rax+rax*2]
   102         prefetcht0      [rsi]
   103         prefetcht0      [rsi+rax]
   104         prefetcht0      [rsi+rax*2]
   105         prefetcht0      [rsi+rcx]
   106         lea             rbx,    [rsi+rax*4]
   107         prefetcht0      [rbx]
   108         prefetcht0      [rbx+rax]
   109         prefetcht0      [rbx+rax*2]
   110         prefetcht0      [rbx+rcx]
   112         lea             rcx,    [rdx+rdx*2]
   113         prefetcht0      [rdi]
   114         prefetcht0      [rdi+rdx]
   115         prefetcht0      [rdi+rdx*2]
   116         prefetcht0      [rdi+rcx]
   117         lea             rbx,    [rdi+rdx*4]
   118         prefetcht0      [rbx]
   119         prefetcht0      [rbx+rdx]
   120         prefetcht0      [rbx+rdx*2]
   121         prefetcht0      [rbx+rcx]
   123         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
   124         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
   126         pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
   127         mov         rcx,            16
   129 .var16loop:
   130         movdqu      xmm1,           XMMWORD PTR [rsi]
   131         movdqu      xmm2,           XMMWORD PTR [rdi]
   133         prefetcht0      [rsi+rax*8]
   134         prefetcht0      [rdi+rdx*8]
   136         movdqa      xmm3,           xmm1
   137         movdqa      xmm4,           xmm2
   140         punpcklbw   xmm1,           xmm0
   141         punpckhbw   xmm3,           xmm0
   143         punpcklbw   xmm2,           xmm0
   144         punpckhbw   xmm4,           xmm0
   147         psubw       xmm1,           xmm2
   148         psubw       xmm3,           xmm4
   150         paddw       xmm7,           xmm1
   151         pmaddwd     xmm1,           xmm1
   153         paddw       xmm7,           xmm3
   154         pmaddwd     xmm3,           xmm3
   156         paddd       xmm6,           xmm1
   157         paddd       xmm6,           xmm3
   159         add         rsi,            rax
   160         add         rdi,            rdx
   162         sub         rcx,            1
   163         jnz         .var16loop
   166         movdqa      xmm1,           xmm6
   167         pxor        xmm6,           xmm6
   169         pxor        xmm5,           xmm5
   170         punpcklwd   xmm6,           xmm7
   172         punpckhwd   xmm5,           xmm7
   173         psrad       xmm5,           16
   175         psrad       xmm6,           16
   176         paddd       xmm6,           xmm5
   178         movdqa      xmm2,           xmm1
   179         punpckldq   xmm1,           xmm0
   181         punpckhdq   xmm2,           xmm0
   182         movdqa      xmm7,           xmm6
   184         paddd       xmm1,           xmm2
   185         punpckldq   xmm6,           xmm0
   187         punpckhdq   xmm7,           xmm0
   188         paddd       xmm6,           xmm7
   190         movdqa      xmm2,           xmm1
   191         movdqa      xmm7,           xmm6
   193         psrldq      xmm1,           8
   194         psrldq      xmm6,           8
   196         paddd       xmm7,           xmm6
   197         paddd       xmm1,           xmm2
   199         mov         rax,            arg(5) ;[Sum]
   200         mov         rdi,            arg(4) ;[SSE]
   202         movd DWORD PTR [rax],       xmm7
   203         movd DWORD PTR [rdi],       xmm1
   206     ; begin epilog
   207     pop rdi
   208     pop rsi
   209     pop rbx
   210     RESTORE_XMM
   211     UNSHADOW_ARGS
   212     pop         rbp
   213     ret
   218 ;unsigned int vp8_get8x8var_sse2
   219 ;(
   220 ;    unsigned char   *  src_ptr,
   221 ;    int             source_stride,
   222 ;    unsigned char   *  ref_ptr,
   223 ;    int             recon_stride,
   224 ;    unsigned int    *  SSE,
   225 ;    int             *  Sum
   226 ;)
   227 global sym(vp8_get8x8var_sse2) PRIVATE
   228 sym(vp8_get8x8var_sse2):
   229     push        rbp
   230     mov         rbp, rsp
   231     SHADOW_ARGS_TO_STACK 6
   232     SAVE_XMM 7
   233     GET_GOT     rbx
   234     push rsi
   235     push rdi
   236     sub         rsp, 16
   237     ; end prolog
   239         mov         rsi,            arg(0) ;[src_ptr]
   240         mov         rdi,            arg(2) ;[ref_ptr]
   242         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
   243         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
   245         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
   246         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
   248         movq        xmm1,           QWORD PTR [rsi]
   249         movq        xmm2,           QWORD PTR [rdi]
   251         punpcklbw   xmm1,           xmm0
   252         punpcklbw   xmm2,           xmm0
   254         psubsw      xmm1,           xmm2
   255         paddw       xmm7,           xmm1
   257         pmaddwd     xmm1,           xmm1
   259         movq        xmm2,           QWORD PTR[rsi + rax]
   260         movq        xmm3,           QWORD PTR[rdi + rdx]
   262         punpcklbw   xmm2,           xmm0
   263         punpcklbw   xmm3,           xmm0
   265         psubsw      xmm2,           xmm3
   266         paddw       xmm7,           xmm2
   268         pmaddwd     xmm2,           xmm2
   269         paddd       xmm1,           xmm2
   272         movq        xmm2,           QWORD PTR[rsi + rax * 2]
   273         movq        xmm3,           QWORD PTR[rdi + rdx * 2]
   275         punpcklbw   xmm2,           xmm0
   276         punpcklbw   xmm3,           xmm0
   278         psubsw      xmm2,           xmm3
   279         paddw       xmm7,           xmm2
   281         pmaddwd     xmm2,           xmm2
   282         paddd       xmm1,           xmm2
   285         lea         rsi,            [rsi + rax * 2]
   286         lea         rdi,            [rdi + rdx * 2]
   287         movq        xmm2,           QWORD PTR[rsi + rax]
   288         movq        xmm3,           QWORD PTR[rdi + rdx]
   290         punpcklbw   xmm2,           xmm0
   291         punpcklbw   xmm3,           xmm0
   293         psubsw      xmm2,           xmm3
   294         paddw       xmm7,           xmm2
   296         pmaddwd     xmm2,           xmm2
   297         paddd       xmm1,           xmm2
   299         movq        xmm2,           QWORD PTR[rsi + rax *2]
   300         movq        xmm3,           QWORD PTR[rdi + rdx *2]
   302         punpcklbw   xmm2,           xmm0
   303         punpcklbw   xmm3,           xmm0
   305         psubsw      xmm2,           xmm3
   306         paddw       xmm7,           xmm2
   308         pmaddwd     xmm2,           xmm2
   309         paddd       xmm1,           xmm2
   312         lea         rsi,            [rsi + rax * 2]
   313         lea         rdi,            [rdi + rdx * 2]
   316         movq        xmm2,           QWORD PTR[rsi + rax]
   317         movq        xmm3,           QWORD PTR[rdi + rdx]
   319         punpcklbw   xmm2,           xmm0
   320         punpcklbw   xmm3,           xmm0
   322         psubsw      xmm2,           xmm3
   323         paddw       xmm7,           xmm2
   325         pmaddwd     xmm2,           xmm2
   326         paddd       xmm1,           xmm2
   328         movq        xmm2,           QWORD PTR[rsi + rax *2]
   329         movq        xmm3,           QWORD PTR[rdi + rdx *2]
   331         punpcklbw   xmm2,           xmm0
   332         punpcklbw   xmm3,           xmm0
   334         psubsw      xmm2,           xmm3
   335         paddw       xmm7,           xmm2
   337         pmaddwd     xmm2,           xmm2
   338         paddd       xmm1,           xmm2
   341         lea         rsi,            [rsi + rax * 2]
   342         lea         rdi,            [rdi + rdx * 2]
   344         movq        xmm2,           QWORD PTR[rsi + rax]
   345         movq        xmm3,           QWORD PTR[rdi + rdx]
   347         punpcklbw   xmm2,           xmm0
   348         punpcklbw   xmm3,           xmm0
   350         psubsw      xmm2,           xmm3
   351         paddw       xmm7,           xmm2
   353         pmaddwd     xmm2,           xmm2
   354         paddd       xmm1,           xmm2
   357         movdqa      xmm6,           xmm7
   358         punpcklwd   xmm6,           xmm0
   360         punpckhwd   xmm7,           xmm0
   361         movdqa      xmm2,           xmm1
   363         paddw       xmm6,           xmm7
   364         punpckldq   xmm1,           xmm0
   366         punpckhdq   xmm2,           xmm0
   367         movdqa      xmm7,           xmm6
   369         paddd       xmm1,           xmm2
   370         punpckldq   xmm6,           xmm0
   372         punpckhdq   xmm7,           xmm0
   373         paddw       xmm6,           xmm7
   375         movdqa      xmm2,           xmm1
   376         movdqa      xmm7,           xmm6
   378         psrldq      xmm1,           8
   379         psrldq      xmm6,           8
   381         paddw       xmm7,           xmm6
   382         paddd       xmm1,           xmm2
   384         mov         rax,            arg(5) ;[Sum]
   385         mov         rdi,            arg(4) ;[SSE]
   387         movq        rdx,            xmm7
   388         movsx       rcx,            dx
   390         mov  dword ptr [rax],       ecx
   391         movd DWORD PTR [rdi],       xmm1
   393     ; begin epilog
   394     add rsp, 16
   395     pop rdi
   396     pop rsi
   397     RESTORE_GOT
   398     RESTORE_XMM
   399     UNSHADOW_ARGS
   400     pop         rbp
   401     ret
   403 ;void vp8_filter_block2d_bil_var_sse2
   404 ;(
   405 ;    unsigned char *ref_ptr,
   406 ;    int ref_pixels_per_line,
   407 ;    unsigned char *src_ptr,
   408 ;    int src_pixels_per_line,
   409 ;    unsigned int Height,
   410 ;    int  xoffset,
   411 ;    int  yoffset,
   412 ;    int *sum,
   413 ;    unsigned int *sumsquared;;
   414 ;
   415 ;)
   416 global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
   417 sym(vp8_filter_block2d_bil_var_sse2):
   418     push        rbp
   419     mov         rbp, rsp
   420     SHADOW_ARGS_TO_STACK 9
   421     SAVE_XMM 7
   422     GET_GOT     rbx
   423     push rsi
   424     push rdi
   425     push rbx
   426     ; end prolog
   428         pxor            xmm6,           xmm6                 ;
   429         pxor            xmm7,           xmm7                 ;
   431         lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
   432         movdqa          xmm4,           XMMWORD PTR [rsi]
   434         lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
   435         movsxd          rax,            dword ptr arg(5)     ; xoffset
   437         cmp             rax,            0                    ; skip first_pass filter if xoffset=0
   438         je              filter_block2d_bil_var_sse2_sp_only
   440         shl             rax,            5                    ; point to filter coeff with xoffset
   441         lea             rax,            [rax + rcx]          ; HFilter
   443         movsxd          rdx,            dword ptr arg(6)     ; yoffset
   445         cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
   446         je              filter_block2d_bil_var_sse2_fp_only
   448         shl             rdx,            5
   449         lea             rdx,            [rdx + rcx]          ; VFilter
   451         mov             rsi,            arg(0)               ;ref_ptr
   452         mov             rdi,            arg(2)               ;src_ptr
   453         movsxd          rcx,            dword ptr arg(4)     ;Height
   455         pxor            xmm0,           xmm0                 ;
   456         movq            xmm1,           QWORD PTR [rsi]      ;
   457         movq            xmm3,           QWORD PTR [rsi+1]    ;
   459         punpcklbw       xmm1,           xmm0                 ;
   460         pmullw          xmm1,           [rax]                ;
   461         punpcklbw       xmm3,           xmm0
   462         pmullw          xmm3,           [rax+16]             ;
   464         paddw           xmm1,           xmm3                 ;
   465         paddw           xmm1,           xmm4                 ;
   466         psraw           xmm1,           xmm_filter_shift     ;
   467         movdqa          xmm5,           xmm1
   469         movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
   470         lea             rsi,            [rsi + rbx]
   471 %if ABI_IS_32BIT=0
   472         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
   473 %endif
   475 filter_block2d_bil_var_sse2_loop:
   476         movq            xmm1,           QWORD PTR [rsi]               ;
   477         movq            xmm3,           QWORD PTR [rsi+1]             ;
   479         punpcklbw       xmm1,           xmm0                 ;
   480         pmullw          xmm1,           [rax]               ;
   481         punpcklbw       xmm3,           xmm0                 ;
   482         pmullw          xmm3,           [rax+16]             ;
   484         paddw           xmm1,           xmm3                 ;
   485         paddw           xmm1,           xmm4               ;
   486         psraw           xmm1,           xmm_filter_shift    ;
   488         movdqa          xmm3,           xmm5                 ;
   489         movdqa          xmm5,           xmm1                 ;
   491         pmullw          xmm3,           [rdx]               ;
   492         pmullw          xmm1,           [rdx+16]             ;
   493         paddw           xmm1,           xmm3                 ;
   494         paddw           xmm1,           xmm4                 ;
   495         psraw           xmm1,           xmm_filter_shift    ;
   497         movq            xmm3,           QWORD PTR [rdi]               ;
   498         punpcklbw       xmm3,           xmm0                 ;
   500         psubw           xmm1,           xmm3                 ;
   501         paddw           xmm6,           xmm1                 ;
   503         pmaddwd         xmm1,           xmm1                 ;
   504         paddd           xmm7,           xmm1                 ;
   506         lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
   507 %if ABI_IS_32BIT
   508         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
   509 %else
   510         lea             rdi,            [rdi + r9]
   511 %endif
   513         sub             rcx,            1                   ;
   514         jnz             filter_block2d_bil_var_sse2_loop       ;
   516         jmp             filter_block2d_bil_variance
   518 filter_block2d_bil_var_sse2_sp_only:
   519         movsxd          rdx,            dword ptr arg(6)     ; yoffset
   521         cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
   522         je              filter_block2d_bil_var_sse2_full_pixel
   524         shl             rdx,            5
   525         lea             rdx,            [rdx + rcx]          ; VFilter
   527         mov             rsi,            arg(0)               ;ref_ptr
   528         mov             rdi,            arg(2)               ;src_ptr
   529         movsxd          rcx,            dword ptr arg(4)     ;Height
   530         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
   532         pxor            xmm0,           xmm0                 ;
   533         movq            xmm1,           QWORD PTR [rsi]      ;
   534         punpcklbw       xmm1,           xmm0                 ;
   536         movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
   537         lea             rsi,            [rsi + rax]
   539 filter_block2d_bil_sp_only_loop:
   540         movq            xmm3,           QWORD PTR [rsi]             ;
   541         punpcklbw       xmm3,           xmm0                 ;
   542         movdqa          xmm5,           xmm3
   544         pmullw          xmm1,           [rdx]               ;
   545         pmullw          xmm3,           [rdx+16]             ;
   546         paddw           xmm1,           xmm3                 ;
   547         paddw           xmm1,           xmm4                 ;
   548         psraw           xmm1,           xmm_filter_shift    ;
   550         movq            xmm3,           QWORD PTR [rdi]               ;
   551         punpcklbw       xmm3,           xmm0                 ;
   553         psubw           xmm1,           xmm3                 ;
   554         paddw           xmm6,           xmm1                 ;
   556         pmaddwd         xmm1,           xmm1                 ;
   557         paddd           xmm7,           xmm1                 ;
   559         movdqa          xmm1,           xmm5                 ;
   560         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
   561         lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
   563         sub             rcx,            1                   ;
   564         jnz             filter_block2d_bil_sp_only_loop       ;
   566         jmp             filter_block2d_bil_variance
   568 filter_block2d_bil_var_sse2_full_pixel:
   569         mov             rsi,            arg(0)               ;ref_ptr
   570         mov             rdi,            arg(2)               ;src_ptr
   571         movsxd          rcx,            dword ptr arg(4)     ;Height
   572         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
   573         movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
   574         pxor            xmm0,           xmm0                 ;
   576 filter_block2d_bil_full_pixel_loop:
   577         movq            xmm1,           QWORD PTR [rsi]               ;
   578         punpcklbw       xmm1,           xmm0                 ;
   580         movq            xmm2,           QWORD PTR [rdi]               ;
   581         punpcklbw       xmm2,           xmm0                 ;
   583         psubw           xmm1,           xmm2                 ;
   584         paddw           xmm6,           xmm1                 ;
   586         pmaddwd         xmm1,           xmm1                 ;
   587         paddd           xmm7,           xmm1                 ;
   589         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
   590         lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
   592         sub             rcx,            1                   ;
   593         jnz             filter_block2d_bil_full_pixel_loop       ;
   595         jmp             filter_block2d_bil_variance
   597 filter_block2d_bil_var_sse2_fp_only:
   598         mov             rsi,            arg(0)               ;ref_ptr
   599         mov             rdi,            arg(2)               ;src_ptr
   600         movsxd          rcx,            dword ptr arg(4)     ;Height
   601         movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
   603         pxor            xmm0,           xmm0                 ;
   604         movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
   606 filter_block2d_bil_fp_only_loop:
   607         movq            xmm1,           QWORD PTR [rsi]       ;
   608         movq            xmm3,           QWORD PTR [rsi+1]     ;
   610         punpcklbw       xmm1,           xmm0                 ;
   611         pmullw          xmm1,           [rax]               ;
   612         punpcklbw       xmm3,           xmm0                 ;
   613         pmullw          xmm3,           [rax+16]             ;
   615         paddw           xmm1,           xmm3                 ;
   616         paddw           xmm1,           xmm4  ;
   617         psraw           xmm1,           xmm_filter_shift    ;
   619         movq            xmm3,           QWORD PTR [rdi]     ;
   620         punpcklbw       xmm3,           xmm0                 ;
   622         psubw           xmm1,           xmm3                 ;
   623         paddw           xmm6,           xmm1                 ;
   625         pmaddwd         xmm1,           xmm1                 ;
   626         paddd           xmm7,           xmm1                 ;
   627         lea             rsi,            [rsi + rdx]
   628         lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
   630         sub             rcx,            1                   ;
   631         jnz             filter_block2d_bil_fp_only_loop       ;
   633         jmp             filter_block2d_bil_variance
   635 filter_block2d_bil_variance:
   636         movdq2q         mm6,            xmm6                ;
   637         movdq2q         mm7,            xmm7                ;
   639         psrldq          xmm6,           8
   640         psrldq          xmm7,           8
   642         movdq2q         mm2,            xmm6
   643         movdq2q         mm3,            xmm7
   645         paddw           mm6,            mm2
   646         paddd           mm7,            mm3
   648         pxor            mm3,            mm3                 ;
   649         pxor            mm2,            mm2                 ;
   651         punpcklwd       mm2,            mm6                 ;
   652         punpckhwd       mm3,            mm6                 ;
   654         paddd           mm2,            mm3                 ;
   655         movq            mm6,            mm2                 ;
   657         psrlq           mm6,            32                  ;
   658         paddd           mm2,            mm6                 ;
   660         psrad           mm2,            16                  ;
   661         movq            mm4,            mm7                 ;
   663         psrlq           mm4,            32                  ;
   664         paddd           mm4,            mm7                 ;
   666         mov             rsi,            arg(7) ; sum
   667         mov             rdi,            arg(8) ; sumsquared
   669         movd            [rsi],          mm2    ; xsum
   670         movd            [rdi],          mm4    ; xxsum
   672     ; begin epilog
   673     pop rbx
   674     pop rdi
   675     pop rsi
   676     RESTORE_GOT
   677     RESTORE_XMM
   678     UNSHADOW_ARGS
   679     pop         rbp
   680     ret
   683 ;void vp8_half_horiz_vert_variance8x_h_sse2
   684 ;(
   685 ;    unsigned char *ref_ptr,
   686 ;    int ref_pixels_per_line,
   687 ;    unsigned char *src_ptr,
   688 ;    int src_pixels_per_line,
   689 ;    unsigned int Height,
   690 ;    int *sum,
   691 ;    unsigned int *sumsquared
   692 ;)
   693 global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
   694 sym(vp8_half_horiz_vert_variance8x_h_sse2):
   695     push        rbp
   696     mov         rbp, rsp
   697     SHADOW_ARGS_TO_STACK 7
   698     SAVE_XMM 7
   699     GET_GOT     rbx
   700     push rsi
   701     push rdi
   702     ; end prolog
   704 %if ABI_IS_32BIT=0
   705     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
   706     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
   707 %endif
   709         pxor            xmm6,           xmm6                ;  error accumulator
   710         pxor            xmm7,           xmm7                ;  sse eaccumulator
   711         mov             rsi,            arg(0) ;ref_ptr              ;
   713         mov             rdi,            arg(2) ;src_ptr              ;
   714         movsxd          rcx,            dword ptr arg(4) ;Height              ;
   715         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
   717         pxor            xmm0,           xmm0                ;
   719         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
   720         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
   721         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
   723 %if ABI_IS_32BIT
   724         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
   725 %else
   726         add             rsi, r8
   727 %endif
   729 vp8_half_horiz_vert_variance8x_h_1:
   731         movq            xmm1,           QWORD PTR [rsi]     ;
   732         movq            xmm2,           QWORD PTR [rsi+1]   ;
   733         pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
   735         pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
   736         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
   738         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
   739         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
   741         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
   742         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   743         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   744         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   746         movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
   748 %if ABI_IS_32BIT
   749         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
   750         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
   751 %else
   752         add             rsi, r8
   753         add             rdi, r9
   754 %endif
   756         sub             rcx,            1                   ;
   757         jnz             vp8_half_horiz_vert_variance8x_h_1     ;
   759         movdq2q         mm6,            xmm6                ;
   760         movdq2q         mm7,            xmm7                ;
   762         psrldq          xmm6,           8
   763         psrldq          xmm7,           8
   765         movdq2q         mm2,            xmm6
   766         movdq2q         mm3,            xmm7
   768         paddw           mm6,            mm2
   769         paddd           mm7,            mm3
   771         pxor            mm3,            mm3                 ;
   772         pxor            mm2,            mm2                 ;
   774         punpcklwd       mm2,            mm6                 ;
   775         punpckhwd       mm3,            mm6                 ;
   777         paddd           mm2,            mm3                 ;
   778         movq            mm6,            mm2                 ;
   780         psrlq           mm6,            32                  ;
   781         paddd           mm2,            mm6                 ;
   783         psrad           mm2,            16                  ;
   784         movq            mm4,            mm7                 ;
   786         psrlq           mm4,            32                  ;
   787         paddd           mm4,            mm7                 ;
   789         mov             rsi,            arg(5) ; sum
   790         mov             rdi,            arg(6) ; sumsquared
   792         movd            [rsi],          mm2                 ;
   793         movd            [rdi],          mm4                 ;
   796     ; begin epilog
   797     pop rdi
   798     pop rsi
   799     RESTORE_GOT
   800     RESTORE_XMM
   801     UNSHADOW_ARGS
   802     pop         rbp
   803     ret
   805 ;void vp8_half_horiz_vert_variance16x_h_sse2
   806 ;(
   807 ;    unsigned char *ref_ptr,
   808 ;    int ref_pixels_per_line,
   809 ;    unsigned char *src_ptr,
   810 ;    int src_pixels_per_line,
   811 ;    unsigned int Height,
   812 ;    int *sum,
   813 ;    unsigned int *sumsquared
   814 ;)
   815 global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
   816 sym(vp8_half_horiz_vert_variance16x_h_sse2):
   817     push        rbp
   818     mov         rbp, rsp
   819     SHADOW_ARGS_TO_STACK 7
   820     SAVE_XMM 7
   821     GET_GOT     rbx
   822     push rsi
   823     push rdi
   824     ; end prolog
   826         pxor            xmm6,           xmm6                ;  error accumulator
   827         pxor            xmm7,           xmm7                ;  sse eaccumulator
   828         mov             rsi,            arg(0) ;ref_ptr              ;
   830         mov             rdi,            arg(2) ;src_ptr              ;
   831         movsxd          rcx,            dword ptr arg(4) ;Height              ;
   832         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
   833         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
   835         pxor            xmm0,           xmm0                ;
   837         movdqu          xmm5,           XMMWORD PTR [rsi]
   838         movdqu          xmm3,           XMMWORD PTR [rsi+1]
   839         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
   841         lea             rsi,            [rsi + rax]
   843 vp8_half_horiz_vert_variance16x_h_1:
   844         movdqu          xmm1,           XMMWORD PTR [rsi]     ;
   845         movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
   846         pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
   848         pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
   850         movdqa          xmm4,           xmm5
   851         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
   852         punpckhbw       xmm4,           xmm0
   854         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
   855         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
   856         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
   858         movq            xmm3,           QWORD PTR [rdi+8]
   859         punpcklbw       xmm3,           xmm0
   860         psubw           xmm4,           xmm3
   862         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   863         paddw           xmm6,           xmm4
   864         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   865         pmaddwd         xmm4,           xmm4
   866         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   867         paddd           xmm7,           xmm4
   869         movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
   871         lea             rsi,            [rsi + rax]
   872         lea             rdi,            [rdi + rdx]
   874         sub             rcx,            1                   ;
   875         jnz             vp8_half_horiz_vert_variance16x_h_1     ;
   877         pxor        xmm1,           xmm1
   878         pxor        xmm5,           xmm5
   880         punpcklwd   xmm0,           xmm6
   881         punpckhwd   xmm1,           xmm6
   882         psrad       xmm0,           16
   883         psrad       xmm1,           16
   884         paddd       xmm0,           xmm1
   885         movdqa      xmm1,           xmm0
   887         movdqa      xmm6,           xmm7
   888         punpckldq   xmm6,           xmm5
   889         punpckhdq   xmm7,           xmm5
   890         paddd       xmm6,           xmm7
   892         punpckldq   xmm0,           xmm5
   893         punpckhdq   xmm1,           xmm5
   894         paddd       xmm0,           xmm1
   896         movdqa      xmm7,           xmm6
   897         movdqa      xmm1,           xmm0
   899         psrldq      xmm7,           8
   900         psrldq      xmm1,           8
   902         paddd       xmm6,           xmm7
   903         paddd       xmm0,           xmm1
   905         mov         rsi,            arg(5) ;[Sum]
   906         mov         rdi,            arg(6) ;[SSE]
   908         movd        [rsi],       xmm0
   909         movd        [rdi],       xmm6
   911     ; begin epilog
   912     pop rdi
   913     pop rsi
   914     RESTORE_GOT
   915     RESTORE_XMM
   916     UNSHADOW_ARGS
   917     pop         rbp
   918     ret
   921 ;void vp8_half_vert_variance8x_h_sse2
   922 ;(
   923 ;    unsigned char *ref_ptr,
   924 ;    int ref_pixels_per_line,
   925 ;    unsigned char *src_ptr,
   926 ;    int src_pixels_per_line,
   927 ;    unsigned int Height,
   928 ;    int *sum,
   929 ;    unsigned int *sumsquared
   930 ;)
   931 global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
   932 sym(vp8_half_vert_variance8x_h_sse2):
   933     push        rbp
   934     mov         rbp, rsp
   935     SHADOW_ARGS_TO_STACK 7
   936     SAVE_XMM 7
   937     GET_GOT     rbx
   938     push rsi
   939     push rdi
   940     ; end prolog
   942 %if ABI_IS_32BIT=0
   943     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
   944     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
   945 %endif
   947         pxor            xmm6,           xmm6                ;  error accumulator
   948         pxor            xmm7,           xmm7                ;  sse eaccumulator
   949         mov             rsi,            arg(0) ;ref_ptr              ;
   951         mov             rdi,            arg(2) ;src_ptr              ;
   952         movsxd          rcx,            dword ptr arg(4) ;Height              ;
   953         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
   955         pxor            xmm0,           xmm0                ;
   956 vp8_half_vert_variance8x_h_1:
   957         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
   958         movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
   960         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
   961         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
   963         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
   964         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
   966         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
   967         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   968         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   969         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   971 %if ABI_IS_32BIT
   972         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
   973         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
   974 %else
   975         add             rsi, r8
   976         add             rdi, r9
   977 %endif
   979         sub             rcx,            1                   ;
   980         jnz             vp8_half_vert_variance8x_h_1          ;
   982         movdq2q         mm6,            xmm6                ;
   983         movdq2q         mm7,            xmm7                ;
   985         psrldq          xmm6,           8
   986         psrldq          xmm7,           8
   988         movdq2q         mm2,            xmm6
   989         movdq2q         mm3,            xmm7
   991         paddw           mm6,            mm2
   992         paddd           mm7,            mm3
   994         pxor            mm3,            mm3                 ;
   995         pxor            mm2,            mm2                 ;
   997         punpcklwd       mm2,            mm6                 ;
   998         punpckhwd       mm3,            mm6                 ;
  1000         paddd           mm2,            mm3                 ;
  1001         movq            mm6,            mm2                 ;
  1003         psrlq           mm6,            32                  ;
  1004         paddd           mm2,            mm6                 ;
  1006         psrad           mm2,            16                  ;
  1007         movq            mm4,            mm7                 ;
  1009         psrlq           mm4,            32                  ;
  1010         paddd           mm4,            mm7                 ;
  1012         mov             rsi,            arg(5) ; sum
  1013         mov             rdi,            arg(6) ; sumsquared
  1015         movd            [rsi],          mm2                 ;
  1016         movd            [rdi],          mm4                 ;
  1019     ; begin epilog
  1020     pop rdi
  1021     pop rsi
  1022     RESTORE_GOT
  1023     RESTORE_XMM
  1024     UNSHADOW_ARGS
  1025     pop         rbp
  1026     ret
  1028 ;void vp8_half_vert_variance16x_h_sse2
  1029 ;(
  1030 ;    unsigned char *ref_ptr,
  1031 ;    int ref_pixels_per_line,
  1032 ;    unsigned char *src_ptr,
  1033 ;    int src_pixels_per_line,
  1034 ;    unsigned int Height,
  1035 ;    int *sum,
  1036 ;    unsigned int *sumsquared
  1037 ;)
  1038 global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
  1039 sym(vp8_half_vert_variance16x_h_sse2):
  1040     push        rbp
  1041     mov         rbp, rsp
  1042     SHADOW_ARGS_TO_STACK 7
  1043     SAVE_XMM 7
  1044     GET_GOT     rbx
  1045     push rsi
  1046     push rdi
  1047     ; end prolog
  1049         pxor            xmm6,           xmm6                ;  error accumulator
  1050         pxor            xmm7,           xmm7                ;  sse eaccumulator
  1051         mov             rsi,            arg(0)              ;ref_ptr
  1053         mov             rdi,            arg(2)              ;src_ptr
  1054         movsxd          rcx,            dword ptr arg(4)    ;Height
  1055         movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
  1056         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
  1058         movdqu          xmm5,           XMMWORD PTR [rsi]
  1059         lea             rsi,            [rsi + rax          ]
  1060         pxor            xmm0,           xmm0
  1062 vp8_half_vert_variance16x_h_1:
  1063         movdqu          xmm3,           XMMWORD PTR [rsi]
  1065         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
  1066         movdqa          xmm4,           xmm5
  1067         punpcklbw       xmm5,           xmm0
  1068         punpckhbw       xmm4,           xmm0
  1070         movq            xmm2,           QWORD PTR [rdi]
  1071         punpcklbw       xmm2,           xmm0
  1072         psubw           xmm5,           xmm2
  1073         movq            xmm2,           QWORD PTR [rdi+8]
  1074         punpcklbw       xmm2,           xmm0
  1075         psubw           xmm4,           xmm2
  1077         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
  1078         paddw           xmm6,           xmm4
  1079         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
  1080         pmaddwd         xmm4,           xmm4
  1081         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
  1082         paddd           xmm7,           xmm4
  1084         movdqa          xmm5,           xmm3
  1086         lea             rsi,            [rsi + rax]
  1087         lea             rdi,            [rdi + rdx]
  1089         sub             rcx,            1
  1090         jnz             vp8_half_vert_variance16x_h_1
  1092         pxor        xmm1,           xmm1
  1093         pxor        xmm5,           xmm5
  1095         punpcklwd   xmm0,           xmm6
  1096         punpckhwd   xmm1,           xmm6
  1097         psrad       xmm0,           16
  1098         psrad       xmm1,           16
  1099         paddd       xmm0,           xmm1
  1100         movdqa      xmm1,           xmm0
  1102         movdqa      xmm6,           xmm7
  1103         punpckldq   xmm6,           xmm5
  1104         punpckhdq   xmm7,           xmm5
  1105         paddd       xmm6,           xmm7
  1107         punpckldq   xmm0,           xmm5
  1108         punpckhdq   xmm1,           xmm5
  1109         paddd       xmm0,           xmm1
  1111         movdqa      xmm7,           xmm6
  1112         movdqa      xmm1,           xmm0
  1114         psrldq      xmm7,           8
  1115         psrldq      xmm1,           8
  1117         paddd       xmm6,           xmm7
  1118         paddd       xmm0,           xmm1
  1120         mov         rsi,            arg(5) ;[Sum]
  1121         mov         rdi,            arg(6) ;[SSE]
  1123         movd        [rsi],       xmm0
  1124         movd        [rdi],       xmm6
  1126     ; begin epilog
  1127     pop rdi
  1128     pop rsi
  1129     RESTORE_GOT
  1130     RESTORE_XMM
  1131     UNSHADOW_ARGS
  1132     pop         rbp
  1133     ret
  1136 ;void vp8_half_horiz_variance8x_h_sse2
  1137 ;(
  1138 ;    unsigned char *ref_ptr,
  1139 ;    int ref_pixels_per_line,
  1140 ;    unsigned char *src_ptr,
  1141 ;    int src_pixels_per_line,
  1142 ;    unsigned int Height,
  1143 ;    int *sum,
  1144 ;    unsigned int *sumsquared
  1145 ;)
  1146 global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
  1147 sym(vp8_half_horiz_variance8x_h_sse2):
  1148     push        rbp
  1149     mov         rbp, rsp
  1150     SHADOW_ARGS_TO_STACK 7
  1151     SAVE_XMM 7
  1152     GET_GOT     rbx
  1153     push rsi
  1154     push rdi
  1155     ; end prolog
  1157 %if ABI_IS_32BIT=0
  1158     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
  1159     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
  1160 %endif
  1162         pxor            xmm6,           xmm6                ;  error accumulator
  1163         pxor            xmm7,           xmm7                ;  sse eaccumulator
  1164         mov             rsi,            arg(0) ;ref_ptr              ;
  1166         mov             rdi,            arg(2) ;src_ptr              ;
  1167         movsxd          rcx,            dword ptr arg(4) ;Height              ;
  1169         pxor            xmm0,           xmm0                ;
  1170 vp8_half_horiz_variance8x_h_1:
  1171         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
  1172         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
  1174         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
  1175         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
  1177         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
  1178         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
  1180         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
  1181         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
  1182         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
  1183         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
  1185 %if ABI_IS_32BIT
  1186         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
  1187         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
  1188 %else
  1189         add             rsi, r8
  1190         add             rdi, r9
  1191 %endif
  1192         sub             rcx,            1                   ;
  1193         jnz             vp8_half_horiz_variance8x_h_1        ;
  1195         movdq2q         mm6,            xmm6                ;
  1196         movdq2q         mm7,            xmm7                ;
  1198         psrldq          xmm6,           8
  1199         psrldq          xmm7,           8
  1201         movdq2q         mm2,            xmm6
  1202         movdq2q         mm3,            xmm7
  1204         paddw           mm6,            mm2
  1205         paddd           mm7,            mm3
  1207         pxor            mm3,            mm3                 ;
  1208         pxor            mm2,            mm2                 ;
  1210         punpcklwd       mm2,            mm6                 ;
  1211         punpckhwd       mm3,            mm6                 ;
  1213         paddd           mm2,            mm3                 ;
  1214         movq            mm6,            mm2                 ;
  1216         psrlq           mm6,            32                  ;
  1217         paddd           mm2,            mm6                 ;
  1219         psrad           mm2,            16                  ;
  1220         movq            mm4,            mm7                 ;
  1222         psrlq           mm4,            32                  ;
  1223         paddd           mm4,            mm7                 ;
  1225         mov             rsi,            arg(5) ; sum
  1226         mov             rdi,            arg(6) ; sumsquared
  1228         movd            [rsi],          mm2                 ;
  1229         movd            [rdi],          mm4                 ;
  1232     ; begin epilog
  1233     pop rdi
  1234     pop rsi
  1235     RESTORE_GOT
  1236     RESTORE_XMM
  1237     UNSHADOW_ARGS
  1238     pop         rbp
  1239     ret
  1241 ;void vp8_half_horiz_variance16x_h_sse2
  1242 ;(
  1243 ;    unsigned char *ref_ptr,
  1244 ;    int ref_pixels_per_line,
  1245 ;    unsigned char *src_ptr,
  1246 ;    int src_pixels_per_line,
  1247 ;    unsigned int Height,
  1248 ;    int *sum,
  1249 ;    unsigned int *sumsquared
  1250 ;)
  1251 global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
  1252 sym(vp8_half_horiz_variance16x_h_sse2):
  1253     push        rbp
  1254     mov         rbp, rsp
  1255     SHADOW_ARGS_TO_STACK 7
  1256     SAVE_XMM 7
  1257     GET_GOT     rbx
  1258     push rsi
  1259     push rdi
  1260     ; end prolog
  1262         pxor            xmm6,           xmm6                ;  error accumulator
  1263         pxor            xmm7,           xmm7                ;  sse eaccumulator
  1264         mov             rsi,            arg(0) ;ref_ptr              ;
  1266         mov             rdi,            arg(2) ;src_ptr              ;
  1267         movsxd          rcx,            dword ptr arg(4) ;Height              ;
  1268         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
  1269         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
  1271         pxor            xmm0,           xmm0                ;
  1273 vp8_half_horiz_variance16x_h_1:
  1274         movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
  1275         movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
  1277         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
  1278         movdqa          xmm1,           xmm5
  1279         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
  1280         punpckhbw       xmm1,           xmm0
  1282         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
  1283         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
  1284         movq            xmm2,           QWORD PTR [rdi+8]
  1285         punpcklbw       xmm2,           xmm0
  1287         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
  1288         psubw           xmm1,           xmm2
  1289         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
  1290         paddw           xmm6,           xmm1
  1291         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
  1292         pmaddwd         xmm1,           xmm1
  1293         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
  1294         paddd           xmm7,           xmm1
  1296         lea             rsi,            [rsi + rax]
  1297         lea             rdi,            [rdi + rdx]
  1299         sub             rcx,            1                   ;
  1300         jnz             vp8_half_horiz_variance16x_h_1        ;
  1302         pxor        xmm1,           xmm1
  1303         pxor        xmm5,           xmm5
  1305         punpcklwd   xmm0,           xmm6
  1306         punpckhwd   xmm1,           xmm6
  1307         psrad       xmm0,           16
  1308         psrad       xmm1,           16
  1309         paddd       xmm0,           xmm1
  1310         movdqa      xmm1,           xmm0
  1312         movdqa      xmm6,           xmm7
  1313         punpckldq   xmm6,           xmm5
  1314         punpckhdq   xmm7,           xmm5
  1315         paddd       xmm6,           xmm7
  1317         punpckldq   xmm0,           xmm5
  1318         punpckhdq   xmm1,           xmm5
  1319         paddd       xmm0,           xmm1
  1321         movdqa      xmm7,           xmm6
  1322         movdqa      xmm1,           xmm0
  1324         psrldq      xmm7,           8
  1325         psrldq      xmm1,           8
  1327         paddd       xmm6,           xmm7
  1328         paddd       xmm0,           xmm1
  1330         mov         rsi,            arg(5) ;[Sum]
  1331         mov         rdi,            arg(6) ;[SSE]
  1333         movd        [rsi],       xmm0
  1334         movd        [rdi],       xmm6
  1336     ; begin epilog
  1337     pop rdi
  1338     pop rsi
  1339     RESTORE_GOT
  1340     RESTORE_XMM
  1341     UNSHADOW_ARGS
  1342     pop         rbp
  1343     ret
  1345 SECTION_RODATA
  1346 ;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
  1347 align 16
  1348 xmm_bi_rd:
  1349     times 8 dw 64
  1350 align 16
  1351 vp8_bilinear_filters_sse2:
  1352     dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
  1353     dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
  1354     dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
  1355     dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
  1356     dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
  1357     dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
  1358     dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
  1359     dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

mercurial