media/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;unsigned int vp9_get_mb_ss_sse2
    15 ;(
    16 ;    short *src_ptr
    17 ;)
    18 global sym(vp9_get_mb_ss_sse2) PRIVATE
    19 sym(vp9_get_mb_ss_sse2):
    20     push        rbp
    21     mov         rbp, rsp
    22     SHADOW_ARGS_TO_STACK 1
    23     GET_GOT     rbx
    24     push rsi
    25     push rdi
    26     sub         rsp, 16
    27     ; end prolog
    30         mov         rax, arg(0) ;[src_ptr]
    31         mov         rcx, 8
    32         pxor        xmm4, xmm4
    34 .NEXTROW:
    35         movdqa      xmm0, [rax]
    36         movdqa      xmm1, [rax+16]
    37         movdqa      xmm2, [rax+32]
    38         movdqa      xmm3, [rax+48]
    39         pmaddwd     xmm0, xmm0
    40         pmaddwd     xmm1, xmm1
    41         pmaddwd     xmm2, xmm2
    42         pmaddwd     xmm3, xmm3
    44         paddd       xmm0, xmm1
    45         paddd       xmm2, xmm3
    46         paddd       xmm4, xmm0
    47         paddd       xmm4, xmm2
    49         add         rax, 0x40
    50         dec         rcx
    51         ja          .NEXTROW
    53         movdqa      xmm3,xmm4
    54         psrldq      xmm4,8
    55         paddd       xmm4,xmm3
    56         movdqa      xmm3,xmm4
    57         psrldq      xmm4,4
    58         paddd       xmm4,xmm3
    59         movq        rax,xmm4
    62     ; begin epilog
    63     add rsp, 16
    64     pop rdi
    65     pop rsi
    66     RESTORE_GOT
    67     UNSHADOW_ARGS
    68     pop         rbp
    69     ret
    72 ;unsigned int vp9_get16x16var_sse2
    73 ;(
    74 ;    unsigned char   *  src_ptr,
    75 ;    int             source_stride,
    76 ;    unsigned char   *  ref_ptr,
    77 ;    int             recon_stride,
    78 ;    unsigned int    *  SSE,
    79 ;    int             *  Sum
    80 ;)
    81 global sym(vp9_get16x16var_sse2) PRIVATE
    82 sym(vp9_get16x16var_sse2):
    83     push        rbp
    84     mov         rbp, rsp
    85     SHADOW_ARGS_TO_STACK 6
    86     SAVE_XMM 7
    87     push rbx
    88     push rsi
    89     push rdi
    90     ; end prolog
    92         mov         rsi,            arg(0) ;[src_ptr]
    93         mov         rdi,            arg(2) ;[ref_ptr]
    95         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
    96         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
    98         ; Prefetch data
    99         lea             rcx,    [rax+rax*2]
   100         prefetcht0      [rsi]
   101         prefetcht0      [rsi+rax]
   102         prefetcht0      [rsi+rax*2]
   103         prefetcht0      [rsi+rcx]
   104         lea             rbx,    [rsi+rax*4]
   105         prefetcht0      [rbx]
   106         prefetcht0      [rbx+rax]
   107         prefetcht0      [rbx+rax*2]
   108         prefetcht0      [rbx+rcx]
   110         lea             rcx,    [rdx+rdx*2]
   111         prefetcht0      [rdi]
   112         prefetcht0      [rdi+rdx]
   113         prefetcht0      [rdi+rdx*2]
   114         prefetcht0      [rdi+rcx]
   115         lea             rbx,    [rdi+rdx*4]
   116         prefetcht0      [rbx]
   117         prefetcht0      [rbx+rdx]
   118         prefetcht0      [rbx+rdx*2]
   119         prefetcht0      [rbx+rcx]
   121         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
   122         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
   124         pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
   125         mov         rcx,            16
   127 .var16loop:
   128         movdqu      xmm1,           XMMWORD PTR [rsi]
   129         movdqu      xmm2,           XMMWORD PTR [rdi]
   131         prefetcht0      [rsi+rax*8]
   132         prefetcht0      [rdi+rdx*8]
   134         movdqa      xmm3,           xmm1
   135         movdqa      xmm4,           xmm2
   138         punpcklbw   xmm1,           xmm0
   139         punpckhbw   xmm3,           xmm0
   141         punpcklbw   xmm2,           xmm0
   142         punpckhbw   xmm4,           xmm0
   145         psubw       xmm1,           xmm2
   146         psubw       xmm3,           xmm4
   148         paddw       xmm7,           xmm1
   149         pmaddwd     xmm1,           xmm1
   151         paddw       xmm7,           xmm3
   152         pmaddwd     xmm3,           xmm3
   154         paddd       xmm6,           xmm1
   155         paddd       xmm6,           xmm3
   157         add         rsi,            rax
   158         add         rdi,            rdx
   160         sub         rcx,            1
   161         jnz         .var16loop
   164         movdqa      xmm1,           xmm6
   165         pxor        xmm6,           xmm6
   167         pxor        xmm5,           xmm5
   168         punpcklwd   xmm6,           xmm7
   170         punpckhwd   xmm5,           xmm7
   171         psrad       xmm5,           16
   173         psrad       xmm6,           16
   174         paddd       xmm6,           xmm5
   176         movdqa      xmm2,           xmm1
   177         punpckldq   xmm1,           xmm0
   179         punpckhdq   xmm2,           xmm0
   180         movdqa      xmm7,           xmm6
   182         paddd       xmm1,           xmm2
   183         punpckldq   xmm6,           xmm0
   185         punpckhdq   xmm7,           xmm0
   186         paddd       xmm6,           xmm7
   188         movdqa      xmm2,           xmm1
   189         movdqa      xmm7,           xmm6
   191         psrldq      xmm1,           8
   192         psrldq      xmm6,           8
   194         paddd       xmm7,           xmm6
   195         paddd       xmm1,           xmm2
   197         mov         rax,            arg(5) ;[Sum]
   198         mov         rdi,            arg(4) ;[SSE]
   200         movd DWORD PTR [rax],       xmm7
   201         movd DWORD PTR [rdi],       xmm1
   204     ; begin epilog
   205     pop rdi
   206     pop rsi
   207     pop rbx
   208     RESTORE_XMM
   209     UNSHADOW_ARGS
   210     pop         rbp
   211     ret
   216 ;unsigned int vp9_get8x8var_sse2
   217 ;(
   218 ;    unsigned char   *  src_ptr,
   219 ;    int             source_stride,
   220 ;    unsigned char   *  ref_ptr,
   221 ;    int             recon_stride,
   222 ;    unsigned int    *  SSE,
   223 ;    int             *  Sum
   224 ;)
   225 global sym(vp9_get8x8var_sse2) PRIVATE
   226 sym(vp9_get8x8var_sse2):
   227     push        rbp
   228     mov         rbp, rsp
   229     SHADOW_ARGS_TO_STACK 6
   230     SAVE_XMM 7
   231     GET_GOT     rbx
   232     push rsi
   233     push rdi
   234     sub         rsp, 16
   235     ; end prolog
   237         mov         rsi,            arg(0) ;[src_ptr]
   238         mov         rdi,            arg(2) ;[ref_ptr]
   240         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
   241         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
   243         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
   244         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
   246         movq        xmm1,           QWORD PTR [rsi]
   247         movq        xmm2,           QWORD PTR [rdi]
   249         punpcklbw   xmm1,           xmm0
   250         punpcklbw   xmm2,           xmm0
   252         psubsw      xmm1,           xmm2
   253         paddw       xmm7,           xmm1
   255         pmaddwd     xmm1,           xmm1
   257         movq        xmm2,           QWORD PTR[rsi + rax]
   258         movq        xmm3,           QWORD PTR[rdi + rdx]
   260         punpcklbw   xmm2,           xmm0
   261         punpcklbw   xmm3,           xmm0
   263         psubsw      xmm2,           xmm3
   264         paddw       xmm7,           xmm2
   266         pmaddwd     xmm2,           xmm2
   267         paddd       xmm1,           xmm2
   270         movq        xmm2,           QWORD PTR[rsi + rax * 2]
   271         movq        xmm3,           QWORD PTR[rdi + rdx * 2]
   273         punpcklbw   xmm2,           xmm0
   274         punpcklbw   xmm3,           xmm0
   276         psubsw      xmm2,           xmm3
   277         paddw       xmm7,           xmm2
   279         pmaddwd     xmm2,           xmm2
   280         paddd       xmm1,           xmm2
   283         lea         rsi,            [rsi + rax * 2]
   284         lea         rdi,            [rdi + rdx * 2]
   285         movq        xmm2,           QWORD PTR[rsi + rax]
   286         movq        xmm3,           QWORD PTR[rdi + rdx]
   288         punpcklbw   xmm2,           xmm0
   289         punpcklbw   xmm3,           xmm0
   291         psubsw      xmm2,           xmm3
   292         paddw       xmm7,           xmm2
   294         pmaddwd     xmm2,           xmm2
   295         paddd       xmm1,           xmm2
   297         movq        xmm2,           QWORD PTR[rsi + rax *2]
   298         movq        xmm3,           QWORD PTR[rdi + rdx *2]
   300         punpcklbw   xmm2,           xmm0
   301         punpcklbw   xmm3,           xmm0
   303         psubsw      xmm2,           xmm3
   304         paddw       xmm7,           xmm2
   306         pmaddwd     xmm2,           xmm2
   307         paddd       xmm1,           xmm2
   310         lea         rsi,            [rsi + rax * 2]
   311         lea         rdi,            [rdi + rdx * 2]
   314         movq        xmm2,           QWORD PTR[rsi + rax]
   315         movq        xmm3,           QWORD PTR[rdi + rdx]
   317         punpcklbw   xmm2,           xmm0
   318         punpcklbw   xmm3,           xmm0
   320         psubsw      xmm2,           xmm3
   321         paddw       xmm7,           xmm2
   323         pmaddwd     xmm2,           xmm2
   324         paddd       xmm1,           xmm2
   326         movq        xmm2,           QWORD PTR[rsi + rax *2]
   327         movq        xmm3,           QWORD PTR[rdi + rdx *2]
   329         punpcklbw   xmm2,           xmm0
   330         punpcklbw   xmm3,           xmm0
   332         psubsw      xmm2,           xmm3
   333         paddw       xmm7,           xmm2
   335         pmaddwd     xmm2,           xmm2
   336         paddd       xmm1,           xmm2
   339         lea         rsi,            [rsi + rax * 2]
   340         lea         rdi,            [rdi + rdx * 2]
   342         movq        xmm2,           QWORD PTR[rsi + rax]
   343         movq        xmm3,           QWORD PTR[rdi + rdx]
   345         punpcklbw   xmm2,           xmm0
   346         punpcklbw   xmm3,           xmm0
   348         psubsw      xmm2,           xmm3
   349         paddw       xmm7,           xmm2
   351         pmaddwd     xmm2,           xmm2
   352         paddd       xmm1,           xmm2
   355         movdqa      xmm6,           xmm7
   356         punpcklwd   xmm6,           xmm0
   358         punpckhwd   xmm7,           xmm0
   359         movdqa      xmm2,           xmm1
   361         paddw       xmm6,           xmm7
   362         punpckldq   xmm1,           xmm0
   364         punpckhdq   xmm2,           xmm0
   365         movdqa      xmm7,           xmm6
   367         paddd       xmm1,           xmm2
   368         punpckldq   xmm6,           xmm0
   370         punpckhdq   xmm7,           xmm0
   371         paddw       xmm6,           xmm7
   373         movdqa      xmm2,           xmm1
   374         movdqa      xmm7,           xmm6
   376         psrldq      xmm1,           8
   377         psrldq      xmm6,           8
   379         paddw       xmm7,           xmm6
   380         paddd       xmm1,           xmm2
   382         mov         rax,            arg(5) ;[Sum]
   383         mov         rdi,            arg(4) ;[SSE]
   385         movq        rdx,            xmm7
   386         movsx       rcx,            dx
   388         mov  dword ptr [rax],       ecx
   389         movd DWORD PTR [rdi],       xmm1
   391     ; begin epilog
   392     add rsp, 16
   393     pop rdi
   394     pop rsi
   395     RESTORE_GOT
   396     RESTORE_XMM
   397     UNSHADOW_ARGS
   398     pop         rbp
   399     ret
   401 ;void vp9_half_horiz_vert_variance8x_h_sse2
   402 ;(
   403 ;    unsigned char *ref_ptr,
   404 ;    int ref_pixels_per_line,
   405 ;    unsigned char *src_ptr,
   406 ;    int src_pixels_per_line,
   407 ;    unsigned int Height,
   408 ;    int *sum,
   409 ;    unsigned int *sumsquared
   410 ;)
   411 global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
   412 sym(vp9_half_horiz_vert_variance8x_h_sse2):
   413     push        rbp
   414     mov         rbp, rsp
   415     SHADOW_ARGS_TO_STACK 7
   416     SAVE_XMM 7
   417     GET_GOT     rbx
   418     push rsi
   419     push rdi
   420     ; end prolog
   422 %if ABI_IS_32BIT=0
   423     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
   424     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
   425 %endif
   427         pxor            xmm6,           xmm6                ;  error accumulator
   428         pxor            xmm7,           xmm7                ;  sse eaccumulator
   429         mov             rsi,            arg(0) ;ref_ptr              ;
   431         mov             rdi,            arg(2) ;src_ptr              ;
   432         movsxd          rcx,            dword ptr arg(4) ;Height              ;
   433         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
   435         pxor            xmm0,           xmm0                ;
   437         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
   438         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
   439         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
   441 %if ABI_IS_32BIT
   442         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
   443 %else
   444         add             rsi, r8
   445 %endif
   447 .half_horiz_vert_variance8x_h_1:
   449         movq            xmm1,           QWORD PTR [rsi]     ;
   450         movq            xmm2,           QWORD PTR [rsi+1]   ;
   451         pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
   453         pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
   454         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
   456         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
   457         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
   459         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
   460         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   461         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   462         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   464         movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
   466 %if ABI_IS_32BIT
   467         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
   468         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
   469 %else
   470         add             rsi, r8
   471         add             rdi, r9
   472 %endif
   474         sub             rcx,            1                   ;
   475         jnz             .half_horiz_vert_variance8x_h_1     ;
   477         movdq2q         mm6,            xmm6                ;
   478         movdq2q         mm7,            xmm7                ;
   480         psrldq          xmm6,           8
   481         psrldq          xmm7,           8
   483         movdq2q         mm2,            xmm6
   484         movdq2q         mm3,            xmm7
   486         paddw           mm6,            mm2
   487         paddd           mm7,            mm3
   489         pxor            mm3,            mm3                 ;
   490         pxor            mm2,            mm2                 ;
   492         punpcklwd       mm2,            mm6                 ;
   493         punpckhwd       mm3,            mm6                 ;
   495         paddd           mm2,            mm3                 ;
   496         movq            mm6,            mm2                 ;
   498         psrlq           mm6,            32                  ;
   499         paddd           mm2,            mm6                 ;
   501         psrad           mm2,            16                  ;
   502         movq            mm4,            mm7                 ;
   504         psrlq           mm4,            32                  ;
   505         paddd           mm4,            mm7                 ;
   507         mov             rsi,            arg(5) ; sum
   508         mov             rdi,            arg(6) ; sumsquared
   510         movd            [rsi],          mm2                 ;
   511         movd            [rdi],          mm4                 ;
   514     ; begin epilog
   515     pop rdi
   516     pop rsi
   517     RESTORE_GOT
   518     RESTORE_XMM
   519     UNSHADOW_ARGS
   520     pop         rbp
   521     ret
   523 ;void vp9_half_vert_variance8x_h_sse2
   524 ;(
   525 ;    unsigned char *ref_ptr,
   526 ;    int ref_pixels_per_line,
   527 ;    unsigned char *src_ptr,
   528 ;    int src_pixels_per_line,
   529 ;    unsigned int Height,
   530 ;    int *sum,
   531 ;    unsigned int *sumsquared
   532 ;)
   533 global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
   534 sym(vp9_half_vert_variance8x_h_sse2):
   535     push        rbp
   536     mov         rbp, rsp
   537     SHADOW_ARGS_TO_STACK 7
   538     SAVE_XMM 7
   539     GET_GOT     rbx
   540     push rsi
   541     push rdi
   542     ; end prolog
   544 %if ABI_IS_32BIT=0
   545     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
   546     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
   547 %endif
   549         pxor            xmm6,           xmm6                ;  error accumulator
   550         pxor            xmm7,           xmm7                ;  sse eaccumulator
   551         mov             rsi,            arg(0) ;ref_ptr              ;
   553         mov             rdi,            arg(2) ;src_ptr              ;
   554         movsxd          rcx,            dword ptr arg(4) ;Height              ;
   555         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
   557         pxor            xmm0,           xmm0                ;
   558 .half_vert_variance8x_h_1:
   559         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
   560         movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
   562         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
   563         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
   565         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
   566         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
   568         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
   569         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   570         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   571         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   573 %if ABI_IS_32BIT
   574         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
   575         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
   576 %else
   577         add             rsi, r8
   578         add             rdi, r9
   579 %endif
   581         sub             rcx,            1                   ;
   582         jnz             .half_vert_variance8x_h_1          ;
   584         movdq2q         mm6,            xmm6                ;
   585         movdq2q         mm7,            xmm7                ;
   587         psrldq          xmm6,           8
   588         psrldq          xmm7,           8
   590         movdq2q         mm2,            xmm6
   591         movdq2q         mm3,            xmm7
   593         paddw           mm6,            mm2
   594         paddd           mm7,            mm3
   596         pxor            mm3,            mm3                 ;
   597         pxor            mm2,            mm2                 ;
   599         punpcklwd       mm2,            mm6                 ;
   600         punpckhwd       mm3,            mm6                 ;
   602         paddd           mm2,            mm3                 ;
   603         movq            mm6,            mm2                 ;
   605         psrlq           mm6,            32                  ;
   606         paddd           mm2,            mm6                 ;
   608         psrad           mm2,            16                  ;
   609         movq            mm4,            mm7                 ;
   611         psrlq           mm4,            32                  ;
   612         paddd           mm4,            mm7                 ;
   614         mov             rsi,            arg(5) ; sum
   615         mov             rdi,            arg(6) ; sumsquared
   617         movd            [rsi],          mm2                 ;
   618         movd            [rdi],          mm4                 ;
   621     ; begin epilog
   622     pop rdi
   623     pop rsi
   624     RESTORE_GOT
   625     RESTORE_XMM
   626     UNSHADOW_ARGS
   627     pop         rbp
   628     ret
   631 ;void vp9_half_horiz_variance8x_h_sse2
   632 ;(
   633 ;    unsigned char *ref_ptr,
   634 ;    int ref_pixels_per_line,
   635 ;    unsigned char *src_ptr,
   636 ;    int src_pixels_per_line,
   637 ;    unsigned int Height,
   638 ;    int *sum,
   639 ;    unsigned int *sumsquared
   640 ;)
   641 global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
   642 sym(vp9_half_horiz_variance8x_h_sse2):
   643     push        rbp
   644     mov         rbp, rsp
   645     SHADOW_ARGS_TO_STACK 7
   646     SAVE_XMM 7
   647     GET_GOT     rbx
   648     push rsi
   649     push rdi
   650     ; end prolog
   652 %if ABI_IS_32BIT=0
   653     movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
   654     movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
   655 %endif
   657         pxor            xmm6,           xmm6                ;  error accumulator
   658         pxor            xmm7,           xmm7                ;  sse eaccumulator
   659         mov             rsi,            arg(0) ;ref_ptr              ;
   661         mov             rdi,            arg(2) ;src_ptr              ;
   662         movsxd          rcx,            dword ptr arg(4) ;Height              ;
   664         pxor            xmm0,           xmm0                ;
   665 .half_horiz_variance8x_h_1:
   666         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
   667         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
   669         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
   670         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
   672         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
   673         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
   675         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
   676         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   677         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   678         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   680 %if ABI_IS_32BIT
   681         add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
   682         add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
   683 %else
   684         add             rsi, r8
   685         add             rdi, r9
   686 %endif
   687         sub             rcx,            1                   ;
   688         jnz             .half_horiz_variance8x_h_1          ;
   690         movdq2q         mm6,            xmm6                ;
   691         movdq2q         mm7,            xmm7                ;
   693         psrldq          xmm6,           8
   694         psrldq          xmm7,           8
   696         movdq2q         mm2,            xmm6
   697         movdq2q         mm3,            xmm7
   699         paddw           mm6,            mm2
   700         paddd           mm7,            mm3
   702         pxor            mm3,            mm3                 ;
   703         pxor            mm2,            mm2                 ;
   705         punpcklwd       mm2,            mm6                 ;
   706         punpckhwd       mm3,            mm6                 ;
   708         paddd           mm2,            mm3                 ;
   709         movq            mm6,            mm2                 ;
   711         psrlq           mm6,            32                  ;
   712         paddd           mm2,            mm6                 ;
   714         psrad           mm2,            16                  ;
   715         movq            mm4,            mm7                 ;
   717         psrlq           mm4,            32                  ;
   718         paddd           mm4,            mm7                 ;
   720         mov             rsi,            arg(5) ; sum
   721         mov             rdi,            arg(6) ; sumsquared
   723         movd            [rsi],          mm2                 ;
   724         movd            [rdi],          mm4                 ;
   727     ; begin epilog
   728     pop rdi
   729     pop rsi
   730     RESTORE_GOT
   731     RESTORE_XMM
   732     UNSHADOW_ARGS
   733     pop         rbp
   734     ret

mercurial