media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "vpx_ports/x86_abi_support.asm"
    13 ;void vp9_half_horiz_vert_variance16x_h_sse2
    14 ;(
    15 ;    unsigned char *ref_ptr,
    16 ;    int ref_pixels_per_line,
    17 ;    unsigned char *src_ptr,
    18 ;    int src_pixels_per_line,
    19 ;    unsigned int Height,
    20 ;    int *sum,
    21 ;    unsigned int *sumsquared
    22 ;)
    23 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
    24 sym(vp9_half_horiz_vert_variance16x_h_sse2):
    25     push        rbp
    26     mov         rbp, rsp
    27     SHADOW_ARGS_TO_STACK 7
    28     SAVE_XMM 7
    29     GET_GOT     rbx
    30     push rsi
    31     push rdi
    32     ; end prolog
    34         pxor            xmm6,           xmm6                ;  error accumulator
    35         pxor            xmm7,           xmm7                ;  sse eaccumulator
    36         mov             rsi,            arg(0) ;ref_ptr              ;
    38         mov             rdi,            arg(2) ;src_ptr              ;
    39         movsxd          rcx,            dword ptr arg(4) ;Height              ;
    40         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
    41         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
    43         pxor            xmm0,           xmm0                ;
    45         movdqu          xmm5,           XMMWORD PTR [rsi]
    46         movdqu          xmm3,           XMMWORD PTR [rsi+1]
    47         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
    49         lea             rsi,            [rsi + rax]
    51 .half_horiz_vert_variance16x_h_1:
    52         movdqu          xmm1,           XMMWORD PTR [rsi]     ;
    53         movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
    54         pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
    56         pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
    58         movdqa          xmm4,           xmm5
    59         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
    60         punpckhbw       xmm4,           xmm0
    62         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
    63         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
    64         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
    66         movq            xmm3,           QWORD PTR [rdi+8]
    67         punpcklbw       xmm3,           xmm0
    68         psubw           xmm4,           xmm3
    70         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
    71         paddw           xmm6,           xmm4
    72         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
    73         pmaddwd         xmm4,           xmm4
    74         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
    75         paddd           xmm7,           xmm4
    77         movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
    79         lea             rsi,            [rsi + rax]
    80         lea             rdi,            [rdi + rdx]
    82         sub             rcx,            1                   ;
    83         jnz             .half_horiz_vert_variance16x_h_1    ;
    85         pxor        xmm1,           xmm1
    86         pxor        xmm5,           xmm5
    88         punpcklwd   xmm0,           xmm6
    89         punpckhwd   xmm1,           xmm6
    90         psrad       xmm0,           16
    91         psrad       xmm1,           16
    92         paddd       xmm0,           xmm1
    93         movdqa      xmm1,           xmm0
    95         movdqa      xmm6,           xmm7
    96         punpckldq   xmm6,           xmm5
    97         punpckhdq   xmm7,           xmm5
    98         paddd       xmm6,           xmm7
   100         punpckldq   xmm0,           xmm5
   101         punpckhdq   xmm1,           xmm5
   102         paddd       xmm0,           xmm1
   104         movdqa      xmm7,           xmm6
   105         movdqa      xmm1,           xmm0
   107         psrldq      xmm7,           8
   108         psrldq      xmm1,           8
   110         paddd       xmm6,           xmm7
   111         paddd       xmm0,           xmm1
   113         mov         rsi,            arg(5) ;[Sum]
   114         mov         rdi,            arg(6) ;[SSE]
   116         movd        [rsi],       xmm0
   117         movd        [rdi],       xmm6
   119     ; begin epilog
   120     pop rdi
   121     pop rsi
   122     RESTORE_GOT
   123     RESTORE_XMM
   124     UNSHADOW_ARGS
   125     pop         rbp
   126     ret
   128 ;void vp9_half_vert_variance16x_h_sse2
   129 ;(
   130 ;    unsigned char *ref_ptr,
   131 ;    int ref_pixels_per_line,
   132 ;    unsigned char *src_ptr,
   133 ;    int src_pixels_per_line,
   134 ;    unsigned int Height,
   135 ;    int *sum,
   136 ;    unsigned int *sumsquared
   137 ;)
   138 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
   139 sym(vp9_half_vert_variance16x_h_sse2):
   140     push        rbp
   141     mov         rbp, rsp
   142     SHADOW_ARGS_TO_STACK 7
   143     SAVE_XMM 7
   144     GET_GOT     rbx
   145     push rsi
   146     push rdi
   147     ; end prolog
   149         pxor            xmm6,           xmm6                ;  error accumulator
   150         pxor            xmm7,           xmm7                ;  sse eaccumulator
   151         mov             rsi,            arg(0)              ;ref_ptr
   153         mov             rdi,            arg(2)              ;src_ptr
   154         movsxd          rcx,            dword ptr arg(4)    ;Height
   155         movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
   156         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
   158         movdqu          xmm5,           XMMWORD PTR [rsi]
   159         lea             rsi,            [rsi + rax          ]
   160         pxor            xmm0,           xmm0
   162 .half_vert_variance16x_h_1:
   163         movdqu          xmm3,           XMMWORD PTR [rsi]
   165         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
   166         movdqa          xmm4,           xmm5
   167         punpcklbw       xmm5,           xmm0
   168         punpckhbw       xmm4,           xmm0
   170         movq            xmm2,           QWORD PTR [rdi]
   171         punpcklbw       xmm2,           xmm0
   172         psubw           xmm5,           xmm2
   173         movq            xmm2,           QWORD PTR [rdi+8]
   174         punpcklbw       xmm2,           xmm0
   175         psubw           xmm4,           xmm2
   177         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   178         paddw           xmm6,           xmm4
   179         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   180         pmaddwd         xmm4,           xmm4
   181         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   182         paddd           xmm7,           xmm4
   184         movdqa          xmm5,           xmm3
   186         lea             rsi,            [rsi + rax]
   187         lea             rdi,            [rdi + rdx]
   189         sub             rcx,            1
   190         jnz             .half_vert_variance16x_h_1
   192         pxor        xmm1,           xmm1
   193         pxor        xmm5,           xmm5
   195         punpcklwd   xmm0,           xmm6
   196         punpckhwd   xmm1,           xmm6
   197         psrad       xmm0,           16
   198         psrad       xmm1,           16
   199         paddd       xmm0,           xmm1
   200         movdqa      xmm1,           xmm0
   202         movdqa      xmm6,           xmm7
   203         punpckldq   xmm6,           xmm5
   204         punpckhdq   xmm7,           xmm5
   205         paddd       xmm6,           xmm7
   207         punpckldq   xmm0,           xmm5
   208         punpckhdq   xmm1,           xmm5
   209         paddd       xmm0,           xmm1
   211         movdqa      xmm7,           xmm6
   212         movdqa      xmm1,           xmm0
   214         psrldq      xmm7,           8
   215         psrldq      xmm1,           8
   217         paddd       xmm6,           xmm7
   218         paddd       xmm0,           xmm1
   220         mov         rsi,            arg(5) ;[Sum]
   221         mov         rdi,            arg(6) ;[SSE]
   223         movd        [rsi],       xmm0
   224         movd        [rdi],       xmm6
   226     ; begin epilog
   227     pop rdi
   228     pop rsi
   229     RESTORE_GOT
   230     RESTORE_XMM
   231     UNSHADOW_ARGS
   232     pop         rbp
   233     ret
   235 ;void vp9_half_horiz_variance16x_h_sse2
   236 ;(
   237 ;    unsigned char *ref_ptr,
   238 ;    int ref_pixels_per_line,
   239 ;    unsigned char *src_ptr,
   240 ;    int src_pixels_per_line,
   241 ;    unsigned int Height,
   242 ;    int *sum,
   243 ;    unsigned int *sumsquared
   244 ;)
   245 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
   246 sym(vp9_half_horiz_variance16x_h_sse2):
   247     push        rbp
   248     mov         rbp, rsp
   249     SHADOW_ARGS_TO_STACK 7
   250     SAVE_XMM 7
   251     GET_GOT     rbx
   252     push rsi
   253     push rdi
   254     ; end prolog
   256         pxor            xmm6,           xmm6                ;  error accumulator
   257         pxor            xmm7,           xmm7                ;  sse eaccumulator
   258         mov             rsi,            arg(0) ;ref_ptr              ;
   260         mov             rdi,            arg(2) ;src_ptr              ;
   261         movsxd          rcx,            dword ptr arg(4) ;Height              ;
   262         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
   263         movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
   265         pxor            xmm0,           xmm0                ;
   267 .half_horiz_variance16x_h_1:
   268         movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
   269         movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
   271         pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
   272         movdqa          xmm1,           xmm5
   273         punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
   274         punpckhbw       xmm1,           xmm0
   276         movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
   277         punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
   278         movq            xmm2,           QWORD PTR [rdi+8]
   279         punpcklbw       xmm2,           xmm0
   281         psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
   282         psubw           xmm1,           xmm2
   283         paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   284         paddw           xmm6,           xmm1
   285         pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   286         pmaddwd         xmm1,           xmm1
   287         paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   288         paddd           xmm7,           xmm1
   290         lea             rsi,            [rsi + rax]
   291         lea             rdi,            [rdi + rdx]
   293         sub             rcx,            1                   ;
   294         jnz             .half_horiz_variance16x_h_1         ;
   296         pxor        xmm1,           xmm1
   297         pxor        xmm5,           xmm5
   299         punpcklwd   xmm0,           xmm6
   300         punpckhwd   xmm1,           xmm6
   301         psrad       xmm0,           16
   302         psrad       xmm1,           16
   303         paddd       xmm0,           xmm1
   304         movdqa      xmm1,           xmm0
   306         movdqa      xmm6,           xmm7
   307         punpckldq   xmm6,           xmm5
   308         punpckhdq   xmm7,           xmm5
   309         paddd       xmm6,           xmm7
   311         punpckldq   xmm0,           xmm5
   312         punpckhdq   xmm1,           xmm5
   313         paddd       xmm0,           xmm1
   315         movdqa      xmm7,           xmm6
   316         movdqa      xmm1,           xmm0
   318         psrldq      xmm7,           8
   319         psrldq      xmm1,           8
   321         paddd       xmm6,           xmm7
   322         paddd       xmm0,           xmm1
   324         mov         rsi,            arg(5) ;[Sum]
   325         mov         rdi,            arg(6) ;[SSE]
   327         movd        [rsi],       xmm0
   328         movd        [rdi],       xmm6
   330     ; begin epilog
   331     pop rdi
   332     pop rsi
   333     RESTORE_GOT
   334     RESTORE_XMM
   335     UNSHADOW_ARGS
   336     pop         rbp
   337     ret

mercurial