media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,337 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%include "vpx_ports/x86_abi_support.asm"
    1.15 +
    1.16 +;void vp9_half_horiz_vert_variance16x_h_sse2
    1.17 +;(
    1.18 +;    unsigned char *ref_ptr,
    1.19 +;    int ref_pixels_per_line,
    1.20 +;    unsigned char *src_ptr,
    1.21 +;    int src_pixels_per_line,
    1.22 +;    unsigned int Height,
    1.23 +;    int *sum,
    1.24 +;    unsigned int *sumsquared
    1.25 +;)
    1.26 +global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
    1.27 +sym(vp9_half_horiz_vert_variance16x_h_sse2):
    1.28 +    push        rbp
    1.29 +    mov         rbp, rsp
    1.30 +    SHADOW_ARGS_TO_STACK 7
    1.31 +    SAVE_XMM 7
    1.32 +    GET_GOT     rbx
    1.33 +    push rsi
    1.34 +    push rdi
    1.35 +    ; end prolog
    1.36 +
    1.37 +        pxor            xmm6,           xmm6                ;  error accumulator
    1.38 +        pxor            xmm7,           xmm7                ;  sse eaccumulator
    1.39 +        mov             rsi,            arg(0) ;ref_ptr              ;
    1.40 +
    1.41 +        mov             rdi,            arg(2) ;src_ptr              ;
    1.42 +        movsxd          rcx,            dword ptr arg(4) ;Height              ;
    1.43 +        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
    1.44 +        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
    1.45 +
    1.46 +        pxor            xmm0,           xmm0                ;
    1.47 +
    1.48 +        movdqu          xmm5,           XMMWORD PTR [rsi]
    1.49 +        movdqu          xmm3,           XMMWORD PTR [rsi+1]
    1.50 +        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
    1.51 +
    1.52 +        lea             rsi,            [rsi + rax]
    1.53 +
    1.54 +.half_horiz_vert_variance16x_h_1:
    1.55 +        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
    1.56 +        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
    1.57 +        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
    1.58 +
    1.59 +        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
    1.60 +
    1.61 +        movdqa          xmm4,           xmm5
    1.62 +        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
    1.63 +        punpckhbw       xmm4,           xmm0
    1.64 +
    1.65 +        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
    1.66 +        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
    1.67 +        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
    1.68 +
    1.69 +        movq            xmm3,           QWORD PTR [rdi+8]
    1.70 +        punpcklbw       xmm3,           xmm0
    1.71 +        psubw           xmm4,           xmm3
    1.72 +
    1.73 +        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
    1.74 +        paddw           xmm6,           xmm4
    1.75 +        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
    1.76 +        pmaddwd         xmm4,           xmm4
    1.77 +        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
    1.78 +        paddd           xmm7,           xmm4
    1.79 +
    1.80 +        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
    1.81 +
    1.82 +        lea             rsi,            [rsi + rax]
    1.83 +        lea             rdi,            [rdi + rdx]
    1.84 +
    1.85 +        sub             rcx,            1                   ;
    1.86 +        jnz             .half_horiz_vert_variance16x_h_1    ;
    1.87 +
    1.88 +        pxor        xmm1,           xmm1
    1.89 +        pxor        xmm5,           xmm5
    1.90 +
    1.91 +        punpcklwd   xmm0,           xmm6
    1.92 +        punpckhwd   xmm1,           xmm6
    1.93 +        psrad       xmm0,           16
    1.94 +        psrad       xmm1,           16
    1.95 +        paddd       xmm0,           xmm1
    1.96 +        movdqa      xmm1,           xmm0
    1.97 +
    1.98 +        movdqa      xmm6,           xmm7
    1.99 +        punpckldq   xmm6,           xmm5
   1.100 +        punpckhdq   xmm7,           xmm5
   1.101 +        paddd       xmm6,           xmm7
   1.102 +
   1.103 +        punpckldq   xmm0,           xmm5
   1.104 +        punpckhdq   xmm1,           xmm5
   1.105 +        paddd       xmm0,           xmm1
   1.106 +
   1.107 +        movdqa      xmm7,           xmm6
   1.108 +        movdqa      xmm1,           xmm0
   1.109 +
   1.110 +        psrldq      xmm7,           8
   1.111 +        psrldq      xmm1,           8
   1.112 +
   1.113 +        paddd       xmm6,           xmm7
   1.114 +        paddd       xmm0,           xmm1
   1.115 +
   1.116 +        mov         rsi,            arg(5) ;[Sum]
   1.117 +        mov         rdi,            arg(6) ;[SSE]
   1.118 +
   1.119 +        movd        [rsi],       xmm0
   1.120 +        movd        [rdi],       xmm6
   1.121 +
   1.122 +    ; begin epilog
   1.123 +    pop rdi
   1.124 +    pop rsi
   1.125 +    RESTORE_GOT
   1.126 +    RESTORE_XMM
   1.127 +    UNSHADOW_ARGS
   1.128 +    pop         rbp
   1.129 +    ret
   1.130 +
   1.131 +;void vp9_half_vert_variance16x_h_sse2
   1.132 +;(
   1.133 +;    unsigned char *ref_ptr,
   1.134 +;    int ref_pixels_per_line,
   1.135 +;    unsigned char *src_ptr,
   1.136 +;    int src_pixels_per_line,
   1.137 +;    unsigned int Height,
   1.138 +;    int *sum,
   1.139 +;    unsigned int *sumsquared
   1.140 +;)
   1.141 +global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
   1.142 +sym(vp9_half_vert_variance16x_h_sse2):
   1.143 +    push        rbp
   1.144 +    mov         rbp, rsp
   1.145 +    SHADOW_ARGS_TO_STACK 7
   1.146 +    SAVE_XMM 7
   1.147 +    GET_GOT     rbx
   1.148 +    push rsi
   1.149 +    push rdi
   1.150 +    ; end prolog
   1.151 +
   1.152 +        pxor            xmm6,           xmm6                ;  error accumulator
   1.153 +        pxor            xmm7,           xmm7                ;  sse eaccumulator
   1.154 +        mov             rsi,            arg(0)              ;ref_ptr
   1.155 +
   1.156 +        mov             rdi,            arg(2)              ;src_ptr
   1.157 +        movsxd          rcx,            dword ptr arg(4)    ;Height
   1.158 +        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
   1.159 +        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
   1.160 +
   1.161 +        movdqu          xmm5,           XMMWORD PTR [rsi]
   1.162 +        lea             rsi,            [rsi + rax          ]
   1.163 +        pxor            xmm0,           xmm0
   1.164 +
   1.165 +.half_vert_variance16x_h_1:
   1.166 +        movdqu          xmm3,           XMMWORD PTR [rsi]
   1.167 +
   1.168 +        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
   1.169 +        movdqa          xmm4,           xmm5
   1.170 +        punpcklbw       xmm5,           xmm0
   1.171 +        punpckhbw       xmm4,           xmm0
   1.172 +
   1.173 +        movq            xmm2,           QWORD PTR [rdi]
   1.174 +        punpcklbw       xmm2,           xmm0
   1.175 +        psubw           xmm5,           xmm2
   1.176 +        movq            xmm2,           QWORD PTR [rdi+8]
   1.177 +        punpcklbw       xmm2,           xmm0
   1.178 +        psubw           xmm4,           xmm2
   1.179 +
   1.180 +        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   1.181 +        paddw           xmm6,           xmm4
   1.182 +        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   1.183 +        pmaddwd         xmm4,           xmm4
   1.184 +        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   1.185 +        paddd           xmm7,           xmm4
   1.186 +
   1.187 +        movdqa          xmm5,           xmm3
   1.188 +
   1.189 +        lea             rsi,            [rsi + rax]
   1.190 +        lea             rdi,            [rdi + rdx]
   1.191 +
   1.192 +        sub             rcx,            1
   1.193 +        jnz             .half_vert_variance16x_h_1
   1.194 +
   1.195 +        pxor        xmm1,           xmm1
   1.196 +        pxor        xmm5,           xmm5
   1.197 +
   1.198 +        punpcklwd   xmm0,           xmm6
   1.199 +        punpckhwd   xmm1,           xmm6
   1.200 +        psrad       xmm0,           16
   1.201 +        psrad       xmm1,           16
   1.202 +        paddd       xmm0,           xmm1
   1.203 +        movdqa      xmm1,           xmm0
   1.204 +
   1.205 +        movdqa      xmm6,           xmm7
   1.206 +        punpckldq   xmm6,           xmm5
   1.207 +        punpckhdq   xmm7,           xmm5
   1.208 +        paddd       xmm6,           xmm7
   1.209 +
   1.210 +        punpckldq   xmm0,           xmm5
   1.211 +        punpckhdq   xmm1,           xmm5
   1.212 +        paddd       xmm0,           xmm1
   1.213 +
   1.214 +        movdqa      xmm7,           xmm6
   1.215 +        movdqa      xmm1,           xmm0
   1.216 +
   1.217 +        psrldq      xmm7,           8
   1.218 +        psrldq      xmm1,           8
   1.219 +
   1.220 +        paddd       xmm6,           xmm7
   1.221 +        paddd       xmm0,           xmm1
   1.222 +
   1.223 +        mov         rsi,            arg(5) ;[Sum]
   1.224 +        mov         rdi,            arg(6) ;[SSE]
   1.225 +
   1.226 +        movd        [rsi],       xmm0
   1.227 +        movd        [rdi],       xmm6
   1.228 +
   1.229 +    ; begin epilog
   1.230 +    pop rdi
   1.231 +    pop rsi
   1.232 +    RESTORE_GOT
   1.233 +    RESTORE_XMM
   1.234 +    UNSHADOW_ARGS
   1.235 +    pop         rbp
   1.236 +    ret
   1.237 +
   1.238 +;void vp9_half_horiz_variance16x_h_sse2
   1.239 +;(
   1.240 +;    unsigned char *ref_ptr,
   1.241 +;    int ref_pixels_per_line,
   1.242 +;    unsigned char *src_ptr,
   1.243 +;    int src_pixels_per_line,
   1.244 +;    unsigned int Height,
   1.245 +;    int *sum,
   1.246 +;    unsigned int *sumsquared
   1.247 +;)
   1.248 +global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
   1.249 +sym(vp9_half_horiz_variance16x_h_sse2):
   1.250 +    push        rbp
   1.251 +    mov         rbp, rsp
   1.252 +    SHADOW_ARGS_TO_STACK 7
   1.253 +    SAVE_XMM 7
   1.254 +    GET_GOT     rbx
   1.255 +    push rsi
   1.256 +    push rdi
   1.257 +    ; end prolog
   1.258 +
   1.259 +        pxor            xmm6,           xmm6                ;  error accumulator
   1.260 +        pxor            xmm7,           xmm7                ;  sse eaccumulator
   1.261 +        mov             rsi,            arg(0) ;ref_ptr              ;
   1.262 +
   1.263 +        mov             rdi,            arg(2) ;src_ptr              ;
   1.264 +        movsxd          rcx,            dword ptr arg(4) ;Height              ;
   1.265 +        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
   1.266 +        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
   1.267 +
   1.268 +        pxor            xmm0,           xmm0                ;
   1.269 +
   1.270 +.half_horiz_variance16x_h_1:
   1.271 +        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
   1.272 +        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
   1.273 +
   1.274 +        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
   1.275 +        movdqa          xmm1,           xmm5
   1.276 +        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
   1.277 +        punpckhbw       xmm1,           xmm0
   1.278 +
   1.279 +        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
   1.280 +        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
   1.281 +        movq            xmm2,           QWORD PTR [rdi+8]
   1.282 +        punpcklbw       xmm2,           xmm0
   1.283 +
   1.284 +        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
   1.285 +        psubw           xmm1,           xmm2
   1.286 +        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
   1.287 +        paddw           xmm6,           xmm1
   1.288 +        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
   1.289 +        pmaddwd         xmm1,           xmm1
   1.290 +        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
   1.291 +        paddd           xmm7,           xmm1
   1.292 +
   1.293 +        lea             rsi,            [rsi + rax]
   1.294 +        lea             rdi,            [rdi + rdx]
   1.295 +
   1.296 +        sub             rcx,            1                   ;
   1.297 +        jnz             .half_horiz_variance16x_h_1         ;
   1.298 +
   1.299 +        pxor        xmm1,           xmm1
   1.300 +        pxor        xmm5,           xmm5
   1.301 +
   1.302 +        punpcklwd   xmm0,           xmm6
   1.303 +        punpckhwd   xmm1,           xmm6
   1.304 +        psrad       xmm0,           16
   1.305 +        psrad       xmm1,           16
   1.306 +        paddd       xmm0,           xmm1
   1.307 +        movdqa      xmm1,           xmm0
   1.308 +
   1.309 +        movdqa      xmm6,           xmm7
   1.310 +        punpckldq   xmm6,           xmm5
   1.311 +        punpckhdq   xmm7,           xmm5
   1.312 +        paddd       xmm6,           xmm7
   1.313 +
   1.314 +        punpckldq   xmm0,           xmm5
   1.315 +        punpckhdq   xmm1,           xmm5
   1.316 +        paddd       xmm0,           xmm1
   1.317 +
   1.318 +        movdqa      xmm7,           xmm6
   1.319 +        movdqa      xmm1,           xmm0
   1.320 +
   1.321 +        psrldq      xmm7,           8
   1.322 +        psrldq      xmm1,           8
   1.323 +
   1.324 +        paddd       xmm6,           xmm7
   1.325 +        paddd       xmm0,           xmm1
   1.326 +
   1.327 +        mov         rsi,            arg(5) ;[Sum]
   1.328 +        mov         rdi,            arg(6) ;[SSE]
   1.329 +
   1.330 +        movd        [rsi],       xmm0
   1.331 +        movd        [rdi],       xmm6
   1.332 +
   1.333 +    ; begin epilog
   1.334 +    pop rdi
   1.335 +    pop rsi
   1.336 +    RESTORE_GOT
   1.337 +    RESTORE_XMM
   1.338 +    UNSHADOW_ARGS
   1.339 +    pop         rbp
   1.340 +    ret

mercurial