media/libvpx/vp8/encoder/x86/encodeopt.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/x86/encodeopt.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,386 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
    1.18 +global sym(vp8_block_error_xmm) PRIVATE
    1.19 +sym(vp8_block_error_xmm):
    1.20 +    push        rbp
    1.21 +    mov         rbp, rsp
    1.22 +    SHADOW_ARGS_TO_STACK 2
    1.23 +    push rsi
    1.24 +    push rdi
    1.25 +    ; end prologue
    1.26 +
    1.27 +        mov         rsi,        arg(0) ;coeff_ptr
    1.28 +        mov         rdi,        arg(1) ;dcoef_ptr
    1.29 +
    1.30 +        movdqa      xmm0,       [rsi]
    1.31 +        movdqa      xmm1,       [rdi]
    1.32 +
    1.33 +        movdqa      xmm2,       [rsi+16]
    1.34 +        movdqa      xmm3,       [rdi+16]
    1.35 +
    1.36 +        psubw       xmm0,       xmm1
    1.37 +        psubw       xmm2,       xmm3
    1.38 +
    1.39 +        pmaddwd     xmm0,       xmm0
    1.40 +        pmaddwd     xmm2,       xmm2
    1.41 +
    1.42 +        paddd       xmm0,       xmm2
    1.43 +
    1.44 +        pxor        xmm5,       xmm5
    1.45 +        movdqa      xmm1,       xmm0
    1.46 +
    1.47 +        punpckldq   xmm0,       xmm5
    1.48 +        punpckhdq   xmm1,       xmm5
    1.49 +
    1.50 +        paddd       xmm0,       xmm1
    1.51 +        movdqa      xmm1,       xmm0
    1.52 +
    1.53 +        psrldq      xmm0,       8
    1.54 +        paddd       xmm0,       xmm1
    1.55 +
    1.56 +        movq        rax,        xmm0
    1.57 +
    1.58 +    pop rdi
    1.59 +    pop rsi
    1.60 +    ; begin epilog
    1.61 +    UNSHADOW_ARGS
    1.62 +    pop         rbp
    1.63 +    ret
    1.64 +
    1.65 +;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
    1.66 +global sym(vp8_block_error_mmx) PRIVATE
    1.67 +sym(vp8_block_error_mmx):
    1.68 +    push        rbp
    1.69 +    mov         rbp, rsp
    1.70 +    SHADOW_ARGS_TO_STACK 2
    1.71 +    push rsi
    1.72 +    push rdi
    1.73 +    ; end prolog
    1.74 +
    1.75 +
    1.76 +        mov         rsi,        arg(0) ;coeff_ptr
    1.77 +        pxor        mm7,        mm7
    1.78 +
    1.79 +        mov         rdi,        arg(1) ;dcoef_ptr
    1.80 +        movq        mm3,        [rsi]
    1.81 +
    1.82 +        movq        mm4,        [rdi]
    1.83 +        movq        mm5,        [rsi+8]
    1.84 +
    1.85 +        movq        mm6,        [rdi+8]
    1.86 +        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
    1.87 +
    1.88 +        movq        mm2,        mm7
    1.89 +        psubw       mm5,        mm6
    1.90 +
    1.91 +        por         mm1,        mm2
    1.92 +        pmaddwd     mm5,        mm5
    1.93 +
    1.94 +        pcmpeqw     mm1,        mm7
    1.95 +        psubw       mm3,        mm4
    1.96 +
    1.97 +        pand        mm1,        mm3
    1.98 +        pmaddwd     mm1,        mm1
    1.99 +
   1.100 +        paddd       mm1,        mm5
   1.101 +        movq        mm3,        [rsi+16]
   1.102 +
   1.103 +        movq        mm4,        [rdi+16]
   1.104 +        movq        mm5,        [rsi+24]
   1.105 +
   1.106 +        movq        mm6,        [rdi+24]
   1.107 +        psubw       mm5,        mm6
   1.108 +
   1.109 +        pmaddwd     mm5,        mm5
   1.110 +        psubw       mm3,        mm4
   1.111 +
   1.112 +        pmaddwd     mm3,        mm3
   1.113 +        paddd       mm3,        mm5
   1.114 +
   1.115 +        paddd       mm1,        mm3
   1.116 +        movq        mm0,        mm1
   1.117 +
   1.118 +        psrlq       mm1,        32
   1.119 +        paddd       mm0,        mm1
   1.120 +
   1.121 +        movq        rax,        mm0
   1.122 +
   1.123 +    pop rdi
   1.124 +    pop rsi
   1.125 +    ; begin epilog
   1.126 +    UNSHADOW_ARGS
   1.127 +    pop         rbp
   1.128 +    ret
   1.129 +
   1.130 +
   1.131 +;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
   1.132 +global sym(vp8_mbblock_error_mmx_impl) PRIVATE
   1.133 +sym(vp8_mbblock_error_mmx_impl):
   1.134 +    push        rbp
   1.135 +    mov         rbp, rsp
   1.136 +    SHADOW_ARGS_TO_STACK 3
   1.137 +    push rsi
   1.138 +    push rdi
   1.139 +    ; end prolog
   1.140 +
   1.141 +
   1.142 +        mov         rsi,        arg(0) ;coeff_ptr
   1.143 +        pxor        mm7,        mm7
   1.144 +
   1.145 +        mov         rdi,        arg(1) ;dcoef_ptr
   1.146 +        pxor        mm2,        mm2
   1.147 +
   1.148 +        movd        mm1,        dword ptr arg(2) ;dc
   1.149 +        por         mm1,        mm2
   1.150 +
   1.151 +        pcmpeqw     mm1,        mm7
   1.152 +        mov         rcx,        16
   1.153 +
   1.154 +.mberror_loop_mmx:
   1.155 +        movq        mm3,       [rsi]
   1.156 +        movq        mm4,       [rdi]
   1.157 +
   1.158 +        movq        mm5,       [rsi+8]
   1.159 +        movq        mm6,       [rdi+8]
   1.160 +
   1.161 +
   1.162 +        psubw       mm5,        mm6
   1.163 +        pmaddwd     mm5,        mm5
   1.164 +
   1.165 +        psubw       mm3,        mm4
   1.166 +        pand        mm3,        mm1
   1.167 +
   1.168 +        pmaddwd     mm3,        mm3
   1.169 +        paddd       mm2,        mm5
   1.170 +
   1.171 +        paddd       mm2,        mm3
   1.172 +        movq        mm3,       [rsi+16]
   1.173 +
   1.174 +        movq        mm4,       [rdi+16]
   1.175 +        movq        mm5,       [rsi+24]
   1.176 +
   1.177 +        movq        mm6,       [rdi+24]
   1.178 +        psubw       mm5,        mm6
   1.179 +
   1.180 +        pmaddwd     mm5,        mm5
   1.181 +        psubw       mm3,        mm4
   1.182 +
   1.183 +        pmaddwd     mm3,        mm3
   1.184 +        paddd       mm2,        mm5
   1.185 +
   1.186 +        paddd       mm2,        mm3
   1.187 +        add         rsi,        32
   1.188 +
   1.189 +        add         rdi,        32
   1.190 +        sub         rcx,        1
   1.191 +
   1.192 +        jnz         .mberror_loop_mmx
   1.193 +
   1.194 +        movq        mm0,        mm2
   1.195 +        psrlq       mm2,        32
   1.196 +
   1.197 +        paddd       mm0,        mm2
   1.198 +        movq        rax,        mm0
   1.199 +
   1.200 +    pop rdi
   1.201 +    pop rsi
   1.202 +    ; begin epilog
   1.203 +    UNSHADOW_ARGS
   1.204 +    pop         rbp
   1.205 +    ret
   1.206 +
   1.207 +
   1.208 +;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
   1.209 +global sym(vp8_mbblock_error_xmm_impl) PRIVATE
   1.210 +sym(vp8_mbblock_error_xmm_impl):
   1.211 +    push        rbp
   1.212 +    mov         rbp, rsp
   1.213 +    SHADOW_ARGS_TO_STACK 3
   1.214 +    SAVE_XMM 6
   1.215 +    push rsi
   1.216 +    push rdi
   1.217 +    ; end prolog
   1.218 +
   1.219 +
   1.220 +        mov         rsi,        arg(0) ;coeff_ptr
   1.221 +        pxor        xmm6,       xmm6
   1.222 +
   1.223 +        mov         rdi,        arg(1) ;dcoef_ptr
   1.224 +        pxor        xmm4,       xmm4
   1.225 +
   1.226 +        movd        xmm5,       dword ptr arg(2) ;dc
   1.227 +        por         xmm5,       xmm4
   1.228 +
   1.229 +        pcmpeqw     xmm5,       xmm6
   1.230 +        mov         rcx,        16
   1.231 +
   1.232 +.mberror_loop:
   1.233 +        movdqa      xmm0,       [rsi]
   1.234 +        movdqa      xmm1,       [rdi]
   1.235 +
   1.236 +        movdqa      xmm2,       [rsi+16]
   1.237 +        movdqa      xmm3,       [rdi+16]
   1.238 +
   1.239 +
   1.240 +        psubw       xmm2,       xmm3
   1.241 +        pmaddwd     xmm2,       xmm2
   1.242 +
   1.243 +        psubw       xmm0,       xmm1
   1.244 +        pand        xmm0,       xmm5
   1.245 +
   1.246 +        pmaddwd     xmm0,       xmm0
   1.247 +        add         rsi,        32
   1.248 +
   1.249 +        add         rdi,        32
   1.250 +
   1.251 +        sub         rcx,        1
   1.252 +        paddd       xmm4,       xmm2
   1.253 +
   1.254 +        paddd       xmm4,       xmm0
   1.255 +        jnz         .mberror_loop
   1.256 +
   1.257 +        movdqa      xmm0,       xmm4
   1.258 +        punpckldq   xmm0,       xmm6
   1.259 +
   1.260 +        punpckhdq   xmm4,       xmm6
   1.261 +        paddd       xmm0,       xmm4
   1.262 +
   1.263 +        movdqa      xmm1,       xmm0
   1.264 +        psrldq      xmm0,       8
   1.265 +
   1.266 +        paddd       xmm0,       xmm1
   1.267 +        movq        rax,        xmm0
   1.268 +
   1.269 +    pop rdi
   1.270 +    pop rsi
   1.271 +    ; begin epilog
   1.272 +    RESTORE_XMM
   1.273 +    UNSHADOW_ARGS
   1.274 +    pop         rbp
   1.275 +    ret
   1.276 +
   1.277 +
   1.278 +;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
   1.279 +global sym(vp8_mbuverror_mmx_impl) PRIVATE
   1.280 +sym(vp8_mbuverror_mmx_impl):
   1.281 +    push        rbp
   1.282 +    mov         rbp, rsp
   1.283 +    SHADOW_ARGS_TO_STACK 2
   1.284 +    push rsi
   1.285 +    push rdi
   1.286 +    ; end prolog
   1.287 +
   1.288 +
   1.289 +        mov             rsi,        arg(0) ;s_ptr
   1.290 +        mov             rdi,        arg(1) ;d_ptr
   1.291 +
   1.292 +        mov             rcx,        16
   1.293 +        pxor            mm7,        mm7
   1.294 +
   1.295 +.mbuverror_loop_mmx:
   1.296 +
   1.297 +        movq            mm1,        [rsi]
   1.298 +        movq            mm2,        [rdi]
   1.299 +
   1.300 +        psubw           mm1,        mm2
   1.301 +        pmaddwd         mm1,        mm1
   1.302 +
   1.303 +
   1.304 +        movq            mm3,        [rsi+8]
   1.305 +        movq            mm4,        [rdi+8]
   1.306 +
   1.307 +        psubw           mm3,        mm4
   1.308 +        pmaddwd         mm3,        mm3
   1.309 +
   1.310 +
   1.311 +        paddd           mm7,        mm1
   1.312 +        paddd           mm7,        mm3
   1.313 +
   1.314 +
   1.315 +        add             rsi,        16
   1.316 +        add             rdi,        16
   1.317 +
   1.318 +        dec             rcx
   1.319 +        jnz             .mbuverror_loop_mmx
   1.320 +
   1.321 +        movq            mm0,        mm7
   1.322 +        psrlq           mm7,        32
   1.323 +
   1.324 +        paddd           mm0,        mm7
   1.325 +        movq            rax,        mm0
   1.326 +
   1.327 +    pop rdi
   1.328 +    pop rsi
   1.329 +    ; begin epilog
   1.330 +    UNSHADOW_ARGS
   1.331 +    pop         rbp
   1.332 +    ret
   1.333 +
   1.334 +
   1.335 +;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
   1.336 +global sym(vp8_mbuverror_xmm_impl) PRIVATE
   1.337 +sym(vp8_mbuverror_xmm_impl):
   1.338 +    push        rbp
   1.339 +    mov         rbp, rsp
   1.340 +    SHADOW_ARGS_TO_STACK 2
   1.341 +    push rsi
   1.342 +    push rdi
   1.343 +    ; end prolog
   1.344 +
   1.345 +
   1.346 +        mov             rsi,        arg(0) ;s_ptr
   1.347 +        mov             rdi,        arg(1) ;d_ptr
   1.348 +
   1.349 +        mov             rcx,        16
   1.350 +        pxor            xmm3,       xmm3
   1.351 +
   1.352 +.mbuverror_loop:
   1.353 +
   1.354 +        movdqa          xmm1,       [rsi]
   1.355 +        movdqa          xmm2,       [rdi]
   1.356 +
   1.357 +        psubw           xmm1,       xmm2
   1.358 +        pmaddwd         xmm1,       xmm1
   1.359 +
   1.360 +        paddd           xmm3,       xmm1
   1.361 +
   1.362 +        add             rsi,        16
   1.363 +        add             rdi,        16
   1.364 +
   1.365 +        dec             rcx
   1.366 +        jnz             .mbuverror_loop
   1.367 +
   1.368 +        pxor        xmm0,           xmm0
   1.369 +        movdqa      xmm1,           xmm3
   1.370 +
   1.371 +        movdqa      xmm2,           xmm1
   1.372 +        punpckldq   xmm1,           xmm0
   1.373 +
   1.374 +        punpckhdq   xmm2,           xmm0
   1.375 +        paddd       xmm1,           xmm2
   1.376 +
   1.377 +        movdqa      xmm2,           xmm1
   1.378 +
   1.379 +        psrldq      xmm1,           8
   1.380 +        paddd       xmm1,           xmm2
   1.381 +
   1.382 +        movq            rax,            xmm1
   1.383 +
   1.384 +    pop rdi
   1.385 +    pop rsi
   1.386 +    ; begin epilog
   1.387 +    UNSHADOW_ARGS
   1.388 +    pop         rbp
   1.389 +    ret

mercurial