media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,164 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
    1.18 +global sym(vp8_short_walsh4x4_sse2) PRIVATE
    1.19 +sym(vp8_short_walsh4x4_sse2):
    1.20 +    push        rbp
    1.21 +    mov         rbp, rsp
    1.22 +    SHADOW_ARGS_TO_STACK 3
    1.23 +    SAVE_XMM 7
    1.24 +    GET_GOT     rbx
    1.25 +    push        rsi
    1.26 +    push        rdi
    1.27 +    ; end prolog
    1.28 +
    1.29 +    mov     rsi, arg(0)           ; input
    1.30 +    mov     rdi, arg(1)           ; output
    1.31 +    movsxd  rdx, dword ptr arg(2) ; pitch
    1.32 +
    1.33 +    ; first for loop
    1.34 +    movq    xmm0, MMWORD PTR [rsi]           ; load input
    1.35 +    movq    xmm1, MMWORD PTR [rsi + rdx]
    1.36 +    lea     rsi,  [rsi + rdx*2]
    1.37 +    movq    xmm2, MMWORD PTR [rsi]
    1.38 +    movq    xmm3, MMWORD PTR [rsi + rdx]
    1.39 +
    1.40 +    punpcklwd xmm0,  xmm1
    1.41 +    punpcklwd xmm2,  xmm3
    1.42 +
    1.43 +    movdqa    xmm1, xmm0
    1.44 +    punpckldq xmm0, xmm2           ; ip[1] ip[0]
    1.45 +    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
    1.46 +
    1.47 +    movdqa    xmm2, xmm0
    1.48 +    paddw     xmm0, xmm1
    1.49 +    psubw     xmm2, xmm1
    1.50 +
    1.51 +    psllw     xmm0, 2              ; d1  a1
    1.52 +    psllw     xmm2, 2              ; c1  b1
    1.53 +
    1.54 +    movdqa    xmm1, xmm0
    1.55 +    punpcklqdq xmm0, xmm2          ; b1  a1
    1.56 +    punpckhqdq xmm1, xmm2          ; c1  d1
    1.57 +
    1.58 +    pxor      xmm6, xmm6
    1.59 +    movq      xmm6, xmm0
    1.60 +    pxor      xmm7, xmm7
    1.61 +    pcmpeqw   xmm7, xmm6
    1.62 +    paddw     xmm7, [GLOBAL(c1)]
    1.63 +
    1.64 +    movdqa    xmm2, xmm0
    1.65 +    paddw     xmm0, xmm1           ; b1+c1  a1+d1
    1.66 +    psubw     xmm2, xmm1           ; b1-c1  a1-d1
    1.67 +    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
    1.68 +
    1.69 +    ; second for loop
    1.70 +    ; input: 13  9  5  1 12  8  4  0 (xmm0)
    1.71 +    ;        14 10  6  2 15 11  7  3 (xmm2)
    1.72 +    ; after shuffle:
    1.73 +    ;        13  5  9  1 12  4  8  0 (xmm0)
    1.74 +    ;        14  6 10  2 15  7 11  3 (xmm1)
    1.75 +    pshuflw   xmm3, xmm0, 0xd8
    1.76 +    pshufhw   xmm0, xmm3, 0xd8
    1.77 +    pshuflw   xmm3, xmm2, 0xd8
    1.78 +    pshufhw   xmm1, xmm3, 0xd8
    1.79 +
    1.80 +    movdqa    xmm2, xmm0
    1.81 +    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
    1.82 +    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
    1.83 +    movdqa    xmm3, xmm1
    1.84 +    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
    1.85 +    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
    1.86 +
    1.87 +    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
    1.88 +    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
    1.89 +    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
    1.90 +    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
    1.91 +
    1.92 +    movdqa    xmm0, xmm4
    1.93 +    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
    1.94 +    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
    1.95 +    movdqa    xmm1, xmm6
    1.96 +    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
    1.97 +    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
    1.98 +
    1.99 +    movdqa    xmm2, xmm0
   1.100 +    paddd     xmm0, xmm4            ; b21 b20 a21 a20
   1.101 +    psubd     xmm2, xmm4            ; c21 c20 d21 d20
   1.102 +    movdqa    xmm3, xmm1
   1.103 +    paddd     xmm1, xmm6            ; b23 b22 a23 a22
   1.104 +    psubd     xmm3, xmm6            ; c23 c22 d23 d22
   1.105 +
   1.106 +    pxor      xmm4, xmm4
   1.107 +    movdqa    xmm5, xmm4
   1.108 +    pcmpgtd   xmm4, xmm0
   1.109 +    pcmpgtd   xmm5, xmm2
   1.110 +    pand      xmm4, [GLOBAL(cd1)]
   1.111 +    pand      xmm5, [GLOBAL(cd1)]
   1.112 +
   1.113 +    pxor      xmm6, xmm6
   1.114 +    movdqa    xmm7, xmm6
   1.115 +    pcmpgtd   xmm6, xmm1
   1.116 +    pcmpgtd   xmm7, xmm3
   1.117 +    pand      xmm6, [GLOBAL(cd1)]
   1.118 +    pand      xmm7, [GLOBAL(cd1)]
   1.119 +
   1.120 +    paddd     xmm0, xmm4
   1.121 +    paddd     xmm2, xmm5
   1.122 +    paddd     xmm0, [GLOBAL(cd3)]
   1.123 +    paddd     xmm2, [GLOBAL(cd3)]
   1.124 +    paddd     xmm1, xmm6
   1.125 +    paddd     xmm3, xmm7
   1.126 +    paddd     xmm1, [GLOBAL(cd3)]
   1.127 +    paddd     xmm3, [GLOBAL(cd3)]
   1.128 +
   1.129 +    psrad     xmm0, 3
   1.130 +    psrad     xmm1, 3
   1.131 +    psrad     xmm2, 3
   1.132 +    psrad     xmm3, 3
   1.133 +    movdqa    xmm4, xmm0
   1.134 +    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
   1.135 +    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
   1.136 +    movdqa    xmm5, xmm2
   1.137 +    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
   1.138 +    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
   1.139 +
   1.140 +    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
   1.141 +    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
   1.142 +
   1.143 +    movdqa  XMMWORD PTR [rdi], xmm0
   1.144 +    movdqa  XMMWORD PTR [rdi + 16], xmm2
   1.145 +
   1.146 +    ; begin epilog
   1.147 +    pop rdi
   1.148 +    pop rsi
   1.149 +    RESTORE_GOT
   1.150 +    RESTORE_XMM
   1.151 +    UNSHADOW_ARGS
   1.152 +    pop         rbp
   1.153 +    ret
   1.154 +
   1.155 +SECTION_RODATA
   1.156 +align 16
   1.157 +c1:
   1.158 +    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
   1.159 +align 16
   1.160 +cn1:
   1.161 +    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
   1.162 +align 16
   1.163 +cd1:
   1.164 +    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
   1.165 +align 16
   1.166 +cd3:
   1.167 +    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003

mercurial