1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,164 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) 1.18 +global sym(vp8_short_walsh4x4_sse2) PRIVATE 1.19 +sym(vp8_short_walsh4x4_sse2): 1.20 + push rbp 1.21 + mov rbp, rsp 1.22 + SHADOW_ARGS_TO_STACK 3 1.23 + SAVE_XMM 7 1.24 + GET_GOT rbx 1.25 + push rsi 1.26 + push rdi 1.27 + ; end prolog 1.28 + 1.29 + mov rsi, arg(0) ; input 1.30 + mov rdi, arg(1) ; output 1.31 + movsxd rdx, dword ptr arg(2) ; pitch 1.32 + 1.33 + ; first for loop 1.34 + movq xmm0, MMWORD PTR [rsi] ; load input 1.35 + movq xmm1, MMWORD PTR [rsi + rdx] 1.36 + lea rsi, [rsi + rdx*2] 1.37 + movq xmm2, MMWORD PTR [rsi] 1.38 + movq xmm3, MMWORD PTR [rsi + rdx] 1.39 + 1.40 + punpcklwd xmm0, xmm1 1.41 + punpcklwd xmm2, xmm3 1.42 + 1.43 + movdqa xmm1, xmm0 1.44 + punpckldq xmm0, xmm2 ; ip[1] ip[0] 1.45 + punpckhdq xmm1, xmm2 ; ip[3] ip[2] 1.46 + 1.47 + movdqa xmm2, xmm0 1.48 + paddw xmm0, xmm1 1.49 + psubw xmm2, xmm1 1.50 + 1.51 + psllw xmm0, 2 ; d1 a1 1.52 + psllw xmm2, 2 ; c1 b1 1.53 + 1.54 + movdqa xmm1, xmm0 1.55 + punpcklqdq xmm0, xmm2 ; b1 a1 1.56 + punpckhqdq xmm1, xmm2 ; c1 d1 1.57 + 1.58 + pxor xmm6, xmm6 1.59 + movq xmm6, xmm0 1.60 + pxor xmm7, xmm7 1.61 + pcmpeqw xmm7, xmm6 1.62 + paddw xmm7, [GLOBAL(c1)] 1.63 + 1.64 + movdqa xmm2, xmm0 1.65 + paddw xmm0, xmm1 ; b1+c1 a1+d1 1.66 + psubw xmm2, xmm1 ; b1-c1 a1-d1 1.67 + paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0) 1.68 + 1.69 + ; second for loop 1.70 + ; input: 13 9 5 1 12 8 4 0 (xmm0) 1.71 + ; 14 10 6 2 15 11 7 3 (xmm2) 1.72 + ; after shuffle: 1.73 + ; 13 5 9 1 12 4 8 0 (xmm0) 1.74 + ; 14 6 10 2 15 7 11 3 (xmm1) 1.75 + pshuflw xmm3, xmm0, 0xd8 1.76 + pshufhw xmm0, xmm3, 0xd8 1.77 + pshuflw xmm3, xmm2, 0xd8 1.78 + pshufhw xmm1, xmm3, 0xd8 1.79 + 1.80 + movdqa xmm2, xmm0 1.81 + pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10 1.82 + pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10 1.83 + movdqa xmm3, xmm1 1.84 + pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13 1.85 + pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13 1.86 + 1.87 + pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10 1.88 + pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10 1.89 + pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12 1.90 + pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12 1.91 + 1.92 + movdqa xmm0, xmm4 1.93 + punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10 1.94 + punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10 1.95 + movdqa xmm1, xmm6 1.96 + punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12 1.97 + punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12 1.98 + 1.99 + movdqa xmm2, xmm0 1.100 + paddd xmm0, xmm4 ; b21 b20 a21 a20 1.101 + psubd xmm2, xmm4 ; c21 c20 d21 d20 1.102 + movdqa xmm3, xmm1 1.103 + paddd xmm1, xmm6 ; b23 b22 a23 a22 1.104 + psubd xmm3, xmm6 ; c23 c22 d23 d22 1.105 + 1.106 + pxor xmm4, xmm4 1.107 + movdqa xmm5, xmm4 1.108 + pcmpgtd xmm4, xmm0 1.109 + pcmpgtd xmm5, xmm2 1.110 + pand xmm4, [GLOBAL(cd1)] 1.111 + pand xmm5, [GLOBAL(cd1)] 1.112 + 1.113 + pxor xmm6, xmm6 1.114 + movdqa xmm7, xmm6 1.115 + pcmpgtd xmm6, xmm1 1.116 + pcmpgtd xmm7, xmm3 1.117 + pand xmm6, [GLOBAL(cd1)] 1.118 + pand xmm7, [GLOBAL(cd1)] 1.119 + 1.120 + paddd xmm0, xmm4 1.121 + paddd xmm2, xmm5 1.122 + paddd xmm0, [GLOBAL(cd3)] 1.123 + paddd xmm2, [GLOBAL(cd3)] 1.124 + paddd xmm1, xmm6 1.125 + paddd xmm3, xmm7 1.126 + paddd xmm1, [GLOBAL(cd3)] 1.127 + paddd xmm3, [GLOBAL(cd3)] 1.128 + 1.129 + psrad xmm0, 3 1.130 + psrad xmm1, 3 1.131 + psrad xmm2, 3 1.132 + psrad xmm3, 3 1.133 + movdqa xmm4, xmm0 1.134 + punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20 1.135 + punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20 1.136 + movdqa xmm5, xmm2 1.137 + punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20 1.138 + punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20 1.139 + 1.140 + packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20 1.141 + packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20 1.142 + 1.143 + movdqa XMMWORD PTR [rdi], xmm0 1.144 + movdqa XMMWORD PTR [rdi + 16], xmm2 1.145 + 1.146 + ; begin epilog 1.147 + pop rdi 1.148 + pop rsi 1.149 + RESTORE_GOT 1.150 + RESTORE_XMM 1.151 + UNSHADOW_ARGS 1.152 + pop rbp 1.153 + ret 1.154 + 1.155 +SECTION_RODATA 1.156 +align 16 1.157 +c1: 1.158 + dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 1.159 +align 16 1.160 +cn1: 1.161 + dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff 1.162 +align 16 1.163 +cd1: 1.164 + dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 1.165 +align 16 1.166 +cd3: 1.167 + dd 0x00000003, 0x00000003, 0x00000003, 0x00000003