1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,152 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION .text 1.17 + 1.18 +%macro convolve_fn 1 1.19 +INIT_XMM sse2 1.20 +cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ 1.21 + fx, fxs, fy, fys, w, h 1.22 + mov r4d, dword wm 1.23 + cmp r4d, 4 1.24 + je .w4 1.25 + cmp r4d, 8 1.26 + je .w8 1.27 + cmp r4d, 16 1.28 + je .w16 1.29 + cmp r4d, 32 1.30 + je .w32 1.31 + 1.32 + mov r4d, dword hm 1.33 +.loop64: 1.34 + movu m0, [srcq] 1.35 + movu m1, [srcq+16] 1.36 + movu m2, [srcq+32] 1.37 + movu m3, [srcq+48] 1.38 + add srcq, src_strideq 1.39 +%ifidn %1, avg 1.40 + pavgb m0, [dstq] 1.41 + pavgb m1, [dstq+16] 1.42 + pavgb m2, [dstq+32] 1.43 + pavgb m3, [dstq+48] 1.44 +%endif 1.45 + mova [dstq ], m0 1.46 + mova [dstq+16], m1 1.47 + mova [dstq+32], m2 1.48 + mova [dstq+48], m3 1.49 + add dstq, dst_strideq 1.50 + dec r4d 1.51 + jnz .loop64 1.52 + RET 1.53 + 1.54 +.w32: 1.55 + mov r4d, dword hm 1.56 +.loop32: 1.57 + movu m0, [srcq] 1.58 + movu m1, [srcq+16] 1.59 + movu m2, [srcq+src_strideq] 1.60 + movu m3, [srcq+src_strideq+16] 1.61 + lea srcq, [srcq+src_strideq*2] 1.62 +%ifidn %1, avg 1.63 + pavgb m0, [dstq] 1.64 + pavgb m1, [dstq +16] 1.65 + pavgb m2, [dstq+dst_strideq] 1.66 + pavgb m3, [dstq+dst_strideq+16] 1.67 +%endif 1.68 + mova [dstq ], m0 1.69 + mova [dstq +16], m1 1.70 + mova [dstq+dst_strideq ], m2 1.71 + mova [dstq+dst_strideq+16], m3 1.72 + lea dstq, [dstq+dst_strideq*2] 1.73 + sub r4d, 2 1.74 + jnz .loop32 1.75 + RET 1.76 + 1.77 +.w16: 1.78 + mov r4d, dword hm 1.79 + lea r5q, [src_strideq*3] 1.80 + lea r6q, [dst_strideq*3] 1.81 +.loop16: 1.82 + movu m0, [srcq] 1.83 + movu m1, [srcq+src_strideq] 1.84 + movu m2, [srcq+src_strideq*2] 1.85 + movu m3, [srcq+r5q] 1.86 + lea srcq, [srcq+src_strideq*4] 1.87 +%ifidn %1, avg 1.88 + pavgb m0, [dstq] 1.89 + pavgb m1, [dstq+dst_strideq] 1.90 + pavgb m2, [dstq+dst_strideq*2] 1.91 + pavgb m3, [dstq+r6q] 1.92 +%endif 1.93 + mova [dstq ], m0 1.94 + mova [dstq+dst_strideq ], m1 1.95 + mova [dstq+dst_strideq*2], m2 1.96 + mova [dstq+r6q ], m3 1.97 + lea dstq, [dstq+dst_strideq*4] 1.98 + sub r4d, 4 1.99 + jnz .loop16 1.100 + RET 1.101 + 1.102 +INIT_MMX sse 1.103 +.w8: 1.104 + mov r4d, dword hm 1.105 + lea r5q, [src_strideq*3] 1.106 + lea r6q, [dst_strideq*3] 1.107 +.loop8: 1.108 + movu m0, [srcq] 1.109 + movu m1, [srcq+src_strideq] 1.110 + movu m2, [srcq+src_strideq*2] 1.111 + movu m3, [srcq+r5q] 1.112 + lea srcq, [srcq+src_strideq*4] 1.113 +%ifidn %1, avg 1.114 + pavgb m0, [dstq] 1.115 + pavgb m1, [dstq+dst_strideq] 1.116 + pavgb m2, [dstq+dst_strideq*2] 1.117 + pavgb m3, [dstq+r6q] 1.118 +%endif 1.119 + mova [dstq ], m0 1.120 + mova [dstq+dst_strideq ], m1 1.121 + mova [dstq+dst_strideq*2], m2 1.122 + mova [dstq+r6q ], m3 1.123 + lea dstq, [dstq+dst_strideq*4] 1.124 + sub r4d, 4 1.125 + jnz .loop8 1.126 + RET 1.127 + 1.128 +.w4: 1.129 + mov r4d, dword hm 1.130 + lea r5q, [src_strideq*3] 1.131 + lea r6q, [dst_strideq*3] 1.132 +.loop4: 1.133 + movh m0, [srcq] 1.134 + movh m1, [srcq+src_strideq] 1.135 + movh m2, [srcq+src_strideq*2] 1.136 + movh m3, [srcq+r5q] 1.137 + lea srcq, [srcq+src_strideq*4] 1.138 +%ifidn %1, avg 1.139 + pavgb m0, [dstq] 1.140 + pavgb m1, [dstq+dst_strideq] 1.141 + pavgb m2, [dstq+dst_strideq*2] 1.142 + pavgb m3, [dstq+r6q] 1.143 +%endif 1.144 + movh [dstq ], m0 1.145 + movh [dstq+dst_strideq ], m1 1.146 + movh [dstq+dst_strideq*2], m2 1.147 + movh [dstq+r6q ], m3 1.148 + lea dstq, [dstq+dst_strideq*4] 1.149 + sub r4d, 4 1.150 + jnz .loop4 1.151 + RET 1.152 +%endmacro 1.153 + 1.154 +convolve_fn copy 1.155 +convolve_fn avg