1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,231 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION .text 1.17 + 1.18 +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end 1.19 +%macro PROCESS_4x2x4 5-6 0 1.20 + movd m0, [srcq +%2] 1.21 +%if %1 == 1 1.22 + movd m6, [ref1q+%3] 1.23 + movd m4, [ref2q+%3] 1.24 + movd m7, [ref3q+%3] 1.25 + movd m5, [ref4q+%3] 1.26 + punpckldq m0, [srcq +%4] 1.27 + punpckldq m6, [ref1q+%5] 1.28 + punpckldq m4, [ref2q+%5] 1.29 + punpckldq m7, [ref3q+%5] 1.30 + punpckldq m5, [ref4q+%5] 1.31 + psadbw m6, m0 1.32 + psadbw m4, m0 1.33 + psadbw m7, m0 1.34 + psadbw m5, m0 1.35 + punpckldq m6, m4 1.36 + punpckldq m7, m5 1.37 +%else 1.38 + movd m1, [ref1q+%3] 1.39 + movd m2, [ref2q+%3] 1.40 + movd m3, [ref3q+%3] 1.41 + movd m4, [ref4q+%3] 1.42 + punpckldq m0, [srcq +%4] 1.43 + punpckldq m1, [ref1q+%5] 1.44 + punpckldq m2, [ref2q+%5] 1.45 + punpckldq m3, [ref3q+%5] 1.46 + punpckldq m4, [ref4q+%5] 1.47 + psadbw m1, m0 1.48 + psadbw m2, m0 1.49 + psadbw m3, m0 1.50 + psadbw m4, m0 1.51 + punpckldq m1, m2 1.52 + punpckldq m3, m4 1.53 + paddd m6, m1 1.54 + paddd m7, m3 1.55 +%endif 1.56 +%if %6 == 1 1.57 + lea srcq, [srcq +src_strideq*2] 1.58 + lea ref1q, [ref1q+ref_strideq*2] 1.59 + lea ref2q, [ref2q+ref_strideq*2] 1.60 + lea ref3q, [ref3q+ref_strideq*2] 1.61 + lea ref4q, [ref4q+ref_strideq*2] 1.62 +%endif 1.63 +%endmacro 1.64 + 1.65 +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end 1.66 +%macro PROCESS_8x2x4 5-6 0 1.67 + movh m0, [srcq +%2] 1.68 +%if %1 == 1 1.69 + movh m4, [ref1q+%3] 1.70 + movh m5, [ref2q+%3] 1.71 + movh m6, [ref3q+%3] 1.72 + movh m7, [ref4q+%3] 1.73 + movhps m0, [srcq +%4] 1.74 + movhps m4, [ref1q+%5] 1.75 + movhps m5, [ref2q+%5] 1.76 + movhps m6, [ref3q+%5] 1.77 + movhps m7, [ref4q+%5] 1.78 + psadbw m4, m0 1.79 + psadbw m5, m0 1.80 + psadbw m6, m0 1.81 + psadbw m7, m0 1.82 +%else 1.83 + movh m1, [ref1q+%3] 1.84 + movh m2, [ref2q+%3] 1.85 + movh m3, [ref3q+%3] 1.86 + movhps m0, [srcq +%4] 1.87 + movhps m1, [ref1q+%5] 1.88 + movhps m2, [ref2q+%5] 1.89 + movhps m3, [ref3q+%5] 1.90 + psadbw m1, m0 1.91 + psadbw m2, m0 1.92 + psadbw m3, m0 1.93 + paddd m4, m1 1.94 + movh m1, [ref4q+%3] 1.95 + movhps m1, [ref4q+%5] 1.96 + paddd m5, m2 1.97 + paddd m6, m3 1.98 + psadbw m1, m0 1.99 + paddd m7, m1 1.100 +%endif 1.101 +%if %6 == 1 1.102 + lea srcq, [srcq +src_strideq*2] 1.103 + lea ref1q, [ref1q+ref_strideq*2] 1.104 + lea ref2q, [ref2q+ref_strideq*2] 1.105 + lea ref3q, [ref3q+ref_strideq*2] 1.106 + lea ref4q, [ref4q+ref_strideq*2] 1.107 +%endif 1.108 +%endmacro 1.109 + 1.110 +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end 1.111 +%macro PROCESS_16x2x4 5-6 0 1.112 + ; 1st 16 px 1.113 + mova m0, [srcq +%2] 1.114 +%if %1 == 1 1.115 + movu m4, [ref1q+%3] 1.116 + movu m5, [ref2q+%3] 1.117 + movu m6, [ref3q+%3] 1.118 + movu m7, [ref4q+%3] 1.119 + psadbw m4, m0 1.120 + psadbw m5, m0 1.121 + psadbw m6, m0 1.122 + psadbw m7, m0 1.123 +%else 1.124 + movu m1, [ref1q+%3] 1.125 + movu m2, [ref2q+%3] 1.126 + movu m3, [ref3q+%3] 1.127 + psadbw m1, m0 1.128 + psadbw m2, m0 1.129 + psadbw m3, m0 1.130 + paddd m4, m1 1.131 + movu m1, [ref4q+%3] 1.132 + paddd m5, m2 1.133 + paddd m6, m3 1.134 + psadbw m1, m0 1.135 + paddd m7, m1 1.136 +%endif 1.137 + 1.138 + ; 2nd 16 px 1.139 + mova m0, [srcq +%4] 1.140 + movu m1, [ref1q+%5] 1.141 + movu m2, [ref2q+%5] 1.142 + movu m3, [ref3q+%5] 1.143 + psadbw m1, m0 1.144 + psadbw m2, m0 1.145 + psadbw m3, m0 1.146 + paddd m4, m1 1.147 + movu m1, [ref4q+%5] 1.148 + paddd m5, m2 1.149 + paddd m6, m3 1.150 +%if %6 == 1 1.151 + lea srcq, [srcq +src_strideq*2] 1.152 + lea ref1q, [ref1q+ref_strideq*2] 1.153 + lea ref2q, [ref2q+ref_strideq*2] 1.154 + lea ref3q, [ref3q+ref_strideq*2] 1.155 + lea ref4q, [ref4q+ref_strideq*2] 1.156 +%endif 1.157 + psadbw m1, m0 1.158 + paddd m7, m1 1.159 +%endmacro 1.160 + 1.161 +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end 1.162 +%macro PROCESS_32x2x4 5-6 0 1.163 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 1.164 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 1.165 +%endmacro 1.166 + 1.167 +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end 1.168 +%macro PROCESS_64x2x4 5-6 0 1.169 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 1.170 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 1.171 +%endmacro 1.172 + 1.173 +; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride, 1.174 +; uint8_t *ref[4], int ref_stride, 1.175 +; unsigned int res[4]); 1.176 +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 1.177 +%macro SADNXN4D 2 1.178 +%if UNIX64 1.179 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ 1.180 + res, ref2, ref3, ref4 1.181 +%else 1.182 +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ 1.183 + ref2, ref3, ref4 1.184 +%endif 1.185 + movsxdifnidn src_strideq, src_strided 1.186 + movsxdifnidn ref_strideq, ref_strided 1.187 + mov ref2q, [ref1q+gprsize*1] 1.188 + mov ref3q, [ref1q+gprsize*2] 1.189 + mov ref4q, [ref1q+gprsize*3] 1.190 + mov ref1q, [ref1q+gprsize*0] 1.191 + 1.192 + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 1.193 +%rep (%2-4)/2 1.194 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 1.195 +%endrep 1.196 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 1.197 + 1.198 +%if mmsize == 16 1.199 + pslldq m5, 4 1.200 + pslldq m7, 4 1.201 + por m4, m5 1.202 + por m6, m7 1.203 + mova m5, m4 1.204 + mova m7, m6 1.205 + punpcklqdq m4, m6 1.206 + punpckhqdq m5, m7 1.207 + movifnidn r4, r4mp 1.208 + paddd m4, m5 1.209 + movu [r4], m4 1.210 + RET 1.211 +%else 1.212 + movifnidn r4, r4mp 1.213 + movq [r4+0], m6 1.214 + movq [r4+8], m7 1.215 + RET 1.216 +%endif 1.217 +%endmacro 1.218 + 1.219 +INIT_XMM sse2 1.220 +SADNXN4D 64, 64 1.221 +SADNXN4D 64, 32 1.222 +SADNXN4D 32, 64 1.223 +SADNXN4D 32, 32 1.224 +SADNXN4D 32, 16 1.225 +SADNXN4D 16, 32 1.226 +SADNXN4D 16, 16 1.227 +SADNXN4D 16, 8 1.228 +SADNXN4D 8, 16 1.229 +SADNXN4D 8, 8 1.230 +SADNXN4D 8, 4 1.231 + 1.232 +INIT_MMX sse 1.233 +SADNXN4D 4, 8 1.234 +SADNXN4D 4, 4