1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,267 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION .text 1.17 + 1.18 +%macro SAD_FN 4 1.19 +%if %4 == 0 1.20 +%if %3 == 5 1.21 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 1.22 +%else ; %3 == 7 1.23 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ 1.24 + src_stride3, ref_stride3, n_rows 1.25 +%endif ; %3 == 5/7 1.26 +%else ; avg 1.27 +%if %3 == 5 1.28 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 1.29 + second_pred, n_rows 1.30 +%else ; %3 == 7 1.31 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ 1.32 + ref, ref_stride, \ 1.33 + second_pred, \ 1.34 + src_stride3, ref_stride3 1.35 +%if ARCH_X86_64 1.36 +%define n_rowsd r7d 1.37 +%else ; x86-32 1.38 +%define n_rowsd dword r0m 1.39 +%endif ; x86-32/64 1.40 +%endif ; %3 == 5/7 1.41 +%endif ; avg/sad 1.42 + movsxdifnidn src_strideq, src_strided 1.43 + movsxdifnidn ref_strideq, ref_strided 1.44 +%if %3 == 7 1.45 + lea src_stride3q, [src_strideq*3] 1.46 + lea ref_stride3q, [ref_strideq*3] 1.47 +%endif ; %3 == 7 1.48 +%endmacro 1.49 + 1.50 +; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, 1.51 +; uint8_t *ref, int ref_stride); 1.52 +%macro SAD64XN 1-2 0 1.53 + SAD_FN 64, %1, 5, %2 1.54 + mov n_rowsd, %1 1.55 + pxor m0, m0 1.56 +.loop: 1.57 + movu m1, [refq] 1.58 + movu m2, [refq+16] 1.59 + movu m3, [refq+32] 1.60 + movu m4, [refq+48] 1.61 +%if %2 == 1 1.62 + pavgb m1, [second_predq+mmsize*0] 1.63 + pavgb m2, [second_predq+mmsize*1] 1.64 + pavgb m3, [second_predq+mmsize*2] 1.65 + pavgb m4, [second_predq+mmsize*3] 1.66 + lea second_predq, [second_predq+mmsize*4] 1.67 +%endif 1.68 + psadbw m1, [srcq] 1.69 + psadbw m2, [srcq+16] 1.70 + psadbw m3, [srcq+32] 1.71 + psadbw m4, [srcq+48] 1.72 + paddd m1, m2 1.73 + paddd m3, m4 1.74 + add refq, ref_strideq 1.75 + paddd m0, m1 1.76 + add srcq, src_strideq 1.77 + paddd m0, m3 1.78 + dec n_rowsd 1.79 + jg .loop 1.80 + 1.81 + movhlps m1, m0 1.82 + paddd m0, m1 1.83 + movd eax, m0 1.84 + RET 1.85 +%endmacro 1.86 + 1.87 +INIT_XMM sse2 1.88 +SAD64XN 64 ; sad64x64_sse2 1.89 +SAD64XN 32 ; sad64x32_sse2 1.90 +SAD64XN 64, 1 ; sad64x64_avg_sse2 1.91 +SAD64XN 32, 1 ; sad64x32_avg_sse2 1.92 + 1.93 +; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, 1.94 +; uint8_t *ref, int ref_stride); 1.95 +%macro SAD32XN 1-2 0 1.96 + SAD_FN 32, %1, 5, %2 1.97 + mov n_rowsd, %1/2 1.98 + pxor m0, m0 1.99 +.loop: 1.100 + movu m1, [refq] 1.101 + movu m2, [refq+16] 1.102 + movu m3, [refq+ref_strideq] 1.103 + movu m4, [refq+ref_strideq+16] 1.104 +%if %2 == 1 1.105 + pavgb m1, [second_predq+mmsize*0] 1.106 + pavgb m2, [second_predq+mmsize*1] 1.107 + pavgb m3, [second_predq+mmsize*2] 1.108 + pavgb m4, [second_predq+mmsize*3] 1.109 + lea second_predq, [second_predq+mmsize*4] 1.110 +%endif 1.111 + psadbw m1, [srcq] 1.112 + psadbw m2, [srcq+16] 1.113 + psadbw m3, [srcq+src_strideq] 1.114 + psadbw m4, [srcq+src_strideq+16] 1.115 + paddd m1, m2 1.116 + paddd m3, m4 1.117 + lea refq, [refq+ref_strideq*2] 1.118 + paddd m0, m1 1.119 + lea srcq, [srcq+src_strideq*2] 1.120 + paddd m0, m3 1.121 + dec n_rowsd 1.122 + jg .loop 1.123 + 1.124 + movhlps m1, m0 1.125 + paddd m0, m1 1.126 + movd eax, m0 1.127 + RET 1.128 +%endmacro 1.129 + 1.130 +INIT_XMM sse2 1.131 +SAD32XN 64 ; sad32x64_sse2 1.132 +SAD32XN 32 ; sad32x32_sse2 1.133 +SAD32XN 16 ; sad32x16_sse2 1.134 +SAD32XN 64, 1 ; sad32x64_avg_sse2 1.135 +SAD32XN 32, 1 ; sad32x32_avg_sse2 1.136 +SAD32XN 16, 1 ; sad32x16_avg_sse2 1.137 + 1.138 +; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 1.139 +; uint8_t *ref, int ref_stride); 1.140 +%macro SAD16XN 1-2 0 1.141 + SAD_FN 16, %1, 7, %2 1.142 + mov n_rowsd, %1/4 1.143 + pxor m0, m0 1.144 + 1.145 +.loop: 1.146 + movu m1, [refq] 1.147 + movu m2, [refq+ref_strideq] 1.148 + movu m3, [refq+ref_strideq*2] 1.149 + movu m4, [refq+ref_stride3q] 1.150 +%if %2 == 1 1.151 + pavgb m1, [second_predq+mmsize*0] 1.152 + pavgb m2, [second_predq+mmsize*1] 1.153 + pavgb m3, [second_predq+mmsize*2] 1.154 + pavgb m4, [second_predq+mmsize*3] 1.155 + lea second_predq, [second_predq+mmsize*4] 1.156 +%endif 1.157 + psadbw m1, [srcq] 1.158 + psadbw m2, [srcq+src_strideq] 1.159 + psadbw m3, [srcq+src_strideq*2] 1.160 + psadbw m4, [srcq+src_stride3q] 1.161 + paddd m1, m2 1.162 + paddd m3, m4 1.163 + lea refq, [refq+ref_strideq*4] 1.164 + paddd m0, m1 1.165 + lea srcq, [srcq+src_strideq*4] 1.166 + paddd m0, m3 1.167 + dec n_rowsd 1.168 + jg .loop 1.169 + 1.170 + movhlps m1, m0 1.171 + paddd m0, m1 1.172 + movd eax, m0 1.173 + RET 1.174 +%endmacro 1.175 + 1.176 +INIT_XMM sse2 1.177 +SAD16XN 32 ; sad16x32_sse2 1.178 +SAD16XN 16 ; sad16x16_sse2 1.179 +SAD16XN 8 ; sad16x8_sse2 1.180 +SAD16XN 32, 1 ; sad16x32_avg_sse2 1.181 +SAD16XN 16, 1 ; sad16x16_avg_sse2 1.182 +SAD16XN 8, 1 ; sad16x8_avg_sse2 1.183 + 1.184 +; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 1.185 +; uint8_t *ref, int ref_stride); 1.186 +%macro SAD8XN 1-2 0 1.187 + SAD_FN 8, %1, 7, %2 1.188 + mov n_rowsd, %1/4 1.189 + pxor m0, m0 1.190 + 1.191 +.loop: 1.192 + movh m1, [refq] 1.193 + movhps m1, [refq+ref_strideq] 1.194 + movh m2, [refq+ref_strideq*2] 1.195 + movhps m2, [refq+ref_stride3q] 1.196 +%if %2 == 1 1.197 + pavgb m1, [second_predq+mmsize*0] 1.198 + pavgb m2, [second_predq+mmsize*1] 1.199 + lea second_predq, [second_predq+mmsize*2] 1.200 +%endif 1.201 + movh m3, [srcq] 1.202 + movhps m3, [srcq+src_strideq] 1.203 + movh m4, [srcq+src_strideq*2] 1.204 + movhps m4, [srcq+src_stride3q] 1.205 + psadbw m1, m3 1.206 + psadbw m2, m4 1.207 + lea refq, [refq+ref_strideq*4] 1.208 + paddd m0, m1 1.209 + lea srcq, [srcq+src_strideq*4] 1.210 + paddd m0, m2 1.211 + dec n_rowsd 1.212 + jg .loop 1.213 + 1.214 + movhlps m1, m0 1.215 + paddd m0, m1 1.216 + movd eax, m0 1.217 + RET 1.218 +%endmacro 1.219 + 1.220 +INIT_XMM sse2 1.221 +SAD8XN 16 ; sad8x16_sse2 1.222 +SAD8XN 8 ; sad8x8_sse2 1.223 +SAD8XN 4 ; sad8x4_sse2 1.224 +SAD8XN 16, 1 ; sad8x16_avg_sse2 1.225 +SAD8XN 8, 1 ; sad8x8_avg_sse2 1.226 +SAD8XN 4, 1 ; sad8x4_avg_sse2 1.227 + 1.228 +; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, 1.229 +; uint8_t *ref, int ref_stride); 1.230 +%macro SAD4XN 1-2 0 1.231 + SAD_FN 4, %1, 7, %2 1.232 + mov n_rowsd, %1/4 1.233 + pxor m0, m0 1.234 + 1.235 +.loop: 1.236 + movd m1, [refq] 1.237 + movd m2, [refq+ref_strideq] 1.238 + movd m3, [refq+ref_strideq*2] 1.239 + movd m4, [refq+ref_stride3q] 1.240 + punpckldq m1, m2 1.241 + punpckldq m3, m4 1.242 +%if %2 == 1 1.243 + pavgb m1, [second_predq+mmsize*0] 1.244 + pavgb m3, [second_predq+mmsize*1] 1.245 + lea second_predq, [second_predq+mmsize*2] 1.246 +%endif 1.247 + movd m2, [srcq] 1.248 + movd m5, [srcq+src_strideq] 1.249 + movd m4, [srcq+src_strideq*2] 1.250 + movd m6, [srcq+src_stride3q] 1.251 + punpckldq m2, m5 1.252 + punpckldq m4, m6 1.253 + psadbw m1, m2 1.254 + psadbw m3, m4 1.255 + lea refq, [refq+ref_strideq*4] 1.256 + paddd m0, m1 1.257 + lea srcq, [srcq+src_strideq*4] 1.258 + paddd m0, m3 1.259 + dec n_rowsd 1.260 + jg .loop 1.261 + 1.262 + movd eax, m0 1.263 + RET 1.264 +%endmacro 1.265 + 1.266 +INIT_MMX sse 1.267 +SAD4XN 8 ; sad4x8_sse 1.268 +SAD4XN 4 ; sad4x4_sse 1.269 +SAD4XN 8, 1 ; sad4x8_avg_sse 1.270 +SAD4XN 4, 1 ; sad4x4_avg_sse