1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_error_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,74 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION .text 1.17 + 1.18 +; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, 1.19 +; int64_t *ssz) 1.20 + 1.21 +INIT_XMM sse2 1.22 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz 1.23 + pxor m4, m4 ; sse accumulator 1.24 + pxor m6, m6 ; ssz accumulator 1.25 + pxor m5, m5 ; dedicated zero register 1.26 + lea uqcq, [uqcq+sizeq*2] 1.27 + lea dqcq, [dqcq+sizeq*2] 1.28 + neg sizeq 1.29 +.loop: 1.30 + mova m2, [uqcq+sizeq*2] 1.31 + mova m0, [dqcq+sizeq*2] 1.32 + mova m3, [uqcq+sizeq*2+mmsize] 1.33 + mova m1, [dqcq+sizeq*2+mmsize] 1.34 + psubw m0, m2 1.35 + psubw m1, m3 1.36 + ; individual errors are max. 15bit+sign, so squares are 30bit, and 1.37 + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 1.38 + pmaddwd m0, m0 1.39 + pmaddwd m1, m1 1.40 + pmaddwd m2, m2 1.41 + pmaddwd m3, m3 1.42 + ; accumulate in 64bit 1.43 + punpckldq m7, m0, m5 1.44 + punpckhdq m0, m5 1.45 + paddq m4, m7 1.46 + punpckldq m7, m1, m5 1.47 + paddq m4, m0 1.48 + punpckhdq m1, m5 1.49 + paddq m4, m7 1.50 + punpckldq m7, m2, m5 1.51 + paddq m4, m1 1.52 + punpckhdq m2, m5 1.53 + paddq m6, m7 1.54 + punpckldq m7, m3, m5 1.55 + paddq m6, m2 1.56 + punpckhdq m3, m5 1.57 + paddq m6, m7 1.58 + paddq m6, m3 1.59 + add sizeq, mmsize 1.60 + jl .loop 1.61 + 1.62 + ; accumulate horizontally and store in return value 1.63 + movhlps m5, m4 1.64 + movhlps m7, m6 1.65 + paddq m4, m5 1.66 + paddq m6, m7 1.67 +%if ARCH_X86_64 1.68 + movq rax, m4 1.69 + movq [sszq], m6 1.70 +%else 1.71 + mov eax, sszm 1.72 + pshufd m5, m4, 0x1 1.73 + movq [eax], m6 1.74 + movd eax, m4 1.75 + movd edx, m5 1.76 +%endif 1.77 + RET