michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "third_party/x86inc/x86inc.asm" michael@0: michael@0: SECTION .text michael@0: michael@0: ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, michael@0: ; int64_t *ssz) michael@0: michael@0: INIT_XMM sse2 michael@0: cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz michael@0: pxor m4, m4 ; sse accumulator michael@0: pxor m6, m6 ; ssz accumulator michael@0: pxor m5, m5 ; dedicated zero register michael@0: lea uqcq, [uqcq+sizeq*2] michael@0: lea dqcq, [dqcq+sizeq*2] michael@0: neg sizeq michael@0: .loop: michael@0: mova m2, [uqcq+sizeq*2] michael@0: mova m0, [dqcq+sizeq*2] michael@0: mova m3, [uqcq+sizeq*2+mmsize] michael@0: mova m1, [dqcq+sizeq*2+mmsize] michael@0: psubw m0, m2 michael@0: psubw m1, m3 michael@0: ; individual errors are max. 15bit+sign, so squares are 30bit, and michael@0: ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) michael@0: pmaddwd m0, m0 michael@0: pmaddwd m1, m1 michael@0: pmaddwd m2, m2 michael@0: pmaddwd m3, m3 michael@0: ; accumulate in 64bit michael@0: punpckldq m7, m0, m5 michael@0: punpckhdq m0, m5 michael@0: paddq m4, m7 michael@0: punpckldq m7, m1, m5 michael@0: paddq m4, m0 michael@0: punpckhdq m1, m5 michael@0: paddq m4, m7 michael@0: punpckldq m7, m2, m5 michael@0: paddq m4, m1 michael@0: punpckhdq m2, m5 michael@0: paddq m6, m7 michael@0: punpckldq m7, m3, m5 michael@0: paddq m6, m2 michael@0: punpckhdq m3, m5 michael@0: paddq m6, m7 michael@0: paddq m6, m3 michael@0: add sizeq, mmsize michael@0: jl .loop michael@0: michael@0: ; accumulate horizontally and store in return value michael@0: movhlps m5, m4 michael@0: movhlps m7, m6 michael@0: paddq m4, m5 michael@0: paddq m6, m7 michael@0: %if ARCH_X86_64 michael@0: movq rax, m4 michael@0: movq [sszq], m6 michael@0: %else michael@0: mov eax, sszm michael@0: pshufd m5, m4, 0x1 michael@0: movq [eax], m6 michael@0: movd eax, m4 michael@0: movd edx, m5 michael@0: %endif michael@0: RET