|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "third_party/x86inc/x86inc.asm" |
|
12 |
|
13 SECTION .text |
|
14 |
|
15 ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, |
|
16 ; int64_t *ssz) |
|
17 |
|
18 INIT_XMM sse2 |
|
19 cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz |
|
20 pxor m4, m4 ; sse accumulator |
|
21 pxor m6, m6 ; ssz accumulator |
|
22 pxor m5, m5 ; dedicated zero register |
|
23 lea uqcq, [uqcq+sizeq*2] |
|
24 lea dqcq, [dqcq+sizeq*2] |
|
25 neg sizeq |
|
26 .loop: |
|
27 mova m2, [uqcq+sizeq*2] |
|
28 mova m0, [dqcq+sizeq*2] |
|
29 mova m3, [uqcq+sizeq*2+mmsize] |
|
30 mova m1, [dqcq+sizeq*2+mmsize] |
|
31 psubw m0, m2 |
|
32 psubw m1, m3 |
|
33 ; individual errors are max. 15bit+sign, so squares are 30bit, and |
|
34 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) |
|
35 pmaddwd m0, m0 |
|
36 pmaddwd m1, m1 |
|
37 pmaddwd m2, m2 |
|
38 pmaddwd m3, m3 |
|
39 ; accumulate in 64bit |
|
40 punpckldq m7, m0, m5 |
|
41 punpckhdq m0, m5 |
|
42 paddq m4, m7 |
|
43 punpckldq m7, m1, m5 |
|
44 paddq m4, m0 |
|
45 punpckhdq m1, m5 |
|
46 paddq m4, m7 |
|
47 punpckldq m7, m2, m5 |
|
48 paddq m4, m1 |
|
49 punpckhdq m2, m5 |
|
50 paddq m6, m7 |
|
51 punpckldq m7, m3, m5 |
|
52 paddq m6, m2 |
|
53 punpckhdq m3, m5 |
|
54 paddq m6, m7 |
|
55 paddq m6, m3 |
|
56 add sizeq, mmsize |
|
57 jl .loop |
|
58 |
|
59 ; accumulate horizontally and store in return value |
|
60 movhlps m5, m4 |
|
61 movhlps m7, m6 |
|
62 paddq m4, m5 |
|
63 paddq m6, m7 |
|
64 %if ARCH_X86_64 |
|
65 movq rax, m4 |
|
66 movq [sszq], m6 |
|
67 %else |
|
68 mov eax, sszm |
|
69 pshufd m5, m4, 0x1 |
|
70 movq [eax], m6 |
|
71 movd eax, m4 |
|
72 movd edx, m5 |
|
73 %endif |
|
74 RET |