|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "third_party/x86inc/x86inc.asm" |
|
12 |
|
13 SECTION .text |
|
14 |
|
15 ; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end |
|
16 %macro PROCESS_4x2x4 5-6 0 |
|
17 movd m0, [srcq +%2] |
|
18 %if %1 == 1 |
|
19 movd m6, [ref1q+%3] |
|
20 movd m4, [ref2q+%3] |
|
21 movd m7, [ref3q+%3] |
|
22 movd m5, [ref4q+%3] |
|
23 punpckldq m0, [srcq +%4] |
|
24 punpckldq m6, [ref1q+%5] |
|
25 punpckldq m4, [ref2q+%5] |
|
26 punpckldq m7, [ref3q+%5] |
|
27 punpckldq m5, [ref4q+%5] |
|
28 psadbw m6, m0 |
|
29 psadbw m4, m0 |
|
30 psadbw m7, m0 |
|
31 psadbw m5, m0 |
|
32 punpckldq m6, m4 |
|
33 punpckldq m7, m5 |
|
34 %else |
|
35 movd m1, [ref1q+%3] |
|
36 movd m2, [ref2q+%3] |
|
37 movd m3, [ref3q+%3] |
|
38 movd m4, [ref4q+%3] |
|
39 punpckldq m0, [srcq +%4] |
|
40 punpckldq m1, [ref1q+%5] |
|
41 punpckldq m2, [ref2q+%5] |
|
42 punpckldq m3, [ref3q+%5] |
|
43 punpckldq m4, [ref4q+%5] |
|
44 psadbw m1, m0 |
|
45 psadbw m2, m0 |
|
46 psadbw m3, m0 |
|
47 psadbw m4, m0 |
|
48 punpckldq m1, m2 |
|
49 punpckldq m3, m4 |
|
50 paddd m6, m1 |
|
51 paddd m7, m3 |
|
52 %endif |
|
53 %if %6 == 1 |
|
54 lea srcq, [srcq +src_strideq*2] |
|
55 lea ref1q, [ref1q+ref_strideq*2] |
|
56 lea ref2q, [ref2q+ref_strideq*2] |
|
57 lea ref3q, [ref3q+ref_strideq*2] |
|
58 lea ref4q, [ref4q+ref_strideq*2] |
|
59 %endif |
|
60 %endmacro |
|
61 |
|
62 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end |
|
63 %macro PROCESS_8x2x4 5-6 0 |
|
64 movh m0, [srcq +%2] |
|
65 %if %1 == 1 |
|
66 movh m4, [ref1q+%3] |
|
67 movh m5, [ref2q+%3] |
|
68 movh m6, [ref3q+%3] |
|
69 movh m7, [ref4q+%3] |
|
70 movhps m0, [srcq +%4] |
|
71 movhps m4, [ref1q+%5] |
|
72 movhps m5, [ref2q+%5] |
|
73 movhps m6, [ref3q+%5] |
|
74 movhps m7, [ref4q+%5] |
|
75 psadbw m4, m0 |
|
76 psadbw m5, m0 |
|
77 psadbw m6, m0 |
|
78 psadbw m7, m0 |
|
79 %else |
|
80 movh m1, [ref1q+%3] |
|
81 movh m2, [ref2q+%3] |
|
82 movh m3, [ref3q+%3] |
|
83 movhps m0, [srcq +%4] |
|
84 movhps m1, [ref1q+%5] |
|
85 movhps m2, [ref2q+%5] |
|
86 movhps m3, [ref3q+%5] |
|
87 psadbw m1, m0 |
|
88 psadbw m2, m0 |
|
89 psadbw m3, m0 |
|
90 paddd m4, m1 |
|
91 movh m1, [ref4q+%3] |
|
92 movhps m1, [ref4q+%5] |
|
93 paddd m5, m2 |
|
94 paddd m6, m3 |
|
95 psadbw m1, m0 |
|
96 paddd m7, m1 |
|
97 %endif |
|
98 %if %6 == 1 |
|
99 lea srcq, [srcq +src_strideq*2] |
|
100 lea ref1q, [ref1q+ref_strideq*2] |
|
101 lea ref2q, [ref2q+ref_strideq*2] |
|
102 lea ref3q, [ref3q+ref_strideq*2] |
|
103 lea ref4q, [ref4q+ref_strideq*2] |
|
104 %endif |
|
105 %endmacro |
|
106 |
|
107 ; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end |
|
108 %macro PROCESS_16x2x4 5-6 0 |
|
109 ; 1st 16 px |
|
110 mova m0, [srcq +%2] |
|
111 %if %1 == 1 |
|
112 movu m4, [ref1q+%3] |
|
113 movu m5, [ref2q+%3] |
|
114 movu m6, [ref3q+%3] |
|
115 movu m7, [ref4q+%3] |
|
116 psadbw m4, m0 |
|
117 psadbw m5, m0 |
|
118 psadbw m6, m0 |
|
119 psadbw m7, m0 |
|
120 %else |
|
121 movu m1, [ref1q+%3] |
|
122 movu m2, [ref2q+%3] |
|
123 movu m3, [ref3q+%3] |
|
124 psadbw m1, m0 |
|
125 psadbw m2, m0 |
|
126 psadbw m3, m0 |
|
127 paddd m4, m1 |
|
128 movu m1, [ref4q+%3] |
|
129 paddd m5, m2 |
|
130 paddd m6, m3 |
|
131 psadbw m1, m0 |
|
132 paddd m7, m1 |
|
133 %endif |
|
134 |
|
135 ; 2nd 16 px |
|
136 mova m0, [srcq +%4] |
|
137 movu m1, [ref1q+%5] |
|
138 movu m2, [ref2q+%5] |
|
139 movu m3, [ref3q+%5] |
|
140 psadbw m1, m0 |
|
141 psadbw m2, m0 |
|
142 psadbw m3, m0 |
|
143 paddd m4, m1 |
|
144 movu m1, [ref4q+%5] |
|
145 paddd m5, m2 |
|
146 paddd m6, m3 |
|
147 %if %6 == 1 |
|
148 lea srcq, [srcq +src_strideq*2] |
|
149 lea ref1q, [ref1q+ref_strideq*2] |
|
150 lea ref2q, [ref2q+ref_strideq*2] |
|
151 lea ref3q, [ref3q+ref_strideq*2] |
|
152 lea ref4q, [ref4q+ref_strideq*2] |
|
153 %endif |
|
154 psadbw m1, m0 |
|
155 paddd m7, m1 |
|
156 %endmacro |
|
157 |
|
158 ; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end |
|
159 %macro PROCESS_32x2x4 5-6 0 |
|
160 PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 |
|
161 PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 |
|
162 %endmacro |
|
163 |
|
164 ; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end |
|
165 %macro PROCESS_64x2x4 5-6 0 |
|
166 PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 |
|
167 PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 |
|
168 %endmacro |
|
169 |
|
170 ; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride, |
|
171 ; uint8_t *ref[4], int ref_stride, |
|
172 ; unsigned int res[4]); |
|
173 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 |
|
174 %macro SADNXN4D 2 |
|
175 %if UNIX64 |
|
176 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ |
|
177 res, ref2, ref3, ref4 |
|
178 %else |
|
179 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ |
|
180 ref2, ref3, ref4 |
|
181 %endif |
|
182 movsxdifnidn src_strideq, src_strided |
|
183 movsxdifnidn ref_strideq, ref_strided |
|
184 mov ref2q, [ref1q+gprsize*1] |
|
185 mov ref3q, [ref1q+gprsize*2] |
|
186 mov ref4q, [ref1q+gprsize*3] |
|
187 mov ref1q, [ref1q+gprsize*0] |
|
188 |
|
189 PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 |
|
190 %rep (%2-4)/2 |
|
191 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 |
|
192 %endrep |
|
193 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 |
|
194 |
|
195 %if mmsize == 16 |
|
196 pslldq m5, 4 |
|
197 pslldq m7, 4 |
|
198 por m4, m5 |
|
199 por m6, m7 |
|
200 mova m5, m4 |
|
201 mova m7, m6 |
|
202 punpcklqdq m4, m6 |
|
203 punpckhqdq m5, m7 |
|
204 movifnidn r4, r4mp |
|
205 paddd m4, m5 |
|
206 movu [r4], m4 |
|
207 RET |
|
208 %else |
|
209 movifnidn r4, r4mp |
|
210 movq [r4+0], m6 |
|
211 movq [r4+8], m7 |
|
212 RET |
|
213 %endif |
|
214 %endmacro |
|
215 |
|
216 INIT_XMM sse2 |
|
217 SADNXN4D 64, 64 |
|
218 SADNXN4D 64, 32 |
|
219 SADNXN4D 32, 64 |
|
220 SADNXN4D 32, 32 |
|
221 SADNXN4D 32, 16 |
|
222 SADNXN4D 16, 32 |
|
223 SADNXN4D 16, 16 |
|
224 SADNXN4D 16, 8 |
|
225 SADNXN4D 8, 16 |
|
226 SADNXN4D 8, 8 |
|
227 SADNXN4D 8, 4 |
|
228 |
|
229 INIT_MMX sse |
|
230 SADNXN4D 4, 8 |
|
231 SADNXN4D 4, 4 |