|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 %macro PROCESS_16X2X8 1 |
|
15 %if %1 |
|
16 movdqa xmm0, XMMWORD PTR [rsi] |
|
17 movq xmm1, MMWORD PTR [rdi] |
|
18 movq xmm3, MMWORD PTR [rdi+8] |
|
19 movq xmm2, MMWORD PTR [rdi+16] |
|
20 punpcklqdq xmm1, xmm3 |
|
21 punpcklqdq xmm3, xmm2 |
|
22 |
|
23 movdqa xmm2, xmm1 |
|
24 mpsadbw xmm1, xmm0, 0x0 |
|
25 mpsadbw xmm2, xmm0, 0x5 |
|
26 |
|
27 psrldq xmm0, 8 |
|
28 |
|
29 movdqa xmm4, xmm3 |
|
30 mpsadbw xmm3, xmm0, 0x0 |
|
31 mpsadbw xmm4, xmm0, 0x5 |
|
32 |
|
33 paddw xmm1, xmm2 |
|
34 paddw xmm1, xmm3 |
|
35 paddw xmm1, xmm4 |
|
36 %else |
|
37 movdqa xmm0, XMMWORD PTR [rsi] |
|
38 movq xmm5, MMWORD PTR [rdi] |
|
39 movq xmm3, MMWORD PTR [rdi+8] |
|
40 movq xmm2, MMWORD PTR [rdi+16] |
|
41 punpcklqdq xmm5, xmm3 |
|
42 punpcklqdq xmm3, xmm2 |
|
43 |
|
44 movdqa xmm2, xmm5 |
|
45 mpsadbw xmm5, xmm0, 0x0 |
|
46 mpsadbw xmm2, xmm0, 0x5 |
|
47 |
|
48 psrldq xmm0, 8 |
|
49 |
|
50 movdqa xmm4, xmm3 |
|
51 mpsadbw xmm3, xmm0, 0x0 |
|
52 mpsadbw xmm4, xmm0, 0x5 |
|
53 |
|
54 paddw xmm5, xmm2 |
|
55 paddw xmm5, xmm3 |
|
56 paddw xmm5, xmm4 |
|
57 |
|
58 paddw xmm1, xmm5 |
|
59 %endif |
|
60 movdqa xmm0, XMMWORD PTR [rsi + rax] |
|
61 movq xmm5, MMWORD PTR [rdi+ rdx] |
|
62 movq xmm3, MMWORD PTR [rdi+ rdx+8] |
|
63 movq xmm2, MMWORD PTR [rdi+ rdx+16] |
|
64 punpcklqdq xmm5, xmm3 |
|
65 punpcklqdq xmm3, xmm2 |
|
66 |
|
67 lea rsi, [rsi+rax*2] |
|
68 lea rdi, [rdi+rdx*2] |
|
69 |
|
70 movdqa xmm2, xmm5 |
|
71 mpsadbw xmm5, xmm0, 0x0 |
|
72 mpsadbw xmm2, xmm0, 0x5 |
|
73 |
|
74 psrldq xmm0, 8 |
|
75 movdqa xmm4, xmm3 |
|
76 mpsadbw xmm3, xmm0, 0x0 |
|
77 mpsadbw xmm4, xmm0, 0x5 |
|
78 |
|
79 paddw xmm5, xmm2 |
|
80 paddw xmm5, xmm3 |
|
81 paddw xmm5, xmm4 |
|
82 |
|
83 paddw xmm1, xmm5 |
|
84 %endmacro |
|
85 |
|
86 %macro PROCESS_8X2X8 1 |
|
87 %if %1 |
|
88 movq xmm0, MMWORD PTR [rsi] |
|
89 movq xmm1, MMWORD PTR [rdi] |
|
90 movq xmm3, MMWORD PTR [rdi+8] |
|
91 punpcklqdq xmm1, xmm3 |
|
92 |
|
93 movdqa xmm2, xmm1 |
|
94 mpsadbw xmm1, xmm0, 0x0 |
|
95 mpsadbw xmm2, xmm0, 0x5 |
|
96 paddw xmm1, xmm2 |
|
97 %else |
|
98 movq xmm0, MMWORD PTR [rsi] |
|
99 movq xmm5, MMWORD PTR [rdi] |
|
100 movq xmm3, MMWORD PTR [rdi+8] |
|
101 punpcklqdq xmm5, xmm3 |
|
102 |
|
103 movdqa xmm2, xmm5 |
|
104 mpsadbw xmm5, xmm0, 0x0 |
|
105 mpsadbw xmm2, xmm0, 0x5 |
|
106 paddw xmm5, xmm2 |
|
107 |
|
108 paddw xmm1, xmm5 |
|
109 %endif |
|
110 movq xmm0, MMWORD PTR [rsi + rax] |
|
111 movq xmm5, MMWORD PTR [rdi+ rdx] |
|
112 movq xmm3, MMWORD PTR [rdi+ rdx+8] |
|
113 punpcklqdq xmm5, xmm3 |
|
114 |
|
115 lea rsi, [rsi+rax*2] |
|
116 lea rdi, [rdi+rdx*2] |
|
117 |
|
118 movdqa xmm2, xmm5 |
|
119 mpsadbw xmm5, xmm0, 0x0 |
|
120 mpsadbw xmm2, xmm0, 0x5 |
|
121 paddw xmm5, xmm2 |
|
122 |
|
123 paddw xmm1, xmm5 |
|
124 %endmacro |
|
125 |
|
126 %macro PROCESS_4X2X8 1 |
|
127 %if %1 |
|
128 movd xmm0, [rsi] |
|
129 movq xmm1, MMWORD PTR [rdi] |
|
130 movq xmm3, MMWORD PTR [rdi+8] |
|
131 punpcklqdq xmm1, xmm3 |
|
132 |
|
133 mpsadbw xmm1, xmm0, 0x0 |
|
134 %else |
|
135 movd xmm0, [rsi] |
|
136 movq xmm5, MMWORD PTR [rdi] |
|
137 movq xmm3, MMWORD PTR [rdi+8] |
|
138 punpcklqdq xmm5, xmm3 |
|
139 |
|
140 mpsadbw xmm5, xmm0, 0x0 |
|
141 |
|
142 paddw xmm1, xmm5 |
|
143 %endif |
|
144 movd xmm0, [rsi + rax] |
|
145 movq xmm5, MMWORD PTR [rdi+ rdx] |
|
146 movq xmm3, MMWORD PTR [rdi+ rdx+8] |
|
147 punpcklqdq xmm5, xmm3 |
|
148 |
|
149 lea rsi, [rsi+rax*2] |
|
150 lea rdi, [rdi+rdx*2] |
|
151 |
|
152 mpsadbw xmm5, xmm0, 0x0 |
|
153 |
|
154 paddw xmm1, xmm5 |
|
155 %endmacro |
|
156 |
|
157 %macro WRITE_AS_INTS 0 |
|
158 mov rdi, arg(4) ;Results |
|
159 pxor xmm0, xmm0 |
|
160 movdqa xmm2, xmm1 |
|
161 punpcklwd xmm1, xmm0 |
|
162 punpckhwd xmm2, xmm0 |
|
163 |
|
164 movdqa [rdi], xmm1 |
|
165 movdqa [rdi + 16], xmm2 |
|
166 %endmacro |
|
167 |
|
168 ;void vp9_sad16x16x8_sse4( |
|
169 ; const unsigned char *src_ptr, |
|
170 ; int src_stride, |
|
171 ; const unsigned char *ref_ptr, |
|
172 ; int ref_stride, |
|
173 ; unsigned short *sad_array); |
|
174 global sym(vp9_sad16x16x8_sse4) PRIVATE |
|
175 sym(vp9_sad16x16x8_sse4): |
|
176 push rbp |
|
177 mov rbp, rsp |
|
178 SHADOW_ARGS_TO_STACK 5 |
|
179 push rsi |
|
180 push rdi |
|
181 ; end prolog |
|
182 |
|
183 mov rsi, arg(0) ;src_ptr |
|
184 mov rdi, arg(2) ;ref_ptr |
|
185 |
|
186 movsxd rax, dword ptr arg(1) ;src_stride |
|
187 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
188 |
|
189 PROCESS_16X2X8 1 |
|
190 PROCESS_16X2X8 0 |
|
191 PROCESS_16X2X8 0 |
|
192 PROCESS_16X2X8 0 |
|
193 PROCESS_16X2X8 0 |
|
194 PROCESS_16X2X8 0 |
|
195 PROCESS_16X2X8 0 |
|
196 PROCESS_16X2X8 0 |
|
197 |
|
198 WRITE_AS_INTS |
|
199 |
|
200 ; begin epilog |
|
201 pop rdi |
|
202 pop rsi |
|
203 UNSHADOW_ARGS |
|
204 pop rbp |
|
205 ret |
|
206 |
|
207 |
|
208 ;void vp9_sad16x8x8_sse4( |
|
209 ; const unsigned char *src_ptr, |
|
210 ; int src_stride, |
|
211 ; const unsigned char *ref_ptr, |
|
212 ; int ref_stride, |
|
213 ; unsigned short *sad_array |
|
214 ;); |
|
215 global sym(vp9_sad16x8x8_sse4) PRIVATE |
|
216 sym(vp9_sad16x8x8_sse4): |
|
217 push rbp |
|
218 mov rbp, rsp |
|
219 SHADOW_ARGS_TO_STACK 5 |
|
220 push rsi |
|
221 push rdi |
|
222 ; end prolog |
|
223 |
|
224 mov rsi, arg(0) ;src_ptr |
|
225 mov rdi, arg(2) ;ref_ptr |
|
226 |
|
227 movsxd rax, dword ptr arg(1) ;src_stride |
|
228 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
229 |
|
230 PROCESS_16X2X8 1 |
|
231 PROCESS_16X2X8 0 |
|
232 PROCESS_16X2X8 0 |
|
233 PROCESS_16X2X8 0 |
|
234 |
|
235 WRITE_AS_INTS |
|
236 |
|
237 ; begin epilog |
|
238 pop rdi |
|
239 pop rsi |
|
240 UNSHADOW_ARGS |
|
241 pop rbp |
|
242 ret |
|
243 |
|
244 |
|
245 ;void vp9_sad8x8x8_sse4( |
|
246 ; const unsigned char *src_ptr, |
|
247 ; int src_stride, |
|
248 ; const unsigned char *ref_ptr, |
|
249 ; int ref_stride, |
|
250 ; unsigned short *sad_array |
|
251 ;); |
|
252 global sym(vp9_sad8x8x8_sse4) PRIVATE |
|
253 sym(vp9_sad8x8x8_sse4): |
|
254 push rbp |
|
255 mov rbp, rsp |
|
256 SHADOW_ARGS_TO_STACK 5 |
|
257 push rsi |
|
258 push rdi |
|
259 ; end prolog |
|
260 |
|
261 mov rsi, arg(0) ;src_ptr |
|
262 mov rdi, arg(2) ;ref_ptr |
|
263 |
|
264 movsxd rax, dword ptr arg(1) ;src_stride |
|
265 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
266 |
|
267 PROCESS_8X2X8 1 |
|
268 PROCESS_8X2X8 0 |
|
269 PROCESS_8X2X8 0 |
|
270 PROCESS_8X2X8 0 |
|
271 |
|
272 WRITE_AS_INTS |
|
273 |
|
274 ; begin epilog |
|
275 pop rdi |
|
276 pop rsi |
|
277 UNSHADOW_ARGS |
|
278 pop rbp |
|
279 ret |
|
280 |
|
281 |
|
282 ;void vp9_sad8x16x8_sse4( |
|
283 ; const unsigned char *src_ptr, |
|
284 ; int src_stride, |
|
285 ; const unsigned char *ref_ptr, |
|
286 ; int ref_stride, |
|
287 ; unsigned short *sad_array |
|
288 ;); |
|
289 global sym(vp9_sad8x16x8_sse4) PRIVATE |
|
290 sym(vp9_sad8x16x8_sse4): |
|
291 push rbp |
|
292 mov rbp, rsp |
|
293 SHADOW_ARGS_TO_STACK 5 |
|
294 push rsi |
|
295 push rdi |
|
296 ; end prolog |
|
297 |
|
298 mov rsi, arg(0) ;src_ptr |
|
299 mov rdi, arg(2) ;ref_ptr |
|
300 |
|
301 movsxd rax, dword ptr arg(1) ;src_stride |
|
302 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
303 |
|
304 PROCESS_8X2X8 1 |
|
305 PROCESS_8X2X8 0 |
|
306 PROCESS_8X2X8 0 |
|
307 PROCESS_8X2X8 0 |
|
308 PROCESS_8X2X8 0 |
|
309 PROCESS_8X2X8 0 |
|
310 PROCESS_8X2X8 0 |
|
311 PROCESS_8X2X8 0 |
|
312 |
|
313 WRITE_AS_INTS |
|
314 |
|
315 ; begin epilog |
|
316 pop rdi |
|
317 pop rsi |
|
318 UNSHADOW_ARGS |
|
319 pop rbp |
|
320 ret |
|
321 |
|
322 |
|
323 ;void vp9_sad4x4x8_c( |
|
324 ; const unsigned char *src_ptr, |
|
325 ; int src_stride, |
|
326 ; const unsigned char *ref_ptr, |
|
327 ; int ref_stride, |
|
328 ; unsigned short *sad_array |
|
329 ;); |
|
330 global sym(vp9_sad4x4x8_sse4) PRIVATE |
|
331 sym(vp9_sad4x4x8_sse4): |
|
332 push rbp |
|
333 mov rbp, rsp |
|
334 SHADOW_ARGS_TO_STACK 5 |
|
335 push rsi |
|
336 push rdi |
|
337 ; end prolog |
|
338 |
|
339 mov rsi, arg(0) ;src_ptr |
|
340 mov rdi, arg(2) ;ref_ptr |
|
341 |
|
342 movsxd rax, dword ptr arg(1) ;src_stride |
|
343 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
344 |
|
345 PROCESS_4X2X8 1 |
|
346 PROCESS_4X2X8 0 |
|
347 |
|
348 WRITE_AS_INTS |
|
349 |
|
350 ; begin epilog |
|
351 pop rdi |
|
352 pop rsi |
|
353 UNSHADOW_ARGS |
|
354 pop rbp |
|
355 ret |
|
356 |
|
357 |
|
358 |
|
359 |