|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 %macro PROCESS_16X2X8 1 |
|
15 %if %1 |
|
16 movdqa xmm0, XMMWORD PTR [rsi] |
|
17 movq xmm1, MMWORD PTR [rdi] |
|
18 movq xmm3, MMWORD PTR [rdi+8] |
|
19 movq xmm2, MMWORD PTR [rdi+16] |
|
20 punpcklqdq xmm1, xmm3 |
|
21 punpcklqdq xmm3, xmm2 |
|
22 |
|
23 movdqa xmm2, xmm1 |
|
24 mpsadbw xmm1, xmm0, 0x0 |
|
25 mpsadbw xmm2, xmm0, 0x5 |
|
26 |
|
27 psrldq xmm0, 8 |
|
28 |
|
29 movdqa xmm4, xmm3 |
|
30 mpsadbw xmm3, xmm0, 0x0 |
|
31 mpsadbw xmm4, xmm0, 0x5 |
|
32 |
|
33 paddw xmm1, xmm2 |
|
34 paddw xmm1, xmm3 |
|
35 paddw xmm1, xmm4 |
|
36 %else |
|
37 movdqa xmm0, XMMWORD PTR [rsi] |
|
38 movq xmm5, MMWORD PTR [rdi] |
|
39 movq xmm3, MMWORD PTR [rdi+8] |
|
40 movq xmm2, MMWORD PTR [rdi+16] |
|
41 punpcklqdq xmm5, xmm3 |
|
42 punpcklqdq xmm3, xmm2 |
|
43 |
|
44 movdqa xmm2, xmm5 |
|
45 mpsadbw xmm5, xmm0, 0x0 |
|
46 mpsadbw xmm2, xmm0, 0x5 |
|
47 |
|
48 psrldq xmm0, 8 |
|
49 |
|
50 movdqa xmm4, xmm3 |
|
51 mpsadbw xmm3, xmm0, 0x0 |
|
52 mpsadbw xmm4, xmm0, 0x5 |
|
53 |
|
54 paddw xmm5, xmm2 |
|
55 paddw xmm5, xmm3 |
|
56 paddw xmm5, xmm4 |
|
57 |
|
58 paddw xmm1, xmm5 |
|
59 %endif |
|
60 movdqa xmm0, XMMWORD PTR [rsi + rax] |
|
61 movq xmm5, MMWORD PTR [rdi+ rdx] |
|
62 movq xmm3, MMWORD PTR [rdi+ rdx+8] |
|
63 movq xmm2, MMWORD PTR [rdi+ rdx+16] |
|
64 punpcklqdq xmm5, xmm3 |
|
65 punpcklqdq xmm3, xmm2 |
|
66 |
|
67 lea rsi, [rsi+rax*2] |
|
68 lea rdi, [rdi+rdx*2] |
|
69 |
|
70 movdqa xmm2, xmm5 |
|
71 mpsadbw xmm5, xmm0, 0x0 |
|
72 mpsadbw xmm2, xmm0, 0x5 |
|
73 |
|
74 psrldq xmm0, 8 |
|
75 movdqa xmm4, xmm3 |
|
76 mpsadbw xmm3, xmm0, 0x0 |
|
77 mpsadbw xmm4, xmm0, 0x5 |
|
78 |
|
79 paddw xmm5, xmm2 |
|
80 paddw xmm5, xmm3 |
|
81 paddw xmm5, xmm4 |
|
82 |
|
83 paddw xmm1, xmm5 |
|
84 %endmacro |
|
85 |
|
86 %macro PROCESS_8X2X8 1 |
|
87 %if %1 |
|
88 movq xmm0, MMWORD PTR [rsi] |
|
89 movq xmm1, MMWORD PTR [rdi] |
|
90 movq xmm3, MMWORD PTR [rdi+8] |
|
91 punpcklqdq xmm1, xmm3 |
|
92 |
|
93 movdqa xmm2, xmm1 |
|
94 mpsadbw xmm1, xmm0, 0x0 |
|
95 mpsadbw xmm2, xmm0, 0x5 |
|
96 paddw xmm1, xmm2 |
|
97 %else |
|
98 movq xmm0, MMWORD PTR [rsi] |
|
99 movq xmm5, MMWORD PTR [rdi] |
|
100 movq xmm3, MMWORD PTR [rdi+8] |
|
101 punpcklqdq xmm5, xmm3 |
|
102 |
|
103 movdqa xmm2, xmm5 |
|
104 mpsadbw xmm5, xmm0, 0x0 |
|
105 mpsadbw xmm2, xmm0, 0x5 |
|
106 paddw xmm5, xmm2 |
|
107 |
|
108 paddw xmm1, xmm5 |
|
109 %endif |
|
110 movq xmm0, MMWORD PTR [rsi + rax] |
|
111 movq xmm5, MMWORD PTR [rdi+ rdx] |
|
112 movq xmm3, MMWORD PTR [rdi+ rdx+8] |
|
113 punpcklqdq xmm5, xmm3 |
|
114 |
|
115 lea rsi, [rsi+rax*2] |
|
116 lea rdi, [rdi+rdx*2] |
|
117 |
|
118 movdqa xmm2, xmm5 |
|
119 mpsadbw xmm5, xmm0, 0x0 |
|
120 mpsadbw xmm2, xmm0, 0x5 |
|
121 paddw xmm5, xmm2 |
|
122 |
|
123 paddw xmm1, xmm5 |
|
124 %endmacro |
|
125 |
|
126 %macro PROCESS_4X2X8 1 |
|
127 %if %1 |
|
128 movd xmm0, [rsi] |
|
129 movq xmm1, MMWORD PTR [rdi] |
|
130 movq xmm3, MMWORD PTR [rdi+8] |
|
131 punpcklqdq xmm1, xmm3 |
|
132 |
|
133 mpsadbw xmm1, xmm0, 0x0 |
|
134 %else |
|
135 movd xmm0, [rsi] |
|
136 movq xmm5, MMWORD PTR [rdi] |
|
137 movq xmm3, MMWORD PTR [rdi+8] |
|
138 punpcklqdq xmm5, xmm3 |
|
139 |
|
140 mpsadbw xmm5, xmm0, 0x0 |
|
141 |
|
142 paddw xmm1, xmm5 |
|
143 %endif |
|
144 movd xmm0, [rsi + rax] |
|
145 movq xmm5, MMWORD PTR [rdi+ rdx] |
|
146 movq xmm3, MMWORD PTR [rdi+ rdx+8] |
|
147 punpcklqdq xmm5, xmm3 |
|
148 |
|
149 lea rsi, [rsi+rax*2] |
|
150 lea rdi, [rdi+rdx*2] |
|
151 |
|
152 mpsadbw xmm5, xmm0, 0x0 |
|
153 |
|
154 paddw xmm1, xmm5 |
|
155 %endmacro |
|
156 |
|
157 |
|
158 ;void vp8_sad16x16x8_sse4( |
|
159 ; const unsigned char *src_ptr, |
|
160 ; int src_stride, |
|
161 ; const unsigned char *ref_ptr, |
|
162 ; int ref_stride, |
|
163 ; unsigned short *sad_array); |
|
164 global sym(vp8_sad16x16x8_sse4) PRIVATE |
|
165 sym(vp8_sad16x16x8_sse4): |
|
166 push rbp |
|
167 mov rbp, rsp |
|
168 SHADOW_ARGS_TO_STACK 5 |
|
169 push rsi |
|
170 push rdi |
|
171 ; end prolog |
|
172 |
|
173 mov rsi, arg(0) ;src_ptr |
|
174 mov rdi, arg(2) ;ref_ptr |
|
175 |
|
176 movsxd rax, dword ptr arg(1) ;src_stride |
|
177 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
178 |
|
179 PROCESS_16X2X8 1 |
|
180 PROCESS_16X2X8 0 |
|
181 PROCESS_16X2X8 0 |
|
182 PROCESS_16X2X8 0 |
|
183 PROCESS_16X2X8 0 |
|
184 PROCESS_16X2X8 0 |
|
185 PROCESS_16X2X8 0 |
|
186 PROCESS_16X2X8 0 |
|
187 |
|
188 mov rdi, arg(4) ;Results |
|
189 movdqa XMMWORD PTR [rdi], xmm1 |
|
190 |
|
191 ; begin epilog |
|
192 pop rdi |
|
193 pop rsi |
|
194 UNSHADOW_ARGS |
|
195 pop rbp |
|
196 ret |
|
197 |
|
198 |
|
199 ;void vp8_sad16x8x8_sse4( |
|
200 ; const unsigned char *src_ptr, |
|
201 ; int src_stride, |
|
202 ; const unsigned char *ref_ptr, |
|
203 ; int ref_stride, |
|
204 ; unsigned short *sad_array |
|
205 ;); |
|
206 global sym(vp8_sad16x8x8_sse4) PRIVATE |
|
207 sym(vp8_sad16x8x8_sse4): |
|
208 push rbp |
|
209 mov rbp, rsp |
|
210 SHADOW_ARGS_TO_STACK 5 |
|
211 push rsi |
|
212 push rdi |
|
213 ; end prolog |
|
214 |
|
215 mov rsi, arg(0) ;src_ptr |
|
216 mov rdi, arg(2) ;ref_ptr |
|
217 |
|
218 movsxd rax, dword ptr arg(1) ;src_stride |
|
219 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
220 |
|
221 PROCESS_16X2X8 1 |
|
222 PROCESS_16X2X8 0 |
|
223 PROCESS_16X2X8 0 |
|
224 PROCESS_16X2X8 0 |
|
225 |
|
226 mov rdi, arg(4) ;Results |
|
227 movdqa XMMWORD PTR [rdi], xmm1 |
|
228 |
|
229 ; begin epilog |
|
230 pop rdi |
|
231 pop rsi |
|
232 UNSHADOW_ARGS |
|
233 pop rbp |
|
234 ret |
|
235 |
|
236 |
|
237 ;void vp8_sad8x8x8_sse4( |
|
238 ; const unsigned char *src_ptr, |
|
239 ; int src_stride, |
|
240 ; const unsigned char *ref_ptr, |
|
241 ; int ref_stride, |
|
242 ; unsigned short *sad_array |
|
243 ;); |
|
244 global sym(vp8_sad8x8x8_sse4) PRIVATE |
|
245 sym(vp8_sad8x8x8_sse4): |
|
246 push rbp |
|
247 mov rbp, rsp |
|
248 SHADOW_ARGS_TO_STACK 5 |
|
249 push rsi |
|
250 push rdi |
|
251 ; end prolog |
|
252 |
|
253 mov rsi, arg(0) ;src_ptr |
|
254 mov rdi, arg(2) ;ref_ptr |
|
255 |
|
256 movsxd rax, dword ptr arg(1) ;src_stride |
|
257 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
258 |
|
259 PROCESS_8X2X8 1 |
|
260 PROCESS_8X2X8 0 |
|
261 PROCESS_8X2X8 0 |
|
262 PROCESS_8X2X8 0 |
|
263 |
|
264 mov rdi, arg(4) ;Results |
|
265 movdqa XMMWORD PTR [rdi], xmm1 |
|
266 |
|
267 ; begin epilog |
|
268 pop rdi |
|
269 pop rsi |
|
270 UNSHADOW_ARGS |
|
271 pop rbp |
|
272 ret |
|
273 |
|
274 |
|
275 ;void vp8_sad8x16x8_sse4( |
|
276 ; const unsigned char *src_ptr, |
|
277 ; int src_stride, |
|
278 ; const unsigned char *ref_ptr, |
|
279 ; int ref_stride, |
|
280 ; unsigned short *sad_array |
|
281 ;); |
|
282 global sym(vp8_sad8x16x8_sse4) PRIVATE |
|
283 sym(vp8_sad8x16x8_sse4): |
|
284 push rbp |
|
285 mov rbp, rsp |
|
286 SHADOW_ARGS_TO_STACK 5 |
|
287 push rsi |
|
288 push rdi |
|
289 ; end prolog |
|
290 |
|
291 mov rsi, arg(0) ;src_ptr |
|
292 mov rdi, arg(2) ;ref_ptr |
|
293 |
|
294 movsxd rax, dword ptr arg(1) ;src_stride |
|
295 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
296 |
|
297 PROCESS_8X2X8 1 |
|
298 PROCESS_8X2X8 0 |
|
299 PROCESS_8X2X8 0 |
|
300 PROCESS_8X2X8 0 |
|
301 PROCESS_8X2X8 0 |
|
302 PROCESS_8X2X8 0 |
|
303 PROCESS_8X2X8 0 |
|
304 PROCESS_8X2X8 0 |
|
305 mov rdi, arg(4) ;Results |
|
306 movdqa XMMWORD PTR [rdi], xmm1 |
|
307 |
|
308 ; begin epilog |
|
309 pop rdi |
|
310 pop rsi |
|
311 UNSHADOW_ARGS |
|
312 pop rbp |
|
313 ret |
|
314 |
|
315 |
|
316 ;void vp8_sad4x4x8_c( |
|
317 ; const unsigned char *src_ptr, |
|
318 ; int src_stride, |
|
319 ; const unsigned char *ref_ptr, |
|
320 ; int ref_stride, |
|
321 ; unsigned short *sad_array |
|
322 ;); |
|
323 global sym(vp8_sad4x4x8_sse4) PRIVATE |
|
324 sym(vp8_sad4x4x8_sse4): |
|
325 push rbp |
|
326 mov rbp, rsp |
|
327 SHADOW_ARGS_TO_STACK 5 |
|
328 push rsi |
|
329 push rdi |
|
330 ; end prolog |
|
331 |
|
332 mov rsi, arg(0) ;src_ptr |
|
333 mov rdi, arg(2) ;ref_ptr |
|
334 |
|
335 movsxd rax, dword ptr arg(1) ;src_stride |
|
336 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
337 |
|
338 PROCESS_4X2X8 1 |
|
339 PROCESS_4X2X8 0 |
|
340 |
|
341 mov rdi, arg(4) ;Results |
|
342 movdqa XMMWORD PTR [rdi], xmm1 |
|
343 |
|
344 ; begin epilog |
|
345 pop rdi |
|
346 pop rsi |
|
347 UNSHADOW_ARGS |
|
348 pop rbp |
|
349 ret |
|
350 |
|
351 |
|
352 |
|
353 |