|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ;unsigned int vp8_sad16x16_wmt( |
|
15 ; unsigned char *src_ptr, |
|
16 ; int src_stride, |
|
17 ; unsigned char *ref_ptr, |
|
18 ; int ref_stride) |
|
19 global sym(vp8_sad16x16_wmt) PRIVATE |
|
20 sym(vp8_sad16x16_wmt): |
|
21 push rbp |
|
22 mov rbp, rsp |
|
23 SHADOW_ARGS_TO_STACK 4 |
|
24 SAVE_XMM 6 |
|
25 push rsi |
|
26 push rdi |
|
27 ; end prolog |
|
28 |
|
29 mov rsi, arg(0) ;src_ptr |
|
30 mov rdi, arg(2) ;ref_ptr |
|
31 |
|
32 movsxd rax, dword ptr arg(1) ;src_stride |
|
33 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
34 |
|
35 lea rcx, [rsi+rax*8] |
|
36 |
|
37 lea rcx, [rcx+rax*8] |
|
38 pxor xmm6, xmm6 |
|
39 |
|
40 .x16x16sad_wmt_loop: |
|
41 |
|
42 movq xmm0, QWORD PTR [rsi] |
|
43 movq xmm2, QWORD PTR [rsi+8] |
|
44 |
|
45 movq xmm1, QWORD PTR [rdi] |
|
46 movq xmm3, QWORD PTR [rdi+8] |
|
47 |
|
48 movq xmm4, QWORD PTR [rsi+rax] |
|
49 movq xmm5, QWORD PTR [rdi+rdx] |
|
50 |
|
51 |
|
52 punpcklbw xmm0, xmm2 |
|
53 punpcklbw xmm1, xmm3 |
|
54 |
|
55 psadbw xmm0, xmm1 |
|
56 movq xmm2, QWORD PTR [rsi+rax+8] |
|
57 |
|
58 movq xmm3, QWORD PTR [rdi+rdx+8] |
|
59 lea rsi, [rsi+rax*2] |
|
60 |
|
61 lea rdi, [rdi+rdx*2] |
|
62 punpcklbw xmm4, xmm2 |
|
63 |
|
64 punpcklbw xmm5, xmm3 |
|
65 psadbw xmm4, xmm5 |
|
66 |
|
67 paddw xmm6, xmm0 |
|
68 paddw xmm6, xmm4 |
|
69 |
|
70 cmp rsi, rcx |
|
71 jne .x16x16sad_wmt_loop |
|
72 |
|
73 movq xmm0, xmm6 |
|
74 psrldq xmm6, 8 |
|
75 |
|
76 paddw xmm0, xmm6 |
|
77 movq rax, xmm0 |
|
78 |
|
79 ; begin epilog |
|
80 pop rdi |
|
81 pop rsi |
|
82 RESTORE_XMM |
|
83 UNSHADOW_ARGS |
|
84 pop rbp |
|
85 ret |
|
86 |
|
87 ;unsigned int vp8_sad8x16_wmt( |
|
88 ; unsigned char *src_ptr, |
|
89 ; int src_stride, |
|
90 ; unsigned char *ref_ptr, |
|
91 ; int ref_stride, |
|
92 ; int max_sad) |
|
93 global sym(vp8_sad8x16_wmt) PRIVATE |
|
94 sym(vp8_sad8x16_wmt): |
|
95 push rbp |
|
96 mov rbp, rsp |
|
97 SHADOW_ARGS_TO_STACK 5 |
|
98 push rbx |
|
99 push rsi |
|
100 push rdi |
|
101 ; end prolog |
|
102 |
|
103 mov rsi, arg(0) ;src_ptr |
|
104 mov rdi, arg(2) ;ref_ptr |
|
105 |
|
106 movsxd rbx, dword ptr arg(1) ;src_stride |
|
107 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
108 |
|
109 lea rcx, [rsi+rbx*8] |
|
110 |
|
111 lea rcx, [rcx+rbx*8] |
|
112 pxor mm7, mm7 |
|
113 |
|
114 .x8x16sad_wmt_loop: |
|
115 |
|
116 movq rax, mm7 |
|
117 cmp eax, arg(4) |
|
118 ja .x8x16sad_wmt_early_exit |
|
119 |
|
120 movq mm0, QWORD PTR [rsi] |
|
121 movq mm1, QWORD PTR [rdi] |
|
122 |
|
123 movq mm2, QWORD PTR [rsi+rbx] |
|
124 movq mm3, QWORD PTR [rdi+rdx] |
|
125 |
|
126 psadbw mm0, mm1 |
|
127 psadbw mm2, mm3 |
|
128 |
|
129 lea rsi, [rsi+rbx*2] |
|
130 lea rdi, [rdi+rdx*2] |
|
131 |
|
132 paddw mm7, mm0 |
|
133 paddw mm7, mm2 |
|
134 |
|
135 cmp rsi, rcx |
|
136 jne .x8x16sad_wmt_loop |
|
137 |
|
138 movq rax, mm7 |
|
139 |
|
140 .x8x16sad_wmt_early_exit: |
|
141 |
|
142 ; begin epilog |
|
143 pop rdi |
|
144 pop rsi |
|
145 pop rbx |
|
146 UNSHADOW_ARGS |
|
147 pop rbp |
|
148 ret |
|
149 |
|
150 |
|
151 ;unsigned int vp8_sad8x8_wmt( |
|
152 ; unsigned char *src_ptr, |
|
153 ; int src_stride, |
|
154 ; unsigned char *ref_ptr, |
|
155 ; int ref_stride) |
|
156 global sym(vp8_sad8x8_wmt) PRIVATE |
|
157 sym(vp8_sad8x8_wmt): |
|
158 push rbp |
|
159 mov rbp, rsp |
|
160 SHADOW_ARGS_TO_STACK 5 |
|
161 push rbx |
|
162 push rsi |
|
163 push rdi |
|
164 ; end prolog |
|
165 |
|
166 mov rsi, arg(0) ;src_ptr |
|
167 mov rdi, arg(2) ;ref_ptr |
|
168 |
|
169 movsxd rbx, dword ptr arg(1) ;src_stride |
|
170 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
171 |
|
172 lea rcx, [rsi+rbx*8] |
|
173 pxor mm7, mm7 |
|
174 |
|
175 .x8x8sad_wmt_loop: |
|
176 |
|
177 movq rax, mm7 |
|
178 cmp eax, arg(4) |
|
179 ja .x8x8sad_wmt_early_exit |
|
180 |
|
181 movq mm0, QWORD PTR [rsi] |
|
182 movq mm1, QWORD PTR [rdi] |
|
183 |
|
184 psadbw mm0, mm1 |
|
185 lea rsi, [rsi+rbx] |
|
186 |
|
187 add rdi, rdx |
|
188 paddw mm7, mm0 |
|
189 |
|
190 cmp rsi, rcx |
|
191 jne .x8x8sad_wmt_loop |
|
192 |
|
193 movq rax, mm7 |
|
194 .x8x8sad_wmt_early_exit: |
|
195 |
|
196 ; begin epilog |
|
197 pop rdi |
|
198 pop rsi |
|
199 pop rbx |
|
200 UNSHADOW_ARGS |
|
201 pop rbp |
|
202 ret |
|
203 |
|
204 ;unsigned int vp8_sad4x4_wmt( |
|
205 ; unsigned char *src_ptr, |
|
206 ; int src_stride, |
|
207 ; unsigned char *ref_ptr, |
|
208 ; int ref_stride) |
|
209 global sym(vp8_sad4x4_wmt) PRIVATE |
|
210 sym(vp8_sad4x4_wmt): |
|
211 push rbp |
|
212 mov rbp, rsp |
|
213 SHADOW_ARGS_TO_STACK 4 |
|
214 push rsi |
|
215 push rdi |
|
216 ; end prolog |
|
217 |
|
218 mov rsi, arg(0) ;src_ptr |
|
219 mov rdi, arg(2) ;ref_ptr |
|
220 |
|
221 movsxd rax, dword ptr arg(1) ;src_stride |
|
222 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
223 |
|
224 movd mm0, DWORD PTR [rsi] |
|
225 movd mm1, DWORD PTR [rdi] |
|
226 |
|
227 movd mm2, DWORD PTR [rsi+rax] |
|
228 movd mm3, DWORD PTR [rdi+rdx] |
|
229 |
|
230 punpcklbw mm0, mm2 |
|
231 punpcklbw mm1, mm3 |
|
232 |
|
233 psadbw mm0, mm1 |
|
234 lea rsi, [rsi+rax*2] |
|
235 |
|
236 lea rdi, [rdi+rdx*2] |
|
237 movd mm4, DWORD PTR [rsi] |
|
238 |
|
239 movd mm5, DWORD PTR [rdi] |
|
240 movd mm6, DWORD PTR [rsi+rax] |
|
241 |
|
242 movd mm7, DWORD PTR [rdi+rdx] |
|
243 punpcklbw mm4, mm6 |
|
244 |
|
245 punpcklbw mm5, mm7 |
|
246 psadbw mm4, mm5 |
|
247 |
|
248 paddw mm0, mm4 |
|
249 movq rax, mm0 |
|
250 |
|
251 ; begin epilog |
|
252 pop rdi |
|
253 pop rsi |
|
254 UNSHADOW_ARGS |
|
255 pop rbp |
|
256 ret |
|
257 |
|
258 |
|
259 ;unsigned int vp8_sad16x8_wmt( |
|
260 ; unsigned char *src_ptr, |
|
261 ; int src_stride, |
|
262 ; unsigned char *ref_ptr, |
|
263 ; int ref_stride) |
|
264 global sym(vp8_sad16x8_wmt) PRIVATE |
|
265 sym(vp8_sad16x8_wmt): |
|
266 push rbp |
|
267 mov rbp, rsp |
|
268 SHADOW_ARGS_TO_STACK 5 |
|
269 push rbx |
|
270 push rsi |
|
271 push rdi |
|
272 ; end prolog |
|
273 |
|
274 |
|
275 mov rsi, arg(0) ;src_ptr |
|
276 mov rdi, arg(2) ;ref_ptr |
|
277 |
|
278 movsxd rbx, dword ptr arg(1) ;src_stride |
|
279 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
280 |
|
281 lea rcx, [rsi+rbx*8] |
|
282 pxor mm7, mm7 |
|
283 |
|
284 .x16x8sad_wmt_loop: |
|
285 |
|
286 movq rax, mm7 |
|
287 cmp eax, arg(4) |
|
288 ja .x16x8sad_wmt_early_exit |
|
289 |
|
290 movq mm0, QWORD PTR [rsi] |
|
291 movq mm2, QWORD PTR [rsi+8] |
|
292 |
|
293 movq mm1, QWORD PTR [rdi] |
|
294 movq mm3, QWORD PTR [rdi+8] |
|
295 |
|
296 movq mm4, QWORD PTR [rsi+rbx] |
|
297 movq mm5, QWORD PTR [rdi+rdx] |
|
298 |
|
299 psadbw mm0, mm1 |
|
300 psadbw mm2, mm3 |
|
301 |
|
302 movq mm1, QWORD PTR [rsi+rbx+8] |
|
303 movq mm3, QWORD PTR [rdi+rdx+8] |
|
304 |
|
305 psadbw mm4, mm5 |
|
306 psadbw mm1, mm3 |
|
307 |
|
308 lea rsi, [rsi+rbx*2] |
|
309 lea rdi, [rdi+rdx*2] |
|
310 |
|
311 paddw mm0, mm2 |
|
312 paddw mm4, mm1 |
|
313 |
|
314 paddw mm7, mm0 |
|
315 paddw mm7, mm4 |
|
316 |
|
317 cmp rsi, rcx |
|
318 jne .x16x8sad_wmt_loop |
|
319 |
|
320 movq rax, mm7 |
|
321 |
|
322 .x16x8sad_wmt_early_exit: |
|
323 |
|
324 ; begin epilog |
|
325 pop rdi |
|
326 pop rsi |
|
327 pop rbx |
|
328 UNSHADOW_ARGS |
|
329 pop rbp |
|
330 ret |
|
331 |
|
332 ;void vp8_copy32xn_sse2( |
|
333 ; unsigned char *src_ptr, |
|
334 ; int src_stride, |
|
335 ; unsigned char *dst_ptr, |
|
336 ; int dst_stride, |
|
337 ; int height); |
|
338 global sym(vp8_copy32xn_sse2) PRIVATE |
|
339 sym(vp8_copy32xn_sse2): |
|
340 push rbp |
|
341 mov rbp, rsp |
|
342 SHADOW_ARGS_TO_STACK 5 |
|
343 SAVE_XMM 7 |
|
344 push rsi |
|
345 push rdi |
|
346 ; end prolog |
|
347 |
|
348 mov rsi, arg(0) ;src_ptr |
|
349 mov rdi, arg(2) ;dst_ptr |
|
350 |
|
351 movsxd rax, dword ptr arg(1) ;src_stride |
|
352 movsxd rdx, dword ptr arg(3) ;dst_stride |
|
353 movsxd rcx, dword ptr arg(4) ;height |
|
354 |
|
355 .block_copy_sse2_loopx4: |
|
356 movdqu xmm0, XMMWORD PTR [rsi] |
|
357 movdqu xmm1, XMMWORD PTR [rsi + 16] |
|
358 movdqu xmm2, XMMWORD PTR [rsi + rax] |
|
359 movdqu xmm3, XMMWORD PTR [rsi + rax + 16] |
|
360 |
|
361 lea rsi, [rsi+rax*2] |
|
362 |
|
363 movdqu xmm4, XMMWORD PTR [rsi] |
|
364 movdqu xmm5, XMMWORD PTR [rsi + 16] |
|
365 movdqu xmm6, XMMWORD PTR [rsi + rax] |
|
366 movdqu xmm7, XMMWORD PTR [rsi + rax + 16] |
|
367 |
|
368 lea rsi, [rsi+rax*2] |
|
369 |
|
370 movdqa XMMWORD PTR [rdi], xmm0 |
|
371 movdqa XMMWORD PTR [rdi + 16], xmm1 |
|
372 movdqa XMMWORD PTR [rdi + rdx], xmm2 |
|
373 movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 |
|
374 |
|
375 lea rdi, [rdi+rdx*2] |
|
376 |
|
377 movdqa XMMWORD PTR [rdi], xmm4 |
|
378 movdqa XMMWORD PTR [rdi + 16], xmm5 |
|
379 movdqa XMMWORD PTR [rdi + rdx], xmm6 |
|
380 movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 |
|
381 |
|
382 lea rdi, [rdi+rdx*2] |
|
383 |
|
384 sub rcx, 4 |
|
385 cmp rcx, 4 |
|
386 jge .block_copy_sse2_loopx4 |
|
387 |
|
388 cmp rcx, 0 |
|
389 je .copy_is_done |
|
390 |
|
391 .block_copy_sse2_loop: |
|
392 movdqu xmm0, XMMWORD PTR [rsi] |
|
393 movdqu xmm1, XMMWORD PTR [rsi + 16] |
|
394 lea rsi, [rsi+rax] |
|
395 |
|
396 movdqa XMMWORD PTR [rdi], xmm0 |
|
397 movdqa XMMWORD PTR [rdi + 16], xmm1 |
|
398 lea rdi, [rdi+rdx] |
|
399 |
|
400 sub rcx, 1 |
|
401 jne .block_copy_sse2_loop |
|
402 |
|
403 .copy_is_done: |
|
404 ; begin epilog |
|
405 pop rdi |
|
406 pop rsi |
|
407 RESTORE_XMM |
|
408 UNSHADOW_ARGS |
|
409 pop rbp |
|
410 ret |