|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ;unsigned int vp9_get_mb_ss_sse2 |
|
15 ;( |
|
16 ; short *src_ptr |
|
17 ;) |
|
18 global sym(vp9_get_mb_ss_sse2) PRIVATE |
|
19 sym(vp9_get_mb_ss_sse2): |
|
20 push rbp |
|
21 mov rbp, rsp |
|
22 SHADOW_ARGS_TO_STACK 1 |
|
23 GET_GOT rbx |
|
24 push rsi |
|
25 push rdi |
|
26 sub rsp, 16 |
|
27 ; end prolog |
|
28 |
|
29 |
|
30 mov rax, arg(0) ;[src_ptr] |
|
31 mov rcx, 8 |
|
32 pxor xmm4, xmm4 |
|
33 |
|
34 .NEXTROW: |
|
35 movdqa xmm0, [rax] |
|
36 movdqa xmm1, [rax+16] |
|
37 movdqa xmm2, [rax+32] |
|
38 movdqa xmm3, [rax+48] |
|
39 pmaddwd xmm0, xmm0 |
|
40 pmaddwd xmm1, xmm1 |
|
41 pmaddwd xmm2, xmm2 |
|
42 pmaddwd xmm3, xmm3 |
|
43 |
|
44 paddd xmm0, xmm1 |
|
45 paddd xmm2, xmm3 |
|
46 paddd xmm4, xmm0 |
|
47 paddd xmm4, xmm2 |
|
48 |
|
49 add rax, 0x40 |
|
50 dec rcx |
|
51 ja .NEXTROW |
|
52 |
|
53 movdqa xmm3,xmm4 |
|
54 psrldq xmm4,8 |
|
55 paddd xmm4,xmm3 |
|
56 movdqa xmm3,xmm4 |
|
57 psrldq xmm4,4 |
|
58 paddd xmm4,xmm3 |
|
59 movq rax,xmm4 |
|
60 |
|
61 |
|
62 ; begin epilog |
|
63 add rsp, 16 |
|
64 pop rdi |
|
65 pop rsi |
|
66 RESTORE_GOT |
|
67 UNSHADOW_ARGS |
|
68 pop rbp |
|
69 ret |
|
70 |
|
71 |
|
72 ;unsigned int vp9_get16x16var_sse2 |
|
73 ;( |
|
74 ; unsigned char * src_ptr, |
|
75 ; int source_stride, |
|
76 ; unsigned char * ref_ptr, |
|
77 ; int recon_stride, |
|
78 ; unsigned int * SSE, |
|
79 ; int * Sum |
|
80 ;) |
|
81 global sym(vp9_get16x16var_sse2) PRIVATE |
|
82 sym(vp9_get16x16var_sse2): |
|
83 push rbp |
|
84 mov rbp, rsp |
|
85 SHADOW_ARGS_TO_STACK 6 |
|
86 SAVE_XMM 7 |
|
87 push rbx |
|
88 push rsi |
|
89 push rdi |
|
90 ; end prolog |
|
91 |
|
92 mov rsi, arg(0) ;[src_ptr] |
|
93 mov rdi, arg(2) ;[ref_ptr] |
|
94 |
|
95 movsxd rax, DWORD PTR arg(1) ;[source_stride] |
|
96 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] |
|
97 |
|
98 ; Prefetch data |
|
99 lea rcx, [rax+rax*2] |
|
100 prefetcht0 [rsi] |
|
101 prefetcht0 [rsi+rax] |
|
102 prefetcht0 [rsi+rax*2] |
|
103 prefetcht0 [rsi+rcx] |
|
104 lea rbx, [rsi+rax*4] |
|
105 prefetcht0 [rbx] |
|
106 prefetcht0 [rbx+rax] |
|
107 prefetcht0 [rbx+rax*2] |
|
108 prefetcht0 [rbx+rcx] |
|
109 |
|
110 lea rcx, [rdx+rdx*2] |
|
111 prefetcht0 [rdi] |
|
112 prefetcht0 [rdi+rdx] |
|
113 prefetcht0 [rdi+rdx*2] |
|
114 prefetcht0 [rdi+rcx] |
|
115 lea rbx, [rdi+rdx*4] |
|
116 prefetcht0 [rbx] |
|
117 prefetcht0 [rbx+rdx] |
|
118 prefetcht0 [rbx+rdx*2] |
|
119 prefetcht0 [rbx+rcx] |
|
120 |
|
121 pxor xmm0, xmm0 ; clear xmm0 for unpack |
|
122 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs |
|
123 |
|
124 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse |
|
125 mov rcx, 16 |
|
126 |
|
127 .var16loop: |
|
128 movdqu xmm1, XMMWORD PTR [rsi] |
|
129 movdqu xmm2, XMMWORD PTR [rdi] |
|
130 |
|
131 prefetcht0 [rsi+rax*8] |
|
132 prefetcht0 [rdi+rdx*8] |
|
133 |
|
134 movdqa xmm3, xmm1 |
|
135 movdqa xmm4, xmm2 |
|
136 |
|
137 |
|
138 punpcklbw xmm1, xmm0 |
|
139 punpckhbw xmm3, xmm0 |
|
140 |
|
141 punpcklbw xmm2, xmm0 |
|
142 punpckhbw xmm4, xmm0 |
|
143 |
|
144 |
|
145 psubw xmm1, xmm2 |
|
146 psubw xmm3, xmm4 |
|
147 |
|
148 paddw xmm7, xmm1 |
|
149 pmaddwd xmm1, xmm1 |
|
150 |
|
151 paddw xmm7, xmm3 |
|
152 pmaddwd xmm3, xmm3 |
|
153 |
|
154 paddd xmm6, xmm1 |
|
155 paddd xmm6, xmm3 |
|
156 |
|
157 add rsi, rax |
|
158 add rdi, rdx |
|
159 |
|
160 sub rcx, 1 |
|
161 jnz .var16loop |
|
162 |
|
163 |
|
164 movdqa xmm1, xmm6 |
|
165 pxor xmm6, xmm6 |
|
166 |
|
167 pxor xmm5, xmm5 |
|
168 punpcklwd xmm6, xmm7 |
|
169 |
|
170 punpckhwd xmm5, xmm7 |
|
171 psrad xmm5, 16 |
|
172 |
|
173 psrad xmm6, 16 |
|
174 paddd xmm6, xmm5 |
|
175 |
|
176 movdqa xmm2, xmm1 |
|
177 punpckldq xmm1, xmm0 |
|
178 |
|
179 punpckhdq xmm2, xmm0 |
|
180 movdqa xmm7, xmm6 |
|
181 |
|
182 paddd xmm1, xmm2 |
|
183 punpckldq xmm6, xmm0 |
|
184 |
|
185 punpckhdq xmm7, xmm0 |
|
186 paddd xmm6, xmm7 |
|
187 |
|
188 movdqa xmm2, xmm1 |
|
189 movdqa xmm7, xmm6 |
|
190 |
|
191 psrldq xmm1, 8 |
|
192 psrldq xmm6, 8 |
|
193 |
|
194 paddd xmm7, xmm6 |
|
195 paddd xmm1, xmm2 |
|
196 |
|
197 mov rax, arg(5) ;[Sum] |
|
198 mov rdi, arg(4) ;[SSE] |
|
199 |
|
200 movd DWORD PTR [rax], xmm7 |
|
201 movd DWORD PTR [rdi], xmm1 |
|
202 |
|
203 |
|
204 ; begin epilog |
|
205 pop rdi |
|
206 pop rsi |
|
207 pop rbx |
|
208 RESTORE_XMM |
|
209 UNSHADOW_ARGS |
|
210 pop rbp |
|
211 ret |
|
212 |
|
213 |
|
214 |
|
215 |
|
216 ;unsigned int vp9_get8x8var_sse2 |
|
217 ;( |
|
218 ; unsigned char * src_ptr, |
|
219 ; int source_stride, |
|
220 ; unsigned char * ref_ptr, |
|
221 ; int recon_stride, |
|
222 ; unsigned int * SSE, |
|
223 ; int * Sum |
|
224 ;) |
|
225 global sym(vp9_get8x8var_sse2) PRIVATE |
|
226 sym(vp9_get8x8var_sse2): |
|
227 push rbp |
|
228 mov rbp, rsp |
|
229 SHADOW_ARGS_TO_STACK 6 |
|
230 SAVE_XMM 7 |
|
231 GET_GOT rbx |
|
232 push rsi |
|
233 push rdi |
|
234 sub rsp, 16 |
|
235 ; end prolog |
|
236 |
|
237 mov rsi, arg(0) ;[src_ptr] |
|
238 mov rdi, arg(2) ;[ref_ptr] |
|
239 |
|
240 movsxd rax, DWORD PTR arg(1) ;[source_stride] |
|
241 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] |
|
242 |
|
243 pxor xmm0, xmm0 ; clear xmm0 for unpack |
|
244 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs |
|
245 |
|
246 movq xmm1, QWORD PTR [rsi] |
|
247 movq xmm2, QWORD PTR [rdi] |
|
248 |
|
249 punpcklbw xmm1, xmm0 |
|
250 punpcklbw xmm2, xmm0 |
|
251 |
|
252 psubsw xmm1, xmm2 |
|
253 paddw xmm7, xmm1 |
|
254 |
|
255 pmaddwd xmm1, xmm1 |
|
256 |
|
257 movq xmm2, QWORD PTR[rsi + rax] |
|
258 movq xmm3, QWORD PTR[rdi + rdx] |
|
259 |
|
260 punpcklbw xmm2, xmm0 |
|
261 punpcklbw xmm3, xmm0 |
|
262 |
|
263 psubsw xmm2, xmm3 |
|
264 paddw xmm7, xmm2 |
|
265 |
|
266 pmaddwd xmm2, xmm2 |
|
267 paddd xmm1, xmm2 |
|
268 |
|
269 |
|
270 movq xmm2, QWORD PTR[rsi + rax * 2] |
|
271 movq xmm3, QWORD PTR[rdi + rdx * 2] |
|
272 |
|
273 punpcklbw xmm2, xmm0 |
|
274 punpcklbw xmm3, xmm0 |
|
275 |
|
276 psubsw xmm2, xmm3 |
|
277 paddw xmm7, xmm2 |
|
278 |
|
279 pmaddwd xmm2, xmm2 |
|
280 paddd xmm1, xmm2 |
|
281 |
|
282 |
|
283 lea rsi, [rsi + rax * 2] |
|
284 lea rdi, [rdi + rdx * 2] |
|
285 movq xmm2, QWORD PTR[rsi + rax] |
|
286 movq xmm3, QWORD PTR[rdi + rdx] |
|
287 |
|
288 punpcklbw xmm2, xmm0 |
|
289 punpcklbw xmm3, xmm0 |
|
290 |
|
291 psubsw xmm2, xmm3 |
|
292 paddw xmm7, xmm2 |
|
293 |
|
294 pmaddwd xmm2, xmm2 |
|
295 paddd xmm1, xmm2 |
|
296 |
|
297 movq xmm2, QWORD PTR[rsi + rax *2] |
|
298 movq xmm3, QWORD PTR[rdi + rdx *2] |
|
299 |
|
300 punpcklbw xmm2, xmm0 |
|
301 punpcklbw xmm3, xmm0 |
|
302 |
|
303 psubsw xmm2, xmm3 |
|
304 paddw xmm7, xmm2 |
|
305 |
|
306 pmaddwd xmm2, xmm2 |
|
307 paddd xmm1, xmm2 |
|
308 |
|
309 |
|
310 lea rsi, [rsi + rax * 2] |
|
311 lea rdi, [rdi + rdx * 2] |
|
312 |
|
313 |
|
314 movq xmm2, QWORD PTR[rsi + rax] |
|
315 movq xmm3, QWORD PTR[rdi + rdx] |
|
316 |
|
317 punpcklbw xmm2, xmm0 |
|
318 punpcklbw xmm3, xmm0 |
|
319 |
|
320 psubsw xmm2, xmm3 |
|
321 paddw xmm7, xmm2 |
|
322 |
|
323 pmaddwd xmm2, xmm2 |
|
324 paddd xmm1, xmm2 |
|
325 |
|
326 movq xmm2, QWORD PTR[rsi + rax *2] |
|
327 movq xmm3, QWORD PTR[rdi + rdx *2] |
|
328 |
|
329 punpcklbw xmm2, xmm0 |
|
330 punpcklbw xmm3, xmm0 |
|
331 |
|
332 psubsw xmm2, xmm3 |
|
333 paddw xmm7, xmm2 |
|
334 |
|
335 pmaddwd xmm2, xmm2 |
|
336 paddd xmm1, xmm2 |
|
337 |
|
338 |
|
339 lea rsi, [rsi + rax * 2] |
|
340 lea rdi, [rdi + rdx * 2] |
|
341 |
|
342 movq xmm2, QWORD PTR[rsi + rax] |
|
343 movq xmm3, QWORD PTR[rdi + rdx] |
|
344 |
|
345 punpcklbw xmm2, xmm0 |
|
346 punpcklbw xmm3, xmm0 |
|
347 |
|
348 psubsw xmm2, xmm3 |
|
349 paddw xmm7, xmm2 |
|
350 |
|
351 pmaddwd xmm2, xmm2 |
|
352 paddd xmm1, xmm2 |
|
353 |
|
354 |
|
355 movdqa xmm6, xmm7 |
|
356 punpcklwd xmm6, xmm0 |
|
357 |
|
358 punpckhwd xmm7, xmm0 |
|
359 movdqa xmm2, xmm1 |
|
360 |
|
361 paddw xmm6, xmm7 |
|
362 punpckldq xmm1, xmm0 |
|
363 |
|
364 punpckhdq xmm2, xmm0 |
|
365 movdqa xmm7, xmm6 |
|
366 |
|
367 paddd xmm1, xmm2 |
|
368 punpckldq xmm6, xmm0 |
|
369 |
|
370 punpckhdq xmm7, xmm0 |
|
371 paddw xmm6, xmm7 |
|
372 |
|
373 movdqa xmm2, xmm1 |
|
374 movdqa xmm7, xmm6 |
|
375 |
|
376 psrldq xmm1, 8 |
|
377 psrldq xmm6, 8 |
|
378 |
|
379 paddw xmm7, xmm6 |
|
380 paddd xmm1, xmm2 |
|
381 |
|
382 mov rax, arg(5) ;[Sum] |
|
383 mov rdi, arg(4) ;[SSE] |
|
384 |
|
385 movq rdx, xmm7 |
|
386 movsx rcx, dx |
|
387 |
|
388 mov dword ptr [rax], ecx |
|
389 movd DWORD PTR [rdi], xmm1 |
|
390 |
|
391 ; begin epilog |
|
392 add rsp, 16 |
|
393 pop rdi |
|
394 pop rsi |
|
395 RESTORE_GOT |
|
396 RESTORE_XMM |
|
397 UNSHADOW_ARGS |
|
398 pop rbp |
|
399 ret |
|
400 |
|
401 ;void vp9_half_horiz_vert_variance8x_h_sse2 |
|
402 ;( |
|
403 ; unsigned char *ref_ptr, |
|
404 ; int ref_pixels_per_line, |
|
405 ; unsigned char *src_ptr, |
|
406 ; int src_pixels_per_line, |
|
407 ; unsigned int Height, |
|
408 ; int *sum, |
|
409 ; unsigned int *sumsquared |
|
410 ;) |
|
411 global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE |
|
412 sym(vp9_half_horiz_vert_variance8x_h_sse2): |
|
413 push rbp |
|
414 mov rbp, rsp |
|
415 SHADOW_ARGS_TO_STACK 7 |
|
416 SAVE_XMM 7 |
|
417 GET_GOT rbx |
|
418 push rsi |
|
419 push rdi |
|
420 ; end prolog |
|
421 |
|
422 %if ABI_IS_32BIT=0 |
|
423 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
|
424 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
|
425 %endif |
|
426 |
|
427 pxor xmm6, xmm6 ; error accumulator |
|
428 pxor xmm7, xmm7 ; sse eaccumulator |
|
429 mov rsi, arg(0) ;ref_ptr ; |
|
430 |
|
431 mov rdi, arg(2) ;src_ptr ; |
|
432 movsxd rcx, dword ptr arg(4) ;Height ; |
|
433 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
|
434 |
|
435 pxor xmm0, xmm0 ; |
|
436 |
|
437 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 |
|
438 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 |
|
439 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 |
|
440 |
|
441 %if ABI_IS_32BIT |
|
442 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source |
|
443 %else |
|
444 add rsi, r8 |
|
445 %endif |
|
446 |
|
447 .half_horiz_vert_variance8x_h_1: |
|
448 |
|
449 movq xmm1, QWORD PTR [rsi] ; |
|
450 movq xmm2, QWORD PTR [rsi+1] ; |
|
451 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 |
|
452 |
|
453 pavgb xmm5, xmm1 ; xmm = vertical average of the above |
|
454 punpcklbw xmm5, xmm0 ; xmm5 = words of above |
|
455 |
|
456 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 |
|
457 punpcklbw xmm3, xmm0 ; xmm3 = words of above |
|
458 |
|
459 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
|
460 paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
|
461 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
|
462 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
|
463 |
|
464 movdqa xmm5, xmm1 ; save xmm1 for use on the next row |
|
465 |
|
466 %if ABI_IS_32BIT |
|
467 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source |
|
468 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination |
|
469 %else |
|
470 add rsi, r8 |
|
471 add rdi, r9 |
|
472 %endif |
|
473 |
|
474 sub rcx, 1 ; |
|
475 jnz .half_horiz_vert_variance8x_h_1 ; |
|
476 |
|
477 movdq2q mm6, xmm6 ; |
|
478 movdq2q mm7, xmm7 ; |
|
479 |
|
480 psrldq xmm6, 8 |
|
481 psrldq xmm7, 8 |
|
482 |
|
483 movdq2q mm2, xmm6 |
|
484 movdq2q mm3, xmm7 |
|
485 |
|
486 paddw mm6, mm2 |
|
487 paddd mm7, mm3 |
|
488 |
|
489 pxor mm3, mm3 ; |
|
490 pxor mm2, mm2 ; |
|
491 |
|
492 punpcklwd mm2, mm6 ; |
|
493 punpckhwd mm3, mm6 ; |
|
494 |
|
495 paddd mm2, mm3 ; |
|
496 movq mm6, mm2 ; |
|
497 |
|
498 psrlq mm6, 32 ; |
|
499 paddd mm2, mm6 ; |
|
500 |
|
501 psrad mm2, 16 ; |
|
502 movq mm4, mm7 ; |
|
503 |
|
504 psrlq mm4, 32 ; |
|
505 paddd mm4, mm7 ; |
|
506 |
|
507 mov rsi, arg(5) ; sum |
|
508 mov rdi, arg(6) ; sumsquared |
|
509 |
|
510 movd [rsi], mm2 ; |
|
511 movd [rdi], mm4 ; |
|
512 |
|
513 |
|
514 ; begin epilog |
|
515 pop rdi |
|
516 pop rsi |
|
517 RESTORE_GOT |
|
518 RESTORE_XMM |
|
519 UNSHADOW_ARGS |
|
520 pop rbp |
|
521 ret |
|
522 |
|
523 ;void vp9_half_vert_variance8x_h_sse2 |
|
524 ;( |
|
525 ; unsigned char *ref_ptr, |
|
526 ; int ref_pixels_per_line, |
|
527 ; unsigned char *src_ptr, |
|
528 ; int src_pixels_per_line, |
|
529 ; unsigned int Height, |
|
530 ; int *sum, |
|
531 ; unsigned int *sumsquared |
|
532 ;) |
|
533 global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE |
|
534 sym(vp9_half_vert_variance8x_h_sse2): |
|
535 push rbp |
|
536 mov rbp, rsp |
|
537 SHADOW_ARGS_TO_STACK 7 |
|
538 SAVE_XMM 7 |
|
539 GET_GOT rbx |
|
540 push rsi |
|
541 push rdi |
|
542 ; end prolog |
|
543 |
|
544 %if ABI_IS_32BIT=0 |
|
545 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
|
546 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
|
547 %endif |
|
548 |
|
549 pxor xmm6, xmm6 ; error accumulator |
|
550 pxor xmm7, xmm7 ; sse eaccumulator |
|
551 mov rsi, arg(0) ;ref_ptr ; |
|
552 |
|
553 mov rdi, arg(2) ;src_ptr ; |
|
554 movsxd rcx, dword ptr arg(4) ;Height ; |
|
555 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
|
556 |
|
557 pxor xmm0, xmm0 ; |
|
558 .half_vert_variance8x_h_1: |
|
559 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 |
|
560 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 |
|
561 |
|
562 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) |
|
563 punpcklbw xmm5, xmm0 ; xmm5 = words of above |
|
564 |
|
565 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 |
|
566 punpcklbw xmm3, xmm0 ; xmm3 = words of above |
|
567 |
|
568 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
|
569 paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
|
570 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
|
571 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
|
572 |
|
573 %if ABI_IS_32BIT |
|
574 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source |
|
575 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination |
|
576 %else |
|
577 add rsi, r8 |
|
578 add rdi, r9 |
|
579 %endif |
|
580 |
|
581 sub rcx, 1 ; |
|
582 jnz .half_vert_variance8x_h_1 ; |
|
583 |
|
584 movdq2q mm6, xmm6 ; |
|
585 movdq2q mm7, xmm7 ; |
|
586 |
|
587 psrldq xmm6, 8 |
|
588 psrldq xmm7, 8 |
|
589 |
|
590 movdq2q mm2, xmm6 |
|
591 movdq2q mm3, xmm7 |
|
592 |
|
593 paddw mm6, mm2 |
|
594 paddd mm7, mm3 |
|
595 |
|
596 pxor mm3, mm3 ; |
|
597 pxor mm2, mm2 ; |
|
598 |
|
599 punpcklwd mm2, mm6 ; |
|
600 punpckhwd mm3, mm6 ; |
|
601 |
|
602 paddd mm2, mm3 ; |
|
603 movq mm6, mm2 ; |
|
604 |
|
605 psrlq mm6, 32 ; |
|
606 paddd mm2, mm6 ; |
|
607 |
|
608 psrad mm2, 16 ; |
|
609 movq mm4, mm7 ; |
|
610 |
|
611 psrlq mm4, 32 ; |
|
612 paddd mm4, mm7 ; |
|
613 |
|
614 mov rsi, arg(5) ; sum |
|
615 mov rdi, arg(6) ; sumsquared |
|
616 |
|
617 movd [rsi], mm2 ; |
|
618 movd [rdi], mm4 ; |
|
619 |
|
620 |
|
621 ; begin epilog |
|
622 pop rdi |
|
623 pop rsi |
|
624 RESTORE_GOT |
|
625 RESTORE_XMM |
|
626 UNSHADOW_ARGS |
|
627 pop rbp |
|
628 ret |
|
629 |
|
630 |
|
631 ;void vp9_half_horiz_variance8x_h_sse2 |
|
632 ;( |
|
633 ; unsigned char *ref_ptr, |
|
634 ; int ref_pixels_per_line, |
|
635 ; unsigned char *src_ptr, |
|
636 ; int src_pixels_per_line, |
|
637 ; unsigned int Height, |
|
638 ; int *sum, |
|
639 ; unsigned int *sumsquared |
|
640 ;) |
|
641 global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE |
|
642 sym(vp9_half_horiz_variance8x_h_sse2): |
|
643 push rbp |
|
644 mov rbp, rsp |
|
645 SHADOW_ARGS_TO_STACK 7 |
|
646 SAVE_XMM 7 |
|
647 GET_GOT rbx |
|
648 push rsi |
|
649 push rdi |
|
650 ; end prolog |
|
651 |
|
652 %if ABI_IS_32BIT=0 |
|
653 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
|
654 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
|
655 %endif |
|
656 |
|
657 pxor xmm6, xmm6 ; error accumulator |
|
658 pxor xmm7, xmm7 ; sse eaccumulator |
|
659 mov rsi, arg(0) ;ref_ptr ; |
|
660 |
|
661 mov rdi, arg(2) ;src_ptr ; |
|
662 movsxd rcx, dword ptr arg(4) ;Height ; |
|
663 |
|
664 pxor xmm0, xmm0 ; |
|
665 .half_horiz_variance8x_h_1: |
|
666 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 |
|
667 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 |
|
668 |
|
669 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) |
|
670 punpcklbw xmm5, xmm0 ; xmm5 = words of above |
|
671 |
|
672 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 |
|
673 punpcklbw xmm3, xmm0 ; xmm3 = words of above |
|
674 |
|
675 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
|
676 paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
|
677 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
|
678 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
|
679 |
|
680 %if ABI_IS_32BIT |
|
681 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source |
|
682 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination |
|
683 %else |
|
684 add rsi, r8 |
|
685 add rdi, r9 |
|
686 %endif |
|
687 sub rcx, 1 ; |
|
688 jnz .half_horiz_variance8x_h_1 ; |
|
689 |
|
690 movdq2q mm6, xmm6 ; |
|
691 movdq2q mm7, xmm7 ; |
|
692 |
|
693 psrldq xmm6, 8 |
|
694 psrldq xmm7, 8 |
|
695 |
|
696 movdq2q mm2, xmm6 |
|
697 movdq2q mm3, xmm7 |
|
698 |
|
699 paddw mm6, mm2 |
|
700 paddd mm7, mm3 |
|
701 |
|
702 pxor mm3, mm3 ; |
|
703 pxor mm2, mm2 ; |
|
704 |
|
705 punpcklwd mm2, mm6 ; |
|
706 punpckhwd mm3, mm6 ; |
|
707 |
|
708 paddd mm2, mm3 ; |
|
709 movq mm6, mm2 ; |
|
710 |
|
711 psrlq mm6, 32 ; |
|
712 paddd mm2, mm6 ; |
|
713 |
|
714 psrad mm2, 16 ; |
|
715 movq mm4, mm7 ; |
|
716 |
|
717 psrlq mm4, 32 ; |
|
718 paddd mm4, mm7 ; |
|
719 |
|
720 mov rsi, arg(5) ; sum |
|
721 mov rdi, arg(6) ; sumsquared |
|
722 |
|
723 movd [rsi], mm2 ; |
|
724 movd [rdi], mm4 ; |
|
725 |
|
726 |
|
727 ; begin epilog |
|
728 pop rdi |
|
729 pop rsi |
|
730 RESTORE_GOT |
|
731 RESTORE_XMM |
|
732 UNSHADOW_ARGS |
|
733 pop rbp |
|
734 ret |