media/libvpx/vp8/common/x86/variance_impl_mmx.asm

branch
TOR_BUG_9701
changeset 10
ac0c01689b40
equal deleted inserted replaced
-1:000000000000 0:f5ecab1e4682
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
15 global sym(vp8_get_mb_ss_mmx) PRIVATE
16 sym(vp8_get_mb_ss_mmx):
17 push rbp
18 mov rbp, rsp
19 SHADOW_ARGS_TO_STACK 7
20 GET_GOT rbx
21 push rsi
22 push rdi
23 sub rsp, 8
24 ; end prolog
25
26 mov rax, arg(0) ;src_ptr
27 mov rcx, 16
28 pxor mm4, mm4
29
30 .NEXTROW:
31 movq mm0, [rax]
32 movq mm1, [rax+8]
33 movq mm2, [rax+16]
34 movq mm3, [rax+24]
35 pmaddwd mm0, mm0
36 pmaddwd mm1, mm1
37 pmaddwd mm2, mm2
38 pmaddwd mm3, mm3
39
40 paddd mm4, mm0
41 paddd mm4, mm1
42 paddd mm4, mm2
43 paddd mm4, mm3
44
45 add rax, 32
46 dec rcx
47 ja .NEXTROW
48 movq QWORD PTR [rsp], mm4
49
50 ;return sum[0]+sum[1];
51 movsxd rax, dword ptr [rsp]
52 movsxd rcx, dword ptr [rsp+4]
53 add rax, rcx
54
55
56 ; begin epilog
57 add rsp, 8
58 pop rdi
59 pop rsi
60 RESTORE_GOT
61 UNSHADOW_ARGS
62 pop rbp
63 ret
64
65
66 ;unsigned int vp8_get8x8var_mmx
67 ;(
68 ; unsigned char *src_ptr,
69 ; int source_stride,
70 ; unsigned char *ref_ptr,
71 ; int recon_stride,
72 ; unsigned int *SSE,
73 ; int *Sum
74 ;)
75 global sym(vp8_get8x8var_mmx) PRIVATE
76 sym(vp8_get8x8var_mmx):
77 push rbp
78 mov rbp, rsp
79 SHADOW_ARGS_TO_STACK 6
80 push rsi
81 push rdi
82 push rbx
83 sub rsp, 16
84 ; end prolog
85
86
87 pxor mm5, mm5 ; Blank mmx6
88 pxor mm6, mm6 ; Blank mmx7
89 pxor mm7, mm7 ; Blank mmx7
90
91 mov rax, arg(0) ;[src_ptr] ; Load base addresses
92 mov rbx, arg(2) ;[ref_ptr]
93 movsxd rcx, dword ptr arg(1) ;[source_stride]
94 movsxd rdx, dword ptr arg(3) ;[recon_stride]
95
96 ; Row 1
97 movq mm0, [rax] ; Copy eight bytes to mm0
98 movq mm1, [rbx] ; Copy eight bytes to mm1
99 movq mm2, mm0 ; Take copies
100 movq mm3, mm1 ; Take copies
101
102 punpcklbw mm0, mm6 ; unpack to higher prrcision
103 punpcklbw mm1, mm6
104 punpckhbw mm2, mm6 ; unpack to higher prrcision
105 punpckhbw mm3, mm6
106 psubsw mm0, mm1 ; A-B (low order) to MM0
107 psubsw mm2, mm3 ; A-B (high order) to MM2
108
109 paddw mm5, mm0 ; accumulate differences in mm5
110 paddw mm5, mm2 ; accumulate differences in mm5
111
112 pmaddwd mm0, mm0 ; square and accumulate
113 pmaddwd mm2, mm2 ; square and accumulate
114 add rbx,rdx ; Inc pointer into ref data
115 add rax,rcx ; Inc pointer into the new data
116 movq mm1, [rbx] ; Copy eight bytes to mm1
117 paddd mm7, mm0 ; accumulate in mm7
118 paddd mm7, mm2 ; accumulate in mm7
119
120
121 ; Row 2
122 movq mm0, [rax] ; Copy eight bytes to mm0
123 movq mm2, mm0 ; Take copies
124 movq mm3, mm1 ; Take copies
125
126 punpcklbw mm0, mm6 ; unpack to higher prrcision
127 punpcklbw mm1, mm6
128 punpckhbw mm2, mm6 ; unpack to higher prrcision
129 punpckhbw mm3, mm6
130 psubsw mm0, mm1 ; A-B (low order) to MM0
131 psubsw mm2, mm3 ; A-B (high order) to MM2
132
133 paddw mm5, mm0 ; accumulate differences in mm5
134 paddw mm5, mm2 ; accumulate differences in mm5
135
136 pmaddwd mm0, mm0 ; square and accumulate
137 pmaddwd mm2, mm2 ; square and accumulate
138 add rbx,rdx ; Inc pointer into ref data
139 add rax,rcx ; Inc pointer into the new data
140 movq mm1, [rbx] ; Copy eight bytes to mm1
141 paddd mm7, mm0 ; accumulate in mm7
142 paddd mm7, mm2 ; accumulate in mm7
143
144 ; Row 3
145 movq mm0, [rax] ; Copy eight bytes to mm0
146 movq mm2, mm0 ; Take copies
147 movq mm3, mm1 ; Take copies
148
149 punpcklbw mm0, mm6 ; unpack to higher prrcision
150 punpcklbw mm1, mm6
151 punpckhbw mm2, mm6 ; unpack to higher prrcision
152 punpckhbw mm3, mm6
153 psubsw mm0, mm1 ; A-B (low order) to MM0
154 psubsw mm2, mm3 ; A-B (high order) to MM2
155
156 paddw mm5, mm0 ; accumulate differences in mm5
157 paddw mm5, mm2 ; accumulate differences in mm5
158
159 pmaddwd mm0, mm0 ; square and accumulate
160 pmaddwd mm2, mm2 ; square and accumulate
161 add rbx,rdx ; Inc pointer into ref data
162 add rax,rcx ; Inc pointer into the new data
163 movq mm1, [rbx] ; Copy eight bytes to mm1
164 paddd mm7, mm0 ; accumulate in mm7
165 paddd mm7, mm2 ; accumulate in mm7
166
167 ; Row 4
168 movq mm0, [rax] ; Copy eight bytes to mm0
169 movq mm2, mm0 ; Take copies
170 movq mm3, mm1 ; Take copies
171
172 punpcklbw mm0, mm6 ; unpack to higher prrcision
173 punpcklbw mm1, mm6
174 punpckhbw mm2, mm6 ; unpack to higher prrcision
175 punpckhbw mm3, mm6
176 psubsw mm0, mm1 ; A-B (low order) to MM0
177 psubsw mm2, mm3 ; A-B (high order) to MM2
178
179 paddw mm5, mm0 ; accumulate differences in mm5
180 paddw mm5, mm2 ; accumulate differences in mm5
181
182 pmaddwd mm0, mm0 ; square and accumulate
183 pmaddwd mm2, mm2 ; square and accumulate
184 add rbx,rdx ; Inc pointer into ref data
185 add rax,rcx ; Inc pointer into the new data
186 movq mm1, [rbx] ; Copy eight bytes to mm1
187 paddd mm7, mm0 ; accumulate in mm7
188 paddd mm7, mm2 ; accumulate in mm7
189
190 ; Row 5
191 movq mm0, [rax] ; Copy eight bytes to mm0
192 movq mm2, mm0 ; Take copies
193 movq mm3, mm1 ; Take copies
194
195 punpcklbw mm0, mm6 ; unpack to higher prrcision
196 punpcklbw mm1, mm6
197 punpckhbw mm2, mm6 ; unpack to higher prrcision
198 punpckhbw mm3, mm6
199 psubsw mm0, mm1 ; A-B (low order) to MM0
200 psubsw mm2, mm3 ; A-B (high order) to MM2
201
202 paddw mm5, mm0 ; accumulate differences in mm5
203 paddw mm5, mm2 ; accumulate differences in mm5
204
205 pmaddwd mm0, mm0 ; square and accumulate
206 pmaddwd mm2, mm2 ; square and accumulate
207 add rbx,rdx ; Inc pointer into ref data
208 add rax,rcx ; Inc pointer into the new data
209 movq mm1, [rbx] ; Copy eight bytes to mm1
210 ; movq mm4, [rbx + rdx]
211 paddd mm7, mm0 ; accumulate in mm7
212 paddd mm7, mm2 ; accumulate in mm7
213
214 ; Row 6
215 movq mm0, [rax] ; Copy eight bytes to mm0
216 movq mm2, mm0 ; Take copies
217 movq mm3, mm1 ; Take copies
218
219 punpcklbw mm0, mm6 ; unpack to higher prrcision
220 punpcklbw mm1, mm6
221 punpckhbw mm2, mm6 ; unpack to higher prrcision
222 punpckhbw mm3, mm6
223 psubsw mm0, mm1 ; A-B (low order) to MM0
224 psubsw mm2, mm3 ; A-B (high order) to MM2
225
226 paddw mm5, mm0 ; accumulate differences in mm5
227 paddw mm5, mm2 ; accumulate differences in mm5
228
229 pmaddwd mm0, mm0 ; square and accumulate
230 pmaddwd mm2, mm2 ; square and accumulate
231 add rbx,rdx ; Inc pointer into ref data
232 add rax,rcx ; Inc pointer into the new data
233 movq mm1, [rbx] ; Copy eight bytes to mm1
234 paddd mm7, mm0 ; accumulate in mm7
235 paddd mm7, mm2 ; accumulate in mm7
236
237 ; Row 7
238 movq mm0, [rax] ; Copy eight bytes to mm0
239 movq mm2, mm0 ; Take copies
240 movq mm3, mm1 ; Take copies
241
242 punpcklbw mm0, mm6 ; unpack to higher prrcision
243 punpcklbw mm1, mm6
244 punpckhbw mm2, mm6 ; unpack to higher prrcision
245 punpckhbw mm3, mm6
246 psubsw mm0, mm1 ; A-B (low order) to MM0
247 psubsw mm2, mm3 ; A-B (high order) to MM2
248
249 paddw mm5, mm0 ; accumulate differences in mm5
250 paddw mm5, mm2 ; accumulate differences in mm5
251
252 pmaddwd mm0, mm0 ; square and accumulate
253 pmaddwd mm2, mm2 ; square and accumulate
254 add rbx,rdx ; Inc pointer into ref data
255 add rax,rcx ; Inc pointer into the new data
256 movq mm1, [rbx] ; Copy eight bytes to mm1
257 paddd mm7, mm0 ; accumulate in mm7
258 paddd mm7, mm2 ; accumulate in mm7
259
260 ; Row 8
261 movq mm0, [rax] ; Copy eight bytes to mm0
262 movq mm2, mm0 ; Take copies
263 movq mm3, mm1 ; Take copies
264
265 punpcklbw mm0, mm6 ; unpack to higher prrcision
266 punpcklbw mm1, mm6
267 punpckhbw mm2, mm6 ; unpack to higher prrcision
268 punpckhbw mm3, mm6
269 psubsw mm0, mm1 ; A-B (low order) to MM0
270 psubsw mm2, mm3 ; A-B (high order) to MM2
271
272 paddw mm5, mm0 ; accumulate differences in mm5
273 paddw mm5, mm2 ; accumulate differences in mm5
274
275 pmaddwd mm0, mm0 ; square and accumulate
276 pmaddwd mm2, mm2 ; square and accumulate
277 add rbx,rdx ; Inc pointer into ref data
278 add rax,rcx ; Inc pointer into the new data
279 paddd mm7, mm0 ; accumulate in mm7
280 paddd mm7, mm2 ; accumulate in mm7
281
282 ; Now accumulate the final results.
283 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
284 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
285 movsx rdx, WORD PTR [rsp+8]
286 movsx rcx, WORD PTR [rsp+10]
287 movsx rbx, WORD PTR [rsp+12]
288 movsx rax, WORD PTR [rsp+14]
289 add rdx, rcx
290 add rbx, rax
291 add rdx, rbx ;XSum
292 movsxd rax, DWORD PTR [rsp]
293 movsxd rcx, DWORD PTR [rsp+4]
294 add rax, rcx ;XXSum
295 mov rsi, arg(4) ;SSE
296 mov rdi, arg(5) ;Sum
297 mov dword ptr [rsi], eax
298 mov dword ptr [rdi], edx
299 xor rax, rax ; return 0
300
301
302 ; begin epilog
303 add rsp, 16
304 pop rbx
305 pop rdi
306 pop rsi
307 UNSHADOW_ARGS
308 pop rbp
309 ret
310
311
312
313 ;unsigned int
314 ;vp8_get4x4var_mmx
315 ;(
316 ; unsigned char *src_ptr,
317 ; int source_stride,
318 ; unsigned char *ref_ptr,
319 ; int recon_stride,
320 ; unsigned int *SSE,
321 ; int *Sum
322 ;)
323 global sym(vp8_get4x4var_mmx) PRIVATE
324 sym(vp8_get4x4var_mmx):
325 push rbp
326 mov rbp, rsp
327 SHADOW_ARGS_TO_STACK 6
328 push rsi
329 push rdi
330 push rbx
331 sub rsp, 16
332 ; end prolog
333
334
335 pxor mm5, mm5 ; Blank mmx6
336 pxor mm6, mm6 ; Blank mmx7
337 pxor mm7, mm7 ; Blank mmx7
338
339 mov rax, arg(0) ;[src_ptr] ; Load base addresses
340 mov rbx, arg(2) ;[ref_ptr]
341 movsxd rcx, dword ptr arg(1) ;[source_stride]
342 movsxd rdx, dword ptr arg(3) ;[recon_stride]
343
344 ; Row 1
345 movq mm0, [rax] ; Copy eight bytes to mm0
346 movq mm1, [rbx] ; Copy eight bytes to mm1
347 punpcklbw mm0, mm6 ; unpack to higher prrcision
348 punpcklbw mm1, mm6
349 psubsw mm0, mm1 ; A-B (low order) to MM0
350 paddw mm5, mm0 ; accumulate differences in mm5
351 pmaddwd mm0, mm0 ; square and accumulate
352 add rbx,rdx ; Inc pointer into ref data
353 add rax,rcx ; Inc pointer into the new data
354 movq mm1, [rbx] ; Copy eight bytes to mm1
355 paddd mm7, mm0 ; accumulate in mm7
356
357
358 ; Row 2
359 movq mm0, [rax] ; Copy eight bytes to mm0
360 punpcklbw mm0, mm6 ; unpack to higher prrcision
361 punpcklbw mm1, mm6
362 psubsw mm0, mm1 ; A-B (low order) to MM0
363 paddw mm5, mm0 ; accumulate differences in mm5
364
365 pmaddwd mm0, mm0 ; square and accumulate
366 add rbx,rdx ; Inc pointer into ref data
367 add rax,rcx ; Inc pointer into the new data
368 movq mm1, [rbx] ; Copy eight bytes to mm1
369 paddd mm7, mm0 ; accumulate in mm7
370
371 ; Row 3
372 movq mm0, [rax] ; Copy eight bytes to mm0
373 punpcklbw mm0, mm6 ; unpack to higher prrcision
374 punpcklbw mm1, mm6
375 psubsw mm0, mm1 ; A-B (low order) to MM0
376 paddw mm5, mm0 ; accumulate differences in mm5
377
378 pmaddwd mm0, mm0 ; square and accumulate
379 add rbx,rdx ; Inc pointer into ref data
380 add rax,rcx ; Inc pointer into the new data
381 movq mm1, [rbx] ; Copy eight bytes to mm1
382 paddd mm7, mm0 ; accumulate in mm7
383
384 ; Row 4
385 movq mm0, [rax] ; Copy eight bytes to mm0
386
387 punpcklbw mm0, mm6 ; unpack to higher prrcision
388 punpcklbw mm1, mm6
389 psubsw mm0, mm1 ; A-B (low order) to MM0
390
391 paddw mm5, mm0 ; accumulate differences in mm5
392
393 pmaddwd mm0, mm0 ; square and accumulate
394 paddd mm7, mm0 ; accumulate in mm7
395
396
397 ; Now accumulate the final results.
398 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
399 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
400 movsx rdx, WORD PTR [rsp+8]
401 movsx rcx, WORD PTR [rsp+10]
402 movsx rbx, WORD PTR [rsp+12]
403 movsx rax, WORD PTR [rsp+14]
404 add rdx, rcx
405 add rbx, rax
406 add rdx, rbx ;XSum
407 movsxd rax, DWORD PTR [rsp]
408 movsxd rcx, DWORD PTR [rsp+4]
409 add rax, rcx ;XXSum
410 mov rsi, arg(4) ;SSE
411 mov rdi, arg(5) ;Sum
412 mov dword ptr [rsi], eax
413 mov dword ptr [rdi], edx
414 xor rax, rax ; return 0
415
416
417 ; begin epilog
418 add rsp, 16
419 pop rbx
420 pop rdi
421 pop rsi
422 UNSHADOW_ARGS
423 pop rbp
424 ret
425
426
427
428 ;unsigned int
429 ;vp8_get4x4sse_cs_mmx
430 ;(
431 ; unsigned char *src_ptr,
432 ; int source_stride,
433 ; unsigned char *ref_ptr,
434 ; int recon_stride
435 ;)
436 global sym(vp8_get4x4sse_cs_mmx) PRIVATE
437 sym(vp8_get4x4sse_cs_mmx):
438 push rbp
439 mov rbp, rsp
440 SHADOW_ARGS_TO_STACK 4
441 push rsi
442 push rdi
443 push rbx
444 ; end prolog
445
446
447 pxor mm6, mm6 ; Blank mmx7
448 pxor mm7, mm7 ; Blank mmx7
449
450 mov rax, arg(0) ;[src_ptr] ; Load base addresses
451 mov rbx, arg(2) ;[ref_ptr]
452 movsxd rcx, dword ptr arg(1) ;[source_stride]
453 movsxd rdx, dword ptr arg(3) ;[recon_stride]
454 ; Row 1
455 movd mm0, [rax] ; Copy eight bytes to mm0
456 movd mm1, [rbx] ; Copy eight bytes to mm1
457 punpcklbw mm0, mm6 ; unpack to higher prrcision
458 punpcklbw mm1, mm6
459 psubsw mm0, mm1 ; A-B (low order) to MM0
460 pmaddwd mm0, mm0 ; square and accumulate
461 add rbx,rdx ; Inc pointer into ref data
462 add rax,rcx ; Inc pointer into the new data
463 movd mm1, [rbx] ; Copy eight bytes to mm1
464 paddd mm7, mm0 ; accumulate in mm7
465
466 ; Row 2
467 movd mm0, [rax] ; Copy eight bytes to mm0
468 punpcklbw mm0, mm6 ; unpack to higher prrcision
469 punpcklbw mm1, mm6
470 psubsw mm0, mm1 ; A-B (low order) to MM0
471 pmaddwd mm0, mm0 ; square and accumulate
472 add rbx,rdx ; Inc pointer into ref data
473 add rax,rcx ; Inc pointer into the new data
474 movd mm1, [rbx] ; Copy eight bytes to mm1
475 paddd mm7, mm0 ; accumulate in mm7
476
477 ; Row 3
478 movd mm0, [rax] ; Copy eight bytes to mm0
479 punpcklbw mm1, mm6
480 punpcklbw mm0, mm6 ; unpack to higher prrcision
481 psubsw mm0, mm1 ; A-B (low order) to MM0
482
483 pmaddwd mm0, mm0 ; square and accumulate
484 add rbx,rdx ; Inc pointer into ref data
485 add rax,rcx ; Inc pointer into the new data
486 movd mm1, [rbx] ; Copy eight bytes to mm1
487 paddd mm7, mm0 ; accumulate in mm7
488
489 ; Row 4
490 movd mm0, [rax] ; Copy eight bytes to mm0
491 punpcklbw mm0, mm6 ; unpack to higher prrcision
492 punpcklbw mm1, mm6
493 psubsw mm0, mm1 ; A-B (low order) to MM0
494 pmaddwd mm0, mm0 ; square and accumulate
495 paddd mm7, mm0 ; accumulate in mm7
496
497 movq mm0, mm7 ;
498 psrlq mm7, 32
499
500 paddd mm0, mm7
501 movq rax, mm0
502
503
504 ; begin epilog
505 pop rbx
506 pop rdi
507 pop rsi
508 UNSHADOW_ARGS
509 pop rbp
510 ret
511
512 %define mmx_filter_shift 7
513
514 ;void vp8_filter_block2d_bil4x4_var_mmx
515 ;(
516 ; unsigned char *ref_ptr,
517 ; int ref_pixels_per_line,
518 ; unsigned char *src_ptr,
519 ; int src_pixels_per_line,
520 ; unsigned short *HFilter,
521 ; unsigned short *VFilter,
522 ; int *sum,
523 ; unsigned int *sumsquared
524 ;)
525 global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
526 sym(vp8_filter_block2d_bil4x4_var_mmx):
527 push rbp
528 mov rbp, rsp
529 SHADOW_ARGS_TO_STACK 8
530 GET_GOT rbx
531 push rsi
532 push rdi
533 sub rsp, 16
534 ; end prolog
535
536
537 pxor mm6, mm6 ;
538 pxor mm7, mm7 ;
539
540 mov rax, arg(4) ;HFilter ;
541 mov rdx, arg(5) ;VFilter ;
542
543 mov rsi, arg(0) ;ref_ptr ;
544 mov rdi, arg(2) ;src_ptr ;
545
546 mov rcx, 4 ;
547 pxor mm0, mm0 ;
548
549 movd mm1, [rsi] ;
550 movd mm3, [rsi+1] ;
551
552 punpcklbw mm1, mm0 ;
553 pmullw mm1, [rax] ;
554
555 punpcklbw mm3, mm0 ;
556 pmullw mm3, [rax+8] ;
557
558 paddw mm1, mm3 ;
559 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
560
561 psraw mm1, mmx_filter_shift ;
562 movq mm5, mm1
563
564 %if ABI_IS_32BIT
565 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
566 %else
567 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
568 add rsi, r8
569 %endif
570
571 .filter_block2d_bil4x4_var_mmx_loop:
572
573 movd mm1, [rsi] ;
574 movd mm3, [rsi+1] ;
575
576 punpcklbw mm1, mm0 ;
577 pmullw mm1, [rax] ;
578
579 punpcklbw mm3, mm0 ;
580 pmullw mm3, [rax+8] ;
581
582 paddw mm1, mm3 ;
583 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
584
585 psraw mm1, mmx_filter_shift ;
586 movq mm3, mm5 ;
587
588 movq mm5, mm1 ;
589 pmullw mm3, [rdx] ;
590
591 pmullw mm1, [rdx+8] ;
592 paddw mm1, mm3 ;
593
594
595 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
596 psraw mm1, mmx_filter_shift ;
597
598 movd mm3, [rdi] ;
599 punpcklbw mm3, mm0 ;
600
601 psubw mm1, mm3 ;
602 paddw mm6, mm1 ;
603
604 pmaddwd mm1, mm1 ;
605 paddd mm7, mm1 ;
606
607 %if ABI_IS_32BIT
608 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
609 add rdi, dword ptr arg(3) ;src_pixels_per_line ;
610 %else
611 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
612 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
613 add rsi, r8
614 add rdi, r9
615 %endif
616 sub rcx, 1 ;
617 jnz .filter_block2d_bil4x4_var_mmx_loop ;
618
619
620 pxor mm3, mm3 ;
621 pxor mm2, mm2 ;
622
623 punpcklwd mm2, mm6 ;
624 punpckhwd mm3, mm6 ;
625
626 paddd mm2, mm3 ;
627 movq mm6, mm2 ;
628
629 psrlq mm6, 32 ;
630 paddd mm2, mm6 ;
631
632 psrad mm2, 16 ;
633 movq mm4, mm7 ;
634
635 psrlq mm4, 32 ;
636 paddd mm4, mm7 ;
637
638 mov rdi, arg(6) ;sum
639 mov rsi, arg(7) ;sumsquared
640
641 movd dword ptr [rdi], mm2 ;
642 movd dword ptr [rsi], mm4 ;
643
644
645
646 ; begin epilog
647 add rsp, 16
648 pop rdi
649 pop rsi
650 RESTORE_GOT
651 UNSHADOW_ARGS
652 pop rbp
653 ret
654
655
656
657
658 ;void vp8_filter_block2d_bil_var_mmx
659 ;(
660 ; unsigned char *ref_ptr,
661 ; int ref_pixels_per_line,
662 ; unsigned char *src_ptr,
663 ; int src_pixels_per_line,
664 ; unsigned int Height,
665 ; unsigned short *HFilter,
666 ; unsigned short *VFilter,
667 ; int *sum,
668 ; unsigned int *sumsquared
669 ;)
670 global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
671 sym(vp8_filter_block2d_bil_var_mmx):
672 push rbp
673 mov rbp, rsp
674 SHADOW_ARGS_TO_STACK 9
675 GET_GOT rbx
676 push rsi
677 push rdi
678 sub rsp, 16
679 ; end prolog
680
681 pxor mm6, mm6 ;
682 pxor mm7, mm7 ;
683 mov rax, arg(5) ;HFilter ;
684
685 mov rdx, arg(6) ;VFilter ;
686 mov rsi, arg(0) ;ref_ptr ;
687
688 mov rdi, arg(2) ;src_ptr ;
689 movsxd rcx, dword ptr arg(4) ;Height ;
690
691 pxor mm0, mm0 ;
692 movq mm1, [rsi] ;
693
694 movq mm3, [rsi+1] ;
695 movq mm2, mm1 ;
696
697 movq mm4, mm3 ;
698 punpcklbw mm1, mm0 ;
699
700 punpckhbw mm2, mm0 ;
701 pmullw mm1, [rax] ;
702
703 pmullw mm2, [rax] ;
704 punpcklbw mm3, mm0 ;
705
706 punpckhbw mm4, mm0 ;
707 pmullw mm3, [rax+8] ;
708
709 pmullw mm4, [rax+8] ;
710 paddw mm1, mm3 ;
711
712 paddw mm2, mm4 ;
713 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
714
715 psraw mm1, mmx_filter_shift ;
716 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
717
718 psraw mm2, mmx_filter_shift ;
719 movq mm5, mm1
720
721 packuswb mm5, mm2 ;
722 %if ABI_IS_32BIT
723 add rsi, dword ptr arg(1) ;ref_pixels_per_line
724 %else
725 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
726 add rsi, r8
727 %endif
728
729 .filter_block2d_bil_var_mmx_loop:
730
731 movq mm1, [rsi] ;
732 movq mm3, [rsi+1] ;
733
734 movq mm2, mm1 ;
735 movq mm4, mm3 ;
736
737 punpcklbw mm1, mm0 ;
738 punpckhbw mm2, mm0 ;
739
740 pmullw mm1, [rax] ;
741 pmullw mm2, [rax] ;
742
743 punpcklbw mm3, mm0 ;
744 punpckhbw mm4, mm0 ;
745
746 pmullw mm3, [rax+8] ;
747 pmullw mm4, [rax+8] ;
748
749 paddw mm1, mm3 ;
750 paddw mm2, mm4 ;
751
752 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
753 psraw mm1, mmx_filter_shift ;
754
755 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
756 psraw mm2, mmx_filter_shift ;
757
758 movq mm3, mm5 ;
759 movq mm4, mm5 ;
760
761 punpcklbw mm3, mm0 ;
762 punpckhbw mm4, mm0 ;
763
764 movq mm5, mm1 ;
765 packuswb mm5, mm2 ;
766
767 pmullw mm3, [rdx] ;
768 pmullw mm4, [rdx] ;
769
770 pmullw mm1, [rdx+8] ;
771 pmullw mm2, [rdx+8] ;
772
773 paddw mm1, mm3 ;
774 paddw mm2, mm4 ;
775
776 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
777 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
778
779 psraw mm1, mmx_filter_shift ;
780 psraw mm2, mmx_filter_shift ;
781
782 movq mm3, [rdi] ;
783 movq mm4, mm3 ;
784
785 punpcklbw mm3, mm0 ;
786 punpckhbw mm4, mm0 ;
787
788 psubw mm1, mm3 ;
789 psubw mm2, mm4 ;
790
791 paddw mm6, mm1 ;
792 pmaddwd mm1, mm1 ;
793
794 paddw mm6, mm2 ;
795 pmaddwd mm2, mm2 ;
796
797 paddd mm7, mm1 ;
798 paddd mm7, mm2 ;
799
800 %if ABI_IS_32BIT
801 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
802 add rdi, dword ptr arg(3) ;src_pixels_per_line ;
803 %else
804 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
805 movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
806 add rsi, r8
807 add rdi, r9
808 %endif
809 sub rcx, 1 ;
810 jnz .filter_block2d_bil_var_mmx_loop ;
811
812
813 pxor mm3, mm3 ;
814 pxor mm2, mm2 ;
815
816 punpcklwd mm2, mm6 ;
817 punpckhwd mm3, mm6 ;
818
819 paddd mm2, mm3 ;
820 movq mm6, mm2 ;
821
822 psrlq mm6, 32 ;
823 paddd mm2, mm6 ;
824
825 psrad mm2, 16 ;
826 movq mm4, mm7 ;
827
828 psrlq mm4, 32 ;
829 paddd mm4, mm7 ;
830
831 mov rdi, arg(7) ;sum
832 mov rsi, arg(8) ;sumsquared
833
834 movd dword ptr [rdi], mm2 ;
835 movd dword ptr [rsi], mm4 ;
836
837 ; begin epilog
838 add rsp, 16
839 pop rdi
840 pop rsi
841 RESTORE_GOT
842 UNSHADOW_ARGS
843 pop rbp
844 ret
845
846
847 SECTION_RODATA
848 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
849 align 16
850 mmx_bi_rd:
851 times 4 dw 64

mercurial