media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:53768290199f
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
15 ;overflow.
16
17 %macro GET_FILTERS_4 0
18 mov rdx, arg(5) ;filter ptr
19 mov rcx, 0x0400040
20
21 movdqa xmm7, [rdx] ;load filters
22 pshuflw xmm0, xmm7, 0b ;k0
23 pshuflw xmm1, xmm7, 01010101b ;k1
24 pshuflw xmm2, xmm7, 10101010b ;k2
25 pshuflw xmm3, xmm7, 11111111b ;k3
26 psrldq xmm7, 8
27 pshuflw xmm4, xmm7, 0b ;k4
28 pshuflw xmm5, xmm7, 01010101b ;k5
29 pshuflw xmm6, xmm7, 10101010b ;k6
30 pshuflw xmm7, xmm7, 11111111b ;k7
31
32 punpcklqdq xmm0, xmm1
33 punpcklqdq xmm2, xmm3
34 punpcklqdq xmm5, xmm4
35 punpcklqdq xmm6, xmm7
36
37 movdqa k0k1, xmm0
38 movdqa k2k3, xmm2
39 movdqa k5k4, xmm5
40 movdqa k6k7, xmm6
41
42 movq xmm6, rcx
43 pshufd xmm6, xmm6, 0
44 movdqa krd, xmm6
45
46 pxor xmm7, xmm7
47 movdqa zero, xmm7
48 %endm
49
50 %macro APPLY_FILTER_4 1
51 punpckldq xmm0, xmm1 ;two row in one register
52 punpckldq xmm6, xmm7
53 punpckldq xmm2, xmm3
54 punpckldq xmm5, xmm4
55
56 punpcklbw xmm0, zero ;unpack to word
57 punpcklbw xmm6, zero
58 punpcklbw xmm2, zero
59 punpcklbw xmm5, zero
60
61 pmullw xmm0, k0k1 ;multiply the filter factors
62 pmullw xmm6, k6k7
63 pmullw xmm2, k2k3
64 pmullw xmm5, k5k4
65
66 paddsw xmm0, xmm6 ;sum
67 movdqa xmm1, xmm0
68 psrldq xmm1, 8
69 paddsw xmm0, xmm1
70 paddsw xmm0, xmm2
71 psrldq xmm2, 8
72 paddsw xmm0, xmm5
73 psrldq xmm5, 8
74 paddsw xmm0, xmm2
75 paddsw xmm0, xmm5
76
77 paddsw xmm0, krd ;rounding
78 psraw xmm0, 7 ;shift
79 packuswb xmm0, xmm0 ;pack to byte
80
81 %if %1
82 movd xmm1, [rdi]
83 pavgb xmm0, xmm1
84 %endif
85 movd [rdi], xmm0
86 %endm
87
88 %macro GET_FILTERS 0
89 mov rdx, arg(5) ;filter ptr
90 mov rsi, arg(0) ;src_ptr
91 mov rdi, arg(2) ;output_ptr
92 mov rcx, 0x0400040
93
94 movdqa xmm7, [rdx] ;load filters
95 pshuflw xmm0, xmm7, 0b ;k0
96 pshuflw xmm1, xmm7, 01010101b ;k1
97 pshuflw xmm2, xmm7, 10101010b ;k2
98 pshuflw xmm3, xmm7, 11111111b ;k3
99 pshufhw xmm4, xmm7, 0b ;k4
100 pshufhw xmm5, xmm7, 01010101b ;k5
101 pshufhw xmm6, xmm7, 10101010b ;k6
102 pshufhw xmm7, xmm7, 11111111b ;k7
103
104 punpcklwd xmm0, xmm0
105 punpcklwd xmm1, xmm1
106 punpcklwd xmm2, xmm2
107 punpcklwd xmm3, xmm3
108 punpckhwd xmm4, xmm4
109 punpckhwd xmm5, xmm5
110 punpckhwd xmm6, xmm6
111 punpckhwd xmm7, xmm7
112
113 movdqa k0, xmm0 ;store filter factors on stack
114 movdqa k1, xmm1
115 movdqa k2, xmm2
116 movdqa k3, xmm3
117 movdqa k4, xmm4
118 movdqa k5, xmm5
119 movdqa k6, xmm6
120 movdqa k7, xmm7
121
122 movq xmm6, rcx
123 pshufd xmm6, xmm6, 0
124 movdqa krd, xmm6 ;rounding
125
126 pxor xmm7, xmm7
127 movdqa zero, xmm7
128 %endm
129
130 %macro LOAD_VERT_8 1
131 movq xmm0, [rsi + %1] ;0
132 movq xmm1, [rsi + rax + %1] ;1
133 movq xmm6, [rsi + rdx * 2 + %1] ;6
134 lea rsi, [rsi + rax]
135 movq xmm7, [rsi + rdx * 2 + %1] ;7
136 movq xmm2, [rsi + rax + %1] ;2
137 movq xmm3, [rsi + rax * 2 + %1] ;3
138 movq xmm4, [rsi + rdx + %1] ;4
139 movq xmm5, [rsi + rax * 4 + %1] ;5
140 %endm
141
142 %macro APPLY_FILTER_8 2
143 punpcklbw xmm0, zero
144 punpcklbw xmm1, zero
145 punpcklbw xmm6, zero
146 punpcklbw xmm7, zero
147 punpcklbw xmm2, zero
148 punpcklbw xmm5, zero
149 punpcklbw xmm3, zero
150 punpcklbw xmm4, zero
151
152 pmullw xmm0, k0
153 pmullw xmm1, k1
154 pmullw xmm6, k6
155 pmullw xmm7, k7
156 pmullw xmm2, k2
157 pmullw xmm5, k5
158 pmullw xmm3, k3
159 pmullw xmm4, k4
160
161 paddsw xmm0, xmm1
162 paddsw xmm0, xmm6
163 paddsw xmm0, xmm7
164 paddsw xmm0, xmm2
165 paddsw xmm0, xmm5
166 paddsw xmm0, xmm3
167 paddsw xmm0, xmm4
168
169 paddsw xmm0, krd ;rounding
170 psraw xmm0, 7 ;shift
171 packuswb xmm0, xmm0 ;pack back to byte
172 %if %1
173 movq xmm1, [rdi + %2]
174 pavgb xmm0, xmm1
175 %endif
176 movq [rdi + %2], xmm0
177 %endm
178
179 ;void vp9_filter_block1d4_v8_sse2
180 ;(
181 ; unsigned char *src_ptr,
182 ; unsigned int src_pitch,
183 ; unsigned char *output_ptr,
184 ; unsigned int out_pitch,
185 ; unsigned int output_height,
186 ; short *filter
187 ;)
188 global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
189 sym(vp9_filter_block1d4_v8_sse2):
190 push rbp
191 mov rbp, rsp
192 SHADOW_ARGS_TO_STACK 6
193 SAVE_XMM 7
194 push rsi
195 push rdi
196 push rbx
197 ; end prolog
198
199 ALIGN_STACK 16, rax
200 sub rsp, 16 * 6
201 %define k0k1 [rsp + 16 * 0]
202 %define k2k3 [rsp + 16 * 1]
203 %define k5k4 [rsp + 16 * 2]
204 %define k6k7 [rsp + 16 * 3]
205 %define krd [rsp + 16 * 4]
206 %define zero [rsp + 16 * 5]
207
208 GET_FILTERS_4
209
210 mov rsi, arg(0) ;src_ptr
211 mov rdi, arg(2) ;output_ptr
212
213 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
214 movsxd rbx, DWORD PTR arg(3) ;out_pitch
215 lea rdx, [rax + rax * 2]
216 movsxd rcx, DWORD PTR arg(4) ;output_height
217
218 .loop:
219 movd xmm0, [rsi] ;load src: row 0
220 movd xmm1, [rsi + rax] ;1
221 movd xmm6, [rsi + rdx * 2] ;6
222 lea rsi, [rsi + rax]
223 movd xmm7, [rsi + rdx * 2] ;7
224 movd xmm2, [rsi + rax] ;2
225 movd xmm3, [rsi + rax * 2] ;3
226 movd xmm4, [rsi + rdx] ;4
227 movd xmm5, [rsi + rax * 4] ;5
228
229 APPLY_FILTER_4 0
230
231 lea rdi, [rdi + rbx]
232 dec rcx
233 jnz .loop
234
235 add rsp, 16 * 6
236 pop rsp
237 pop rbx
238 ; begin epilog
239 pop rdi
240 pop rsi
241 RESTORE_XMM
242 UNSHADOW_ARGS
243 pop rbp
244 ret
245
246 ;void vp9_filter_block1d8_v8_sse2
247 ;(
248 ; unsigned char *src_ptr,
249 ; unsigned int src_pitch,
250 ; unsigned char *output_ptr,
251 ; unsigned int out_pitch,
252 ; unsigned int output_height,
253 ; short *filter
254 ;)
255 global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
256 sym(vp9_filter_block1d8_v8_sse2):
257 push rbp
258 mov rbp, rsp
259 SHADOW_ARGS_TO_STACK 6
260 SAVE_XMM 7
261 push rsi
262 push rdi
263 push rbx
264 ; end prolog
265
266 ALIGN_STACK 16, rax
267 sub rsp, 16 * 10
268 %define k0 [rsp + 16 * 0]
269 %define k1 [rsp + 16 * 1]
270 %define k2 [rsp + 16 * 2]
271 %define k3 [rsp + 16 * 3]
272 %define k4 [rsp + 16 * 4]
273 %define k5 [rsp + 16 * 5]
274 %define k6 [rsp + 16 * 6]
275 %define k7 [rsp + 16 * 7]
276 %define krd [rsp + 16 * 8]
277 %define zero [rsp + 16 * 9]
278
279 GET_FILTERS
280
281 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
282 movsxd rbx, DWORD PTR arg(3) ;out_pitch
283 lea rdx, [rax + rax * 2]
284 movsxd rcx, DWORD PTR arg(4) ;output_height
285
286 .loop:
287 LOAD_VERT_8 0
288 APPLY_FILTER_8 0, 0
289
290 lea rdi, [rdi + rbx]
291 dec rcx
292 jnz .loop
293
294 add rsp, 16 * 10
295 pop rsp
296 pop rbx
297 ; begin epilog
298 pop rdi
299 pop rsi
300 RESTORE_XMM
301 UNSHADOW_ARGS
302 pop rbp
303 ret
304
305 ;void vp9_filter_block1d16_v8_sse2
306 ;(
307 ; unsigned char *src_ptr,
308 ; unsigned int src_pitch,
309 ; unsigned char *output_ptr,
310 ; unsigned int out_pitch,
311 ; unsigned int output_height,
312 ; short *filter
313 ;)
314 global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
315 sym(vp9_filter_block1d16_v8_sse2):
316 push rbp
317 mov rbp, rsp
318 SHADOW_ARGS_TO_STACK 6
319 SAVE_XMM 7
320 push rsi
321 push rdi
322 push rbx
323 ; end prolog
324
325 ALIGN_STACK 16, rax
326 sub rsp, 16 * 10
327 %define k0 [rsp + 16 * 0]
328 %define k1 [rsp + 16 * 1]
329 %define k2 [rsp + 16 * 2]
330 %define k3 [rsp + 16 * 3]
331 %define k4 [rsp + 16 * 4]
332 %define k5 [rsp + 16 * 5]
333 %define k6 [rsp + 16 * 6]
334 %define k7 [rsp + 16 * 7]
335 %define krd [rsp + 16 * 8]
336 %define zero [rsp + 16 * 9]
337
338 GET_FILTERS
339
340 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
341 movsxd rbx, DWORD PTR arg(3) ;out_pitch
342 lea rdx, [rax + rax * 2]
343 movsxd rcx, DWORD PTR arg(4) ;output_height
344
345 .loop:
346 LOAD_VERT_8 0
347 APPLY_FILTER_8 0, 0
348 sub rsi, rax
349
350 LOAD_VERT_8 8
351 APPLY_FILTER_8 0, 8
352 add rdi, rbx
353
354 dec rcx
355 jnz .loop
356
357 add rsp, 16 * 10
358 pop rsp
359 pop rbx
360 ; begin epilog
361 pop rdi
362 pop rsi
363 RESTORE_XMM
364 UNSHADOW_ARGS
365 pop rbp
366 ret
367
368 global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
369 sym(vp9_filter_block1d4_v8_avg_sse2):
370 push rbp
371 mov rbp, rsp
372 SHADOW_ARGS_TO_STACK 6
373 SAVE_XMM 7
374 push rsi
375 push rdi
376 push rbx
377 ; end prolog
378
379 ALIGN_STACK 16, rax
380 sub rsp, 16 * 6
381 %define k0k1 [rsp + 16 * 0]
382 %define k2k3 [rsp + 16 * 1]
383 %define k5k4 [rsp + 16 * 2]
384 %define k6k7 [rsp + 16 * 3]
385 %define krd [rsp + 16 * 4]
386 %define zero [rsp + 16 * 5]
387
388 GET_FILTERS_4
389
390 mov rsi, arg(0) ;src_ptr
391 mov rdi, arg(2) ;output_ptr
392
393 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
394 movsxd rbx, DWORD PTR arg(3) ;out_pitch
395 lea rdx, [rax + rax * 2]
396 movsxd rcx, DWORD PTR arg(4) ;output_height
397
398 .loop:
399 movd xmm0, [rsi] ;load src: row 0
400 movd xmm1, [rsi + rax] ;1
401 movd xmm6, [rsi + rdx * 2] ;6
402 lea rsi, [rsi + rax]
403 movd xmm7, [rsi + rdx * 2] ;7
404 movd xmm2, [rsi + rax] ;2
405 movd xmm3, [rsi + rax * 2] ;3
406 movd xmm4, [rsi + rdx] ;4
407 movd xmm5, [rsi + rax * 4] ;5
408
409 APPLY_FILTER_4 1
410
411 lea rdi, [rdi + rbx]
412 dec rcx
413 jnz .loop
414
415 add rsp, 16 * 6
416 pop rsp
417 pop rbx
418 ; begin epilog
419 pop rdi
420 pop rsi
421 RESTORE_XMM
422 UNSHADOW_ARGS
423 pop rbp
424 ret
425
426 global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
427 sym(vp9_filter_block1d8_v8_avg_sse2):
428 push rbp
429 mov rbp, rsp
430 SHADOW_ARGS_TO_STACK 6
431 SAVE_XMM 7
432 push rsi
433 push rdi
434 push rbx
435 ; end prolog
436
437 ALIGN_STACK 16, rax
438 sub rsp, 16 * 10
439 %define k0 [rsp + 16 * 0]
440 %define k1 [rsp + 16 * 1]
441 %define k2 [rsp + 16 * 2]
442 %define k3 [rsp + 16 * 3]
443 %define k4 [rsp + 16 * 4]
444 %define k5 [rsp + 16 * 5]
445 %define k6 [rsp + 16 * 6]
446 %define k7 [rsp + 16 * 7]
447 %define krd [rsp + 16 * 8]
448 %define zero [rsp + 16 * 9]
449
450 GET_FILTERS
451
452 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
453 movsxd rbx, DWORD PTR arg(3) ;out_pitch
454 lea rdx, [rax + rax * 2]
455 movsxd rcx, DWORD PTR arg(4) ;output_height
456 .loop:
457 LOAD_VERT_8 0
458 APPLY_FILTER_8 1, 0
459
460 lea rdi, [rdi + rbx]
461 dec rcx
462 jnz .loop
463
464 add rsp, 16 * 10
465 pop rsp
466 pop rbx
467 ; begin epilog
468 pop rdi
469 pop rsi
470 RESTORE_XMM
471 UNSHADOW_ARGS
472 pop rbp
473 ret
474
475 global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
476 sym(vp9_filter_block1d16_v8_avg_sse2):
477 push rbp
478 mov rbp, rsp
479 SHADOW_ARGS_TO_STACK 6
480 SAVE_XMM 7
481 push rsi
482 push rdi
483 push rbx
484 ; end prolog
485
486 ALIGN_STACK 16, rax
487 sub rsp, 16 * 10
488 %define k0 [rsp + 16 * 0]
489 %define k1 [rsp + 16 * 1]
490 %define k2 [rsp + 16 * 2]
491 %define k3 [rsp + 16 * 3]
492 %define k4 [rsp + 16 * 4]
493 %define k5 [rsp + 16 * 5]
494 %define k6 [rsp + 16 * 6]
495 %define k7 [rsp + 16 * 7]
496 %define krd [rsp + 16 * 8]
497 %define zero [rsp + 16 * 9]
498
499 GET_FILTERS
500
501 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
502 movsxd rbx, DWORD PTR arg(3) ;out_pitch
503 lea rdx, [rax + rax * 2]
504 movsxd rcx, DWORD PTR arg(4) ;output_height
505 .loop:
506 LOAD_VERT_8 0
507 APPLY_FILTER_8 1, 0
508 sub rsi, rax
509
510 LOAD_VERT_8 8
511 APPLY_FILTER_8 1, 8
512 add rdi, rbx
513
514 dec rcx
515 jnz .loop
516
517 add rsp, 16 * 10
518 pop rsp
519 pop rbx
520 ; begin epilog
521 pop rdi
522 pop rsi
523 RESTORE_XMM
524 UNSHADOW_ARGS
525 pop rbp
526 ret
527
528 ;void vp9_filter_block1d4_h8_sse2
529 ;(
530 ; unsigned char *src_ptr,
531 ; unsigned int src_pixels_per_line,
532 ; unsigned char *output_ptr,
533 ; unsigned int output_pitch,
534 ; unsigned int output_height,
535 ; short *filter
536 ;)
537 global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
538 sym(vp9_filter_block1d4_h8_sse2):
539 push rbp
540 mov rbp, rsp
541 SHADOW_ARGS_TO_STACK 6
542 SAVE_XMM 7
543 push rsi
544 push rdi
545 ; end prolog
546
547 ALIGN_STACK 16, rax
548 sub rsp, 16 * 6
549 %define k0k1 [rsp + 16 * 0]
550 %define k2k3 [rsp + 16 * 1]
551 %define k5k4 [rsp + 16 * 2]
552 %define k6k7 [rsp + 16 * 3]
553 %define krd [rsp + 16 * 4]
554 %define zero [rsp + 16 * 5]
555
556 GET_FILTERS_4
557
558 mov rsi, arg(0) ;src_ptr
559 mov rdi, arg(2) ;output_ptr
560
561 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
562 movsxd rdx, DWORD PTR arg(3) ;out_pitch
563 movsxd rcx, DWORD PTR arg(4) ;output_height
564
565 .loop:
566 movdqu xmm0, [rsi - 3] ;load src
567
568 movdqa xmm1, xmm0
569 movdqa xmm6, xmm0
570 movdqa xmm7, xmm0
571 movdqa xmm2, xmm0
572 movdqa xmm3, xmm0
573 movdqa xmm5, xmm0
574 movdqa xmm4, xmm0
575
576 psrldq xmm1, 1
577 psrldq xmm6, 6
578 psrldq xmm7, 7
579 psrldq xmm2, 2
580 psrldq xmm3, 3
581 psrldq xmm5, 5
582 psrldq xmm4, 4
583
584 APPLY_FILTER_4 0
585
586 lea rsi, [rsi + rax]
587 lea rdi, [rdi + rdx]
588 dec rcx
589 jnz .loop
590
591 add rsp, 16 * 6
592 pop rsp
593
594 ; begin epilog
595 pop rdi
596 pop rsi
597 RESTORE_XMM
598 UNSHADOW_ARGS
599 pop rbp
600 ret
601
602 ;void vp9_filter_block1d8_h8_sse2
603 ;(
604 ; unsigned char *src_ptr,
605 ; unsigned int src_pixels_per_line,
606 ; unsigned char *output_ptr,
607 ; unsigned int output_pitch,
608 ; unsigned int output_height,
609 ; short *filter
610 ;)
611 global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
612 sym(vp9_filter_block1d8_h8_sse2):
613 push rbp
614 mov rbp, rsp
615 SHADOW_ARGS_TO_STACK 6
616 SAVE_XMM 7
617 push rsi
618 push rdi
619 ; end prolog
620
621 ALIGN_STACK 16, rax
622 sub rsp, 16 * 10
623 %define k0 [rsp + 16 * 0]
624 %define k1 [rsp + 16 * 1]
625 %define k2 [rsp + 16 * 2]
626 %define k3 [rsp + 16 * 3]
627 %define k4 [rsp + 16 * 4]
628 %define k5 [rsp + 16 * 5]
629 %define k6 [rsp + 16 * 6]
630 %define k7 [rsp + 16 * 7]
631 %define krd [rsp + 16 * 8]
632 %define zero [rsp + 16 * 9]
633
634 GET_FILTERS
635
636 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
637 movsxd rdx, DWORD PTR arg(3) ;out_pitch
638 movsxd rcx, DWORD PTR arg(4) ;output_height
639
640 .loop:
641 movdqu xmm0, [rsi - 3] ;load src
642
643 movdqa xmm1, xmm0
644 movdqa xmm6, xmm0
645 movdqa xmm7, xmm0
646 movdqa xmm2, xmm0
647 movdqa xmm5, xmm0
648 movdqa xmm3, xmm0
649 movdqa xmm4, xmm0
650
651 psrldq xmm1, 1
652 psrldq xmm6, 6
653 psrldq xmm7, 7
654 psrldq xmm2, 2
655 psrldq xmm5, 5
656 psrldq xmm3, 3
657 psrldq xmm4, 4
658
659 APPLY_FILTER_8 0, 0
660
661 lea rsi, [rsi + rax]
662 lea rdi, [rdi + rdx]
663 dec rcx
664 jnz .loop
665
666 add rsp, 16 * 10
667 pop rsp
668
669 ; begin epilog
670 pop rdi
671 pop rsi
672 RESTORE_XMM
673 UNSHADOW_ARGS
674 pop rbp
675 ret
676
677 ;void vp9_filter_block1d16_h8_sse2
678 ;(
679 ; unsigned char *src_ptr,
680 ; unsigned int src_pixels_per_line,
681 ; unsigned char *output_ptr,
682 ; unsigned int output_pitch,
683 ; unsigned int output_height,
684 ; short *filter
685 ;)
686 global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
687 sym(vp9_filter_block1d16_h8_sse2):
688 push rbp
689 mov rbp, rsp
690 SHADOW_ARGS_TO_STACK 6
691 SAVE_XMM 7
692 push rsi
693 push rdi
694 ; end prolog
695
696 ALIGN_STACK 16, rax
697 sub rsp, 16 * 10
698 %define k0 [rsp + 16 * 0]
699 %define k1 [rsp + 16 * 1]
700 %define k2 [rsp + 16 * 2]
701 %define k3 [rsp + 16 * 3]
702 %define k4 [rsp + 16 * 4]
703 %define k5 [rsp + 16 * 5]
704 %define k6 [rsp + 16 * 6]
705 %define k7 [rsp + 16 * 7]
706 %define krd [rsp + 16 * 8]
707 %define zero [rsp + 16 * 9]
708
709 GET_FILTERS
710
711 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
712 movsxd rdx, DWORD PTR arg(3) ;out_pitch
713 movsxd rcx, DWORD PTR arg(4) ;output_height
714
715 .loop:
716 movdqu xmm0, [rsi - 3] ;load src
717
718 movdqa xmm1, xmm0
719 movdqa xmm6, xmm0
720 movdqa xmm7, xmm0
721 movdqa xmm2, xmm0
722 movdqa xmm5, xmm0
723 movdqa xmm3, xmm0
724 movdqa xmm4, xmm0
725
726 psrldq xmm1, 1
727 psrldq xmm6, 6
728 psrldq xmm7, 7
729 psrldq xmm2, 2
730 psrldq xmm5, 5
731 psrldq xmm3, 3
732 psrldq xmm4, 4
733
734 APPLY_FILTER_8 0, 0
735
736 movdqu xmm0, [rsi + 5] ;load src
737
738 movdqa xmm1, xmm0
739 movdqa xmm6, xmm0
740 movdqa xmm7, xmm0
741 movdqa xmm2, xmm0
742 movdqa xmm5, xmm0
743 movdqa xmm3, xmm0
744 movdqa xmm4, xmm0
745
746 psrldq xmm1, 1
747 psrldq xmm6, 6
748 psrldq xmm7, 7
749 psrldq xmm2, 2
750 psrldq xmm5, 5
751 psrldq xmm3, 3
752 psrldq xmm4, 4
753
754 APPLY_FILTER_8 0, 8
755
756 lea rsi, [rsi + rax]
757 lea rdi, [rdi + rdx]
758 dec rcx
759 jnz .loop
760
761 add rsp, 16 * 10
762 pop rsp
763
764 ; begin epilog
765 pop rdi
766 pop rsi
767 RESTORE_XMM
768 UNSHADOW_ARGS
769 pop rbp
770 ret
771
772 global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
773 sym(vp9_filter_block1d4_h8_avg_sse2):
774 push rbp
775 mov rbp, rsp
776 SHADOW_ARGS_TO_STACK 6
777 SAVE_XMM 7
778 push rsi
779 push rdi
780 ; end prolog
781
782 ALIGN_STACK 16, rax
783 sub rsp, 16 * 6
784 %define k0k1 [rsp + 16 * 0]
785 %define k2k3 [rsp + 16 * 1]
786 %define k5k4 [rsp + 16 * 2]
787 %define k6k7 [rsp + 16 * 3]
788 %define krd [rsp + 16 * 4]
789 %define zero [rsp + 16 * 5]
790
791 GET_FILTERS_4
792
793 mov rsi, arg(0) ;src_ptr
794 mov rdi, arg(2) ;output_ptr
795
796 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
797 movsxd rdx, DWORD PTR arg(3) ;out_pitch
798 movsxd rcx, DWORD PTR arg(4) ;output_height
799
800 .loop:
801 movdqu xmm0, [rsi - 3] ;load src
802
803 movdqa xmm1, xmm0
804 movdqa xmm6, xmm0
805 movdqa xmm7, xmm0
806 movdqa xmm2, xmm0
807 movdqa xmm3, xmm0
808 movdqa xmm5, xmm0
809 movdqa xmm4, xmm0
810
811 psrldq xmm1, 1
812 psrldq xmm6, 6
813 psrldq xmm7, 7
814 psrldq xmm2, 2
815 psrldq xmm3, 3
816 psrldq xmm5, 5
817 psrldq xmm4, 4
818
819 APPLY_FILTER_4 1
820
821 lea rsi, [rsi + rax]
822 lea rdi, [rdi + rdx]
823 dec rcx
824 jnz .loop
825
826 add rsp, 16 * 6
827 pop rsp
828
829 ; begin epilog
830 pop rdi
831 pop rsi
832 RESTORE_XMM
833 UNSHADOW_ARGS
834 pop rbp
835 ret
836
837 global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
838 sym(vp9_filter_block1d8_h8_avg_sse2):
839 push rbp
840 mov rbp, rsp
841 SHADOW_ARGS_TO_STACK 6
842 SAVE_XMM 7
843 push rsi
844 push rdi
845 ; end prolog
846
847 ALIGN_STACK 16, rax
848 sub rsp, 16 * 10
849 %define k0 [rsp + 16 * 0]
850 %define k1 [rsp + 16 * 1]
851 %define k2 [rsp + 16 * 2]
852 %define k3 [rsp + 16 * 3]
853 %define k4 [rsp + 16 * 4]
854 %define k5 [rsp + 16 * 5]
855 %define k6 [rsp + 16 * 6]
856 %define k7 [rsp + 16 * 7]
857 %define krd [rsp + 16 * 8]
858 %define zero [rsp + 16 * 9]
859
860 GET_FILTERS
861
862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
863 movsxd rdx, DWORD PTR arg(3) ;out_pitch
864 movsxd rcx, DWORD PTR arg(4) ;output_height
865
866 .loop:
867 movdqu xmm0, [rsi - 3] ;load src
868
869 movdqa xmm1, xmm0
870 movdqa xmm6, xmm0
871 movdqa xmm7, xmm0
872 movdqa xmm2, xmm0
873 movdqa xmm5, xmm0
874 movdqa xmm3, xmm0
875 movdqa xmm4, xmm0
876
877 psrldq xmm1, 1
878 psrldq xmm6, 6
879 psrldq xmm7, 7
880 psrldq xmm2, 2
881 psrldq xmm5, 5
882 psrldq xmm3, 3
883 psrldq xmm4, 4
884
885 APPLY_FILTER_8 1, 0
886
887 lea rsi, [rsi + rax]
888 lea rdi, [rdi + rdx]
889 dec rcx
890 jnz .loop
891
892 add rsp, 16 * 10
893 pop rsp
894
895 ; begin epilog
896 pop rdi
897 pop rsi
898 RESTORE_XMM
899 UNSHADOW_ARGS
900 pop rbp
901 ret
902
903 global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
904 sym(vp9_filter_block1d16_h8_avg_sse2):
905 push rbp
906 mov rbp, rsp
907 SHADOW_ARGS_TO_STACK 6
908 SAVE_XMM 7
909 push rsi
910 push rdi
911 ; end prolog
912
913 ALIGN_STACK 16, rax
914 sub rsp, 16 * 10
915 %define k0 [rsp + 16 * 0]
916 %define k1 [rsp + 16 * 1]
917 %define k2 [rsp + 16 * 2]
918 %define k3 [rsp + 16 * 3]
919 %define k4 [rsp + 16 * 4]
920 %define k5 [rsp + 16 * 5]
921 %define k6 [rsp + 16 * 6]
922 %define k7 [rsp + 16 * 7]
923 %define krd [rsp + 16 * 8]
924 %define zero [rsp + 16 * 9]
925
926 GET_FILTERS
927
928 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
929 movsxd rdx, DWORD PTR arg(3) ;out_pitch
930 movsxd rcx, DWORD PTR arg(4) ;output_height
931
932 .loop:
933 movdqu xmm0, [rsi - 3] ;load src
934
935 movdqa xmm1, xmm0
936 movdqa xmm6, xmm0
937 movdqa xmm7, xmm0
938 movdqa xmm2, xmm0
939 movdqa xmm5, xmm0
940 movdqa xmm3, xmm0
941 movdqa xmm4, xmm0
942
943 psrldq xmm1, 1
944 psrldq xmm6, 6
945 psrldq xmm7, 7
946 psrldq xmm2, 2
947 psrldq xmm5, 5
948 psrldq xmm3, 3
949 psrldq xmm4, 4
950
951 APPLY_FILTER_8 1, 0
952
953 movdqu xmm0, [rsi + 5] ;load src
954
955 movdqa xmm1, xmm0
956 movdqa xmm6, xmm0
957 movdqa xmm7, xmm0
958 movdqa xmm2, xmm0
959 movdqa xmm5, xmm0
960 movdqa xmm3, xmm0
961 movdqa xmm4, xmm0
962
963 psrldq xmm1, 1
964 psrldq xmm6, 6
965 psrldq xmm7, 7
966 psrldq xmm2, 2
967 psrldq xmm5, 5
968 psrldq xmm3, 3
969 psrldq xmm4, 4
970
971 APPLY_FILTER_8 1, 8
972
973 lea rsi, [rsi + rax]
974 lea rdi, [rdi + rdx]
975 dec rcx
976 jnz .loop
977
978 add rsp, 16 * 10
979 pop rsp
980
981 ; begin epilog
982 pop rdi
983 pop rsi
984 RESTORE_XMM
985 UNSHADOW_ARGS
986 pop rbp
987 ret

mercurial