Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
13 extern sym(vp8_bilinear_filters_x86_8)
15 %define BLOCK_HEIGHT_WIDTH 4
16 %define VP8_FILTER_WEIGHT 128
17 %define VP8_FILTER_SHIFT 7
20 ;/************************************************************************************
21 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
22 ; input pixel array has output_height rows. This routine assumes that output_height is an
23 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
24 ; rows each iteration to take advantage of the 128 bits operations.
25 ;*************************************************************************************/
26 ;void vp8_filter_block1d8_h6_sse2
27 ;(
28 ; unsigned char *src_ptr,
29 ; unsigned short *output_ptr,
30 ; unsigned int src_pixels_per_line,
31 ; unsigned int pixel_step,
32 ; unsigned int output_height,
33 ; unsigned int output_width,
34 ; short *vp8_filter
35 ;)
36 global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
37 sym(vp8_filter_block1d8_h6_sse2):
38 push rbp
39 mov rbp, rsp
40 SHADOW_ARGS_TO_STACK 7
41 SAVE_XMM 7
42 GET_GOT rbx
43 push rsi
44 push rdi
45 ; end prolog
47 mov rdx, arg(6) ;vp8_filter
48 mov rsi, arg(0) ;src_ptr
50 mov rdi, arg(1) ;output_ptr
52 movsxd rcx, dword ptr arg(4) ;output_height
53 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
54 %if ABI_IS_32BIT=0
55 movsxd r8, dword ptr arg(5) ;output_width
56 %endif
57 pxor xmm0, xmm0 ; clear xmm0 for unpack
59 .filter_block1d8_h6_rowloop:
60 movq xmm3, MMWORD PTR [rsi - 2]
61 movq xmm1, MMWORD PTR [rsi + 6]
63 prefetcht2 [rsi+rax-2]
65 pslldq xmm1, 8
66 por xmm1, xmm3
68 movdqa xmm4, xmm1
69 movdqa xmm5, xmm1
71 movdqa xmm6, xmm1
72 movdqa xmm7, xmm1
74 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
75 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
77 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
78 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
80 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
81 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
84 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
85 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
87 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
89 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
90 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
92 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
94 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
95 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
98 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
100 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
101 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
104 paddsw xmm4, xmm7
105 paddsw xmm4, xmm5
107 paddsw xmm4, xmm3
108 paddsw xmm4, xmm6
110 paddsw xmm4, xmm1
111 paddsw xmm4, [GLOBAL(rd)]
113 psraw xmm4, 7
115 packuswb xmm4, xmm0
116 punpcklbw xmm4, xmm0
118 movdqa XMMWORD Ptr [rdi], xmm4
119 lea rsi, [rsi + rax]
121 %if ABI_IS_32BIT
122 add rdi, DWORD Ptr arg(5) ;[output_width]
123 %else
124 add rdi, r8
125 %endif
126 dec rcx
128 jnz .filter_block1d8_h6_rowloop ; next row
130 ; begin epilog
131 pop rdi
132 pop rsi
133 RESTORE_GOT
134 RESTORE_XMM
135 UNSHADOW_ARGS
136 pop rbp
137 ret
140 ;void vp8_filter_block1d16_h6_sse2
141 ;(
142 ; unsigned char *src_ptr,
143 ; unsigned short *output_ptr,
144 ; unsigned int src_pixels_per_line,
145 ; unsigned int pixel_step,
146 ; unsigned int output_height,
147 ; unsigned int output_width,
148 ; short *vp8_filter
149 ;)
150 ;/************************************************************************************
151 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
152 ; input pixel array has output_height rows. This routine assumes that output_height is an
153 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
154 ; rows each iteration to take advantage of the 128 bits operations.
155 ;*************************************************************************************/
156 global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
157 sym(vp8_filter_block1d16_h6_sse2):
158 push rbp
159 mov rbp, rsp
160 SHADOW_ARGS_TO_STACK 7
161 SAVE_XMM 7
162 GET_GOT rbx
163 push rsi
164 push rdi
165 ; end prolog
167 mov rdx, arg(6) ;vp8_filter
168 mov rsi, arg(0) ;src_ptr
170 mov rdi, arg(1) ;output_ptr
172 movsxd rcx, dword ptr arg(4) ;output_height
173 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
174 %if ABI_IS_32BIT=0
175 movsxd r8, dword ptr arg(5) ;output_width
176 %endif
178 pxor xmm0, xmm0 ; clear xmm0 for unpack
180 .filter_block1d16_h6_sse2_rowloop:
181 movq xmm3, MMWORD PTR [rsi - 2]
182 movq xmm1, MMWORD PTR [rsi + 6]
184 movq xmm2, MMWORD PTR [rsi +14]
185 pslldq xmm2, 8
187 por xmm2, xmm1
188 prefetcht2 [rsi+rax-2]
190 pslldq xmm1, 8
191 por xmm1, xmm3
193 movdqa xmm4, xmm1
194 movdqa xmm5, xmm1
196 movdqa xmm6, xmm1
197 movdqa xmm7, xmm1
199 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
200 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
202 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
203 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
205 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
206 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
209 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
210 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
212 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
214 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
215 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
217 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
219 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
220 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
223 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
225 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
226 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
228 paddsw xmm4, xmm7
229 paddsw xmm4, xmm5
231 paddsw xmm4, xmm3
232 paddsw xmm4, xmm6
234 paddsw xmm4, xmm1
235 paddsw xmm4, [GLOBAL(rd)]
237 psraw xmm4, 7
239 packuswb xmm4, xmm0
240 punpcklbw xmm4, xmm0
242 movdqa XMMWORD Ptr [rdi], xmm4
244 movdqa xmm3, xmm2
245 movdqa xmm4, xmm2
247 movdqa xmm5, xmm2
248 movdqa xmm6, xmm2
250 movdqa xmm7, xmm2
252 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
253 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
255 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
256 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
258 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
259 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
262 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
263 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
265 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
267 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
268 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
270 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
272 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
273 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
275 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
277 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
278 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
281 paddsw xmm4, xmm7
282 paddsw xmm4, xmm5
284 paddsw xmm4, xmm3
285 paddsw xmm4, xmm6
287 paddsw xmm4, xmm2
288 paddsw xmm4, [GLOBAL(rd)]
290 psraw xmm4, 7
292 packuswb xmm4, xmm0
293 punpcklbw xmm4, xmm0
295 movdqa XMMWORD Ptr [rdi+16], xmm4
297 lea rsi, [rsi + rax]
298 %if ABI_IS_32BIT
299 add rdi, DWORD Ptr arg(5) ;[output_width]
300 %else
301 add rdi, r8
302 %endif
304 dec rcx
305 jnz .filter_block1d16_h6_sse2_rowloop ; next row
307 ; begin epilog
308 pop rdi
309 pop rsi
310 RESTORE_GOT
311 RESTORE_XMM
312 UNSHADOW_ARGS
313 pop rbp
314 ret
317 ;void vp8_filter_block1d8_v6_sse2
318 ;(
319 ; short *src_ptr,
320 ; unsigned char *output_ptr,
321 ; int dst_ptich,
322 ; unsigned int pixels_per_line,
323 ; unsigned int pixel_step,
324 ; unsigned int output_height,
325 ; unsigned int output_width,
326 ; short * vp8_filter
327 ;)
328 ;/************************************************************************************
329 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
330 ; input pixel array has output_height rows.
331 ;*************************************************************************************/
332 global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
333 sym(vp8_filter_block1d8_v6_sse2):
334 push rbp
335 mov rbp, rsp
336 SHADOW_ARGS_TO_STACK 8
337 SAVE_XMM 7
338 GET_GOT rbx
339 push rsi
340 push rdi
341 ; end prolog
343 mov rax, arg(7) ;vp8_filter
344 movsxd rdx, dword ptr arg(3) ;pixels_per_line
346 mov rdi, arg(1) ;output_ptr
347 mov rsi, arg(0) ;src_ptr
349 sub rsi, rdx
350 sub rsi, rdx
352 movsxd rcx, DWORD PTR arg(5) ;[output_height]
353 pxor xmm0, xmm0 ; clear xmm0
355 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
356 %if ABI_IS_32BIT=0
357 movsxd r8, dword ptr arg(2) ; dst_ptich
358 %endif
360 .vp8_filter_block1d8_v6_sse2_loop:
361 movdqa xmm1, XMMWORD PTR [rsi]
362 pmullw xmm1, [rax]
364 movdqa xmm2, XMMWORD PTR [rsi + rdx]
365 pmullw xmm2, [rax + 16]
367 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
368 pmullw xmm3, [rax + 32]
370 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
371 pmullw xmm5, [rax + 64]
373 add rsi, rdx
374 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
376 pmullw xmm4, [rax + 48]
377 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
379 pmullw xmm6, [rax + 80]
381 paddsw xmm2, xmm5
382 paddsw xmm2, xmm3
384 paddsw xmm2, xmm1
385 paddsw xmm2, xmm4
387 paddsw xmm2, xmm6
388 paddsw xmm2, xmm7
390 psraw xmm2, 7
391 packuswb xmm2, xmm0 ; pack and saturate
393 movq QWORD PTR [rdi], xmm2 ; store the results in the destination
394 %if ABI_IS_32BIT
395 add rdi, DWORD PTR arg(2) ;[dst_ptich]
396 %else
397 add rdi, r8
398 %endif
399 dec rcx ; decrement count
400 jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
402 ; begin epilog
403 pop rdi
404 pop rsi
405 RESTORE_GOT
406 RESTORE_XMM
407 UNSHADOW_ARGS
408 pop rbp
409 ret
412 ;void vp8_filter_block1d16_v6_sse2
413 ;(
414 ; unsigned short *src_ptr,
415 ; unsigned char *output_ptr,
416 ; int dst_ptich,
417 ; unsigned int pixels_per_line,
418 ; unsigned int pixel_step,
419 ; unsigned int output_height,
420 ; unsigned int output_width,
421 ; const short *vp8_filter
422 ;)
423 ;/************************************************************************************
424 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
425 ; input pixel array has output_height rows.
426 ;*************************************************************************************/
427 global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
428 sym(vp8_filter_block1d16_v6_sse2):
429 push rbp
430 mov rbp, rsp
431 SHADOW_ARGS_TO_STACK 8
432 SAVE_XMM 7
433 GET_GOT rbx
434 push rsi
435 push rdi
436 ; end prolog
438 mov rax, arg(7) ;vp8_filter
439 movsxd rdx, dword ptr arg(3) ;pixels_per_line
441 mov rdi, arg(1) ;output_ptr
442 mov rsi, arg(0) ;src_ptr
444 sub rsi, rdx
445 sub rsi, rdx
447 movsxd rcx, DWORD PTR arg(5) ;[output_height]
448 %if ABI_IS_32BIT=0
449 movsxd r8, dword ptr arg(2) ; dst_ptich
450 %endif
452 .vp8_filter_block1d16_v6_sse2_loop:
453 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
454 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
455 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
456 pmullw xmm1, [rax + 16]
457 pmullw xmm2, [rax + 16]
459 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
460 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
461 pmullw xmm3, [rax + 64]
462 pmullw xmm4, [rax + 64]
464 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
465 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
466 pmullw xmm5, [rax + 32]
467 pmullw xmm6, [rax + 32]
469 movdqa xmm7, XMMWORD PTR [rsi] ; line 1
470 movdqa xmm0, XMMWORD PTR [rsi + 16]
471 pmullw xmm7, [rax]
472 pmullw xmm0, [rax]
474 paddsw xmm1, xmm3
475 paddsw xmm2, xmm4
476 paddsw xmm1, xmm5
477 paddsw xmm2, xmm6
478 paddsw xmm1, xmm7
479 paddsw xmm2, xmm0
481 add rsi, rdx
483 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
484 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
485 pmullw xmm3, [rax + 48]
486 pmullw xmm4, [rax + 48]
488 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
489 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
490 pmullw xmm5, [rax + 80]
491 pmullw xmm6, [rax + 80]
493 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
494 pxor xmm0, xmm0 ; clear xmm0
496 paddsw xmm1, xmm3
497 paddsw xmm2, xmm4
498 paddsw xmm1, xmm5
499 paddsw xmm2, xmm6
501 paddsw xmm1, xmm7
502 paddsw xmm2, xmm7
504 psraw xmm1, 7
505 psraw xmm2, 7
507 packuswb xmm1, xmm2 ; pack and saturate
508 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
509 %if ABI_IS_32BIT
510 add rdi, DWORD PTR arg(2) ;[dst_ptich]
511 %else
512 add rdi, r8
513 %endif
514 dec rcx ; decrement count
515 jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
517 ; begin epilog
518 pop rdi
519 pop rsi
520 RESTORE_GOT
521 RESTORE_XMM
522 UNSHADOW_ARGS
523 pop rbp
524 ret
527 ;void vp8_filter_block1d8_h6_only_sse2
528 ;(
529 ; unsigned char *src_ptr,
530 ; unsigned int src_pixels_per_line,
531 ; unsigned char *output_ptr,
532 ; int dst_ptich,
533 ; unsigned int output_height,
534 ; const short *vp8_filter
535 ;)
536 ; First-pass filter only when yoffset==0
537 global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
538 sym(vp8_filter_block1d8_h6_only_sse2):
539 push rbp
540 mov rbp, rsp
541 SHADOW_ARGS_TO_STACK 6
542 SAVE_XMM 7
543 GET_GOT rbx
544 push rsi
545 push rdi
546 ; end prolog
548 mov rdx, arg(5) ;vp8_filter
549 mov rsi, arg(0) ;src_ptr
551 mov rdi, arg(2) ;output_ptr
553 movsxd rcx, dword ptr arg(4) ;output_height
554 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
555 %if ABI_IS_32BIT=0
556 movsxd r8, dword ptr arg(3) ;dst_ptich
557 %endif
558 pxor xmm0, xmm0 ; clear xmm0 for unpack
560 .filter_block1d8_h6_only_rowloop:
561 movq xmm3, MMWORD PTR [rsi - 2]
562 movq xmm1, MMWORD PTR [rsi + 6]
564 prefetcht2 [rsi+rax-2]
566 pslldq xmm1, 8
567 por xmm1, xmm3
569 movdqa xmm4, xmm1
570 movdqa xmm5, xmm1
572 movdqa xmm6, xmm1
573 movdqa xmm7, xmm1
575 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
576 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
578 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
579 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
581 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
582 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
585 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
586 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
588 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
590 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
591 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
593 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
595 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
596 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
599 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
601 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
602 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
605 paddsw xmm4, xmm7
606 paddsw xmm4, xmm5
608 paddsw xmm4, xmm3
609 paddsw xmm4, xmm6
611 paddsw xmm4, xmm1
612 paddsw xmm4, [GLOBAL(rd)]
614 psraw xmm4, 7
616 packuswb xmm4, xmm0
618 movq QWORD PTR [rdi], xmm4 ; store the results in the destination
619 lea rsi, [rsi + rax]
621 %if ABI_IS_32BIT
622 add rdi, DWORD Ptr arg(3) ;dst_ptich
623 %else
624 add rdi, r8
625 %endif
626 dec rcx
628 jnz .filter_block1d8_h6_only_rowloop ; next row
630 ; begin epilog
631 pop rdi
632 pop rsi
633 RESTORE_GOT
634 RESTORE_XMM
635 UNSHADOW_ARGS
636 pop rbp
637 ret
640 ;void vp8_filter_block1d16_h6_only_sse2
641 ;(
642 ; unsigned char *src_ptr,
643 ; unsigned int src_pixels_per_line,
644 ; unsigned char *output_ptr,
645 ; int dst_ptich,
646 ; unsigned int output_height,
647 ; const short *vp8_filter
648 ;)
649 ; First-pass filter only when yoffset==0
650 global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
651 sym(vp8_filter_block1d16_h6_only_sse2):
652 push rbp
653 mov rbp, rsp
654 SHADOW_ARGS_TO_STACK 6
655 SAVE_XMM 7
656 GET_GOT rbx
657 push rsi
658 push rdi
659 ; end prolog
661 mov rdx, arg(5) ;vp8_filter
662 mov rsi, arg(0) ;src_ptr
664 mov rdi, arg(2) ;output_ptr
666 movsxd rcx, dword ptr arg(4) ;output_height
667 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
668 %if ABI_IS_32BIT=0
669 movsxd r8, dword ptr arg(3) ;dst_ptich
670 %endif
672 pxor xmm0, xmm0 ; clear xmm0 for unpack
674 .filter_block1d16_h6_only_sse2_rowloop:
675 movq xmm3, MMWORD PTR [rsi - 2]
676 movq xmm1, MMWORD PTR [rsi + 6]
678 movq xmm2, MMWORD PTR [rsi +14]
679 pslldq xmm2, 8
681 por xmm2, xmm1
682 prefetcht2 [rsi+rax-2]
684 pslldq xmm1, 8
685 por xmm1, xmm3
687 movdqa xmm4, xmm1
688 movdqa xmm5, xmm1
690 movdqa xmm6, xmm1
691 movdqa xmm7, xmm1
693 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
694 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
696 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
697 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
699 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
700 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
702 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
703 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
705 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
707 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
708 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
710 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
712 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
713 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
715 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
717 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
718 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
720 paddsw xmm4, xmm7
721 paddsw xmm4, xmm5
723 paddsw xmm4, xmm3
724 paddsw xmm4, xmm6
726 paddsw xmm4, xmm1
727 paddsw xmm4, [GLOBAL(rd)]
729 psraw xmm4, 7
731 packuswb xmm4, xmm0 ; lower 8 bytes
733 movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
735 movdqa xmm3, xmm2
736 movdqa xmm4, xmm2
738 movdqa xmm5, xmm2
739 movdqa xmm6, xmm2
741 movdqa xmm7, xmm2
743 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
744 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
746 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
747 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
749 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
750 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
752 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
753 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
755 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
757 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
758 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
760 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
762 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
763 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
765 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
767 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
768 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
770 paddsw xmm4, xmm7
771 paddsw xmm4, xmm5
773 paddsw xmm4, xmm3
774 paddsw xmm4, xmm6
776 paddsw xmm4, xmm2
777 paddsw xmm4, [GLOBAL(rd)]
779 psraw xmm4, 7
781 packuswb xmm4, xmm0 ; higher 8 bytes
783 movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
785 lea rsi, [rsi + rax]
786 %if ABI_IS_32BIT
787 add rdi, DWORD Ptr arg(3) ;dst_ptich
788 %else
789 add rdi, r8
790 %endif
792 dec rcx
793 jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
795 ; begin epilog
796 pop rdi
797 pop rsi
798 RESTORE_GOT
799 RESTORE_XMM
800 UNSHADOW_ARGS
801 pop rbp
802 ret
805 ;void vp8_filter_block1d8_v6_only_sse2
806 ;(
807 ; unsigned char *src_ptr,
808 ; unsigned int src_pixels_per_line,
809 ; unsigned char *output_ptr,
810 ; int dst_ptich,
811 ; unsigned int output_height,
812 ; const short *vp8_filter
813 ;)
814 ; Second-pass filter only when xoffset==0
815 global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
816 sym(vp8_filter_block1d8_v6_only_sse2):
817 push rbp
818 mov rbp, rsp
819 SHADOW_ARGS_TO_STACK 6
820 SAVE_XMM 7
821 GET_GOT rbx
822 push rsi
823 push rdi
824 ; end prolog
826 mov rsi, arg(0) ;src_ptr
827 mov rdi, arg(2) ;output_ptr
829 movsxd rcx, dword ptr arg(4) ;output_height
830 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
832 mov rax, arg(5) ;vp8_filter
834 pxor xmm0, xmm0 ; clear xmm0
836 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
837 %if ABI_IS_32BIT=0
838 movsxd r8, dword ptr arg(3) ; dst_ptich
839 %endif
841 .vp8_filter_block1d8_v6_only_sse2_loop:
842 movq xmm1, MMWORD PTR [rsi]
843 movq xmm2, MMWORD PTR [rsi + rdx]
844 movq xmm3, MMWORD PTR [rsi + rdx * 2]
845 movq xmm5, MMWORD PTR [rsi + rdx * 4]
846 add rsi, rdx
847 movq xmm4, MMWORD PTR [rsi + rdx * 2]
848 movq xmm6, MMWORD PTR [rsi + rdx * 4]
850 punpcklbw xmm1, xmm0
851 pmullw xmm1, [rax]
853 punpcklbw xmm2, xmm0
854 pmullw xmm2, [rax + 16]
856 punpcklbw xmm3, xmm0
857 pmullw xmm3, [rax + 32]
859 punpcklbw xmm5, xmm0
860 pmullw xmm5, [rax + 64]
862 punpcklbw xmm4, xmm0
863 pmullw xmm4, [rax + 48]
865 punpcklbw xmm6, xmm0
866 pmullw xmm6, [rax + 80]
868 paddsw xmm2, xmm5
869 paddsw xmm2, xmm3
871 paddsw xmm2, xmm1
872 paddsw xmm2, xmm4
874 paddsw xmm2, xmm6
875 paddsw xmm2, xmm7
877 psraw xmm2, 7
878 packuswb xmm2, xmm0 ; pack and saturate
880 movq QWORD PTR [rdi], xmm2 ; store the results in the destination
881 %if ABI_IS_32BIT
882 add rdi, DWORD PTR arg(3) ;[dst_ptich]
883 %else
884 add rdi, r8
885 %endif
886 dec rcx ; decrement count
887 jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
889 ; begin epilog
890 pop rdi
891 pop rsi
892 RESTORE_GOT
893 RESTORE_XMM
894 UNSHADOW_ARGS
895 pop rbp
896 ret
899 ;void vp8_unpack_block1d16_h6_sse2
900 ;(
901 ; unsigned char *src_ptr,
902 ; unsigned short *output_ptr,
903 ; unsigned int src_pixels_per_line,
904 ; unsigned int output_height,
905 ; unsigned int output_width
906 ;)
907 global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
908 sym(vp8_unpack_block1d16_h6_sse2):
909 push rbp
910 mov rbp, rsp
911 SHADOW_ARGS_TO_STACK 5
912 GET_GOT rbx
913 push rsi
914 push rdi
915 ; end prolog
917 mov rsi, arg(0) ;src_ptr
918 mov rdi, arg(1) ;output_ptr
920 movsxd rcx, dword ptr arg(3) ;output_height
921 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
923 pxor xmm0, xmm0 ; clear xmm0 for unpack
924 %if ABI_IS_32BIT=0
925 movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
926 %endif
928 .unpack_block1d16_h6_sse2_rowloop:
929 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
930 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
932 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
933 punpcklbw xmm1, xmm0
935 movdqa XMMWORD Ptr [rdi], xmm1
936 movdqa XMMWORD Ptr [rdi + 16], xmm3
938 lea rsi, [rsi + rax]
939 %if ABI_IS_32BIT
940 add rdi, DWORD Ptr arg(4) ;[output_width]
941 %else
942 add rdi, r8
943 %endif
944 dec rcx
945 jnz .unpack_block1d16_h6_sse2_rowloop ; next row
947 ; begin epilog
948 pop rdi
949 pop rsi
950 RESTORE_GOT
951 UNSHADOW_ARGS
952 pop rbp
953 ret
956 ;void vp8_bilinear_predict16x16_sse2
957 ;(
958 ; unsigned char *src_ptr,
959 ; int src_pixels_per_line,
960 ; int xoffset,
961 ; int yoffset,
962 ; unsigned char *dst_ptr,
963 ; int dst_pitch
964 ;)
965 extern sym(vp8_bilinear_filters_x86_8)
966 global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
967 sym(vp8_bilinear_predict16x16_sse2):
968 push rbp
969 mov rbp, rsp
970 SHADOW_ARGS_TO_STACK 6
971 SAVE_XMM 7
972 GET_GOT rbx
973 push rsi
974 push rdi
975 ; end prolog
977 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
978 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
980 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
981 movsxd rax, dword ptr arg(2) ;xoffset
983 cmp rax, 0 ;skip first_pass filter if xoffset=0
984 je .b16x16_sp_only
986 shl rax, 5
987 add rax, rcx ;HFilter
989 mov rdi, arg(4) ;dst_ptr
990 mov rsi, arg(0) ;src_ptr
991 movsxd rdx, dword ptr arg(5) ;dst_pitch
993 movdqa xmm1, [rax]
994 movdqa xmm2, [rax+16]
996 movsxd rax, dword ptr arg(3) ;yoffset
998 cmp rax, 0 ;skip second_pass filter if yoffset=0
999 je .b16x16_fp_only
1001 shl rax, 5
1002 add rax, rcx ;VFilter
1004 lea rcx, [rdi+rdx*8]
1005 lea rcx, [rcx+rdx*8]
1006 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1008 pxor xmm0, xmm0
1010 %if ABI_IS_32BIT=0
1011 movsxd r8, dword ptr arg(5) ;dst_pitch
1012 %endif
1013 ; get the first horizontal line done
1014 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1015 movdqa xmm4, xmm3 ; make a copy of current line
1017 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1018 punpckhbw xmm4, xmm0
1020 pmullw xmm3, xmm1
1021 pmullw xmm4, xmm1
1023 movdqu xmm5, [rsi+1]
1024 movdqa xmm6, xmm5
1026 punpcklbw xmm5, xmm0
1027 punpckhbw xmm6, xmm0
1029 pmullw xmm5, xmm2
1030 pmullw xmm6, xmm2
1032 paddw xmm3, xmm5
1033 paddw xmm4, xmm6
1035 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1036 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1038 paddw xmm4, [GLOBAL(rd)]
1039 psraw xmm4, VP8_FILTER_SHIFT
1041 movdqa xmm7, xmm3
1042 packuswb xmm7, xmm4
1044 add rsi, rdx ; next line
1045 .next_row:
1046 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1047 movdqa xmm4, xmm3 ; make a copy of current line
1049 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1050 punpckhbw xmm4, xmm0
1052 pmullw xmm3, xmm1
1053 pmullw xmm4, xmm1
1055 movdqu xmm5, [rsi+1]
1056 movdqa xmm6, xmm5
1058 punpcklbw xmm5, xmm0
1059 punpckhbw xmm6, xmm0
1061 pmullw xmm5, xmm2
1062 pmullw xmm6, xmm2
1064 paddw xmm3, xmm5
1065 paddw xmm4, xmm6
1067 movdqa xmm5, xmm7
1068 movdqa xmm6, xmm7
1070 punpcklbw xmm5, xmm0
1071 punpckhbw xmm6, xmm0
1073 pmullw xmm5, [rax]
1074 pmullw xmm6, [rax]
1076 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1077 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1079 paddw xmm4, [GLOBAL(rd)]
1080 psraw xmm4, VP8_FILTER_SHIFT
1082 movdqa xmm7, xmm3
1083 packuswb xmm7, xmm4
1085 pmullw xmm3, [rax+16]
1086 pmullw xmm4, [rax+16]
1088 paddw xmm3, xmm5
1089 paddw xmm4, xmm6
1091 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1092 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1094 paddw xmm4, [GLOBAL(rd)]
1095 psraw xmm4, VP8_FILTER_SHIFT
1097 packuswb xmm3, xmm4
1098 movdqa [rdi], xmm3 ; store the results in the destination
1100 add rsi, rdx ; next line
1101 %if ABI_IS_32BIT
1102 add rdi, DWORD PTR arg(5) ;dst_pitch
1103 %else
1104 add rdi, r8
1105 %endif
1107 cmp rdi, rcx
1108 jne .next_row
1110 jmp .done
1112 .b16x16_sp_only:
1113 movsxd rax, dword ptr arg(3) ;yoffset
1114 shl rax, 5
1115 add rax, rcx ;VFilter
1117 mov rdi, arg(4) ;dst_ptr
1118 mov rsi, arg(0) ;src_ptr
1119 movsxd rdx, dword ptr arg(5) ;dst_pitch
1121 movdqa xmm1, [rax]
1122 movdqa xmm2, [rax+16]
1124 lea rcx, [rdi+rdx*8]
1125 lea rcx, [rcx+rdx*8]
1126 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
1128 pxor xmm0, xmm0
1130 ; get the first horizontal line done
1131 movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1133 add rsi, rax ; next line
1134 .next_row_spo:
1135 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1137 movdqa xmm5, xmm7
1138 movdqa xmm6, xmm7
1140 movdqa xmm4, xmm3 ; make a copy of current line
1141 movdqa xmm7, xmm3
1143 punpcklbw xmm5, xmm0
1144 punpckhbw xmm6, xmm0
1145 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1146 punpckhbw xmm4, xmm0
1148 pmullw xmm5, xmm1
1149 pmullw xmm6, xmm1
1150 pmullw xmm3, xmm2
1151 pmullw xmm4, xmm2
1153 paddw xmm3, xmm5
1154 paddw xmm4, xmm6
1156 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1157 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1159 paddw xmm4, [GLOBAL(rd)]
1160 psraw xmm4, VP8_FILTER_SHIFT
1162 packuswb xmm3, xmm4
1163 movdqa [rdi], xmm3 ; store the results in the destination
1165 add rsi, rax ; next line
1166 add rdi, rdx ;dst_pitch
1167 cmp rdi, rcx
1168 jne .next_row_spo
1170 jmp .done
1172 .b16x16_fp_only:
1173 lea rcx, [rdi+rdx*8]
1174 lea rcx, [rcx+rdx*8]
1175 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
1176 pxor xmm0, xmm0
1178 .next_row_fpo:
1179 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1180 movdqa xmm4, xmm3 ; make a copy of current line
1182 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1183 punpckhbw xmm4, xmm0
1185 pmullw xmm3, xmm1
1186 pmullw xmm4, xmm1
1188 movdqu xmm5, [rsi+1]
1189 movdqa xmm6, xmm5
1191 punpcklbw xmm5, xmm0
1192 punpckhbw xmm6, xmm0
1194 pmullw xmm5, xmm2
1195 pmullw xmm6, xmm2
1197 paddw xmm3, xmm5
1198 paddw xmm4, xmm6
1200 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1201 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1203 paddw xmm4, [GLOBAL(rd)]
1204 psraw xmm4, VP8_FILTER_SHIFT
1206 packuswb xmm3, xmm4
1207 movdqa [rdi], xmm3 ; store the results in the destination
1209 add rsi, rax ; next line
1210 add rdi, rdx ; dst_pitch
1211 cmp rdi, rcx
1212 jne .next_row_fpo
1214 .done:
1215 ; begin epilog
1216 pop rdi
1217 pop rsi
1218 RESTORE_GOT
1219 RESTORE_XMM
1220 UNSHADOW_ARGS
1221 pop rbp
1222 ret
1225 ;void vp8_bilinear_predict8x8_sse2
1226 ;(
1227 ; unsigned char *src_ptr,
1228 ; int src_pixels_per_line,
1229 ; int xoffset,
1230 ; int yoffset,
1231 ; unsigned char *dst_ptr,
1232 ; int dst_pitch
1233 ;)
1234 global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
1235 sym(vp8_bilinear_predict8x8_sse2):
1236 push rbp
1237 mov rbp, rsp
1238 SHADOW_ARGS_TO_STACK 6
1239 SAVE_XMM 7
1240 GET_GOT rbx
1241 push rsi
1242 push rdi
1243 ; end prolog
1245 ALIGN_STACK 16, rax
1246 sub rsp, 144 ; reserve 144 bytes
1248 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
1249 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
1250 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
1252 mov rsi, arg(0) ;src_ptr
1253 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1255 ;Read 9-line unaligned data in and put them on stack. This gives a big
1256 ;performance boost.
1257 movdqu xmm0, [rsi]
1258 lea rax, [rdx + rdx*2]
1259 movdqu xmm1, [rsi+rdx]
1260 movdqu xmm2, [rsi+rdx*2]
1261 add rsi, rax
1262 movdqu xmm3, [rsi]
1263 movdqu xmm4, [rsi+rdx]
1264 movdqu xmm5, [rsi+rdx*2]
1265 add rsi, rax
1266 movdqu xmm6, [rsi]
1267 movdqu xmm7, [rsi+rdx]
1269 movdqa XMMWORD PTR [rsp], xmm0
1271 movdqu xmm0, [rsi+rdx*2]
1273 movdqa XMMWORD PTR [rsp+16], xmm1
1274 movdqa XMMWORD PTR [rsp+32], xmm2
1275 movdqa XMMWORD PTR [rsp+48], xmm3
1276 movdqa XMMWORD PTR [rsp+64], xmm4
1277 movdqa XMMWORD PTR [rsp+80], xmm5
1278 movdqa XMMWORD PTR [rsp+96], xmm6
1279 movdqa XMMWORD PTR [rsp+112], xmm7
1280 movdqa XMMWORD PTR [rsp+128], xmm0
1282 movsxd rax, dword ptr arg(2) ;xoffset
1283 shl rax, 5
1284 add rax, rcx ;HFilter
1286 mov rdi, arg(4) ;dst_ptr
1287 movsxd rdx, dword ptr arg(5) ;dst_pitch
1289 movdqa xmm1, [rax]
1290 movdqa xmm2, [rax+16]
1292 movsxd rax, dword ptr arg(3) ;yoffset
1293 shl rax, 5
1294 add rax, rcx ;VFilter
1296 lea rcx, [rdi+rdx*8]
1298 movdqa xmm5, [rax]
1299 movdqa xmm6, [rax+16]
1301 pxor xmm0, xmm0
1303 ; get the first horizontal line done
1304 movdqa xmm3, XMMWORD PTR [rsp]
1305 movdqa xmm4, xmm3 ; make a copy of current line
1306 psrldq xmm4, 1
1308 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
1309 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
1311 pmullw xmm3, xmm1
1312 pmullw xmm4, xmm2
1314 paddw xmm3, xmm4
1316 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1317 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1319 movdqa xmm7, xmm3
1320 add rsp, 16 ; next line
1321 .next_row8x8:
1322 movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1323 movdqa xmm4, xmm3 ; make a copy of current line
1324 psrldq xmm4, 1
1326 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
1327 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
1329 pmullw xmm3, xmm1
1330 pmullw xmm4, xmm2
1332 paddw xmm3, xmm4
1333 pmullw xmm7, xmm5
1335 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1336 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1338 movdqa xmm4, xmm3
1340 pmullw xmm3, xmm6
1341 paddw xmm3, xmm7
1343 movdqa xmm7, xmm4
1345 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1346 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1348 packuswb xmm3, xmm0
1349 movq [rdi], xmm3 ; store the results in the destination
1351 add rsp, 16 ; next line
1352 add rdi, rdx
1354 cmp rdi, rcx
1355 jne .next_row8x8
1357 ;add rsp, 144
1358 pop rsp
1359 ; begin epilog
1360 pop rdi
1361 pop rsi
1362 RESTORE_GOT
1363 RESTORE_XMM
1364 UNSHADOW_ARGS
1365 pop rbp
1366 ret
1369 SECTION_RODATA
1370 align 16
1371 rd:
1372 times 8 dw 0x40