|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid |
|
15 ;overflow. |
|
16 |
|
17 %macro GET_FILTERS_4 0 |
|
18 mov rdx, arg(5) ;filter ptr |
|
19 mov rcx, 0x0400040 |
|
20 |
|
21 movdqa xmm7, [rdx] ;load filters |
|
22 pshuflw xmm0, xmm7, 0b ;k0 |
|
23 pshuflw xmm1, xmm7, 01010101b ;k1 |
|
24 pshuflw xmm2, xmm7, 10101010b ;k2 |
|
25 pshuflw xmm3, xmm7, 11111111b ;k3 |
|
26 psrldq xmm7, 8 |
|
27 pshuflw xmm4, xmm7, 0b ;k4 |
|
28 pshuflw xmm5, xmm7, 01010101b ;k5 |
|
29 pshuflw xmm6, xmm7, 10101010b ;k6 |
|
30 pshuflw xmm7, xmm7, 11111111b ;k7 |
|
31 |
|
32 punpcklqdq xmm0, xmm1 |
|
33 punpcklqdq xmm2, xmm3 |
|
34 punpcklqdq xmm5, xmm4 |
|
35 punpcklqdq xmm6, xmm7 |
|
36 |
|
37 movdqa k0k1, xmm0 |
|
38 movdqa k2k3, xmm2 |
|
39 movdqa k5k4, xmm5 |
|
40 movdqa k6k7, xmm6 |
|
41 |
|
42 movq xmm6, rcx |
|
43 pshufd xmm6, xmm6, 0 |
|
44 movdqa krd, xmm6 |
|
45 |
|
46 pxor xmm7, xmm7 |
|
47 movdqa zero, xmm7 |
|
48 %endm |
|
49 |
|
50 %macro APPLY_FILTER_4 1 |
|
51 punpckldq xmm0, xmm1 ;two row in one register |
|
52 punpckldq xmm6, xmm7 |
|
53 punpckldq xmm2, xmm3 |
|
54 punpckldq xmm5, xmm4 |
|
55 |
|
56 punpcklbw xmm0, zero ;unpack to word |
|
57 punpcklbw xmm6, zero |
|
58 punpcklbw xmm2, zero |
|
59 punpcklbw xmm5, zero |
|
60 |
|
61 pmullw xmm0, k0k1 ;multiply the filter factors |
|
62 pmullw xmm6, k6k7 |
|
63 pmullw xmm2, k2k3 |
|
64 pmullw xmm5, k5k4 |
|
65 |
|
66 paddsw xmm0, xmm6 ;sum |
|
67 movdqa xmm1, xmm0 |
|
68 psrldq xmm1, 8 |
|
69 paddsw xmm0, xmm1 |
|
70 paddsw xmm0, xmm2 |
|
71 psrldq xmm2, 8 |
|
72 paddsw xmm0, xmm5 |
|
73 psrldq xmm5, 8 |
|
74 paddsw xmm0, xmm2 |
|
75 paddsw xmm0, xmm5 |
|
76 |
|
77 paddsw xmm0, krd ;rounding |
|
78 psraw xmm0, 7 ;shift |
|
79 packuswb xmm0, xmm0 ;pack to byte |
|
80 |
|
81 %if %1 |
|
82 movd xmm1, [rdi] |
|
83 pavgb xmm0, xmm1 |
|
84 %endif |
|
85 movd [rdi], xmm0 |
|
86 %endm |
|
87 |
|
88 %macro GET_FILTERS 0 |
|
89 mov rdx, arg(5) ;filter ptr |
|
90 mov rsi, arg(0) ;src_ptr |
|
91 mov rdi, arg(2) ;output_ptr |
|
92 mov rcx, 0x0400040 |
|
93 |
|
94 movdqa xmm7, [rdx] ;load filters |
|
95 pshuflw xmm0, xmm7, 0b ;k0 |
|
96 pshuflw xmm1, xmm7, 01010101b ;k1 |
|
97 pshuflw xmm2, xmm7, 10101010b ;k2 |
|
98 pshuflw xmm3, xmm7, 11111111b ;k3 |
|
99 pshufhw xmm4, xmm7, 0b ;k4 |
|
100 pshufhw xmm5, xmm7, 01010101b ;k5 |
|
101 pshufhw xmm6, xmm7, 10101010b ;k6 |
|
102 pshufhw xmm7, xmm7, 11111111b ;k7 |
|
103 |
|
104 punpcklwd xmm0, xmm0 |
|
105 punpcklwd xmm1, xmm1 |
|
106 punpcklwd xmm2, xmm2 |
|
107 punpcklwd xmm3, xmm3 |
|
108 punpckhwd xmm4, xmm4 |
|
109 punpckhwd xmm5, xmm5 |
|
110 punpckhwd xmm6, xmm6 |
|
111 punpckhwd xmm7, xmm7 |
|
112 |
|
113 movdqa k0, xmm0 ;store filter factors on stack |
|
114 movdqa k1, xmm1 |
|
115 movdqa k2, xmm2 |
|
116 movdqa k3, xmm3 |
|
117 movdqa k4, xmm4 |
|
118 movdqa k5, xmm5 |
|
119 movdqa k6, xmm6 |
|
120 movdqa k7, xmm7 |
|
121 |
|
122 movq xmm6, rcx |
|
123 pshufd xmm6, xmm6, 0 |
|
124 movdqa krd, xmm6 ;rounding |
|
125 |
|
126 pxor xmm7, xmm7 |
|
127 movdqa zero, xmm7 |
|
128 %endm |
|
129 |
|
130 %macro LOAD_VERT_8 1 |
|
131 movq xmm0, [rsi + %1] ;0 |
|
132 movq xmm1, [rsi + rax + %1] ;1 |
|
133 movq xmm6, [rsi + rdx * 2 + %1] ;6 |
|
134 lea rsi, [rsi + rax] |
|
135 movq xmm7, [rsi + rdx * 2 + %1] ;7 |
|
136 movq xmm2, [rsi + rax + %1] ;2 |
|
137 movq xmm3, [rsi + rax * 2 + %1] ;3 |
|
138 movq xmm4, [rsi + rdx + %1] ;4 |
|
139 movq xmm5, [rsi + rax * 4 + %1] ;5 |
|
140 %endm |
|
141 |
|
142 %macro APPLY_FILTER_8 2 |
|
143 punpcklbw xmm0, zero |
|
144 punpcklbw xmm1, zero |
|
145 punpcklbw xmm6, zero |
|
146 punpcklbw xmm7, zero |
|
147 punpcklbw xmm2, zero |
|
148 punpcklbw xmm5, zero |
|
149 punpcklbw xmm3, zero |
|
150 punpcklbw xmm4, zero |
|
151 |
|
152 pmullw xmm0, k0 |
|
153 pmullw xmm1, k1 |
|
154 pmullw xmm6, k6 |
|
155 pmullw xmm7, k7 |
|
156 pmullw xmm2, k2 |
|
157 pmullw xmm5, k5 |
|
158 pmullw xmm3, k3 |
|
159 pmullw xmm4, k4 |
|
160 |
|
161 paddsw xmm0, xmm1 |
|
162 paddsw xmm0, xmm6 |
|
163 paddsw xmm0, xmm7 |
|
164 paddsw xmm0, xmm2 |
|
165 paddsw xmm0, xmm5 |
|
166 paddsw xmm0, xmm3 |
|
167 paddsw xmm0, xmm4 |
|
168 |
|
169 paddsw xmm0, krd ;rounding |
|
170 psraw xmm0, 7 ;shift |
|
171 packuswb xmm0, xmm0 ;pack back to byte |
|
172 %if %1 |
|
173 movq xmm1, [rdi + %2] |
|
174 pavgb xmm0, xmm1 |
|
175 %endif |
|
176 movq [rdi + %2], xmm0 |
|
177 %endm |
|
178 |
|
179 ;void vp9_filter_block1d4_v8_sse2 |
|
180 ;( |
|
181 ; unsigned char *src_ptr, |
|
182 ; unsigned int src_pitch, |
|
183 ; unsigned char *output_ptr, |
|
184 ; unsigned int out_pitch, |
|
185 ; unsigned int output_height, |
|
186 ; short *filter |
|
187 ;) |
|
188 global sym(vp9_filter_block1d4_v8_sse2) PRIVATE |
|
189 sym(vp9_filter_block1d4_v8_sse2): |
|
190 push rbp |
|
191 mov rbp, rsp |
|
192 SHADOW_ARGS_TO_STACK 6 |
|
193 SAVE_XMM 7 |
|
194 push rsi |
|
195 push rdi |
|
196 push rbx |
|
197 ; end prolog |
|
198 |
|
199 ALIGN_STACK 16, rax |
|
200 sub rsp, 16 * 6 |
|
201 %define k0k1 [rsp + 16 * 0] |
|
202 %define k2k3 [rsp + 16 * 1] |
|
203 %define k5k4 [rsp + 16 * 2] |
|
204 %define k6k7 [rsp + 16 * 3] |
|
205 %define krd [rsp + 16 * 4] |
|
206 %define zero [rsp + 16 * 5] |
|
207 |
|
208 GET_FILTERS_4 |
|
209 |
|
210 mov rsi, arg(0) ;src_ptr |
|
211 mov rdi, arg(2) ;output_ptr |
|
212 |
|
213 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
214 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
|
215 lea rdx, [rax + rax * 2] |
|
216 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
217 |
|
218 .loop: |
|
219 movd xmm0, [rsi] ;load src: row 0 |
|
220 movd xmm1, [rsi + rax] ;1 |
|
221 movd xmm6, [rsi + rdx * 2] ;6 |
|
222 lea rsi, [rsi + rax] |
|
223 movd xmm7, [rsi + rdx * 2] ;7 |
|
224 movd xmm2, [rsi + rax] ;2 |
|
225 movd xmm3, [rsi + rax * 2] ;3 |
|
226 movd xmm4, [rsi + rdx] ;4 |
|
227 movd xmm5, [rsi + rax * 4] ;5 |
|
228 |
|
229 APPLY_FILTER_4 0 |
|
230 |
|
231 lea rdi, [rdi + rbx] |
|
232 dec rcx |
|
233 jnz .loop |
|
234 |
|
235 add rsp, 16 * 6 |
|
236 pop rsp |
|
237 pop rbx |
|
238 ; begin epilog |
|
239 pop rdi |
|
240 pop rsi |
|
241 RESTORE_XMM |
|
242 UNSHADOW_ARGS |
|
243 pop rbp |
|
244 ret |
|
245 |
|
246 ;void vp9_filter_block1d8_v8_sse2 |
|
247 ;( |
|
248 ; unsigned char *src_ptr, |
|
249 ; unsigned int src_pitch, |
|
250 ; unsigned char *output_ptr, |
|
251 ; unsigned int out_pitch, |
|
252 ; unsigned int output_height, |
|
253 ; short *filter |
|
254 ;) |
|
255 global sym(vp9_filter_block1d8_v8_sse2) PRIVATE |
|
256 sym(vp9_filter_block1d8_v8_sse2): |
|
257 push rbp |
|
258 mov rbp, rsp |
|
259 SHADOW_ARGS_TO_STACK 6 |
|
260 SAVE_XMM 7 |
|
261 push rsi |
|
262 push rdi |
|
263 push rbx |
|
264 ; end prolog |
|
265 |
|
266 ALIGN_STACK 16, rax |
|
267 sub rsp, 16 * 10 |
|
268 %define k0 [rsp + 16 * 0] |
|
269 %define k1 [rsp + 16 * 1] |
|
270 %define k2 [rsp + 16 * 2] |
|
271 %define k3 [rsp + 16 * 3] |
|
272 %define k4 [rsp + 16 * 4] |
|
273 %define k5 [rsp + 16 * 5] |
|
274 %define k6 [rsp + 16 * 6] |
|
275 %define k7 [rsp + 16 * 7] |
|
276 %define krd [rsp + 16 * 8] |
|
277 %define zero [rsp + 16 * 9] |
|
278 |
|
279 GET_FILTERS |
|
280 |
|
281 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
282 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
|
283 lea rdx, [rax + rax * 2] |
|
284 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
285 |
|
286 .loop: |
|
287 LOAD_VERT_8 0 |
|
288 APPLY_FILTER_8 0, 0 |
|
289 |
|
290 lea rdi, [rdi + rbx] |
|
291 dec rcx |
|
292 jnz .loop |
|
293 |
|
294 add rsp, 16 * 10 |
|
295 pop rsp |
|
296 pop rbx |
|
297 ; begin epilog |
|
298 pop rdi |
|
299 pop rsi |
|
300 RESTORE_XMM |
|
301 UNSHADOW_ARGS |
|
302 pop rbp |
|
303 ret |
|
304 |
|
305 ;void vp9_filter_block1d16_v8_sse2 |
|
306 ;( |
|
307 ; unsigned char *src_ptr, |
|
308 ; unsigned int src_pitch, |
|
309 ; unsigned char *output_ptr, |
|
310 ; unsigned int out_pitch, |
|
311 ; unsigned int output_height, |
|
312 ; short *filter |
|
313 ;) |
|
314 global sym(vp9_filter_block1d16_v8_sse2) PRIVATE |
|
315 sym(vp9_filter_block1d16_v8_sse2): |
|
316 push rbp |
|
317 mov rbp, rsp |
|
318 SHADOW_ARGS_TO_STACK 6 |
|
319 SAVE_XMM 7 |
|
320 push rsi |
|
321 push rdi |
|
322 push rbx |
|
323 ; end prolog |
|
324 |
|
325 ALIGN_STACK 16, rax |
|
326 sub rsp, 16 * 10 |
|
327 %define k0 [rsp + 16 * 0] |
|
328 %define k1 [rsp + 16 * 1] |
|
329 %define k2 [rsp + 16 * 2] |
|
330 %define k3 [rsp + 16 * 3] |
|
331 %define k4 [rsp + 16 * 4] |
|
332 %define k5 [rsp + 16 * 5] |
|
333 %define k6 [rsp + 16 * 6] |
|
334 %define k7 [rsp + 16 * 7] |
|
335 %define krd [rsp + 16 * 8] |
|
336 %define zero [rsp + 16 * 9] |
|
337 |
|
338 GET_FILTERS |
|
339 |
|
340 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
341 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
|
342 lea rdx, [rax + rax * 2] |
|
343 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
344 |
|
345 .loop: |
|
346 LOAD_VERT_8 0 |
|
347 APPLY_FILTER_8 0, 0 |
|
348 sub rsi, rax |
|
349 |
|
350 LOAD_VERT_8 8 |
|
351 APPLY_FILTER_8 0, 8 |
|
352 add rdi, rbx |
|
353 |
|
354 dec rcx |
|
355 jnz .loop |
|
356 |
|
357 add rsp, 16 * 10 |
|
358 pop rsp |
|
359 pop rbx |
|
360 ; begin epilog |
|
361 pop rdi |
|
362 pop rsi |
|
363 RESTORE_XMM |
|
364 UNSHADOW_ARGS |
|
365 pop rbp |
|
366 ret |
|
367 |
|
368 global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE |
|
369 sym(vp9_filter_block1d4_v8_avg_sse2): |
|
370 push rbp |
|
371 mov rbp, rsp |
|
372 SHADOW_ARGS_TO_STACK 6 |
|
373 SAVE_XMM 7 |
|
374 push rsi |
|
375 push rdi |
|
376 push rbx |
|
377 ; end prolog |
|
378 |
|
379 ALIGN_STACK 16, rax |
|
380 sub rsp, 16 * 6 |
|
381 %define k0k1 [rsp + 16 * 0] |
|
382 %define k2k3 [rsp + 16 * 1] |
|
383 %define k5k4 [rsp + 16 * 2] |
|
384 %define k6k7 [rsp + 16 * 3] |
|
385 %define krd [rsp + 16 * 4] |
|
386 %define zero [rsp + 16 * 5] |
|
387 |
|
388 GET_FILTERS_4 |
|
389 |
|
390 mov rsi, arg(0) ;src_ptr |
|
391 mov rdi, arg(2) ;output_ptr |
|
392 |
|
393 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
394 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
|
395 lea rdx, [rax + rax * 2] |
|
396 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
397 |
|
398 .loop: |
|
399 movd xmm0, [rsi] ;load src: row 0 |
|
400 movd xmm1, [rsi + rax] ;1 |
|
401 movd xmm6, [rsi + rdx * 2] ;6 |
|
402 lea rsi, [rsi + rax] |
|
403 movd xmm7, [rsi + rdx * 2] ;7 |
|
404 movd xmm2, [rsi + rax] ;2 |
|
405 movd xmm3, [rsi + rax * 2] ;3 |
|
406 movd xmm4, [rsi + rdx] ;4 |
|
407 movd xmm5, [rsi + rax * 4] ;5 |
|
408 |
|
409 APPLY_FILTER_4 1 |
|
410 |
|
411 lea rdi, [rdi + rbx] |
|
412 dec rcx |
|
413 jnz .loop |
|
414 |
|
415 add rsp, 16 * 6 |
|
416 pop rsp |
|
417 pop rbx |
|
418 ; begin epilog |
|
419 pop rdi |
|
420 pop rsi |
|
421 RESTORE_XMM |
|
422 UNSHADOW_ARGS |
|
423 pop rbp |
|
424 ret |
|
425 |
|
426 global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE |
|
427 sym(vp9_filter_block1d8_v8_avg_sse2): |
|
428 push rbp |
|
429 mov rbp, rsp |
|
430 SHADOW_ARGS_TO_STACK 6 |
|
431 SAVE_XMM 7 |
|
432 push rsi |
|
433 push rdi |
|
434 push rbx |
|
435 ; end prolog |
|
436 |
|
437 ALIGN_STACK 16, rax |
|
438 sub rsp, 16 * 10 |
|
439 %define k0 [rsp + 16 * 0] |
|
440 %define k1 [rsp + 16 * 1] |
|
441 %define k2 [rsp + 16 * 2] |
|
442 %define k3 [rsp + 16 * 3] |
|
443 %define k4 [rsp + 16 * 4] |
|
444 %define k5 [rsp + 16 * 5] |
|
445 %define k6 [rsp + 16 * 6] |
|
446 %define k7 [rsp + 16 * 7] |
|
447 %define krd [rsp + 16 * 8] |
|
448 %define zero [rsp + 16 * 9] |
|
449 |
|
450 GET_FILTERS |
|
451 |
|
452 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
453 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
|
454 lea rdx, [rax + rax * 2] |
|
455 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
456 .loop: |
|
457 LOAD_VERT_8 0 |
|
458 APPLY_FILTER_8 1, 0 |
|
459 |
|
460 lea rdi, [rdi + rbx] |
|
461 dec rcx |
|
462 jnz .loop |
|
463 |
|
464 add rsp, 16 * 10 |
|
465 pop rsp |
|
466 pop rbx |
|
467 ; begin epilog |
|
468 pop rdi |
|
469 pop rsi |
|
470 RESTORE_XMM |
|
471 UNSHADOW_ARGS |
|
472 pop rbp |
|
473 ret |
|
474 |
|
475 global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE |
|
476 sym(vp9_filter_block1d16_v8_avg_sse2): |
|
477 push rbp |
|
478 mov rbp, rsp |
|
479 SHADOW_ARGS_TO_STACK 6 |
|
480 SAVE_XMM 7 |
|
481 push rsi |
|
482 push rdi |
|
483 push rbx |
|
484 ; end prolog |
|
485 |
|
486 ALIGN_STACK 16, rax |
|
487 sub rsp, 16 * 10 |
|
488 %define k0 [rsp + 16 * 0] |
|
489 %define k1 [rsp + 16 * 1] |
|
490 %define k2 [rsp + 16 * 2] |
|
491 %define k3 [rsp + 16 * 3] |
|
492 %define k4 [rsp + 16 * 4] |
|
493 %define k5 [rsp + 16 * 5] |
|
494 %define k6 [rsp + 16 * 6] |
|
495 %define k7 [rsp + 16 * 7] |
|
496 %define krd [rsp + 16 * 8] |
|
497 %define zero [rsp + 16 * 9] |
|
498 |
|
499 GET_FILTERS |
|
500 |
|
501 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
502 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
|
503 lea rdx, [rax + rax * 2] |
|
504 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
505 .loop: |
|
506 LOAD_VERT_8 0 |
|
507 APPLY_FILTER_8 1, 0 |
|
508 sub rsi, rax |
|
509 |
|
510 LOAD_VERT_8 8 |
|
511 APPLY_FILTER_8 1, 8 |
|
512 add rdi, rbx |
|
513 |
|
514 dec rcx |
|
515 jnz .loop |
|
516 |
|
517 add rsp, 16 * 10 |
|
518 pop rsp |
|
519 pop rbx |
|
520 ; begin epilog |
|
521 pop rdi |
|
522 pop rsi |
|
523 RESTORE_XMM |
|
524 UNSHADOW_ARGS |
|
525 pop rbp |
|
526 ret |
|
527 |
|
528 ;void vp9_filter_block1d4_h8_sse2 |
|
529 ;( |
|
530 ; unsigned char *src_ptr, |
|
531 ; unsigned int src_pixels_per_line, |
|
532 ; unsigned char *output_ptr, |
|
533 ; unsigned int output_pitch, |
|
534 ; unsigned int output_height, |
|
535 ; short *filter |
|
536 ;) |
|
537 global sym(vp9_filter_block1d4_h8_sse2) PRIVATE |
|
538 sym(vp9_filter_block1d4_h8_sse2): |
|
539 push rbp |
|
540 mov rbp, rsp |
|
541 SHADOW_ARGS_TO_STACK 6 |
|
542 SAVE_XMM 7 |
|
543 push rsi |
|
544 push rdi |
|
545 ; end prolog |
|
546 |
|
547 ALIGN_STACK 16, rax |
|
548 sub rsp, 16 * 6 |
|
549 %define k0k1 [rsp + 16 * 0] |
|
550 %define k2k3 [rsp + 16 * 1] |
|
551 %define k5k4 [rsp + 16 * 2] |
|
552 %define k6k7 [rsp + 16 * 3] |
|
553 %define krd [rsp + 16 * 4] |
|
554 %define zero [rsp + 16 * 5] |
|
555 |
|
556 GET_FILTERS_4 |
|
557 |
|
558 mov rsi, arg(0) ;src_ptr |
|
559 mov rdi, arg(2) ;output_ptr |
|
560 |
|
561 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
562 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
|
563 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
564 |
|
565 .loop: |
|
566 movdqu xmm0, [rsi - 3] ;load src |
|
567 |
|
568 movdqa xmm1, xmm0 |
|
569 movdqa xmm6, xmm0 |
|
570 movdqa xmm7, xmm0 |
|
571 movdqa xmm2, xmm0 |
|
572 movdqa xmm3, xmm0 |
|
573 movdqa xmm5, xmm0 |
|
574 movdqa xmm4, xmm0 |
|
575 |
|
576 psrldq xmm1, 1 |
|
577 psrldq xmm6, 6 |
|
578 psrldq xmm7, 7 |
|
579 psrldq xmm2, 2 |
|
580 psrldq xmm3, 3 |
|
581 psrldq xmm5, 5 |
|
582 psrldq xmm4, 4 |
|
583 |
|
584 APPLY_FILTER_4 0 |
|
585 |
|
586 lea rsi, [rsi + rax] |
|
587 lea rdi, [rdi + rdx] |
|
588 dec rcx |
|
589 jnz .loop |
|
590 |
|
591 add rsp, 16 * 6 |
|
592 pop rsp |
|
593 |
|
594 ; begin epilog |
|
595 pop rdi |
|
596 pop rsi |
|
597 RESTORE_XMM |
|
598 UNSHADOW_ARGS |
|
599 pop rbp |
|
600 ret |
|
601 |
|
602 ;void vp9_filter_block1d8_h8_sse2 |
|
603 ;( |
|
604 ; unsigned char *src_ptr, |
|
605 ; unsigned int src_pixels_per_line, |
|
606 ; unsigned char *output_ptr, |
|
607 ; unsigned int output_pitch, |
|
608 ; unsigned int output_height, |
|
609 ; short *filter |
|
610 ;) |
|
611 global sym(vp9_filter_block1d8_h8_sse2) PRIVATE |
|
612 sym(vp9_filter_block1d8_h8_sse2): |
|
613 push rbp |
|
614 mov rbp, rsp |
|
615 SHADOW_ARGS_TO_STACK 6 |
|
616 SAVE_XMM 7 |
|
617 push rsi |
|
618 push rdi |
|
619 ; end prolog |
|
620 |
|
621 ALIGN_STACK 16, rax |
|
622 sub rsp, 16 * 10 |
|
623 %define k0 [rsp + 16 * 0] |
|
624 %define k1 [rsp + 16 * 1] |
|
625 %define k2 [rsp + 16 * 2] |
|
626 %define k3 [rsp + 16 * 3] |
|
627 %define k4 [rsp + 16 * 4] |
|
628 %define k5 [rsp + 16 * 5] |
|
629 %define k6 [rsp + 16 * 6] |
|
630 %define k7 [rsp + 16 * 7] |
|
631 %define krd [rsp + 16 * 8] |
|
632 %define zero [rsp + 16 * 9] |
|
633 |
|
634 GET_FILTERS |
|
635 |
|
636 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
637 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
|
638 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
639 |
|
640 .loop: |
|
641 movdqu xmm0, [rsi - 3] ;load src |
|
642 |
|
643 movdqa xmm1, xmm0 |
|
644 movdqa xmm6, xmm0 |
|
645 movdqa xmm7, xmm0 |
|
646 movdqa xmm2, xmm0 |
|
647 movdqa xmm5, xmm0 |
|
648 movdqa xmm3, xmm0 |
|
649 movdqa xmm4, xmm0 |
|
650 |
|
651 psrldq xmm1, 1 |
|
652 psrldq xmm6, 6 |
|
653 psrldq xmm7, 7 |
|
654 psrldq xmm2, 2 |
|
655 psrldq xmm5, 5 |
|
656 psrldq xmm3, 3 |
|
657 psrldq xmm4, 4 |
|
658 |
|
659 APPLY_FILTER_8 0, 0 |
|
660 |
|
661 lea rsi, [rsi + rax] |
|
662 lea rdi, [rdi + rdx] |
|
663 dec rcx |
|
664 jnz .loop |
|
665 |
|
666 add rsp, 16 * 10 |
|
667 pop rsp |
|
668 |
|
669 ; begin epilog |
|
670 pop rdi |
|
671 pop rsi |
|
672 RESTORE_XMM |
|
673 UNSHADOW_ARGS |
|
674 pop rbp |
|
675 ret |
|
676 |
|
677 ;void vp9_filter_block1d16_h8_sse2 |
|
678 ;( |
|
679 ; unsigned char *src_ptr, |
|
680 ; unsigned int src_pixels_per_line, |
|
681 ; unsigned char *output_ptr, |
|
682 ; unsigned int output_pitch, |
|
683 ; unsigned int output_height, |
|
684 ; short *filter |
|
685 ;) |
|
686 global sym(vp9_filter_block1d16_h8_sse2) PRIVATE |
|
687 sym(vp9_filter_block1d16_h8_sse2): |
|
688 push rbp |
|
689 mov rbp, rsp |
|
690 SHADOW_ARGS_TO_STACK 6 |
|
691 SAVE_XMM 7 |
|
692 push rsi |
|
693 push rdi |
|
694 ; end prolog |
|
695 |
|
696 ALIGN_STACK 16, rax |
|
697 sub rsp, 16 * 10 |
|
698 %define k0 [rsp + 16 * 0] |
|
699 %define k1 [rsp + 16 * 1] |
|
700 %define k2 [rsp + 16 * 2] |
|
701 %define k3 [rsp + 16 * 3] |
|
702 %define k4 [rsp + 16 * 4] |
|
703 %define k5 [rsp + 16 * 5] |
|
704 %define k6 [rsp + 16 * 6] |
|
705 %define k7 [rsp + 16 * 7] |
|
706 %define krd [rsp + 16 * 8] |
|
707 %define zero [rsp + 16 * 9] |
|
708 |
|
709 GET_FILTERS |
|
710 |
|
711 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
712 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
|
713 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
714 |
|
715 .loop: |
|
716 movdqu xmm0, [rsi - 3] ;load src |
|
717 |
|
718 movdqa xmm1, xmm0 |
|
719 movdqa xmm6, xmm0 |
|
720 movdqa xmm7, xmm0 |
|
721 movdqa xmm2, xmm0 |
|
722 movdqa xmm5, xmm0 |
|
723 movdqa xmm3, xmm0 |
|
724 movdqa xmm4, xmm0 |
|
725 |
|
726 psrldq xmm1, 1 |
|
727 psrldq xmm6, 6 |
|
728 psrldq xmm7, 7 |
|
729 psrldq xmm2, 2 |
|
730 psrldq xmm5, 5 |
|
731 psrldq xmm3, 3 |
|
732 psrldq xmm4, 4 |
|
733 |
|
734 APPLY_FILTER_8 0, 0 |
|
735 |
|
736 movdqu xmm0, [rsi + 5] ;load src |
|
737 |
|
738 movdqa xmm1, xmm0 |
|
739 movdqa xmm6, xmm0 |
|
740 movdqa xmm7, xmm0 |
|
741 movdqa xmm2, xmm0 |
|
742 movdqa xmm5, xmm0 |
|
743 movdqa xmm3, xmm0 |
|
744 movdqa xmm4, xmm0 |
|
745 |
|
746 psrldq xmm1, 1 |
|
747 psrldq xmm6, 6 |
|
748 psrldq xmm7, 7 |
|
749 psrldq xmm2, 2 |
|
750 psrldq xmm5, 5 |
|
751 psrldq xmm3, 3 |
|
752 psrldq xmm4, 4 |
|
753 |
|
754 APPLY_FILTER_8 0, 8 |
|
755 |
|
756 lea rsi, [rsi + rax] |
|
757 lea rdi, [rdi + rdx] |
|
758 dec rcx |
|
759 jnz .loop |
|
760 |
|
761 add rsp, 16 * 10 |
|
762 pop rsp |
|
763 |
|
764 ; begin epilog |
|
765 pop rdi |
|
766 pop rsi |
|
767 RESTORE_XMM |
|
768 UNSHADOW_ARGS |
|
769 pop rbp |
|
770 ret |
|
771 |
|
772 global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE |
|
773 sym(vp9_filter_block1d4_h8_avg_sse2): |
|
774 push rbp |
|
775 mov rbp, rsp |
|
776 SHADOW_ARGS_TO_STACK 6 |
|
777 SAVE_XMM 7 |
|
778 push rsi |
|
779 push rdi |
|
780 ; end prolog |
|
781 |
|
782 ALIGN_STACK 16, rax |
|
783 sub rsp, 16 * 6 |
|
784 %define k0k1 [rsp + 16 * 0] |
|
785 %define k2k3 [rsp + 16 * 1] |
|
786 %define k5k4 [rsp + 16 * 2] |
|
787 %define k6k7 [rsp + 16 * 3] |
|
788 %define krd [rsp + 16 * 4] |
|
789 %define zero [rsp + 16 * 5] |
|
790 |
|
791 GET_FILTERS_4 |
|
792 |
|
793 mov rsi, arg(0) ;src_ptr |
|
794 mov rdi, arg(2) ;output_ptr |
|
795 |
|
796 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
797 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
|
798 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
799 |
|
800 .loop: |
|
801 movdqu xmm0, [rsi - 3] ;load src |
|
802 |
|
803 movdqa xmm1, xmm0 |
|
804 movdqa xmm6, xmm0 |
|
805 movdqa xmm7, xmm0 |
|
806 movdqa xmm2, xmm0 |
|
807 movdqa xmm3, xmm0 |
|
808 movdqa xmm5, xmm0 |
|
809 movdqa xmm4, xmm0 |
|
810 |
|
811 psrldq xmm1, 1 |
|
812 psrldq xmm6, 6 |
|
813 psrldq xmm7, 7 |
|
814 psrldq xmm2, 2 |
|
815 psrldq xmm3, 3 |
|
816 psrldq xmm5, 5 |
|
817 psrldq xmm4, 4 |
|
818 |
|
819 APPLY_FILTER_4 1 |
|
820 |
|
821 lea rsi, [rsi + rax] |
|
822 lea rdi, [rdi + rdx] |
|
823 dec rcx |
|
824 jnz .loop |
|
825 |
|
826 add rsp, 16 * 6 |
|
827 pop rsp |
|
828 |
|
829 ; begin epilog |
|
830 pop rdi |
|
831 pop rsi |
|
832 RESTORE_XMM |
|
833 UNSHADOW_ARGS |
|
834 pop rbp |
|
835 ret |
|
836 |
|
837 global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE |
|
838 sym(vp9_filter_block1d8_h8_avg_sse2): |
|
839 push rbp |
|
840 mov rbp, rsp |
|
841 SHADOW_ARGS_TO_STACK 6 |
|
842 SAVE_XMM 7 |
|
843 push rsi |
|
844 push rdi |
|
845 ; end prolog |
|
846 |
|
847 ALIGN_STACK 16, rax |
|
848 sub rsp, 16 * 10 |
|
849 %define k0 [rsp + 16 * 0] |
|
850 %define k1 [rsp + 16 * 1] |
|
851 %define k2 [rsp + 16 * 2] |
|
852 %define k3 [rsp + 16 * 3] |
|
853 %define k4 [rsp + 16 * 4] |
|
854 %define k5 [rsp + 16 * 5] |
|
855 %define k6 [rsp + 16 * 6] |
|
856 %define k7 [rsp + 16 * 7] |
|
857 %define krd [rsp + 16 * 8] |
|
858 %define zero [rsp + 16 * 9] |
|
859 |
|
860 GET_FILTERS |
|
861 |
|
862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
863 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
|
864 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
865 |
|
866 .loop: |
|
867 movdqu xmm0, [rsi - 3] ;load src |
|
868 |
|
869 movdqa xmm1, xmm0 |
|
870 movdqa xmm6, xmm0 |
|
871 movdqa xmm7, xmm0 |
|
872 movdqa xmm2, xmm0 |
|
873 movdqa xmm5, xmm0 |
|
874 movdqa xmm3, xmm0 |
|
875 movdqa xmm4, xmm0 |
|
876 |
|
877 psrldq xmm1, 1 |
|
878 psrldq xmm6, 6 |
|
879 psrldq xmm7, 7 |
|
880 psrldq xmm2, 2 |
|
881 psrldq xmm5, 5 |
|
882 psrldq xmm3, 3 |
|
883 psrldq xmm4, 4 |
|
884 |
|
885 APPLY_FILTER_8 1, 0 |
|
886 |
|
887 lea rsi, [rsi + rax] |
|
888 lea rdi, [rdi + rdx] |
|
889 dec rcx |
|
890 jnz .loop |
|
891 |
|
892 add rsp, 16 * 10 |
|
893 pop rsp |
|
894 |
|
895 ; begin epilog |
|
896 pop rdi |
|
897 pop rsi |
|
898 RESTORE_XMM |
|
899 UNSHADOW_ARGS |
|
900 pop rbp |
|
901 ret |
|
902 |
|
903 global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE |
|
904 sym(vp9_filter_block1d16_h8_avg_sse2): |
|
905 push rbp |
|
906 mov rbp, rsp |
|
907 SHADOW_ARGS_TO_STACK 6 |
|
908 SAVE_XMM 7 |
|
909 push rsi |
|
910 push rdi |
|
911 ; end prolog |
|
912 |
|
913 ALIGN_STACK 16, rax |
|
914 sub rsp, 16 * 10 |
|
915 %define k0 [rsp + 16 * 0] |
|
916 %define k1 [rsp + 16 * 1] |
|
917 %define k2 [rsp + 16 * 2] |
|
918 %define k3 [rsp + 16 * 3] |
|
919 %define k4 [rsp + 16 * 4] |
|
920 %define k5 [rsp + 16 * 5] |
|
921 %define k6 [rsp + 16 * 6] |
|
922 %define k7 [rsp + 16 * 7] |
|
923 %define krd [rsp + 16 * 8] |
|
924 %define zero [rsp + 16 * 9] |
|
925 |
|
926 GET_FILTERS |
|
927 |
|
928 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
|
929 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
|
930 movsxd rcx, DWORD PTR arg(4) ;output_height |
|
931 |
|
932 .loop: |
|
933 movdqu xmm0, [rsi - 3] ;load src |
|
934 |
|
935 movdqa xmm1, xmm0 |
|
936 movdqa xmm6, xmm0 |
|
937 movdqa xmm7, xmm0 |
|
938 movdqa xmm2, xmm0 |
|
939 movdqa xmm5, xmm0 |
|
940 movdqa xmm3, xmm0 |
|
941 movdqa xmm4, xmm0 |
|
942 |
|
943 psrldq xmm1, 1 |
|
944 psrldq xmm6, 6 |
|
945 psrldq xmm7, 7 |
|
946 psrldq xmm2, 2 |
|
947 psrldq xmm5, 5 |
|
948 psrldq xmm3, 3 |
|
949 psrldq xmm4, 4 |
|
950 |
|
951 APPLY_FILTER_8 1, 0 |
|
952 |
|
953 movdqu xmm0, [rsi + 5] ;load src |
|
954 |
|
955 movdqa xmm1, xmm0 |
|
956 movdqa xmm6, xmm0 |
|
957 movdqa xmm7, xmm0 |
|
958 movdqa xmm2, xmm0 |
|
959 movdqa xmm5, xmm0 |
|
960 movdqa xmm3, xmm0 |
|
961 movdqa xmm4, xmm0 |
|
962 |
|
963 psrldq xmm1, 1 |
|
964 psrldq xmm6, 6 |
|
965 psrldq xmm7, 7 |
|
966 psrldq xmm2, 2 |
|
967 psrldq xmm5, 5 |
|
968 psrldq xmm3, 3 |
|
969 psrldq xmm4, 4 |
|
970 |
|
971 APPLY_FILTER_8 1, 8 |
|
972 |
|
973 lea rsi, [rsi + rax] |
|
974 lea rdi, [rdi + rdx] |
|
975 dec rcx |
|
976 jnz .loop |
|
977 |
|
978 add rsp, 16 * 10 |
|
979 pop rsp |
|
980 |
|
981 ; begin epilog |
|
982 pop rdi |
|
983 pop rsi |
|
984 RESTORE_XMM |
|
985 UNSHADOW_ARGS |
|
986 pop rbp |
|
987 ret |