|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 %define xmm_filter_shift 7 |
|
15 |
|
16 |
|
17 ;void vp8_filter_block2d_bil_var_ssse3 |
|
18 ;( |
|
19 ; unsigned char *ref_ptr, |
|
20 ; int ref_pixels_per_line, |
|
21 ; unsigned char *src_ptr, |
|
22 ; int src_pixels_per_line, |
|
23 ; unsigned int Height, |
|
24 ; int xoffset, |
|
25 ; int yoffset, |
|
26 ; int *sum, |
|
27 ; unsigned int *sumsquared;; |
|
28 ; |
|
29 ;) |
|
30 ;Note: The filter coefficient at offset=0 is 128. Since the second register |
|
31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. |
|
32 global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE |
|
33 sym(vp8_filter_block2d_bil_var_ssse3): |
|
34 push rbp |
|
35 mov rbp, rsp |
|
36 SHADOW_ARGS_TO_STACK 9 |
|
37 SAVE_XMM 7 |
|
38 GET_GOT rbx |
|
39 push rsi |
|
40 push rdi |
|
41 ; end prolog |
|
42 |
|
43 pxor xmm6, xmm6 |
|
44 pxor xmm7, xmm7 |
|
45 |
|
46 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] |
|
47 movsxd rax, dword ptr arg(5) ; xoffset |
|
48 |
|
49 cmp rax, 0 ; skip first_pass filter if xoffset=0 |
|
50 je .filter_block2d_bil_var_ssse3_sp_only |
|
51 |
|
52 shl rax, 4 ; point to filter coeff with xoffset |
|
53 lea rax, [rax + rcx] ; HFilter |
|
54 |
|
55 movsxd rdx, dword ptr arg(6) ; yoffset |
|
56 |
|
57 cmp rdx, 0 ; skip second_pass filter if yoffset=0 |
|
58 je .filter_block2d_bil_var_ssse3_fp_only |
|
59 |
|
60 shl rdx, 4 |
|
61 lea rdx, [rdx + rcx] ; VFilter |
|
62 |
|
63 mov rsi, arg(0) ;ref_ptr |
|
64 mov rdi, arg(2) ;src_ptr |
|
65 movsxd rcx, dword ptr arg(4) ;Height |
|
66 |
|
67 movdqu xmm0, XMMWORD PTR [rsi] |
|
68 movdqu xmm1, XMMWORD PTR [rsi+1] |
|
69 movdqa xmm2, xmm0 |
|
70 |
|
71 punpcklbw xmm0, xmm1 |
|
72 punpckhbw xmm2, xmm1 |
|
73 pmaddubsw xmm0, [rax] |
|
74 pmaddubsw xmm2, [rax] |
|
75 |
|
76 paddw xmm0, [GLOBAL(xmm_bi_rd)] |
|
77 paddw xmm2, [GLOBAL(xmm_bi_rd)] |
|
78 psraw xmm0, xmm_filter_shift |
|
79 psraw xmm2, xmm_filter_shift |
|
80 |
|
81 packuswb xmm0, xmm2 |
|
82 |
|
83 %if ABI_IS_32BIT |
|
84 add rsi, dword ptr arg(1) ;ref_pixels_per_line |
|
85 %else |
|
86 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
|
87 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
|
88 lea rsi, [rsi + r8] |
|
89 %endif |
|
90 |
|
91 .filter_block2d_bil_var_ssse3_loop: |
|
92 movdqu xmm1, XMMWORD PTR [rsi] |
|
93 movdqu xmm2, XMMWORD PTR [rsi+1] |
|
94 movdqa xmm3, xmm1 |
|
95 |
|
96 punpcklbw xmm1, xmm2 |
|
97 punpckhbw xmm3, xmm2 |
|
98 pmaddubsw xmm1, [rax] |
|
99 pmaddubsw xmm3, [rax] |
|
100 |
|
101 paddw xmm1, [GLOBAL(xmm_bi_rd)] |
|
102 paddw xmm3, [GLOBAL(xmm_bi_rd)] |
|
103 psraw xmm1, xmm_filter_shift |
|
104 psraw xmm3, xmm_filter_shift |
|
105 packuswb xmm1, xmm3 |
|
106 |
|
107 movdqa xmm2, xmm0 |
|
108 movdqa xmm0, xmm1 |
|
109 movdqa xmm3, xmm2 |
|
110 |
|
111 punpcklbw xmm2, xmm1 |
|
112 punpckhbw xmm3, xmm1 |
|
113 pmaddubsw xmm2, [rdx] |
|
114 pmaddubsw xmm3, [rdx] |
|
115 |
|
116 paddw xmm2, [GLOBAL(xmm_bi_rd)] |
|
117 paddw xmm3, [GLOBAL(xmm_bi_rd)] |
|
118 psraw xmm2, xmm_filter_shift |
|
119 psraw xmm3, xmm_filter_shift |
|
120 |
|
121 movq xmm1, QWORD PTR [rdi] |
|
122 pxor xmm4, xmm4 |
|
123 punpcklbw xmm1, xmm4 |
|
124 movq xmm5, QWORD PTR [rdi+8] |
|
125 punpcklbw xmm5, xmm4 |
|
126 |
|
127 psubw xmm2, xmm1 |
|
128 psubw xmm3, xmm5 |
|
129 paddw xmm6, xmm2 |
|
130 paddw xmm6, xmm3 |
|
131 pmaddwd xmm2, xmm2 |
|
132 pmaddwd xmm3, xmm3 |
|
133 paddd xmm7, xmm2 |
|
134 paddd xmm7, xmm3 |
|
135 |
|
136 %if ABI_IS_32BIT |
|
137 add rsi, dword ptr arg(1) ;ref_pixels_per_line |
|
138 add rdi, dword ptr arg(3) ;src_pixels_per_line |
|
139 %else |
|
140 lea rsi, [rsi + r8] |
|
141 lea rdi, [rdi + r9] |
|
142 %endif |
|
143 |
|
144 sub rcx, 1 |
|
145 jnz .filter_block2d_bil_var_ssse3_loop |
|
146 |
|
147 jmp .filter_block2d_bil_variance |
|
148 |
|
149 .filter_block2d_bil_var_ssse3_sp_only: |
|
150 movsxd rdx, dword ptr arg(6) ; yoffset |
|
151 |
|
152 cmp rdx, 0 ; Both xoffset =0 and yoffset=0 |
|
153 je .filter_block2d_bil_var_ssse3_full_pixel |
|
154 |
|
155 shl rdx, 4 |
|
156 lea rdx, [rdx + rcx] ; VFilter |
|
157 |
|
158 mov rsi, arg(0) ;ref_ptr |
|
159 mov rdi, arg(2) ;src_ptr |
|
160 movsxd rcx, dword ptr arg(4) ;Height |
|
161 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
|
162 |
|
163 movdqu xmm1, XMMWORD PTR [rsi] |
|
164 movdqa xmm0, xmm1 |
|
165 |
|
166 %if ABI_IS_32BIT=0 |
|
167 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
|
168 %endif |
|
169 |
|
170 lea rsi, [rsi + rax] |
|
171 |
|
172 .filter_block2d_bil_sp_only_loop: |
|
173 movdqu xmm3, XMMWORD PTR [rsi] |
|
174 movdqa xmm2, xmm1 |
|
175 movdqa xmm0, xmm3 |
|
176 |
|
177 punpcklbw xmm1, xmm3 |
|
178 punpckhbw xmm2, xmm3 |
|
179 pmaddubsw xmm1, [rdx] |
|
180 pmaddubsw xmm2, [rdx] |
|
181 |
|
182 paddw xmm1, [GLOBAL(xmm_bi_rd)] |
|
183 paddw xmm2, [GLOBAL(xmm_bi_rd)] |
|
184 psraw xmm1, xmm_filter_shift |
|
185 psraw xmm2, xmm_filter_shift |
|
186 |
|
187 movq xmm3, QWORD PTR [rdi] |
|
188 pxor xmm4, xmm4 |
|
189 punpcklbw xmm3, xmm4 |
|
190 movq xmm5, QWORD PTR [rdi+8] |
|
191 punpcklbw xmm5, xmm4 |
|
192 |
|
193 psubw xmm1, xmm3 |
|
194 psubw xmm2, xmm5 |
|
195 paddw xmm6, xmm1 |
|
196 paddw xmm6, xmm2 |
|
197 pmaddwd xmm1, xmm1 |
|
198 pmaddwd xmm2, xmm2 |
|
199 paddd xmm7, xmm1 |
|
200 paddd xmm7, xmm2 |
|
201 |
|
202 movdqa xmm1, xmm0 |
|
203 lea rsi, [rsi + rax] ;ref_pixels_per_line |
|
204 |
|
205 %if ABI_IS_32BIT |
|
206 add rdi, dword ptr arg(3) ;src_pixels_per_line |
|
207 %else |
|
208 lea rdi, [rdi + r9] |
|
209 %endif |
|
210 |
|
211 sub rcx, 1 |
|
212 jnz .filter_block2d_bil_sp_only_loop |
|
213 |
|
214 jmp .filter_block2d_bil_variance |
|
215 |
|
216 .filter_block2d_bil_var_ssse3_full_pixel: |
|
217 mov rsi, arg(0) ;ref_ptr |
|
218 mov rdi, arg(2) ;src_ptr |
|
219 movsxd rcx, dword ptr arg(4) ;Height |
|
220 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
|
221 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line |
|
222 pxor xmm0, xmm0 |
|
223 |
|
224 .filter_block2d_bil_full_pixel_loop: |
|
225 movq xmm1, QWORD PTR [rsi] |
|
226 punpcklbw xmm1, xmm0 |
|
227 movq xmm2, QWORD PTR [rsi+8] |
|
228 punpcklbw xmm2, xmm0 |
|
229 |
|
230 movq xmm3, QWORD PTR [rdi] |
|
231 punpcklbw xmm3, xmm0 |
|
232 movq xmm4, QWORD PTR [rdi+8] |
|
233 punpcklbw xmm4, xmm0 |
|
234 |
|
235 psubw xmm1, xmm3 |
|
236 psubw xmm2, xmm4 |
|
237 paddw xmm6, xmm1 |
|
238 paddw xmm6, xmm2 |
|
239 pmaddwd xmm1, xmm1 |
|
240 pmaddwd xmm2, xmm2 |
|
241 paddd xmm7, xmm1 |
|
242 paddd xmm7, xmm2 |
|
243 |
|
244 lea rsi, [rsi + rax] ;ref_pixels_per_line |
|
245 lea rdi, [rdi + rdx] ;src_pixels_per_line |
|
246 sub rcx, 1 |
|
247 jnz .filter_block2d_bil_full_pixel_loop |
|
248 |
|
249 jmp .filter_block2d_bil_variance |
|
250 |
|
251 .filter_block2d_bil_var_ssse3_fp_only: |
|
252 mov rsi, arg(0) ;ref_ptr |
|
253 mov rdi, arg(2) ;src_ptr |
|
254 movsxd rcx, dword ptr arg(4) ;Height |
|
255 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line |
|
256 |
|
257 pxor xmm0, xmm0 |
|
258 |
|
259 %if ABI_IS_32BIT=0 |
|
260 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
|
261 %endif |
|
262 |
|
263 .filter_block2d_bil_fp_only_loop: |
|
264 movdqu xmm1, XMMWORD PTR [rsi] |
|
265 movdqu xmm2, XMMWORD PTR [rsi+1] |
|
266 movdqa xmm3, xmm1 |
|
267 |
|
268 punpcklbw xmm1, xmm2 |
|
269 punpckhbw xmm3, xmm2 |
|
270 pmaddubsw xmm1, [rax] |
|
271 pmaddubsw xmm3, [rax] |
|
272 |
|
273 paddw xmm1, [GLOBAL(xmm_bi_rd)] |
|
274 paddw xmm3, [GLOBAL(xmm_bi_rd)] |
|
275 psraw xmm1, xmm_filter_shift |
|
276 psraw xmm3, xmm_filter_shift |
|
277 |
|
278 movq xmm2, XMMWORD PTR [rdi] |
|
279 pxor xmm4, xmm4 |
|
280 punpcklbw xmm2, xmm4 |
|
281 movq xmm5, QWORD PTR [rdi+8] |
|
282 punpcklbw xmm5, xmm4 |
|
283 |
|
284 psubw xmm1, xmm2 |
|
285 psubw xmm3, xmm5 |
|
286 paddw xmm6, xmm1 |
|
287 paddw xmm6, xmm3 |
|
288 pmaddwd xmm1, xmm1 |
|
289 pmaddwd xmm3, xmm3 |
|
290 paddd xmm7, xmm1 |
|
291 paddd xmm7, xmm3 |
|
292 |
|
293 lea rsi, [rsi + rdx] |
|
294 %if ABI_IS_32BIT |
|
295 add rdi, dword ptr arg(3) ;src_pixels_per_line |
|
296 %else |
|
297 lea rdi, [rdi + r9] |
|
298 %endif |
|
299 |
|
300 sub rcx, 1 |
|
301 jnz .filter_block2d_bil_fp_only_loop |
|
302 |
|
303 jmp .filter_block2d_bil_variance |
|
304 |
|
305 .filter_block2d_bil_variance: |
|
306 pxor xmm0, xmm0 |
|
307 pxor xmm1, xmm1 |
|
308 pxor xmm5, xmm5 |
|
309 |
|
310 punpcklwd xmm0, xmm6 |
|
311 punpckhwd xmm1, xmm6 |
|
312 psrad xmm0, 16 |
|
313 psrad xmm1, 16 |
|
314 paddd xmm0, xmm1 |
|
315 movdqa xmm1, xmm0 |
|
316 |
|
317 movdqa xmm6, xmm7 |
|
318 punpckldq xmm6, xmm5 |
|
319 punpckhdq xmm7, xmm5 |
|
320 paddd xmm6, xmm7 |
|
321 |
|
322 punpckldq xmm0, xmm5 |
|
323 punpckhdq xmm1, xmm5 |
|
324 paddd xmm0, xmm1 |
|
325 |
|
326 movdqa xmm7, xmm6 |
|
327 movdqa xmm1, xmm0 |
|
328 |
|
329 psrldq xmm7, 8 |
|
330 psrldq xmm1, 8 |
|
331 |
|
332 paddd xmm6, xmm7 |
|
333 paddd xmm0, xmm1 |
|
334 |
|
335 mov rsi, arg(7) ;[Sum] |
|
336 mov rdi, arg(8) ;[SSE] |
|
337 |
|
338 movd [rsi], xmm0 |
|
339 movd [rdi], xmm6 |
|
340 |
|
341 ; begin epilog |
|
342 pop rdi |
|
343 pop rsi |
|
344 RESTORE_GOT |
|
345 RESTORE_XMM |
|
346 UNSHADOW_ARGS |
|
347 pop rbp |
|
348 ret |
|
349 |
|
350 |
|
351 SECTION_RODATA |
|
352 align 16 |
|
353 xmm_bi_rd: |
|
354 times 8 dw 64 |
|
355 align 16 |
|
356 vp8_bilinear_filters_ssse3: |
|
357 times 8 db 128, 0 |
|
358 times 8 db 112, 16 |
|
359 times 8 db 96, 32 |
|
360 times 8 db 80, 48 |
|
361 times 8 db 64, 64 |
|
362 times 8 db 48, 80 |
|
363 times 8 db 32, 96 |
|
364 times 8 db 16, 112 |