media/libvpx/vp8/common/x86/variance_impl_ssse3.asm

branch
TOR_BUG_9701
changeset 10
ac0c01689b40
equal deleted inserted replaced
-1:000000000000 0:5535b0ed6cc8
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %define xmm_filter_shift 7
15
16
17 ;void vp8_filter_block2d_bil_var_ssse3
18 ;(
19 ; unsigned char *ref_ptr,
20 ; int ref_pixels_per_line,
21 ; unsigned char *src_ptr,
22 ; int src_pixels_per_line,
23 ; unsigned int Height,
24 ; int xoffset,
25 ; int yoffset,
26 ; int *sum,
27 ; unsigned int *sumsquared;;
28 ;
29 ;)
30 ;Note: The filter coefficient at offset=0 is 128. Since the second register
31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
32 global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
33 sym(vp8_filter_block2d_bil_var_ssse3):
34 push rbp
35 mov rbp, rsp
36 SHADOW_ARGS_TO_STACK 9
37 SAVE_XMM 7
38 GET_GOT rbx
39 push rsi
40 push rdi
41 ; end prolog
42
43 pxor xmm6, xmm6
44 pxor xmm7, xmm7
45
46 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
47 movsxd rax, dword ptr arg(5) ; xoffset
48
49 cmp rax, 0 ; skip first_pass filter if xoffset=0
50 je .filter_block2d_bil_var_ssse3_sp_only
51
52 shl rax, 4 ; point to filter coeff with xoffset
53 lea rax, [rax + rcx] ; HFilter
54
55 movsxd rdx, dword ptr arg(6) ; yoffset
56
57 cmp rdx, 0 ; skip second_pass filter if yoffset=0
58 je .filter_block2d_bil_var_ssse3_fp_only
59
60 shl rdx, 4
61 lea rdx, [rdx + rcx] ; VFilter
62
63 mov rsi, arg(0) ;ref_ptr
64 mov rdi, arg(2) ;src_ptr
65 movsxd rcx, dword ptr arg(4) ;Height
66
67 movdqu xmm0, XMMWORD PTR [rsi]
68 movdqu xmm1, XMMWORD PTR [rsi+1]
69 movdqa xmm2, xmm0
70
71 punpcklbw xmm0, xmm1
72 punpckhbw xmm2, xmm1
73 pmaddubsw xmm0, [rax]
74 pmaddubsw xmm2, [rax]
75
76 paddw xmm0, [GLOBAL(xmm_bi_rd)]
77 paddw xmm2, [GLOBAL(xmm_bi_rd)]
78 psraw xmm0, xmm_filter_shift
79 psraw xmm2, xmm_filter_shift
80
81 packuswb xmm0, xmm2
82
83 %if ABI_IS_32BIT
84 add rsi, dword ptr arg(1) ;ref_pixels_per_line
85 %else
86 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
87 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
88 lea rsi, [rsi + r8]
89 %endif
90
91 .filter_block2d_bil_var_ssse3_loop:
92 movdqu xmm1, XMMWORD PTR [rsi]
93 movdqu xmm2, XMMWORD PTR [rsi+1]
94 movdqa xmm3, xmm1
95
96 punpcklbw xmm1, xmm2
97 punpckhbw xmm3, xmm2
98 pmaddubsw xmm1, [rax]
99 pmaddubsw xmm3, [rax]
100
101 paddw xmm1, [GLOBAL(xmm_bi_rd)]
102 paddw xmm3, [GLOBAL(xmm_bi_rd)]
103 psraw xmm1, xmm_filter_shift
104 psraw xmm3, xmm_filter_shift
105 packuswb xmm1, xmm3
106
107 movdqa xmm2, xmm0
108 movdqa xmm0, xmm1
109 movdqa xmm3, xmm2
110
111 punpcklbw xmm2, xmm1
112 punpckhbw xmm3, xmm1
113 pmaddubsw xmm2, [rdx]
114 pmaddubsw xmm3, [rdx]
115
116 paddw xmm2, [GLOBAL(xmm_bi_rd)]
117 paddw xmm3, [GLOBAL(xmm_bi_rd)]
118 psraw xmm2, xmm_filter_shift
119 psraw xmm3, xmm_filter_shift
120
121 movq xmm1, QWORD PTR [rdi]
122 pxor xmm4, xmm4
123 punpcklbw xmm1, xmm4
124 movq xmm5, QWORD PTR [rdi+8]
125 punpcklbw xmm5, xmm4
126
127 psubw xmm2, xmm1
128 psubw xmm3, xmm5
129 paddw xmm6, xmm2
130 paddw xmm6, xmm3
131 pmaddwd xmm2, xmm2
132 pmaddwd xmm3, xmm3
133 paddd xmm7, xmm2
134 paddd xmm7, xmm3
135
136 %if ABI_IS_32BIT
137 add rsi, dword ptr arg(1) ;ref_pixels_per_line
138 add rdi, dword ptr arg(3) ;src_pixels_per_line
139 %else
140 lea rsi, [rsi + r8]
141 lea rdi, [rdi + r9]
142 %endif
143
144 sub rcx, 1
145 jnz .filter_block2d_bil_var_ssse3_loop
146
147 jmp .filter_block2d_bil_variance
148
149 .filter_block2d_bil_var_ssse3_sp_only:
150 movsxd rdx, dword ptr arg(6) ; yoffset
151
152 cmp rdx, 0 ; Both xoffset =0 and yoffset=0
153 je .filter_block2d_bil_var_ssse3_full_pixel
154
155 shl rdx, 4
156 lea rdx, [rdx + rcx] ; VFilter
157
158 mov rsi, arg(0) ;ref_ptr
159 mov rdi, arg(2) ;src_ptr
160 movsxd rcx, dword ptr arg(4) ;Height
161 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
162
163 movdqu xmm1, XMMWORD PTR [rsi]
164 movdqa xmm0, xmm1
165
166 %if ABI_IS_32BIT=0
167 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
168 %endif
169
170 lea rsi, [rsi + rax]
171
172 .filter_block2d_bil_sp_only_loop:
173 movdqu xmm3, XMMWORD PTR [rsi]
174 movdqa xmm2, xmm1
175 movdqa xmm0, xmm3
176
177 punpcklbw xmm1, xmm3
178 punpckhbw xmm2, xmm3
179 pmaddubsw xmm1, [rdx]
180 pmaddubsw xmm2, [rdx]
181
182 paddw xmm1, [GLOBAL(xmm_bi_rd)]
183 paddw xmm2, [GLOBAL(xmm_bi_rd)]
184 psraw xmm1, xmm_filter_shift
185 psraw xmm2, xmm_filter_shift
186
187 movq xmm3, QWORD PTR [rdi]
188 pxor xmm4, xmm4
189 punpcklbw xmm3, xmm4
190 movq xmm5, QWORD PTR [rdi+8]
191 punpcklbw xmm5, xmm4
192
193 psubw xmm1, xmm3
194 psubw xmm2, xmm5
195 paddw xmm6, xmm1
196 paddw xmm6, xmm2
197 pmaddwd xmm1, xmm1
198 pmaddwd xmm2, xmm2
199 paddd xmm7, xmm1
200 paddd xmm7, xmm2
201
202 movdqa xmm1, xmm0
203 lea rsi, [rsi + rax] ;ref_pixels_per_line
204
205 %if ABI_IS_32BIT
206 add rdi, dword ptr arg(3) ;src_pixels_per_line
207 %else
208 lea rdi, [rdi + r9]
209 %endif
210
211 sub rcx, 1
212 jnz .filter_block2d_bil_sp_only_loop
213
214 jmp .filter_block2d_bil_variance
215
216 .filter_block2d_bil_var_ssse3_full_pixel:
217 mov rsi, arg(0) ;ref_ptr
218 mov rdi, arg(2) ;src_ptr
219 movsxd rcx, dword ptr arg(4) ;Height
220 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
221 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
222 pxor xmm0, xmm0
223
224 .filter_block2d_bil_full_pixel_loop:
225 movq xmm1, QWORD PTR [rsi]
226 punpcklbw xmm1, xmm0
227 movq xmm2, QWORD PTR [rsi+8]
228 punpcklbw xmm2, xmm0
229
230 movq xmm3, QWORD PTR [rdi]
231 punpcklbw xmm3, xmm0
232 movq xmm4, QWORD PTR [rdi+8]
233 punpcklbw xmm4, xmm0
234
235 psubw xmm1, xmm3
236 psubw xmm2, xmm4
237 paddw xmm6, xmm1
238 paddw xmm6, xmm2
239 pmaddwd xmm1, xmm1
240 pmaddwd xmm2, xmm2
241 paddd xmm7, xmm1
242 paddd xmm7, xmm2
243
244 lea rsi, [rsi + rax] ;ref_pixels_per_line
245 lea rdi, [rdi + rdx] ;src_pixels_per_line
246 sub rcx, 1
247 jnz .filter_block2d_bil_full_pixel_loop
248
249 jmp .filter_block2d_bil_variance
250
251 .filter_block2d_bil_var_ssse3_fp_only:
252 mov rsi, arg(0) ;ref_ptr
253 mov rdi, arg(2) ;src_ptr
254 movsxd rcx, dword ptr arg(4) ;Height
255 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
256
257 pxor xmm0, xmm0
258
259 %if ABI_IS_32BIT=0
260 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
261 %endif
262
263 .filter_block2d_bil_fp_only_loop:
264 movdqu xmm1, XMMWORD PTR [rsi]
265 movdqu xmm2, XMMWORD PTR [rsi+1]
266 movdqa xmm3, xmm1
267
268 punpcklbw xmm1, xmm2
269 punpckhbw xmm3, xmm2
270 pmaddubsw xmm1, [rax]
271 pmaddubsw xmm3, [rax]
272
273 paddw xmm1, [GLOBAL(xmm_bi_rd)]
274 paddw xmm3, [GLOBAL(xmm_bi_rd)]
275 psraw xmm1, xmm_filter_shift
276 psraw xmm3, xmm_filter_shift
277
278 movq xmm2, XMMWORD PTR [rdi]
279 pxor xmm4, xmm4
280 punpcklbw xmm2, xmm4
281 movq xmm5, QWORD PTR [rdi+8]
282 punpcklbw xmm5, xmm4
283
284 psubw xmm1, xmm2
285 psubw xmm3, xmm5
286 paddw xmm6, xmm1
287 paddw xmm6, xmm3
288 pmaddwd xmm1, xmm1
289 pmaddwd xmm3, xmm3
290 paddd xmm7, xmm1
291 paddd xmm7, xmm3
292
293 lea rsi, [rsi + rdx]
294 %if ABI_IS_32BIT
295 add rdi, dword ptr arg(3) ;src_pixels_per_line
296 %else
297 lea rdi, [rdi + r9]
298 %endif
299
300 sub rcx, 1
301 jnz .filter_block2d_bil_fp_only_loop
302
303 jmp .filter_block2d_bil_variance
304
305 .filter_block2d_bil_variance:
306 pxor xmm0, xmm0
307 pxor xmm1, xmm1
308 pxor xmm5, xmm5
309
310 punpcklwd xmm0, xmm6
311 punpckhwd xmm1, xmm6
312 psrad xmm0, 16
313 psrad xmm1, 16
314 paddd xmm0, xmm1
315 movdqa xmm1, xmm0
316
317 movdqa xmm6, xmm7
318 punpckldq xmm6, xmm5
319 punpckhdq xmm7, xmm5
320 paddd xmm6, xmm7
321
322 punpckldq xmm0, xmm5
323 punpckhdq xmm1, xmm5
324 paddd xmm0, xmm1
325
326 movdqa xmm7, xmm6
327 movdqa xmm1, xmm0
328
329 psrldq xmm7, 8
330 psrldq xmm1, 8
331
332 paddd xmm6, xmm7
333 paddd xmm0, xmm1
334
335 mov rsi, arg(7) ;[Sum]
336 mov rdi, arg(8) ;[SSE]
337
338 movd [rsi], xmm0
339 movd [rdi], xmm6
340
341 ; begin epilog
342 pop rdi
343 pop rsi
344 RESTORE_GOT
345 RESTORE_XMM
346 UNSHADOW_ARGS
347 pop rbp
348 ret
349
350
351 SECTION_RODATA
352 align 16
353 xmm_bi_rd:
354 times 8 dw 64
355 align 16
356 vp8_bilinear_filters_ssse3:
357 times 8 db 128, 0
358 times 8 db 112, 16
359 times 8 db 96, 32
360 times 8 db 80, 48
361 times 8 db 64, 64
362 times 8 db 48, 80
363 times 8 db 32, 96
364 times 8 db 16, 112

mercurial