media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:d4784ffb1267
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "vpx_ports/x86_abi_support.asm"
12
13 ;void vp9_half_horiz_vert_variance16x_h_sse2
14 ;(
15 ; unsigned char *ref_ptr,
16 ; int ref_pixels_per_line,
17 ; unsigned char *src_ptr,
18 ; int src_pixels_per_line,
19 ; unsigned int Height,
20 ; int *sum,
21 ; unsigned int *sumsquared
22 ;)
23 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
24 sym(vp9_half_horiz_vert_variance16x_h_sse2):
25 push rbp
26 mov rbp, rsp
27 SHADOW_ARGS_TO_STACK 7
28 SAVE_XMM 7
29 GET_GOT rbx
30 push rsi
31 push rdi
32 ; end prolog
33
34 pxor xmm6, xmm6 ; error accumulator
35 pxor xmm7, xmm7 ; sse eaccumulator
36 mov rsi, arg(0) ;ref_ptr ;
37
38 mov rdi, arg(2) ;src_ptr ;
39 movsxd rcx, dword ptr arg(4) ;Height ;
40 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
41 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
42
43 pxor xmm0, xmm0 ;
44
45 movdqu xmm5, XMMWORD PTR [rsi]
46 movdqu xmm3, XMMWORD PTR [rsi+1]
47 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
48
49 lea rsi, [rsi + rax]
50
51 .half_horiz_vert_variance16x_h_1:
52 movdqu xmm1, XMMWORD PTR [rsi] ;
53 movdqu xmm2, XMMWORD PTR [rsi+1] ;
54 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
55
56 pavgb xmm5, xmm1 ; xmm = vertical average of the above
57
58 movdqa xmm4, xmm5
59 punpcklbw xmm5, xmm0 ; xmm5 = words of above
60 punpckhbw xmm4, xmm0
61
62 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
63 punpcklbw xmm3, xmm0 ; xmm3 = words of above
64 psubw xmm5, xmm3 ; xmm5 -= xmm3
65
66 movq xmm3, QWORD PTR [rdi+8]
67 punpcklbw xmm3, xmm0
68 psubw xmm4, xmm3
69
70 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
71 paddw xmm6, xmm4
72 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
73 pmaddwd xmm4, xmm4
74 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
75 paddd xmm7, xmm4
76
77 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
78
79 lea rsi, [rsi + rax]
80 lea rdi, [rdi + rdx]
81
82 sub rcx, 1 ;
83 jnz .half_horiz_vert_variance16x_h_1 ;
84
85 pxor xmm1, xmm1
86 pxor xmm5, xmm5
87
88 punpcklwd xmm0, xmm6
89 punpckhwd xmm1, xmm6
90 psrad xmm0, 16
91 psrad xmm1, 16
92 paddd xmm0, xmm1
93 movdqa xmm1, xmm0
94
95 movdqa xmm6, xmm7
96 punpckldq xmm6, xmm5
97 punpckhdq xmm7, xmm5
98 paddd xmm6, xmm7
99
100 punpckldq xmm0, xmm5
101 punpckhdq xmm1, xmm5
102 paddd xmm0, xmm1
103
104 movdqa xmm7, xmm6
105 movdqa xmm1, xmm0
106
107 psrldq xmm7, 8
108 psrldq xmm1, 8
109
110 paddd xmm6, xmm7
111 paddd xmm0, xmm1
112
113 mov rsi, arg(5) ;[Sum]
114 mov rdi, arg(6) ;[SSE]
115
116 movd [rsi], xmm0
117 movd [rdi], xmm6
118
119 ; begin epilog
120 pop rdi
121 pop rsi
122 RESTORE_GOT
123 RESTORE_XMM
124 UNSHADOW_ARGS
125 pop rbp
126 ret
127
128 ;void vp9_half_vert_variance16x_h_sse2
129 ;(
130 ; unsigned char *ref_ptr,
131 ; int ref_pixels_per_line,
132 ; unsigned char *src_ptr,
133 ; int src_pixels_per_line,
134 ; unsigned int Height,
135 ; int *sum,
136 ; unsigned int *sumsquared
137 ;)
138 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
139 sym(vp9_half_vert_variance16x_h_sse2):
140 push rbp
141 mov rbp, rsp
142 SHADOW_ARGS_TO_STACK 7
143 SAVE_XMM 7
144 GET_GOT rbx
145 push rsi
146 push rdi
147 ; end prolog
148
149 pxor xmm6, xmm6 ; error accumulator
150 pxor xmm7, xmm7 ; sse eaccumulator
151 mov rsi, arg(0) ;ref_ptr
152
153 mov rdi, arg(2) ;src_ptr
154 movsxd rcx, dword ptr arg(4) ;Height
155 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
156 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
157
158 movdqu xmm5, XMMWORD PTR [rsi]
159 lea rsi, [rsi + rax ]
160 pxor xmm0, xmm0
161
162 .half_vert_variance16x_h_1:
163 movdqu xmm3, XMMWORD PTR [rsi]
164
165 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
166 movdqa xmm4, xmm5
167 punpcklbw xmm5, xmm0
168 punpckhbw xmm4, xmm0
169
170 movq xmm2, QWORD PTR [rdi]
171 punpcklbw xmm2, xmm0
172 psubw xmm5, xmm2
173 movq xmm2, QWORD PTR [rdi+8]
174 punpcklbw xmm2, xmm0
175 psubw xmm4, xmm2
176
177 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
178 paddw xmm6, xmm4
179 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
180 pmaddwd xmm4, xmm4
181 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
182 paddd xmm7, xmm4
183
184 movdqa xmm5, xmm3
185
186 lea rsi, [rsi + rax]
187 lea rdi, [rdi + rdx]
188
189 sub rcx, 1
190 jnz .half_vert_variance16x_h_1
191
192 pxor xmm1, xmm1
193 pxor xmm5, xmm5
194
195 punpcklwd xmm0, xmm6
196 punpckhwd xmm1, xmm6
197 psrad xmm0, 16
198 psrad xmm1, 16
199 paddd xmm0, xmm1
200 movdqa xmm1, xmm0
201
202 movdqa xmm6, xmm7
203 punpckldq xmm6, xmm5
204 punpckhdq xmm7, xmm5
205 paddd xmm6, xmm7
206
207 punpckldq xmm0, xmm5
208 punpckhdq xmm1, xmm5
209 paddd xmm0, xmm1
210
211 movdqa xmm7, xmm6
212 movdqa xmm1, xmm0
213
214 psrldq xmm7, 8
215 psrldq xmm1, 8
216
217 paddd xmm6, xmm7
218 paddd xmm0, xmm1
219
220 mov rsi, arg(5) ;[Sum]
221 mov rdi, arg(6) ;[SSE]
222
223 movd [rsi], xmm0
224 movd [rdi], xmm6
225
226 ; begin epilog
227 pop rdi
228 pop rsi
229 RESTORE_GOT
230 RESTORE_XMM
231 UNSHADOW_ARGS
232 pop rbp
233 ret
234
235 ;void vp9_half_horiz_variance16x_h_sse2
236 ;(
237 ; unsigned char *ref_ptr,
238 ; int ref_pixels_per_line,
239 ; unsigned char *src_ptr,
240 ; int src_pixels_per_line,
241 ; unsigned int Height,
242 ; int *sum,
243 ; unsigned int *sumsquared
244 ;)
245 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
246 sym(vp9_half_horiz_variance16x_h_sse2):
247 push rbp
248 mov rbp, rsp
249 SHADOW_ARGS_TO_STACK 7
250 SAVE_XMM 7
251 GET_GOT rbx
252 push rsi
253 push rdi
254 ; end prolog
255
256 pxor xmm6, xmm6 ; error accumulator
257 pxor xmm7, xmm7 ; sse eaccumulator
258 mov rsi, arg(0) ;ref_ptr ;
259
260 mov rdi, arg(2) ;src_ptr ;
261 movsxd rcx, dword ptr arg(4) ;Height ;
262 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
263 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
264
265 pxor xmm0, xmm0 ;
266
267 .half_horiz_variance16x_h_1:
268 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
269 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
270
271 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
272 movdqa xmm1, xmm5
273 punpcklbw xmm5, xmm0 ; xmm5 = words of above
274 punpckhbw xmm1, xmm0
275
276 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
277 punpcklbw xmm3, xmm0 ; xmm3 = words of above
278 movq xmm2, QWORD PTR [rdi+8]
279 punpcklbw xmm2, xmm0
280
281 psubw xmm5, xmm3 ; xmm5 -= xmm3
282 psubw xmm1, xmm2
283 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
284 paddw xmm6, xmm1
285 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
286 pmaddwd xmm1, xmm1
287 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
288 paddd xmm7, xmm1
289
290 lea rsi, [rsi + rax]
291 lea rdi, [rdi + rdx]
292
293 sub rcx, 1 ;
294 jnz .half_horiz_variance16x_h_1 ;
295
296 pxor xmm1, xmm1
297 pxor xmm5, xmm5
298
299 punpcklwd xmm0, xmm6
300 punpckhwd xmm1, xmm6
301 psrad xmm0, 16
302 psrad xmm1, 16
303 paddd xmm0, xmm1
304 movdqa xmm1, xmm0
305
306 movdqa xmm6, xmm7
307 punpckldq xmm6, xmm5
308 punpckhdq xmm7, xmm5
309 paddd xmm6, xmm7
310
311 punpckldq xmm0, xmm5
312 punpckhdq xmm1, xmm5
313 paddd xmm0, xmm1
314
315 movdqa xmm7, xmm6
316 movdqa xmm1, xmm0
317
318 psrldq xmm7, 8
319 psrldq xmm1, 8
320
321 paddd xmm6, xmm7
322 paddd xmm0, xmm1
323
324 mov rsi, arg(5) ;[Sum]
325 mov rdi, arg(6) ;[SSE]
326
327 movd [rsi], xmm0
328 movd [rdi], xmm6
329
330 ; begin epilog
331 pop rdi
332 pop rsi
333 RESTORE_GOT
334 RESTORE_XMM
335 UNSHADOW_ARGS
336 pop rbp
337 ret

mercurial