|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "vpx_ports/x86_abi_support.asm" |
|
12 |
|
13 ;void vp9_half_horiz_vert_variance16x_h_sse2 |
|
14 ;( |
|
15 ; unsigned char *ref_ptr, |
|
16 ; int ref_pixels_per_line, |
|
17 ; unsigned char *src_ptr, |
|
18 ; int src_pixels_per_line, |
|
19 ; unsigned int Height, |
|
20 ; int *sum, |
|
21 ; unsigned int *sumsquared |
|
22 ;) |
|
23 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE |
|
24 sym(vp9_half_horiz_vert_variance16x_h_sse2): |
|
25 push rbp |
|
26 mov rbp, rsp |
|
27 SHADOW_ARGS_TO_STACK 7 |
|
28 SAVE_XMM 7 |
|
29 GET_GOT rbx |
|
30 push rsi |
|
31 push rdi |
|
32 ; end prolog |
|
33 |
|
34 pxor xmm6, xmm6 ; error accumulator |
|
35 pxor xmm7, xmm7 ; sse eaccumulator |
|
36 mov rsi, arg(0) ;ref_ptr ; |
|
37 |
|
38 mov rdi, arg(2) ;src_ptr ; |
|
39 movsxd rcx, dword ptr arg(4) ;Height ; |
|
40 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
|
41 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line |
|
42 |
|
43 pxor xmm0, xmm0 ; |
|
44 |
|
45 movdqu xmm5, XMMWORD PTR [rsi] |
|
46 movdqu xmm3, XMMWORD PTR [rsi+1] |
|
47 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 |
|
48 |
|
49 lea rsi, [rsi + rax] |
|
50 |
|
51 .half_horiz_vert_variance16x_h_1: |
|
52 movdqu xmm1, XMMWORD PTR [rsi] ; |
|
53 movdqu xmm2, XMMWORD PTR [rsi+1] ; |
|
54 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 |
|
55 |
|
56 pavgb xmm5, xmm1 ; xmm = vertical average of the above |
|
57 |
|
58 movdqa xmm4, xmm5 |
|
59 punpcklbw xmm5, xmm0 ; xmm5 = words of above |
|
60 punpckhbw xmm4, xmm0 |
|
61 |
|
62 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 |
|
63 punpcklbw xmm3, xmm0 ; xmm3 = words of above |
|
64 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
|
65 |
|
66 movq xmm3, QWORD PTR [rdi+8] |
|
67 punpcklbw xmm3, xmm0 |
|
68 psubw xmm4, xmm3 |
|
69 |
|
70 paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
|
71 paddw xmm6, xmm4 |
|
72 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
|
73 pmaddwd xmm4, xmm4 |
|
74 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
|
75 paddd xmm7, xmm4 |
|
76 |
|
77 movdqa xmm5, xmm1 ; save xmm1 for use on the next row |
|
78 |
|
79 lea rsi, [rsi + rax] |
|
80 lea rdi, [rdi + rdx] |
|
81 |
|
82 sub rcx, 1 ; |
|
83 jnz .half_horiz_vert_variance16x_h_1 ; |
|
84 |
|
85 pxor xmm1, xmm1 |
|
86 pxor xmm5, xmm5 |
|
87 |
|
88 punpcklwd xmm0, xmm6 |
|
89 punpckhwd xmm1, xmm6 |
|
90 psrad xmm0, 16 |
|
91 psrad xmm1, 16 |
|
92 paddd xmm0, xmm1 |
|
93 movdqa xmm1, xmm0 |
|
94 |
|
95 movdqa xmm6, xmm7 |
|
96 punpckldq xmm6, xmm5 |
|
97 punpckhdq xmm7, xmm5 |
|
98 paddd xmm6, xmm7 |
|
99 |
|
100 punpckldq xmm0, xmm5 |
|
101 punpckhdq xmm1, xmm5 |
|
102 paddd xmm0, xmm1 |
|
103 |
|
104 movdqa xmm7, xmm6 |
|
105 movdqa xmm1, xmm0 |
|
106 |
|
107 psrldq xmm7, 8 |
|
108 psrldq xmm1, 8 |
|
109 |
|
110 paddd xmm6, xmm7 |
|
111 paddd xmm0, xmm1 |
|
112 |
|
113 mov rsi, arg(5) ;[Sum] |
|
114 mov rdi, arg(6) ;[SSE] |
|
115 |
|
116 movd [rsi], xmm0 |
|
117 movd [rdi], xmm6 |
|
118 |
|
119 ; begin epilog |
|
120 pop rdi |
|
121 pop rsi |
|
122 RESTORE_GOT |
|
123 RESTORE_XMM |
|
124 UNSHADOW_ARGS |
|
125 pop rbp |
|
126 ret |
|
127 |
|
128 ;void vp9_half_vert_variance16x_h_sse2 |
|
129 ;( |
|
130 ; unsigned char *ref_ptr, |
|
131 ; int ref_pixels_per_line, |
|
132 ; unsigned char *src_ptr, |
|
133 ; int src_pixels_per_line, |
|
134 ; unsigned int Height, |
|
135 ; int *sum, |
|
136 ; unsigned int *sumsquared |
|
137 ;) |
|
138 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE |
|
139 sym(vp9_half_vert_variance16x_h_sse2): |
|
140 push rbp |
|
141 mov rbp, rsp |
|
142 SHADOW_ARGS_TO_STACK 7 |
|
143 SAVE_XMM 7 |
|
144 GET_GOT rbx |
|
145 push rsi |
|
146 push rdi |
|
147 ; end prolog |
|
148 |
|
149 pxor xmm6, xmm6 ; error accumulator |
|
150 pxor xmm7, xmm7 ; sse eaccumulator |
|
151 mov rsi, arg(0) ;ref_ptr |
|
152 |
|
153 mov rdi, arg(2) ;src_ptr |
|
154 movsxd rcx, dword ptr arg(4) ;Height |
|
155 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
|
156 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line |
|
157 |
|
158 movdqu xmm5, XMMWORD PTR [rsi] |
|
159 lea rsi, [rsi + rax ] |
|
160 pxor xmm0, xmm0 |
|
161 |
|
162 .half_vert_variance16x_h_1: |
|
163 movdqu xmm3, XMMWORD PTR [rsi] |
|
164 |
|
165 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) |
|
166 movdqa xmm4, xmm5 |
|
167 punpcklbw xmm5, xmm0 |
|
168 punpckhbw xmm4, xmm0 |
|
169 |
|
170 movq xmm2, QWORD PTR [rdi] |
|
171 punpcklbw xmm2, xmm0 |
|
172 psubw xmm5, xmm2 |
|
173 movq xmm2, QWORD PTR [rdi+8] |
|
174 punpcklbw xmm2, xmm0 |
|
175 psubw xmm4, xmm2 |
|
176 |
|
177 paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
|
178 paddw xmm6, xmm4 |
|
179 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
|
180 pmaddwd xmm4, xmm4 |
|
181 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
|
182 paddd xmm7, xmm4 |
|
183 |
|
184 movdqa xmm5, xmm3 |
|
185 |
|
186 lea rsi, [rsi + rax] |
|
187 lea rdi, [rdi + rdx] |
|
188 |
|
189 sub rcx, 1 |
|
190 jnz .half_vert_variance16x_h_1 |
|
191 |
|
192 pxor xmm1, xmm1 |
|
193 pxor xmm5, xmm5 |
|
194 |
|
195 punpcklwd xmm0, xmm6 |
|
196 punpckhwd xmm1, xmm6 |
|
197 psrad xmm0, 16 |
|
198 psrad xmm1, 16 |
|
199 paddd xmm0, xmm1 |
|
200 movdqa xmm1, xmm0 |
|
201 |
|
202 movdqa xmm6, xmm7 |
|
203 punpckldq xmm6, xmm5 |
|
204 punpckhdq xmm7, xmm5 |
|
205 paddd xmm6, xmm7 |
|
206 |
|
207 punpckldq xmm0, xmm5 |
|
208 punpckhdq xmm1, xmm5 |
|
209 paddd xmm0, xmm1 |
|
210 |
|
211 movdqa xmm7, xmm6 |
|
212 movdqa xmm1, xmm0 |
|
213 |
|
214 psrldq xmm7, 8 |
|
215 psrldq xmm1, 8 |
|
216 |
|
217 paddd xmm6, xmm7 |
|
218 paddd xmm0, xmm1 |
|
219 |
|
220 mov rsi, arg(5) ;[Sum] |
|
221 mov rdi, arg(6) ;[SSE] |
|
222 |
|
223 movd [rsi], xmm0 |
|
224 movd [rdi], xmm6 |
|
225 |
|
226 ; begin epilog |
|
227 pop rdi |
|
228 pop rsi |
|
229 RESTORE_GOT |
|
230 RESTORE_XMM |
|
231 UNSHADOW_ARGS |
|
232 pop rbp |
|
233 ret |
|
234 |
|
235 ;void vp9_half_horiz_variance16x_h_sse2 |
|
236 ;( |
|
237 ; unsigned char *ref_ptr, |
|
238 ; int ref_pixels_per_line, |
|
239 ; unsigned char *src_ptr, |
|
240 ; int src_pixels_per_line, |
|
241 ; unsigned int Height, |
|
242 ; int *sum, |
|
243 ; unsigned int *sumsquared |
|
244 ;) |
|
245 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE |
|
246 sym(vp9_half_horiz_variance16x_h_sse2): |
|
247 push rbp |
|
248 mov rbp, rsp |
|
249 SHADOW_ARGS_TO_STACK 7 |
|
250 SAVE_XMM 7 |
|
251 GET_GOT rbx |
|
252 push rsi |
|
253 push rdi |
|
254 ; end prolog |
|
255 |
|
256 pxor xmm6, xmm6 ; error accumulator |
|
257 pxor xmm7, xmm7 ; sse eaccumulator |
|
258 mov rsi, arg(0) ;ref_ptr ; |
|
259 |
|
260 mov rdi, arg(2) ;src_ptr ; |
|
261 movsxd rcx, dword ptr arg(4) ;Height ; |
|
262 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
|
263 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line |
|
264 |
|
265 pxor xmm0, xmm0 ; |
|
266 |
|
267 .half_horiz_variance16x_h_1: |
|
268 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 |
|
269 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 |
|
270 |
|
271 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) |
|
272 movdqa xmm1, xmm5 |
|
273 punpcklbw xmm5, xmm0 ; xmm5 = words of above |
|
274 punpckhbw xmm1, xmm0 |
|
275 |
|
276 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 |
|
277 punpcklbw xmm3, xmm0 ; xmm3 = words of above |
|
278 movq xmm2, QWORD PTR [rdi+8] |
|
279 punpcklbw xmm2, xmm0 |
|
280 |
|
281 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
|
282 psubw xmm1, xmm2 |
|
283 paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
|
284 paddw xmm6, xmm1 |
|
285 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
|
286 pmaddwd xmm1, xmm1 |
|
287 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
|
288 paddd xmm7, xmm1 |
|
289 |
|
290 lea rsi, [rsi + rax] |
|
291 lea rdi, [rdi + rdx] |
|
292 |
|
293 sub rcx, 1 ; |
|
294 jnz .half_horiz_variance16x_h_1 ; |
|
295 |
|
296 pxor xmm1, xmm1 |
|
297 pxor xmm5, xmm5 |
|
298 |
|
299 punpcklwd xmm0, xmm6 |
|
300 punpckhwd xmm1, xmm6 |
|
301 psrad xmm0, 16 |
|
302 psrad xmm1, 16 |
|
303 paddd xmm0, xmm1 |
|
304 movdqa xmm1, xmm0 |
|
305 |
|
306 movdqa xmm6, xmm7 |
|
307 punpckldq xmm6, xmm5 |
|
308 punpckhdq xmm7, xmm5 |
|
309 paddd xmm6, xmm7 |
|
310 |
|
311 punpckldq xmm0, xmm5 |
|
312 punpckhdq xmm1, xmm5 |
|
313 paddd xmm0, xmm1 |
|
314 |
|
315 movdqa xmm7, xmm6 |
|
316 movdqa xmm1, xmm0 |
|
317 |
|
318 psrldq xmm7, 8 |
|
319 psrldq xmm1, 8 |
|
320 |
|
321 paddd xmm6, xmm7 |
|
322 paddd xmm0, xmm1 |
|
323 |
|
324 mov rsi, arg(5) ;[Sum] |
|
325 mov rdi, arg(6) ;[SSE] |
|
326 |
|
327 movd [rsi], xmm0 |
|
328 movd [rdi], xmm6 |
|
329 |
|
330 ; begin epilog |
|
331 pop rdi |
|
332 pop rsi |
|
333 RESTORE_GOT |
|
334 RESTORE_XMM |
|
335 UNSHADOW_ARGS |
|
336 pop rbp |
|
337 ret |