|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) |
|
15 global sym(vp8_block_error_xmm) PRIVATE |
|
16 sym(vp8_block_error_xmm): |
|
17 push rbp |
|
18 mov rbp, rsp |
|
19 SHADOW_ARGS_TO_STACK 2 |
|
20 push rsi |
|
21 push rdi |
|
22 ; end prologue |
|
23 |
|
24 mov rsi, arg(0) ;coeff_ptr |
|
25 mov rdi, arg(1) ;dcoef_ptr |
|
26 |
|
27 movdqa xmm0, [rsi] |
|
28 movdqa xmm1, [rdi] |
|
29 |
|
30 movdqa xmm2, [rsi+16] |
|
31 movdqa xmm3, [rdi+16] |
|
32 |
|
33 psubw xmm0, xmm1 |
|
34 psubw xmm2, xmm3 |
|
35 |
|
36 pmaddwd xmm0, xmm0 |
|
37 pmaddwd xmm2, xmm2 |
|
38 |
|
39 paddd xmm0, xmm2 |
|
40 |
|
41 pxor xmm5, xmm5 |
|
42 movdqa xmm1, xmm0 |
|
43 |
|
44 punpckldq xmm0, xmm5 |
|
45 punpckhdq xmm1, xmm5 |
|
46 |
|
47 paddd xmm0, xmm1 |
|
48 movdqa xmm1, xmm0 |
|
49 |
|
50 psrldq xmm0, 8 |
|
51 paddd xmm0, xmm1 |
|
52 |
|
53 movq rax, xmm0 |
|
54 |
|
55 pop rdi |
|
56 pop rsi |
|
57 ; begin epilog |
|
58 UNSHADOW_ARGS |
|
59 pop rbp |
|
60 ret |
|
61 |
|
62 ;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) |
|
63 global sym(vp8_block_error_mmx) PRIVATE |
|
64 sym(vp8_block_error_mmx): |
|
65 push rbp |
|
66 mov rbp, rsp |
|
67 SHADOW_ARGS_TO_STACK 2 |
|
68 push rsi |
|
69 push rdi |
|
70 ; end prolog |
|
71 |
|
72 |
|
73 mov rsi, arg(0) ;coeff_ptr |
|
74 pxor mm7, mm7 |
|
75 |
|
76 mov rdi, arg(1) ;dcoef_ptr |
|
77 movq mm3, [rsi] |
|
78 |
|
79 movq mm4, [rdi] |
|
80 movq mm5, [rsi+8] |
|
81 |
|
82 movq mm6, [rdi+8] |
|
83 pxor mm1, mm1 ; from movd mm1, dc ; dc =0 |
|
84 |
|
85 movq mm2, mm7 |
|
86 psubw mm5, mm6 |
|
87 |
|
88 por mm1, mm2 |
|
89 pmaddwd mm5, mm5 |
|
90 |
|
91 pcmpeqw mm1, mm7 |
|
92 psubw mm3, mm4 |
|
93 |
|
94 pand mm1, mm3 |
|
95 pmaddwd mm1, mm1 |
|
96 |
|
97 paddd mm1, mm5 |
|
98 movq mm3, [rsi+16] |
|
99 |
|
100 movq mm4, [rdi+16] |
|
101 movq mm5, [rsi+24] |
|
102 |
|
103 movq mm6, [rdi+24] |
|
104 psubw mm5, mm6 |
|
105 |
|
106 pmaddwd mm5, mm5 |
|
107 psubw mm3, mm4 |
|
108 |
|
109 pmaddwd mm3, mm3 |
|
110 paddd mm3, mm5 |
|
111 |
|
112 paddd mm1, mm3 |
|
113 movq mm0, mm1 |
|
114 |
|
115 psrlq mm1, 32 |
|
116 paddd mm0, mm1 |
|
117 |
|
118 movq rax, mm0 |
|
119 |
|
120 pop rdi |
|
121 pop rsi |
|
122 ; begin epilog |
|
123 UNSHADOW_ARGS |
|
124 pop rbp |
|
125 ret |
|
126 |
|
127 |
|
128 ;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); |
|
129 global sym(vp8_mbblock_error_mmx_impl) PRIVATE |
|
130 sym(vp8_mbblock_error_mmx_impl): |
|
131 push rbp |
|
132 mov rbp, rsp |
|
133 SHADOW_ARGS_TO_STACK 3 |
|
134 push rsi |
|
135 push rdi |
|
136 ; end prolog |
|
137 |
|
138 |
|
139 mov rsi, arg(0) ;coeff_ptr |
|
140 pxor mm7, mm7 |
|
141 |
|
142 mov rdi, arg(1) ;dcoef_ptr |
|
143 pxor mm2, mm2 |
|
144 |
|
145 movd mm1, dword ptr arg(2) ;dc |
|
146 por mm1, mm2 |
|
147 |
|
148 pcmpeqw mm1, mm7 |
|
149 mov rcx, 16 |
|
150 |
|
151 .mberror_loop_mmx: |
|
152 movq mm3, [rsi] |
|
153 movq mm4, [rdi] |
|
154 |
|
155 movq mm5, [rsi+8] |
|
156 movq mm6, [rdi+8] |
|
157 |
|
158 |
|
159 psubw mm5, mm6 |
|
160 pmaddwd mm5, mm5 |
|
161 |
|
162 psubw mm3, mm4 |
|
163 pand mm3, mm1 |
|
164 |
|
165 pmaddwd mm3, mm3 |
|
166 paddd mm2, mm5 |
|
167 |
|
168 paddd mm2, mm3 |
|
169 movq mm3, [rsi+16] |
|
170 |
|
171 movq mm4, [rdi+16] |
|
172 movq mm5, [rsi+24] |
|
173 |
|
174 movq mm6, [rdi+24] |
|
175 psubw mm5, mm6 |
|
176 |
|
177 pmaddwd mm5, mm5 |
|
178 psubw mm3, mm4 |
|
179 |
|
180 pmaddwd mm3, mm3 |
|
181 paddd mm2, mm5 |
|
182 |
|
183 paddd mm2, mm3 |
|
184 add rsi, 32 |
|
185 |
|
186 add rdi, 32 |
|
187 sub rcx, 1 |
|
188 |
|
189 jnz .mberror_loop_mmx |
|
190 |
|
191 movq mm0, mm2 |
|
192 psrlq mm2, 32 |
|
193 |
|
194 paddd mm0, mm2 |
|
195 movq rax, mm0 |
|
196 |
|
197 pop rdi |
|
198 pop rsi |
|
199 ; begin epilog |
|
200 UNSHADOW_ARGS |
|
201 pop rbp |
|
202 ret |
|
203 |
|
204 |
|
205 ;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); |
|
206 global sym(vp8_mbblock_error_xmm_impl) PRIVATE |
|
207 sym(vp8_mbblock_error_xmm_impl): |
|
208 push rbp |
|
209 mov rbp, rsp |
|
210 SHADOW_ARGS_TO_STACK 3 |
|
211 SAVE_XMM 6 |
|
212 push rsi |
|
213 push rdi |
|
214 ; end prolog |
|
215 |
|
216 |
|
217 mov rsi, arg(0) ;coeff_ptr |
|
218 pxor xmm6, xmm6 |
|
219 |
|
220 mov rdi, arg(1) ;dcoef_ptr |
|
221 pxor xmm4, xmm4 |
|
222 |
|
223 movd xmm5, dword ptr arg(2) ;dc |
|
224 por xmm5, xmm4 |
|
225 |
|
226 pcmpeqw xmm5, xmm6 |
|
227 mov rcx, 16 |
|
228 |
|
229 .mberror_loop: |
|
230 movdqa xmm0, [rsi] |
|
231 movdqa xmm1, [rdi] |
|
232 |
|
233 movdqa xmm2, [rsi+16] |
|
234 movdqa xmm3, [rdi+16] |
|
235 |
|
236 |
|
237 psubw xmm2, xmm3 |
|
238 pmaddwd xmm2, xmm2 |
|
239 |
|
240 psubw xmm0, xmm1 |
|
241 pand xmm0, xmm5 |
|
242 |
|
243 pmaddwd xmm0, xmm0 |
|
244 add rsi, 32 |
|
245 |
|
246 add rdi, 32 |
|
247 |
|
248 sub rcx, 1 |
|
249 paddd xmm4, xmm2 |
|
250 |
|
251 paddd xmm4, xmm0 |
|
252 jnz .mberror_loop |
|
253 |
|
254 movdqa xmm0, xmm4 |
|
255 punpckldq xmm0, xmm6 |
|
256 |
|
257 punpckhdq xmm4, xmm6 |
|
258 paddd xmm0, xmm4 |
|
259 |
|
260 movdqa xmm1, xmm0 |
|
261 psrldq xmm0, 8 |
|
262 |
|
263 paddd xmm0, xmm1 |
|
264 movq rax, xmm0 |
|
265 |
|
266 pop rdi |
|
267 pop rsi |
|
268 ; begin epilog |
|
269 RESTORE_XMM |
|
270 UNSHADOW_ARGS |
|
271 pop rbp |
|
272 ret |
|
273 |
|
274 |
|
275 ;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); |
|
276 global sym(vp8_mbuverror_mmx_impl) PRIVATE |
|
277 sym(vp8_mbuverror_mmx_impl): |
|
278 push rbp |
|
279 mov rbp, rsp |
|
280 SHADOW_ARGS_TO_STACK 2 |
|
281 push rsi |
|
282 push rdi |
|
283 ; end prolog |
|
284 |
|
285 |
|
286 mov rsi, arg(0) ;s_ptr |
|
287 mov rdi, arg(1) ;d_ptr |
|
288 |
|
289 mov rcx, 16 |
|
290 pxor mm7, mm7 |
|
291 |
|
292 .mbuverror_loop_mmx: |
|
293 |
|
294 movq mm1, [rsi] |
|
295 movq mm2, [rdi] |
|
296 |
|
297 psubw mm1, mm2 |
|
298 pmaddwd mm1, mm1 |
|
299 |
|
300 |
|
301 movq mm3, [rsi+8] |
|
302 movq mm4, [rdi+8] |
|
303 |
|
304 psubw mm3, mm4 |
|
305 pmaddwd mm3, mm3 |
|
306 |
|
307 |
|
308 paddd mm7, mm1 |
|
309 paddd mm7, mm3 |
|
310 |
|
311 |
|
312 add rsi, 16 |
|
313 add rdi, 16 |
|
314 |
|
315 dec rcx |
|
316 jnz .mbuverror_loop_mmx |
|
317 |
|
318 movq mm0, mm7 |
|
319 psrlq mm7, 32 |
|
320 |
|
321 paddd mm0, mm7 |
|
322 movq rax, mm0 |
|
323 |
|
324 pop rdi |
|
325 pop rsi |
|
326 ; begin epilog |
|
327 UNSHADOW_ARGS |
|
328 pop rbp |
|
329 ret |
|
330 |
|
331 |
|
332 ;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); |
|
333 global sym(vp8_mbuverror_xmm_impl) PRIVATE |
|
334 sym(vp8_mbuverror_xmm_impl): |
|
335 push rbp |
|
336 mov rbp, rsp |
|
337 SHADOW_ARGS_TO_STACK 2 |
|
338 push rsi |
|
339 push rdi |
|
340 ; end prolog |
|
341 |
|
342 |
|
343 mov rsi, arg(0) ;s_ptr |
|
344 mov rdi, arg(1) ;d_ptr |
|
345 |
|
346 mov rcx, 16 |
|
347 pxor xmm3, xmm3 |
|
348 |
|
349 .mbuverror_loop: |
|
350 |
|
351 movdqa xmm1, [rsi] |
|
352 movdqa xmm2, [rdi] |
|
353 |
|
354 psubw xmm1, xmm2 |
|
355 pmaddwd xmm1, xmm1 |
|
356 |
|
357 paddd xmm3, xmm1 |
|
358 |
|
359 add rsi, 16 |
|
360 add rdi, 16 |
|
361 |
|
362 dec rcx |
|
363 jnz .mbuverror_loop |
|
364 |
|
365 pxor xmm0, xmm0 |
|
366 movdqa xmm1, xmm3 |
|
367 |
|
368 movdqa xmm2, xmm1 |
|
369 punpckldq xmm1, xmm0 |
|
370 |
|
371 punpckhdq xmm2, xmm0 |
|
372 paddd xmm1, xmm2 |
|
373 |
|
374 movdqa xmm2, xmm1 |
|
375 |
|
376 psrldq xmm1, 8 |
|
377 paddd xmm1, xmm2 |
|
378 |
|
379 movq rax, xmm1 |
|
380 |
|
381 pop rdi |
|
382 pop rsi |
|
383 ; begin epilog |
|
384 UNSHADOW_ARGS |
|
385 pop rbp |
|
386 ret |