media/libvpx/vp8/encoder/x86/encodeopt.asm

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:dd056ee0cba9
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
15 global sym(vp8_block_error_xmm) PRIVATE
16 sym(vp8_block_error_xmm):
17 push rbp
18 mov rbp, rsp
19 SHADOW_ARGS_TO_STACK 2
20 push rsi
21 push rdi
22 ; end prologue
23
24 mov rsi, arg(0) ;coeff_ptr
25 mov rdi, arg(1) ;dcoef_ptr
26
27 movdqa xmm0, [rsi]
28 movdqa xmm1, [rdi]
29
30 movdqa xmm2, [rsi+16]
31 movdqa xmm3, [rdi+16]
32
33 psubw xmm0, xmm1
34 psubw xmm2, xmm3
35
36 pmaddwd xmm0, xmm0
37 pmaddwd xmm2, xmm2
38
39 paddd xmm0, xmm2
40
41 pxor xmm5, xmm5
42 movdqa xmm1, xmm0
43
44 punpckldq xmm0, xmm5
45 punpckhdq xmm1, xmm5
46
47 paddd xmm0, xmm1
48 movdqa xmm1, xmm0
49
50 psrldq xmm0, 8
51 paddd xmm0, xmm1
52
53 movq rax, xmm0
54
55 pop rdi
56 pop rsi
57 ; begin epilog
58 UNSHADOW_ARGS
59 pop rbp
60 ret
61
62 ;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
63 global sym(vp8_block_error_mmx) PRIVATE
64 sym(vp8_block_error_mmx):
65 push rbp
66 mov rbp, rsp
67 SHADOW_ARGS_TO_STACK 2
68 push rsi
69 push rdi
70 ; end prolog
71
72
73 mov rsi, arg(0) ;coeff_ptr
74 pxor mm7, mm7
75
76 mov rdi, arg(1) ;dcoef_ptr
77 movq mm3, [rsi]
78
79 movq mm4, [rdi]
80 movq mm5, [rsi+8]
81
82 movq mm6, [rdi+8]
83 pxor mm1, mm1 ; from movd mm1, dc ; dc =0
84
85 movq mm2, mm7
86 psubw mm5, mm6
87
88 por mm1, mm2
89 pmaddwd mm5, mm5
90
91 pcmpeqw mm1, mm7
92 psubw mm3, mm4
93
94 pand mm1, mm3
95 pmaddwd mm1, mm1
96
97 paddd mm1, mm5
98 movq mm3, [rsi+16]
99
100 movq mm4, [rdi+16]
101 movq mm5, [rsi+24]
102
103 movq mm6, [rdi+24]
104 psubw mm5, mm6
105
106 pmaddwd mm5, mm5
107 psubw mm3, mm4
108
109 pmaddwd mm3, mm3
110 paddd mm3, mm5
111
112 paddd mm1, mm3
113 movq mm0, mm1
114
115 psrlq mm1, 32
116 paddd mm0, mm1
117
118 movq rax, mm0
119
120 pop rdi
121 pop rsi
122 ; begin epilog
123 UNSHADOW_ARGS
124 pop rbp
125 ret
126
127
128 ;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
129 global sym(vp8_mbblock_error_mmx_impl) PRIVATE
130 sym(vp8_mbblock_error_mmx_impl):
131 push rbp
132 mov rbp, rsp
133 SHADOW_ARGS_TO_STACK 3
134 push rsi
135 push rdi
136 ; end prolog
137
138
139 mov rsi, arg(0) ;coeff_ptr
140 pxor mm7, mm7
141
142 mov rdi, arg(1) ;dcoef_ptr
143 pxor mm2, mm2
144
145 movd mm1, dword ptr arg(2) ;dc
146 por mm1, mm2
147
148 pcmpeqw mm1, mm7
149 mov rcx, 16
150
151 .mberror_loop_mmx:
152 movq mm3, [rsi]
153 movq mm4, [rdi]
154
155 movq mm5, [rsi+8]
156 movq mm6, [rdi+8]
157
158
159 psubw mm5, mm6
160 pmaddwd mm5, mm5
161
162 psubw mm3, mm4
163 pand mm3, mm1
164
165 pmaddwd mm3, mm3
166 paddd mm2, mm5
167
168 paddd mm2, mm3
169 movq mm3, [rsi+16]
170
171 movq mm4, [rdi+16]
172 movq mm5, [rsi+24]
173
174 movq mm6, [rdi+24]
175 psubw mm5, mm6
176
177 pmaddwd mm5, mm5
178 psubw mm3, mm4
179
180 pmaddwd mm3, mm3
181 paddd mm2, mm5
182
183 paddd mm2, mm3
184 add rsi, 32
185
186 add rdi, 32
187 sub rcx, 1
188
189 jnz .mberror_loop_mmx
190
191 movq mm0, mm2
192 psrlq mm2, 32
193
194 paddd mm0, mm2
195 movq rax, mm0
196
197 pop rdi
198 pop rsi
199 ; begin epilog
200 UNSHADOW_ARGS
201 pop rbp
202 ret
203
204
205 ;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
206 global sym(vp8_mbblock_error_xmm_impl) PRIVATE
207 sym(vp8_mbblock_error_xmm_impl):
208 push rbp
209 mov rbp, rsp
210 SHADOW_ARGS_TO_STACK 3
211 SAVE_XMM 6
212 push rsi
213 push rdi
214 ; end prolog
215
216
217 mov rsi, arg(0) ;coeff_ptr
218 pxor xmm6, xmm6
219
220 mov rdi, arg(1) ;dcoef_ptr
221 pxor xmm4, xmm4
222
223 movd xmm5, dword ptr arg(2) ;dc
224 por xmm5, xmm4
225
226 pcmpeqw xmm5, xmm6
227 mov rcx, 16
228
229 .mberror_loop:
230 movdqa xmm0, [rsi]
231 movdqa xmm1, [rdi]
232
233 movdqa xmm2, [rsi+16]
234 movdqa xmm3, [rdi+16]
235
236
237 psubw xmm2, xmm3
238 pmaddwd xmm2, xmm2
239
240 psubw xmm0, xmm1
241 pand xmm0, xmm5
242
243 pmaddwd xmm0, xmm0
244 add rsi, 32
245
246 add rdi, 32
247
248 sub rcx, 1
249 paddd xmm4, xmm2
250
251 paddd xmm4, xmm0
252 jnz .mberror_loop
253
254 movdqa xmm0, xmm4
255 punpckldq xmm0, xmm6
256
257 punpckhdq xmm4, xmm6
258 paddd xmm0, xmm4
259
260 movdqa xmm1, xmm0
261 psrldq xmm0, 8
262
263 paddd xmm0, xmm1
264 movq rax, xmm0
265
266 pop rdi
267 pop rsi
268 ; begin epilog
269 RESTORE_XMM
270 UNSHADOW_ARGS
271 pop rbp
272 ret
273
274
275 ;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
276 global sym(vp8_mbuverror_mmx_impl) PRIVATE
277 sym(vp8_mbuverror_mmx_impl):
278 push rbp
279 mov rbp, rsp
280 SHADOW_ARGS_TO_STACK 2
281 push rsi
282 push rdi
283 ; end prolog
284
285
286 mov rsi, arg(0) ;s_ptr
287 mov rdi, arg(1) ;d_ptr
288
289 mov rcx, 16
290 pxor mm7, mm7
291
292 .mbuverror_loop_mmx:
293
294 movq mm1, [rsi]
295 movq mm2, [rdi]
296
297 psubw mm1, mm2
298 pmaddwd mm1, mm1
299
300
301 movq mm3, [rsi+8]
302 movq mm4, [rdi+8]
303
304 psubw mm3, mm4
305 pmaddwd mm3, mm3
306
307
308 paddd mm7, mm1
309 paddd mm7, mm3
310
311
312 add rsi, 16
313 add rdi, 16
314
315 dec rcx
316 jnz .mbuverror_loop_mmx
317
318 movq mm0, mm7
319 psrlq mm7, 32
320
321 paddd mm0, mm7
322 movq rax, mm0
323
324 pop rdi
325 pop rsi
326 ; begin epilog
327 UNSHADOW_ARGS
328 pop rbp
329 ret
330
331
332 ;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
333 global sym(vp8_mbuverror_xmm_impl) PRIVATE
334 sym(vp8_mbuverror_xmm_impl):
335 push rbp
336 mov rbp, rsp
337 SHADOW_ARGS_TO_STACK 2
338 push rsi
339 push rdi
340 ; end prolog
341
342
343 mov rsi, arg(0) ;s_ptr
344 mov rdi, arg(1) ;d_ptr
345
346 mov rcx, 16
347 pxor xmm3, xmm3
348
349 .mbuverror_loop:
350
351 movdqa xmm1, [rsi]
352 movdqa xmm2, [rdi]
353
354 psubw xmm1, xmm2
355 pmaddwd xmm1, xmm1
356
357 paddd xmm3, xmm1
358
359 add rsi, 16
360 add rdi, 16
361
362 dec rcx
363 jnz .mbuverror_loop
364
365 pxor xmm0, xmm0
366 movdqa xmm1, xmm3
367
368 movdqa xmm2, xmm1
369 punpckldq xmm1, xmm0
370
371 punpckhdq xmm2, xmm0
372 paddd xmm1, xmm2
373
374 movdqa xmm2, xmm1
375
376 psrldq xmm1, 8
377 paddd xmm1, xmm2
378
379 movq rax, xmm1
380
381 pop rdi
382 pop rsi
383 ; begin epilog
384 UNSHADOW_ARGS
385 pop rbp
386 ret

mercurial