media/libvpx/vp8/common/x86/sad_sse4.asm

branch
TOR_BUG_9701
changeset 10
ac0c01689b40
equal deleted inserted replaced
-1:000000000000 0:e7836ff1cd81
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %macro PROCESS_16X2X8 1
15 %if %1
16 movdqa xmm0, XMMWORD PTR [rsi]
17 movq xmm1, MMWORD PTR [rdi]
18 movq xmm3, MMWORD PTR [rdi+8]
19 movq xmm2, MMWORD PTR [rdi+16]
20 punpcklqdq xmm1, xmm3
21 punpcklqdq xmm3, xmm2
22
23 movdqa xmm2, xmm1
24 mpsadbw xmm1, xmm0, 0x0
25 mpsadbw xmm2, xmm0, 0x5
26
27 psrldq xmm0, 8
28
29 movdqa xmm4, xmm3
30 mpsadbw xmm3, xmm0, 0x0
31 mpsadbw xmm4, xmm0, 0x5
32
33 paddw xmm1, xmm2
34 paddw xmm1, xmm3
35 paddw xmm1, xmm4
36 %else
37 movdqa xmm0, XMMWORD PTR [rsi]
38 movq xmm5, MMWORD PTR [rdi]
39 movq xmm3, MMWORD PTR [rdi+8]
40 movq xmm2, MMWORD PTR [rdi+16]
41 punpcklqdq xmm5, xmm3
42 punpcklqdq xmm3, xmm2
43
44 movdqa xmm2, xmm5
45 mpsadbw xmm5, xmm0, 0x0
46 mpsadbw xmm2, xmm0, 0x5
47
48 psrldq xmm0, 8
49
50 movdqa xmm4, xmm3
51 mpsadbw xmm3, xmm0, 0x0
52 mpsadbw xmm4, xmm0, 0x5
53
54 paddw xmm5, xmm2
55 paddw xmm5, xmm3
56 paddw xmm5, xmm4
57
58 paddw xmm1, xmm5
59 %endif
60 movdqa xmm0, XMMWORD PTR [rsi + rax]
61 movq xmm5, MMWORD PTR [rdi+ rdx]
62 movq xmm3, MMWORD PTR [rdi+ rdx+8]
63 movq xmm2, MMWORD PTR [rdi+ rdx+16]
64 punpcklqdq xmm5, xmm3
65 punpcklqdq xmm3, xmm2
66
67 lea rsi, [rsi+rax*2]
68 lea rdi, [rdi+rdx*2]
69
70 movdqa xmm2, xmm5
71 mpsadbw xmm5, xmm0, 0x0
72 mpsadbw xmm2, xmm0, 0x5
73
74 psrldq xmm0, 8
75 movdqa xmm4, xmm3
76 mpsadbw xmm3, xmm0, 0x0
77 mpsadbw xmm4, xmm0, 0x5
78
79 paddw xmm5, xmm2
80 paddw xmm5, xmm3
81 paddw xmm5, xmm4
82
83 paddw xmm1, xmm5
84 %endmacro
85
86 %macro PROCESS_8X2X8 1
87 %if %1
88 movq xmm0, MMWORD PTR [rsi]
89 movq xmm1, MMWORD PTR [rdi]
90 movq xmm3, MMWORD PTR [rdi+8]
91 punpcklqdq xmm1, xmm3
92
93 movdqa xmm2, xmm1
94 mpsadbw xmm1, xmm0, 0x0
95 mpsadbw xmm2, xmm0, 0x5
96 paddw xmm1, xmm2
97 %else
98 movq xmm0, MMWORD PTR [rsi]
99 movq xmm5, MMWORD PTR [rdi]
100 movq xmm3, MMWORD PTR [rdi+8]
101 punpcklqdq xmm5, xmm3
102
103 movdqa xmm2, xmm5
104 mpsadbw xmm5, xmm0, 0x0
105 mpsadbw xmm2, xmm0, 0x5
106 paddw xmm5, xmm2
107
108 paddw xmm1, xmm5
109 %endif
110 movq xmm0, MMWORD PTR [rsi + rax]
111 movq xmm5, MMWORD PTR [rdi+ rdx]
112 movq xmm3, MMWORD PTR [rdi+ rdx+8]
113 punpcklqdq xmm5, xmm3
114
115 lea rsi, [rsi+rax*2]
116 lea rdi, [rdi+rdx*2]
117
118 movdqa xmm2, xmm5
119 mpsadbw xmm5, xmm0, 0x0
120 mpsadbw xmm2, xmm0, 0x5
121 paddw xmm5, xmm2
122
123 paddw xmm1, xmm5
124 %endmacro
125
126 %macro PROCESS_4X2X8 1
127 %if %1
128 movd xmm0, [rsi]
129 movq xmm1, MMWORD PTR [rdi]
130 movq xmm3, MMWORD PTR [rdi+8]
131 punpcklqdq xmm1, xmm3
132
133 mpsadbw xmm1, xmm0, 0x0
134 %else
135 movd xmm0, [rsi]
136 movq xmm5, MMWORD PTR [rdi]
137 movq xmm3, MMWORD PTR [rdi+8]
138 punpcklqdq xmm5, xmm3
139
140 mpsadbw xmm5, xmm0, 0x0
141
142 paddw xmm1, xmm5
143 %endif
144 movd xmm0, [rsi + rax]
145 movq xmm5, MMWORD PTR [rdi+ rdx]
146 movq xmm3, MMWORD PTR [rdi+ rdx+8]
147 punpcklqdq xmm5, xmm3
148
149 lea rsi, [rsi+rax*2]
150 lea rdi, [rdi+rdx*2]
151
152 mpsadbw xmm5, xmm0, 0x0
153
154 paddw xmm1, xmm5
155 %endmacro
156
157
158 ;void vp8_sad16x16x8_sse4(
159 ; const unsigned char *src_ptr,
160 ; int src_stride,
161 ; const unsigned char *ref_ptr,
162 ; int ref_stride,
163 ; unsigned short *sad_array);
164 global sym(vp8_sad16x16x8_sse4) PRIVATE
165 sym(vp8_sad16x16x8_sse4):
166 push rbp
167 mov rbp, rsp
168 SHADOW_ARGS_TO_STACK 5
169 push rsi
170 push rdi
171 ; end prolog
172
173 mov rsi, arg(0) ;src_ptr
174 mov rdi, arg(2) ;ref_ptr
175
176 movsxd rax, dword ptr arg(1) ;src_stride
177 movsxd rdx, dword ptr arg(3) ;ref_stride
178
179 PROCESS_16X2X8 1
180 PROCESS_16X2X8 0
181 PROCESS_16X2X8 0
182 PROCESS_16X2X8 0
183 PROCESS_16X2X8 0
184 PROCESS_16X2X8 0
185 PROCESS_16X2X8 0
186 PROCESS_16X2X8 0
187
188 mov rdi, arg(4) ;Results
189 movdqa XMMWORD PTR [rdi], xmm1
190
191 ; begin epilog
192 pop rdi
193 pop rsi
194 UNSHADOW_ARGS
195 pop rbp
196 ret
197
198
199 ;void vp8_sad16x8x8_sse4(
200 ; const unsigned char *src_ptr,
201 ; int src_stride,
202 ; const unsigned char *ref_ptr,
203 ; int ref_stride,
204 ; unsigned short *sad_array
205 ;);
206 global sym(vp8_sad16x8x8_sse4) PRIVATE
207 sym(vp8_sad16x8x8_sse4):
208 push rbp
209 mov rbp, rsp
210 SHADOW_ARGS_TO_STACK 5
211 push rsi
212 push rdi
213 ; end prolog
214
215 mov rsi, arg(0) ;src_ptr
216 mov rdi, arg(2) ;ref_ptr
217
218 movsxd rax, dword ptr arg(1) ;src_stride
219 movsxd rdx, dword ptr arg(3) ;ref_stride
220
221 PROCESS_16X2X8 1
222 PROCESS_16X2X8 0
223 PROCESS_16X2X8 0
224 PROCESS_16X2X8 0
225
226 mov rdi, arg(4) ;Results
227 movdqa XMMWORD PTR [rdi], xmm1
228
229 ; begin epilog
230 pop rdi
231 pop rsi
232 UNSHADOW_ARGS
233 pop rbp
234 ret
235
236
237 ;void vp8_sad8x8x8_sse4(
238 ; const unsigned char *src_ptr,
239 ; int src_stride,
240 ; const unsigned char *ref_ptr,
241 ; int ref_stride,
242 ; unsigned short *sad_array
243 ;);
244 global sym(vp8_sad8x8x8_sse4) PRIVATE
245 sym(vp8_sad8x8x8_sse4):
246 push rbp
247 mov rbp, rsp
248 SHADOW_ARGS_TO_STACK 5
249 push rsi
250 push rdi
251 ; end prolog
252
253 mov rsi, arg(0) ;src_ptr
254 mov rdi, arg(2) ;ref_ptr
255
256 movsxd rax, dword ptr arg(1) ;src_stride
257 movsxd rdx, dword ptr arg(3) ;ref_stride
258
259 PROCESS_8X2X8 1
260 PROCESS_8X2X8 0
261 PROCESS_8X2X8 0
262 PROCESS_8X2X8 0
263
264 mov rdi, arg(4) ;Results
265 movdqa XMMWORD PTR [rdi], xmm1
266
267 ; begin epilog
268 pop rdi
269 pop rsi
270 UNSHADOW_ARGS
271 pop rbp
272 ret
273
274
275 ;void vp8_sad8x16x8_sse4(
276 ; const unsigned char *src_ptr,
277 ; int src_stride,
278 ; const unsigned char *ref_ptr,
279 ; int ref_stride,
280 ; unsigned short *sad_array
281 ;);
282 global sym(vp8_sad8x16x8_sse4) PRIVATE
283 sym(vp8_sad8x16x8_sse4):
284 push rbp
285 mov rbp, rsp
286 SHADOW_ARGS_TO_STACK 5
287 push rsi
288 push rdi
289 ; end prolog
290
291 mov rsi, arg(0) ;src_ptr
292 mov rdi, arg(2) ;ref_ptr
293
294 movsxd rax, dword ptr arg(1) ;src_stride
295 movsxd rdx, dword ptr arg(3) ;ref_stride
296
297 PROCESS_8X2X8 1
298 PROCESS_8X2X8 0
299 PROCESS_8X2X8 0
300 PROCESS_8X2X8 0
301 PROCESS_8X2X8 0
302 PROCESS_8X2X8 0
303 PROCESS_8X2X8 0
304 PROCESS_8X2X8 0
305 mov rdi, arg(4) ;Results
306 movdqa XMMWORD PTR [rdi], xmm1
307
308 ; begin epilog
309 pop rdi
310 pop rsi
311 UNSHADOW_ARGS
312 pop rbp
313 ret
314
315
316 ;void vp8_sad4x4x8_c(
317 ; const unsigned char *src_ptr,
318 ; int src_stride,
319 ; const unsigned char *ref_ptr,
320 ; int ref_stride,
321 ; unsigned short *sad_array
322 ;);
323 global sym(vp8_sad4x4x8_sse4) PRIVATE
324 sym(vp8_sad4x4x8_sse4):
325 push rbp
326 mov rbp, rsp
327 SHADOW_ARGS_TO_STACK 5
328 push rsi
329 push rdi
330 ; end prolog
331
332 mov rsi, arg(0) ;src_ptr
333 mov rdi, arg(2) ;ref_ptr
334
335 movsxd rax, dword ptr arg(1) ;src_stride
336 movsxd rdx, dword ptr arg(3) ;ref_stride
337
338 PROCESS_4X2X8 1
339 PROCESS_4X2X8 0
340
341 mov rdi, arg(4) ;Results
342 movdqa XMMWORD PTR [rdi], xmm1
343
344 ; begin epilog
345 pop rdi
346 pop rsi
347 UNSHADOW_ARGS
348 pop rbp
349 ret
350
351
352
353

mercurial