|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 %macro PROCESS_16X2X3 1 |
|
15 %if %1 |
|
16 movdqa xmm0, XMMWORD PTR [rsi] |
|
17 lddqu xmm5, XMMWORD PTR [rdi] |
|
18 lddqu xmm6, XMMWORD PTR [rdi+1] |
|
19 lddqu xmm7, XMMWORD PTR [rdi+2] |
|
20 |
|
21 psadbw xmm5, xmm0 |
|
22 psadbw xmm6, xmm0 |
|
23 psadbw xmm7, xmm0 |
|
24 %else |
|
25 movdqa xmm0, XMMWORD PTR [rsi] |
|
26 lddqu xmm1, XMMWORD PTR [rdi] |
|
27 lddqu xmm2, XMMWORD PTR [rdi+1] |
|
28 lddqu xmm3, XMMWORD PTR [rdi+2] |
|
29 |
|
30 psadbw xmm1, xmm0 |
|
31 psadbw xmm2, xmm0 |
|
32 psadbw xmm3, xmm0 |
|
33 |
|
34 paddw xmm5, xmm1 |
|
35 paddw xmm6, xmm2 |
|
36 paddw xmm7, xmm3 |
|
37 %endif |
|
38 movdqa xmm0, XMMWORD PTR [rsi+rax] |
|
39 lddqu xmm1, XMMWORD PTR [rdi+rdx] |
|
40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1] |
|
41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2] |
|
42 |
|
43 lea rsi, [rsi+rax*2] |
|
44 lea rdi, [rdi+rdx*2] |
|
45 |
|
46 psadbw xmm1, xmm0 |
|
47 psadbw xmm2, xmm0 |
|
48 psadbw xmm3, xmm0 |
|
49 |
|
50 paddw xmm5, xmm1 |
|
51 paddw xmm6, xmm2 |
|
52 paddw xmm7, xmm3 |
|
53 %endmacro |
|
54 |
|
55 %macro PROCESS_16X2X3_OFFSET 2 |
|
56 %if %1 |
|
57 movdqa xmm0, XMMWORD PTR [rsi] |
|
58 movdqa xmm4, XMMWORD PTR [rdi] |
|
59 movdqa xmm7, XMMWORD PTR [rdi+16] |
|
60 |
|
61 movdqa xmm5, xmm7 |
|
62 palignr xmm5, xmm4, %2 |
|
63 |
|
64 movdqa xmm6, xmm7 |
|
65 palignr xmm6, xmm4, (%2+1) |
|
66 |
|
67 palignr xmm7, xmm4, (%2+2) |
|
68 |
|
69 psadbw xmm5, xmm0 |
|
70 psadbw xmm6, xmm0 |
|
71 psadbw xmm7, xmm0 |
|
72 %else |
|
73 movdqa xmm0, XMMWORD PTR [rsi] |
|
74 movdqa xmm4, XMMWORD PTR [rdi] |
|
75 movdqa xmm3, XMMWORD PTR [rdi+16] |
|
76 |
|
77 movdqa xmm1, xmm3 |
|
78 palignr xmm1, xmm4, %2 |
|
79 |
|
80 movdqa xmm2, xmm3 |
|
81 palignr xmm2, xmm4, (%2+1) |
|
82 |
|
83 palignr xmm3, xmm4, (%2+2) |
|
84 |
|
85 psadbw xmm1, xmm0 |
|
86 psadbw xmm2, xmm0 |
|
87 psadbw xmm3, xmm0 |
|
88 |
|
89 paddw xmm5, xmm1 |
|
90 paddw xmm6, xmm2 |
|
91 paddw xmm7, xmm3 |
|
92 %endif |
|
93 movdqa xmm0, XMMWORD PTR [rsi+rax] |
|
94 movdqa xmm4, XMMWORD PTR [rdi+rdx] |
|
95 movdqa xmm3, XMMWORD PTR [rdi+rdx+16] |
|
96 |
|
97 movdqa xmm1, xmm3 |
|
98 palignr xmm1, xmm4, %2 |
|
99 |
|
100 movdqa xmm2, xmm3 |
|
101 palignr xmm2, xmm4, (%2+1) |
|
102 |
|
103 palignr xmm3, xmm4, (%2+2) |
|
104 |
|
105 lea rsi, [rsi+rax*2] |
|
106 lea rdi, [rdi+rdx*2] |
|
107 |
|
108 psadbw xmm1, xmm0 |
|
109 psadbw xmm2, xmm0 |
|
110 psadbw xmm3, xmm0 |
|
111 |
|
112 paddw xmm5, xmm1 |
|
113 paddw xmm6, xmm2 |
|
114 paddw xmm7, xmm3 |
|
115 %endmacro |
|
116 |
|
117 %macro PROCESS_16X16X3_OFFSET 2 |
|
118 %2_aligned_by_%1: |
|
119 |
|
120 sub rdi, %1 |
|
121 |
|
122 PROCESS_16X2X3_OFFSET 1, %1 |
|
123 PROCESS_16X2X3_OFFSET 0, %1 |
|
124 PROCESS_16X2X3_OFFSET 0, %1 |
|
125 PROCESS_16X2X3_OFFSET 0, %1 |
|
126 PROCESS_16X2X3_OFFSET 0, %1 |
|
127 PROCESS_16X2X3_OFFSET 0, %1 |
|
128 PROCESS_16X2X3_OFFSET 0, %1 |
|
129 PROCESS_16X2X3_OFFSET 0, %1 |
|
130 |
|
131 jmp %2_store_off |
|
132 |
|
133 %endmacro |
|
134 |
|
135 %macro PROCESS_16X8X3_OFFSET 2 |
|
136 %2_aligned_by_%1: |
|
137 |
|
138 sub rdi, %1 |
|
139 |
|
140 PROCESS_16X2X3_OFFSET 1, %1 |
|
141 PROCESS_16X2X3_OFFSET 0, %1 |
|
142 PROCESS_16X2X3_OFFSET 0, %1 |
|
143 PROCESS_16X2X3_OFFSET 0, %1 |
|
144 |
|
145 jmp %2_store_off |
|
146 |
|
147 %endmacro |
|
148 |
|
149 ;void int vp9_sad16x16x3_ssse3( |
|
150 ; unsigned char *src_ptr, |
|
151 ; int src_stride, |
|
152 ; unsigned char *ref_ptr, |
|
153 ; int ref_stride, |
|
154 ; int *results) |
|
155 global sym(vp9_sad16x16x3_ssse3) PRIVATE |
|
156 sym(vp9_sad16x16x3_ssse3): |
|
157 push rbp |
|
158 mov rbp, rsp |
|
159 SHADOW_ARGS_TO_STACK 5 |
|
160 SAVE_XMM 7 |
|
161 push rsi |
|
162 push rdi |
|
163 push rcx |
|
164 ; end prolog |
|
165 |
|
166 mov rsi, arg(0) ;src_ptr |
|
167 mov rdi, arg(2) ;ref_ptr |
|
168 |
|
169 mov rdx, 0xf |
|
170 and rdx, rdi |
|
171 |
|
172 jmp .vp9_sad16x16x3_ssse3_skiptable |
|
173 .vp9_sad16x16x3_ssse3_jumptable: |
|
174 dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump |
|
175 dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump |
|
176 dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump |
|
177 dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump |
|
178 dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump |
|
179 dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump |
|
180 dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump |
|
181 dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump |
|
182 dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump |
|
183 dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump |
|
184 dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump |
|
185 dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump |
|
186 dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump |
|
187 dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump |
|
188 dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump |
|
189 dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump |
|
190 .vp9_sad16x16x3_ssse3_skiptable: |
|
191 |
|
192 call .vp9_sad16x16x3_ssse3_do_jump |
|
193 .vp9_sad16x16x3_ssse3_do_jump: |
|
194 pop rcx ; get the address of do_jump |
|
195 mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump |
|
196 add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable |
|
197 |
|
198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable |
|
199 add rcx, rax |
|
200 |
|
201 movsxd rax, dword ptr arg(1) ;src_stride |
|
202 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
203 |
|
204 jmp rcx |
|
205 |
|
206 PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3 |
|
207 PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3 |
|
208 PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3 |
|
209 PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3 |
|
210 PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3 |
|
211 PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3 |
|
212 PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3 |
|
213 PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3 |
|
214 PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3 |
|
215 PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3 |
|
216 PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3 |
|
217 PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3 |
|
218 PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3 |
|
219 PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3 |
|
220 PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3 |
|
221 |
|
222 .vp9_sad16x16x3_ssse3_aligned_by_15: |
|
223 PROCESS_16X2X3 1 |
|
224 PROCESS_16X2X3 0 |
|
225 PROCESS_16X2X3 0 |
|
226 PROCESS_16X2X3 0 |
|
227 PROCESS_16X2X3 0 |
|
228 PROCESS_16X2X3 0 |
|
229 PROCESS_16X2X3 0 |
|
230 PROCESS_16X2X3 0 |
|
231 |
|
232 .vp9_sad16x16x3_ssse3_store_off: |
|
233 mov rdi, arg(4) ;Results |
|
234 |
|
235 movq xmm0, xmm5 |
|
236 psrldq xmm5, 8 |
|
237 |
|
238 paddw xmm0, xmm5 |
|
239 movd [rdi], xmm0 |
|
240 ;- |
|
241 movq xmm0, xmm6 |
|
242 psrldq xmm6, 8 |
|
243 |
|
244 paddw xmm0, xmm6 |
|
245 movd [rdi+4], xmm0 |
|
246 ;- |
|
247 movq xmm0, xmm7 |
|
248 psrldq xmm7, 8 |
|
249 |
|
250 paddw xmm0, xmm7 |
|
251 movd [rdi+8], xmm0 |
|
252 |
|
253 ; begin epilog |
|
254 pop rcx |
|
255 pop rdi |
|
256 pop rsi |
|
257 RESTORE_XMM |
|
258 UNSHADOW_ARGS |
|
259 pop rbp |
|
260 ret |
|
261 |
|
262 ;void int vp9_sad16x8x3_ssse3( |
|
263 ; unsigned char *src_ptr, |
|
264 ; int src_stride, |
|
265 ; unsigned char *ref_ptr, |
|
266 ; int ref_stride, |
|
267 ; int *results) |
|
268 global sym(vp9_sad16x8x3_ssse3) PRIVATE |
|
269 sym(vp9_sad16x8x3_ssse3): |
|
270 push rbp |
|
271 mov rbp, rsp |
|
272 SHADOW_ARGS_TO_STACK 5 |
|
273 SAVE_XMM 7 |
|
274 push rsi |
|
275 push rdi |
|
276 push rcx |
|
277 ; end prolog |
|
278 |
|
279 mov rsi, arg(0) ;src_ptr |
|
280 mov rdi, arg(2) ;ref_ptr |
|
281 |
|
282 mov rdx, 0xf |
|
283 and rdx, rdi |
|
284 |
|
285 jmp .vp9_sad16x8x3_ssse3_skiptable |
|
286 .vp9_sad16x8x3_ssse3_jumptable: |
|
287 dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump |
|
288 dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump |
|
289 dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump |
|
290 dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump |
|
291 dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump |
|
292 dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump |
|
293 dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump |
|
294 dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump |
|
295 dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump |
|
296 dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump |
|
297 dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump |
|
298 dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump |
|
299 dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump |
|
300 dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump |
|
301 dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump |
|
302 dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump |
|
303 .vp9_sad16x8x3_ssse3_skiptable: |
|
304 |
|
305 call .vp9_sad16x8x3_ssse3_do_jump |
|
306 .vp9_sad16x8x3_ssse3_do_jump: |
|
307 pop rcx ; get the address of do_jump |
|
308 mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump |
|
309 add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable |
|
310 |
|
311 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable |
|
312 add rcx, rax |
|
313 |
|
314 movsxd rax, dword ptr arg(1) ;src_stride |
|
315 movsxd rdx, dword ptr arg(3) ;ref_stride |
|
316 |
|
317 jmp rcx |
|
318 |
|
319 PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3 |
|
320 PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3 |
|
321 PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3 |
|
322 PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3 |
|
323 PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3 |
|
324 PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3 |
|
325 PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3 |
|
326 PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3 |
|
327 PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3 |
|
328 PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3 |
|
329 PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3 |
|
330 PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3 |
|
331 PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3 |
|
332 PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3 |
|
333 PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3 |
|
334 |
|
335 .vp9_sad16x8x3_ssse3_aligned_by_15: |
|
336 |
|
337 PROCESS_16X2X3 1 |
|
338 PROCESS_16X2X3 0 |
|
339 PROCESS_16X2X3 0 |
|
340 PROCESS_16X2X3 0 |
|
341 |
|
342 .vp9_sad16x8x3_ssse3_store_off: |
|
343 mov rdi, arg(4) ;Results |
|
344 |
|
345 movq xmm0, xmm5 |
|
346 psrldq xmm5, 8 |
|
347 |
|
348 paddw xmm0, xmm5 |
|
349 movd [rdi], xmm0 |
|
350 ;- |
|
351 movq xmm0, xmm6 |
|
352 psrldq xmm6, 8 |
|
353 |
|
354 paddw xmm0, xmm6 |
|
355 movd [rdi+4], xmm0 |
|
356 ;- |
|
357 movq xmm0, xmm7 |
|
358 psrldq xmm7, 8 |
|
359 |
|
360 paddw xmm0, xmm7 |
|
361 movd [rdi+8], xmm0 |
|
362 |
|
363 ; begin epilog |
|
364 pop rcx |
|
365 pop rdi |
|
366 pop rsi |
|
367 RESTORE_XMM |
|
368 UNSHADOW_ARGS |
|
369 pop rbp |
|
370 ret |