|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "third_party/x86inc/x86inc.asm" |
|
12 |
|
13 SECTION_RODATA |
|
14 pw_8: times 8 dw 8 |
|
15 bilin_filter_m_sse2: times 8 dw 16 |
|
16 times 8 dw 0 |
|
17 times 8 dw 15 |
|
18 times 8 dw 1 |
|
19 times 8 dw 14 |
|
20 times 8 dw 2 |
|
21 times 8 dw 13 |
|
22 times 8 dw 3 |
|
23 times 8 dw 12 |
|
24 times 8 dw 4 |
|
25 times 8 dw 11 |
|
26 times 8 dw 5 |
|
27 times 8 dw 10 |
|
28 times 8 dw 6 |
|
29 times 8 dw 9 |
|
30 times 8 dw 7 |
|
31 times 16 dw 8 |
|
32 times 8 dw 7 |
|
33 times 8 dw 9 |
|
34 times 8 dw 6 |
|
35 times 8 dw 10 |
|
36 times 8 dw 5 |
|
37 times 8 dw 11 |
|
38 times 8 dw 4 |
|
39 times 8 dw 12 |
|
40 times 8 dw 3 |
|
41 times 8 dw 13 |
|
42 times 8 dw 2 |
|
43 times 8 dw 14 |
|
44 times 8 dw 1 |
|
45 times 8 dw 15 |
|
46 |
|
47 bilin_filter_m_ssse3: times 8 db 16, 0 |
|
48 times 8 db 15, 1 |
|
49 times 8 db 14, 2 |
|
50 times 8 db 13, 3 |
|
51 times 8 db 12, 4 |
|
52 times 8 db 11, 5 |
|
53 times 8 db 10, 6 |
|
54 times 8 db 9, 7 |
|
55 times 16 db 8 |
|
56 times 8 db 7, 9 |
|
57 times 8 db 6, 10 |
|
58 times 8 db 5, 11 |
|
59 times 8 db 4, 12 |
|
60 times 8 db 3, 13 |
|
61 times 8 db 2, 14 |
|
62 times 8 db 1, 15 |
|
63 |
|
64 SECTION .text |
|
65 |
|
66 ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, |
|
67 ; int x_offset, int y_offset, |
|
68 ; const uint8_t *dst, ptrdiff_t dst_stride, |
|
69 ; int height, unsigned int *sse); |
|
70 ; |
|
71 ; This function returns the SE and stores SSE in the given pointer. |
|
72 |
|
73 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse |
|
74 psubw %3, %4 |
|
75 psubw %1, %2 |
|
76 paddw %5, %3 |
|
77 pmaddwd %3, %3 |
|
78 paddw %5, %1 |
|
79 pmaddwd %1, %1 |
|
80 paddd %6, %3 |
|
81 paddd %6, %1 |
|
82 %endmacro |
|
83 |
|
84 %macro STORE_AND_RET 0 |
|
85 %if mmsize == 16 |
|
86 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit |
|
87 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. |
|
88 ; We have to sign-extend it before adding the words within the register |
|
89 ; and outputing to a dword. |
|
90 pcmpgtw m5, m6 ; mask for 0 > x |
|
91 movhlps m3, m7 |
|
92 punpcklwd m4, m6, m5 |
|
93 punpckhwd m6, m5 ; sign-extend m6 word->dword |
|
94 paddd m7, m3 |
|
95 paddd m6, m4 |
|
96 pshufd m3, m7, 0x1 |
|
97 movhlps m4, m6 |
|
98 paddd m7, m3 |
|
99 paddd m6, m4 |
|
100 mov r1, ssem ; r1 = unsigned int *sse |
|
101 pshufd m4, m6, 0x1 |
|
102 movd [r1], m7 ; store sse |
|
103 paddd m6, m4 |
|
104 movd rax, m6 ; store sum as return value |
|
105 %else ; mmsize == 8 |
|
106 pshufw m4, m6, 0xe |
|
107 pshufw m3, m7, 0xe |
|
108 paddw m6, m4 |
|
109 paddd m7, m3 |
|
110 pcmpgtw m5, m6 ; mask for 0 > x |
|
111 mov r1, ssem ; r1 = unsigned int *sse |
|
112 punpcklwd m6, m5 ; sign-extend m6 word->dword |
|
113 movd [r1], m7 ; store sse |
|
114 pshufw m4, m6, 0xe |
|
115 paddd m6, m4 |
|
116 movd rax, m6 ; store sum as return value |
|
117 %endif |
|
118 RET |
|
119 %endmacro |
|
120 |
|
121 %macro INC_SRC_BY_SRC_STRIDE 0 |
|
122 %if ARCH_X86=1 && CONFIG_PIC=1 |
|
123 add srcq, src_stridemp |
|
124 %else |
|
125 add srcq, src_strideq |
|
126 %endif |
|
127 %endmacro |
|
128 |
|
129 %macro SUBPEL_VARIANCE 1-2 0 ; W |
|
130 %if cpuflag(ssse3) |
|
131 %define bilin_filter_m bilin_filter_m_ssse3 |
|
132 %define filter_idx_shift 4 |
|
133 %else |
|
134 %define bilin_filter_m bilin_filter_m_sse2 |
|
135 %define filter_idx_shift 5 |
|
136 %endif |
|
137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses |
|
138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed |
|
139 ; difference on Win64 |
|
140 |
|
141 %ifdef PIC ; 64bit PIC |
|
142 %if %2 == 1 ; avg |
|
143 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ |
|
144 x_offset, y_offset, \ |
|
145 dst, dst_stride, \ |
|
146 sec, sec_stride, height, sse |
|
147 %define sec_str sec_strideq |
|
148 %else |
|
149 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ |
|
150 y_offset, dst, dst_stride, height, sse |
|
151 %endif |
|
152 %define h heightd |
|
153 %define bilin_filter sseq |
|
154 %else |
|
155 %if ARCH_X86=1 && CONFIG_PIC=1 |
|
156 %if %2 == 1 ; avg |
|
157 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ |
|
158 x_offset, y_offset, \ |
|
159 dst, dst_stride, \ |
|
160 sec, sec_stride, \ |
|
161 height, sse, g_bilin_filter, g_pw_8 |
|
162 %define h dword heightm |
|
163 %define sec_str sec_stridemp |
|
164 |
|
165 ;Store bilin_filter and pw_8 location in stack |
|
166 GET_GOT eax |
|
167 add esp, 4 ; restore esp |
|
168 |
|
169 lea ecx, [GLOBAL(bilin_filter_m)] |
|
170 mov g_bilin_filterm, ecx |
|
171 |
|
172 lea ecx, [GLOBAL(pw_8)] |
|
173 mov g_pw_8m, ecx |
|
174 |
|
175 LOAD_IF_USED 0, 1 ; load eax, ecx back |
|
176 %else |
|
177 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ |
|
178 y_offset, dst, dst_stride, height, sse, \ |
|
179 g_bilin_filter, g_pw_8 |
|
180 %define h heightd |
|
181 |
|
182 ;Store bilin_filter and pw_8 location in stack |
|
183 GET_GOT eax |
|
184 add esp, 4 ; restore esp |
|
185 |
|
186 lea ecx, [GLOBAL(bilin_filter_m)] |
|
187 mov g_bilin_filterm, ecx |
|
188 |
|
189 lea ecx, [GLOBAL(pw_8)] |
|
190 mov g_pw_8m, ecx |
|
191 |
|
192 LOAD_IF_USED 0, 1 ; load eax, ecx back |
|
193 %endif |
|
194 %else |
|
195 %if %2 == 1 ; avg |
|
196 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ |
|
197 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ |
|
198 x_offset, y_offset, \ |
|
199 dst, dst_stride, \ |
|
200 sec, sec_stride, \ |
|
201 height, sse |
|
202 %if ARCH_X86_64 |
|
203 %define h heightd |
|
204 %define sec_str sec_strideq |
|
205 %else |
|
206 %define h dword heightm |
|
207 %define sec_str sec_stridemp |
|
208 %endif |
|
209 %else |
|
210 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ |
|
211 y_offset, dst, dst_stride, height, sse |
|
212 %define h heightd |
|
213 %endif |
|
214 |
|
215 %define bilin_filter bilin_filter_m |
|
216 %endif |
|
217 %endif |
|
218 |
|
219 ASSERT %1 <= 16 ; m6 overflows if w > 16 |
|
220 pxor m6, m6 ; sum |
|
221 pxor m7, m7 ; sse |
|
222 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we |
|
223 ; could perhaps use it for something more productive then |
|
224 pxor m5, m5 ; dedicated zero register |
|
225 %if %1 < 16 |
|
226 sar h, 1 |
|
227 %if %2 == 1 ; avg |
|
228 shl sec_str, 1 |
|
229 %endif |
|
230 %endif |
|
231 |
|
232 ; FIXME(rbultje) replace by jumptable? |
|
233 test x_offsetd, x_offsetd |
|
234 jnz .x_nonzero |
|
235 ; x_offset == 0 |
|
236 test y_offsetd, y_offsetd |
|
237 jnz .x_zero_y_nonzero |
|
238 |
|
239 ; x_offset == 0 && y_offset == 0 |
|
240 .x_zero_y_zero_loop: |
|
241 %if %1 == 16 |
|
242 movu m0, [srcq] |
|
243 mova m1, [dstq] |
|
244 %if %2 == 1 ; avg |
|
245 pavgb m0, [secq] |
|
246 punpckhbw m3, m1, m5 |
|
247 punpcklbw m1, m5 |
|
248 %endif |
|
249 punpckhbw m2, m0, m5 |
|
250 punpcklbw m0, m5 |
|
251 %if %2 == 0 ; !avg |
|
252 punpckhbw m3, m1, m5 |
|
253 punpcklbw m1, m5 |
|
254 %endif |
|
255 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
256 |
|
257 add srcq, src_strideq |
|
258 add dstq, dst_strideq |
|
259 %else ; %1 < 16 |
|
260 movh m0, [srcq] |
|
261 %if %2 == 1 ; avg |
|
262 %if mmsize == 16 |
|
263 movhps m0, [srcq+src_strideq] |
|
264 %else ; mmsize == 8 |
|
265 punpckldq m0, [srcq+src_strideq] |
|
266 %endif |
|
267 %else ; !avg |
|
268 movh m2, [srcq+src_strideq] |
|
269 %endif |
|
270 movh m1, [dstq] |
|
271 movh m3, [dstq+dst_strideq] |
|
272 %if %2 == 1 ; avg |
|
273 pavgb m0, [secq] |
|
274 punpcklbw m3, m5 |
|
275 punpcklbw m1, m5 |
|
276 punpckhbw m2, m0, m5 |
|
277 punpcklbw m0, m5 |
|
278 %else ; !avg |
|
279 punpcklbw m0, m5 |
|
280 punpcklbw m2, m5 |
|
281 punpcklbw m3, m5 |
|
282 punpcklbw m1, m5 |
|
283 %endif |
|
284 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
285 |
|
286 lea srcq, [srcq+src_strideq*2] |
|
287 lea dstq, [dstq+dst_strideq*2] |
|
288 %endif |
|
289 %if %2 == 1 ; avg |
|
290 add secq, sec_str |
|
291 %endif |
|
292 dec h |
|
293 jg .x_zero_y_zero_loop |
|
294 STORE_AND_RET |
|
295 |
|
296 .x_zero_y_nonzero: |
|
297 cmp y_offsetd, 8 |
|
298 jne .x_zero_y_nonhalf |
|
299 |
|
300 ; x_offset == 0 && y_offset == 0.5 |
|
301 .x_zero_y_half_loop: |
|
302 %if %1 == 16 |
|
303 movu m0, [srcq] |
|
304 movu m4, [srcq+src_strideq] |
|
305 mova m1, [dstq] |
|
306 pavgb m0, m4 |
|
307 punpckhbw m3, m1, m5 |
|
308 %if %2 == 1 ; avg |
|
309 pavgb m0, [secq] |
|
310 %endif |
|
311 punpcklbw m1, m5 |
|
312 punpckhbw m2, m0, m5 |
|
313 punpcklbw m0, m5 |
|
314 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
315 |
|
316 add srcq, src_strideq |
|
317 add dstq, dst_strideq |
|
318 %else ; %1 < 16 |
|
319 movh m0, [srcq] |
|
320 movh m2, [srcq+src_strideq] |
|
321 %if %2 == 1 ; avg |
|
322 %if mmsize == 16 |
|
323 movhps m2, [srcq+src_strideq*2] |
|
324 %else ; mmsize == 8 |
|
325 %if %1 == 4 |
|
326 movh m1, [srcq+src_strideq*2] |
|
327 punpckldq m2, m1 |
|
328 %else |
|
329 punpckldq m2, [srcq+src_strideq*2] |
|
330 %endif |
|
331 %endif |
|
332 movh m1, [dstq] |
|
333 %if mmsize == 16 |
|
334 movlhps m0, m2 |
|
335 %else ; mmsize == 8 |
|
336 punpckldq m0, m2 |
|
337 %endif |
|
338 movh m3, [dstq+dst_strideq] |
|
339 pavgb m0, m2 |
|
340 punpcklbw m1, m5 |
|
341 pavgb m0, [secq] |
|
342 punpcklbw m3, m5 |
|
343 punpckhbw m2, m0, m5 |
|
344 punpcklbw m0, m5 |
|
345 %else ; !avg |
|
346 movh m4, [srcq+src_strideq*2] |
|
347 movh m1, [dstq] |
|
348 pavgb m0, m2 |
|
349 movh m3, [dstq+dst_strideq] |
|
350 pavgb m2, m4 |
|
351 punpcklbw m0, m5 |
|
352 punpcklbw m2, m5 |
|
353 punpcklbw m3, m5 |
|
354 punpcklbw m1, m5 |
|
355 %endif |
|
356 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
357 |
|
358 lea srcq, [srcq+src_strideq*2] |
|
359 lea dstq, [dstq+dst_strideq*2] |
|
360 %endif |
|
361 %if %2 == 1 ; avg |
|
362 add secq, sec_str |
|
363 %endif |
|
364 dec h |
|
365 jg .x_zero_y_half_loop |
|
366 STORE_AND_RET |
|
367 |
|
368 .x_zero_y_nonhalf: |
|
369 ; x_offset == 0 && y_offset == bilin interpolation |
|
370 %ifdef PIC |
|
371 lea bilin_filter, [bilin_filter_m] |
|
372 %endif |
|
373 shl y_offsetd, filter_idx_shift |
|
374 %if ARCH_X86_64 && mmsize == 16 |
|
375 mova m8, [bilin_filter+y_offsetq] |
|
376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
|
377 mova m9, [bilin_filter+y_offsetq+16] |
|
378 %endif |
|
379 mova m10, [pw_8] |
|
380 %define filter_y_a m8 |
|
381 %define filter_y_b m9 |
|
382 %define filter_rnd m10 |
|
383 %else ; x86-32 or mmx |
|
384 %if ARCH_X86=1 && CONFIG_PIC=1 |
|
385 ; x_offset == 0, reuse x_offset reg |
|
386 %define tempq x_offsetq |
|
387 add y_offsetq, g_bilin_filterm |
|
388 %define filter_y_a [y_offsetq] |
|
389 %define filter_y_b [y_offsetq+16] |
|
390 mov tempq, g_pw_8m |
|
391 %define filter_rnd [tempq] |
|
392 %else |
|
393 add y_offsetq, bilin_filter |
|
394 %define filter_y_a [y_offsetq] |
|
395 %define filter_y_b [y_offsetq+16] |
|
396 %define filter_rnd [pw_8] |
|
397 %endif |
|
398 %endif |
|
399 |
|
400 .x_zero_y_other_loop: |
|
401 %if %1 == 16 |
|
402 movu m0, [srcq] |
|
403 movu m4, [srcq+src_strideq] |
|
404 mova m1, [dstq] |
|
405 %if cpuflag(ssse3) |
|
406 punpckhbw m2, m0, m4 |
|
407 punpcklbw m0, m4 |
|
408 pmaddubsw m2, filter_y_a |
|
409 pmaddubsw m0, filter_y_a |
|
410 paddw m2, filter_rnd |
|
411 paddw m0, filter_rnd |
|
412 %else |
|
413 punpckhbw m2, m0, m5 |
|
414 punpckhbw m3, m4, m5 |
|
415 punpcklbw m0, m5 |
|
416 punpcklbw m4, m5 |
|
417 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can |
|
418 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of |
|
419 ; instructions is the same (5), but it is 1 mul instead of 2, so might be |
|
420 ; slightly faster because of pmullw latency. It would also cut our rodata |
|
421 ; tables in half for this function, and save 1-2 registers on x86-64. |
|
422 pmullw m2, filter_y_a |
|
423 pmullw m3, filter_y_b |
|
424 paddw m2, filter_rnd |
|
425 pmullw m0, filter_y_a |
|
426 pmullw m4, filter_y_b |
|
427 paddw m0, filter_rnd |
|
428 paddw m2, m3 |
|
429 paddw m0, m4 |
|
430 %endif |
|
431 psraw m2, 4 |
|
432 psraw m0, 4 |
|
433 %if %2 == 1 ; avg |
|
434 ; FIXME(rbultje) pipeline |
|
435 packuswb m0, m2 |
|
436 pavgb m0, [secq] |
|
437 punpckhbw m2, m0, m5 |
|
438 punpcklbw m0, m5 |
|
439 %endif |
|
440 punpckhbw m3, m1, m5 |
|
441 punpcklbw m1, m5 |
|
442 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
443 |
|
444 add srcq, src_strideq |
|
445 add dstq, dst_strideq |
|
446 %else ; %1 < 16 |
|
447 movh m0, [srcq] |
|
448 movh m2, [srcq+src_strideq] |
|
449 movh m4, [srcq+src_strideq*2] |
|
450 movh m3, [dstq+dst_strideq] |
|
451 %if cpuflag(ssse3) |
|
452 movh m1, [dstq] |
|
453 punpcklbw m0, m2 |
|
454 punpcklbw m2, m4 |
|
455 pmaddubsw m0, filter_y_a |
|
456 pmaddubsw m2, filter_y_a |
|
457 punpcklbw m3, m5 |
|
458 paddw m2, filter_rnd |
|
459 paddw m0, filter_rnd |
|
460 %else |
|
461 punpcklbw m0, m5 |
|
462 punpcklbw m2, m5 |
|
463 punpcklbw m4, m5 |
|
464 pmullw m0, filter_y_a |
|
465 pmullw m1, m2, filter_y_b |
|
466 punpcklbw m3, m5 |
|
467 paddw m0, filter_rnd |
|
468 pmullw m2, filter_y_a |
|
469 pmullw m4, filter_y_b |
|
470 paddw m0, m1 |
|
471 paddw m2, filter_rnd |
|
472 movh m1, [dstq] |
|
473 paddw m2, m4 |
|
474 %endif |
|
475 psraw m0, 4 |
|
476 psraw m2, 4 |
|
477 %if %2 == 1 ; avg |
|
478 ; FIXME(rbultje) pipeline |
|
479 packuswb m0, m2 |
|
480 pavgb m0, [secq] |
|
481 punpckhbw m2, m0, m5 |
|
482 punpcklbw m0, m5 |
|
483 %endif |
|
484 punpcklbw m1, m5 |
|
485 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
486 |
|
487 lea srcq, [srcq+src_strideq*2] |
|
488 lea dstq, [dstq+dst_strideq*2] |
|
489 %endif |
|
490 %if %2 == 1 ; avg |
|
491 add secq, sec_str |
|
492 %endif |
|
493 dec h |
|
494 jg .x_zero_y_other_loop |
|
495 %undef filter_y_a |
|
496 %undef filter_y_b |
|
497 %undef filter_rnd |
|
498 STORE_AND_RET |
|
499 |
|
500 .x_nonzero: |
|
501 cmp x_offsetd, 8 |
|
502 jne .x_nonhalf |
|
503 ; x_offset == 0.5 |
|
504 test y_offsetd, y_offsetd |
|
505 jnz .x_half_y_nonzero |
|
506 |
|
507 ; x_offset == 0.5 && y_offset == 0 |
|
508 .x_half_y_zero_loop: |
|
509 %if %1 == 16 |
|
510 movu m0, [srcq] |
|
511 movu m4, [srcq+1] |
|
512 mova m1, [dstq] |
|
513 pavgb m0, m4 |
|
514 punpckhbw m3, m1, m5 |
|
515 %if %2 == 1 ; avg |
|
516 pavgb m0, [secq] |
|
517 %endif |
|
518 punpcklbw m1, m5 |
|
519 punpckhbw m2, m0, m5 |
|
520 punpcklbw m0, m5 |
|
521 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
522 |
|
523 add srcq, src_strideq |
|
524 add dstq, dst_strideq |
|
525 %else ; %1 < 16 |
|
526 movh m0, [srcq] |
|
527 movh m4, [srcq+1] |
|
528 %if %2 == 1 ; avg |
|
529 %if mmsize == 16 |
|
530 movhps m0, [srcq+src_strideq] |
|
531 movhps m4, [srcq+src_strideq+1] |
|
532 %else ; mmsize == 8 |
|
533 punpckldq m0, [srcq+src_strideq] |
|
534 punpckldq m4, [srcq+src_strideq+1] |
|
535 %endif |
|
536 movh m1, [dstq] |
|
537 movh m3, [dstq+dst_strideq] |
|
538 pavgb m0, m4 |
|
539 punpcklbw m3, m5 |
|
540 pavgb m0, [secq] |
|
541 punpcklbw m1, m5 |
|
542 punpckhbw m2, m0, m5 |
|
543 punpcklbw m0, m5 |
|
544 %else ; !avg |
|
545 movh m2, [srcq+src_strideq] |
|
546 movh m1, [dstq] |
|
547 pavgb m0, m4 |
|
548 movh m4, [srcq+src_strideq+1] |
|
549 movh m3, [dstq+dst_strideq] |
|
550 pavgb m2, m4 |
|
551 punpcklbw m0, m5 |
|
552 punpcklbw m2, m5 |
|
553 punpcklbw m3, m5 |
|
554 punpcklbw m1, m5 |
|
555 %endif |
|
556 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
557 |
|
558 lea srcq, [srcq+src_strideq*2] |
|
559 lea dstq, [dstq+dst_strideq*2] |
|
560 %endif |
|
561 %if %2 == 1 ; avg |
|
562 add secq, sec_str |
|
563 %endif |
|
564 dec h |
|
565 jg .x_half_y_zero_loop |
|
566 STORE_AND_RET |
|
567 |
|
568 .x_half_y_nonzero: |
|
569 cmp y_offsetd, 8 |
|
570 jne .x_half_y_nonhalf |
|
571 |
|
572 ; x_offset == 0.5 && y_offset == 0.5 |
|
573 %if %1 == 16 |
|
574 movu m0, [srcq] |
|
575 movu m3, [srcq+1] |
|
576 add srcq, src_strideq |
|
577 pavgb m0, m3 |
|
578 .x_half_y_half_loop: |
|
579 movu m4, [srcq] |
|
580 movu m3, [srcq+1] |
|
581 mova m1, [dstq] |
|
582 pavgb m4, m3 |
|
583 punpckhbw m3, m1, m5 |
|
584 pavgb m0, m4 |
|
585 %if %2 == 1 ; avg |
|
586 punpcklbw m1, m5 |
|
587 pavgb m0, [secq] |
|
588 punpckhbw m2, m0, m5 |
|
589 punpcklbw m0, m5 |
|
590 %else |
|
591 punpckhbw m2, m0, m5 |
|
592 punpcklbw m0, m5 |
|
593 punpcklbw m1, m5 |
|
594 %endif |
|
595 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
596 mova m0, m4 |
|
597 |
|
598 add srcq, src_strideq |
|
599 add dstq, dst_strideq |
|
600 %else ; %1 < 16 |
|
601 movh m0, [srcq] |
|
602 movh m3, [srcq+1] |
|
603 add srcq, src_strideq |
|
604 pavgb m0, m3 |
|
605 .x_half_y_half_loop: |
|
606 movh m2, [srcq] |
|
607 movh m3, [srcq+1] |
|
608 %if %2 == 1 ; avg |
|
609 %if mmsize == 16 |
|
610 movhps m2, [srcq+src_strideq] |
|
611 movhps m3, [srcq+src_strideq+1] |
|
612 %else |
|
613 %if %1 == 4 |
|
614 movh m1, [srcq+src_strideq] |
|
615 punpckldq m2, m1 |
|
616 movh m1, [srcq+src_strideq+1] |
|
617 punpckldq m3, m1 |
|
618 %else |
|
619 punpckldq m2, [srcq+src_strideq] |
|
620 punpckldq m3, [srcq+src_strideq+1] |
|
621 %endif |
|
622 %endif |
|
623 pavgb m2, m3 |
|
624 %if mmsize == 16 |
|
625 movlhps m0, m2 |
|
626 movhlps m4, m2 |
|
627 %else ; mmsize == 8 |
|
628 punpckldq m0, m2 |
|
629 pshufw m4, m2, 0xe |
|
630 %endif |
|
631 movh m1, [dstq] |
|
632 pavgb m0, m2 |
|
633 movh m3, [dstq+dst_strideq] |
|
634 pavgb m0, [secq] |
|
635 punpcklbw m3, m5 |
|
636 punpcklbw m1, m5 |
|
637 punpckhbw m2, m0, m5 |
|
638 punpcklbw m0, m5 |
|
639 %else ; !avg |
|
640 movh m4, [srcq+src_strideq] |
|
641 movh m1, [srcq+src_strideq+1] |
|
642 pavgb m2, m3 |
|
643 pavgb m4, m1 |
|
644 pavgb m0, m2 |
|
645 pavgb m2, m4 |
|
646 movh m1, [dstq] |
|
647 movh m3, [dstq+dst_strideq] |
|
648 punpcklbw m0, m5 |
|
649 punpcklbw m2, m5 |
|
650 punpcklbw m3, m5 |
|
651 punpcklbw m1, m5 |
|
652 %endif |
|
653 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
654 mova m0, m4 |
|
655 |
|
656 lea srcq, [srcq+src_strideq*2] |
|
657 lea dstq, [dstq+dst_strideq*2] |
|
658 %endif |
|
659 %if %2 == 1 ; avg |
|
660 add secq, sec_str |
|
661 %endif |
|
662 dec h |
|
663 jg .x_half_y_half_loop |
|
664 STORE_AND_RET |
|
665 |
|
666 .x_half_y_nonhalf: |
|
667 ; x_offset == 0.5 && y_offset == bilin interpolation |
|
668 %ifdef PIC |
|
669 lea bilin_filter, [bilin_filter_m] |
|
670 %endif |
|
671 shl y_offsetd, filter_idx_shift |
|
672 %if ARCH_X86_64 && mmsize == 16 |
|
673 mova m8, [bilin_filter+y_offsetq] |
|
674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
|
675 mova m9, [bilin_filter+y_offsetq+16] |
|
676 %endif |
|
677 mova m10, [pw_8] |
|
678 %define filter_y_a m8 |
|
679 %define filter_y_b m9 |
|
680 %define filter_rnd m10 |
|
681 %else ;x86_32 |
|
682 %if ARCH_X86=1 && CONFIG_PIC=1 |
|
683 ; x_offset == 0.5. We can reuse x_offset reg |
|
684 %define tempq x_offsetq |
|
685 add y_offsetq, g_bilin_filterm |
|
686 %define filter_y_a [y_offsetq] |
|
687 %define filter_y_b [y_offsetq+16] |
|
688 mov tempq, g_pw_8m |
|
689 %define filter_rnd [tempq] |
|
690 %else |
|
691 add y_offsetq, bilin_filter |
|
692 %define filter_y_a [y_offsetq] |
|
693 %define filter_y_b [y_offsetq+16] |
|
694 %define filter_rnd [pw_8] |
|
695 %endif |
|
696 %endif |
|
697 |
|
698 %if %1 == 16 |
|
699 movu m0, [srcq] |
|
700 movu m3, [srcq+1] |
|
701 add srcq, src_strideq |
|
702 pavgb m0, m3 |
|
703 .x_half_y_other_loop: |
|
704 movu m4, [srcq] |
|
705 movu m2, [srcq+1] |
|
706 mova m1, [dstq] |
|
707 pavgb m4, m2 |
|
708 %if cpuflag(ssse3) |
|
709 punpckhbw m2, m0, m4 |
|
710 punpcklbw m0, m4 |
|
711 pmaddubsw m2, filter_y_a |
|
712 pmaddubsw m0, filter_y_a |
|
713 paddw m2, filter_rnd |
|
714 paddw m0, filter_rnd |
|
715 psraw m2, 4 |
|
716 %else |
|
717 punpckhbw m2, m0, m5 |
|
718 punpckhbw m3, m4, m5 |
|
719 pmullw m2, filter_y_a |
|
720 pmullw m3, filter_y_b |
|
721 paddw m2, filter_rnd |
|
722 punpcklbw m0, m5 |
|
723 paddw m2, m3 |
|
724 punpcklbw m3, m4, m5 |
|
725 pmullw m0, filter_y_a |
|
726 pmullw m3, filter_y_b |
|
727 paddw m0, filter_rnd |
|
728 psraw m2, 4 |
|
729 paddw m0, m3 |
|
730 %endif |
|
731 punpckhbw m3, m1, m5 |
|
732 psraw m0, 4 |
|
733 %if %2 == 1 ; avg |
|
734 ; FIXME(rbultje) pipeline |
|
735 packuswb m0, m2 |
|
736 pavgb m0, [secq] |
|
737 punpckhbw m2, m0, m5 |
|
738 punpcklbw m0, m5 |
|
739 %endif |
|
740 punpcklbw m1, m5 |
|
741 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
742 mova m0, m4 |
|
743 |
|
744 add srcq, src_strideq |
|
745 add dstq, dst_strideq |
|
746 %else ; %1 < 16 |
|
747 movh m0, [srcq] |
|
748 movh m3, [srcq+1] |
|
749 add srcq, src_strideq |
|
750 pavgb m0, m3 |
|
751 %if notcpuflag(ssse3) |
|
752 punpcklbw m0, m5 |
|
753 %endif |
|
754 .x_half_y_other_loop: |
|
755 movh m2, [srcq] |
|
756 movh m1, [srcq+1] |
|
757 movh m4, [srcq+src_strideq] |
|
758 movh m3, [srcq+src_strideq+1] |
|
759 pavgb m2, m1 |
|
760 pavgb m4, m3 |
|
761 movh m3, [dstq+dst_strideq] |
|
762 %if cpuflag(ssse3) |
|
763 movh m1, [dstq] |
|
764 punpcklbw m0, m2 |
|
765 punpcklbw m2, m4 |
|
766 pmaddubsw m0, filter_y_a |
|
767 pmaddubsw m2, filter_y_a |
|
768 punpcklbw m3, m5 |
|
769 paddw m0, filter_rnd |
|
770 paddw m2, filter_rnd |
|
771 %else |
|
772 punpcklbw m2, m5 |
|
773 punpcklbw m4, m5 |
|
774 pmullw m0, filter_y_a |
|
775 pmullw m1, m2, filter_y_b |
|
776 punpcklbw m3, m5 |
|
777 paddw m0, filter_rnd |
|
778 pmullw m2, filter_y_a |
|
779 paddw m0, m1 |
|
780 pmullw m1, m4, filter_y_b |
|
781 paddw m2, filter_rnd |
|
782 paddw m2, m1 |
|
783 movh m1, [dstq] |
|
784 %endif |
|
785 psraw m0, 4 |
|
786 psraw m2, 4 |
|
787 %if %2 == 1 ; avg |
|
788 ; FIXME(rbultje) pipeline |
|
789 packuswb m0, m2 |
|
790 pavgb m0, [secq] |
|
791 punpckhbw m2, m0, m5 |
|
792 punpcklbw m0, m5 |
|
793 %endif |
|
794 punpcklbw m1, m5 |
|
795 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
796 mova m0, m4 |
|
797 |
|
798 lea srcq, [srcq+src_strideq*2] |
|
799 lea dstq, [dstq+dst_strideq*2] |
|
800 %endif |
|
801 %if %2 == 1 ; avg |
|
802 add secq, sec_str |
|
803 %endif |
|
804 dec h |
|
805 jg .x_half_y_other_loop |
|
806 %undef filter_y_a |
|
807 %undef filter_y_b |
|
808 %undef filter_rnd |
|
809 STORE_AND_RET |
|
810 |
|
811 .x_nonhalf: |
|
812 test y_offsetd, y_offsetd |
|
813 jnz .x_nonhalf_y_nonzero |
|
814 |
|
815 ; x_offset == bilin interpolation && y_offset == 0 |
|
816 %ifdef PIC |
|
817 lea bilin_filter, [bilin_filter_m] |
|
818 %endif |
|
819 shl x_offsetd, filter_idx_shift |
|
820 %if ARCH_X86_64 && mmsize == 16 |
|
821 mova m8, [bilin_filter+x_offsetq] |
|
822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
|
823 mova m9, [bilin_filter+x_offsetq+16] |
|
824 %endif |
|
825 mova m10, [pw_8] |
|
826 %define filter_x_a m8 |
|
827 %define filter_x_b m9 |
|
828 %define filter_rnd m10 |
|
829 %else ; x86-32 |
|
830 %if ARCH_X86=1 && CONFIG_PIC=1 |
|
831 ;y_offset == 0. We can reuse y_offset reg. |
|
832 %define tempq y_offsetq |
|
833 add x_offsetq, g_bilin_filterm |
|
834 %define filter_x_a [x_offsetq] |
|
835 %define filter_x_b [x_offsetq+16] |
|
836 mov tempq, g_pw_8m |
|
837 %define filter_rnd [tempq] |
|
838 %else |
|
839 add x_offsetq, bilin_filter |
|
840 %define filter_x_a [x_offsetq] |
|
841 %define filter_x_b [x_offsetq+16] |
|
842 %define filter_rnd [pw_8] |
|
843 %endif |
|
844 %endif |
|
845 |
|
846 .x_other_y_zero_loop: |
|
847 %if %1 == 16 |
|
848 movu m0, [srcq] |
|
849 movu m4, [srcq+1] |
|
850 mova m1, [dstq] |
|
851 %if cpuflag(ssse3) |
|
852 punpckhbw m2, m0, m4 |
|
853 punpcklbw m0, m4 |
|
854 pmaddubsw m2, filter_x_a |
|
855 pmaddubsw m0, filter_x_a |
|
856 paddw m2, filter_rnd |
|
857 paddw m0, filter_rnd |
|
858 %else |
|
859 punpckhbw m2, m0, m5 |
|
860 punpckhbw m3, m4, m5 |
|
861 punpcklbw m0, m5 |
|
862 punpcklbw m4, m5 |
|
863 pmullw m2, filter_x_a |
|
864 pmullw m3, filter_x_b |
|
865 paddw m2, filter_rnd |
|
866 pmullw m0, filter_x_a |
|
867 pmullw m4, filter_x_b |
|
868 paddw m0, filter_rnd |
|
869 paddw m2, m3 |
|
870 paddw m0, m4 |
|
871 %endif |
|
872 psraw m2, 4 |
|
873 psraw m0, 4 |
|
874 %if %2 == 1 ; avg |
|
875 ; FIXME(rbultje) pipeline |
|
876 packuswb m0, m2 |
|
877 pavgb m0, [secq] |
|
878 punpckhbw m2, m0, m5 |
|
879 punpcklbw m0, m5 |
|
880 %endif |
|
881 punpckhbw m3, m1, m5 |
|
882 punpcklbw m1, m5 |
|
883 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
884 |
|
885 add srcq, src_strideq |
|
886 add dstq, dst_strideq |
|
887 %else ; %1 < 16 |
|
888 movh m0, [srcq] |
|
889 movh m1, [srcq+1] |
|
890 movh m2, [srcq+src_strideq] |
|
891 movh m4, [srcq+src_strideq+1] |
|
892 movh m3, [dstq+dst_strideq] |
|
893 %if cpuflag(ssse3) |
|
894 punpcklbw m0, m1 |
|
895 movh m1, [dstq] |
|
896 punpcklbw m2, m4 |
|
897 pmaddubsw m0, filter_x_a |
|
898 pmaddubsw m2, filter_x_a |
|
899 punpcklbw m3, m5 |
|
900 paddw m0, filter_rnd |
|
901 paddw m2, filter_rnd |
|
902 %else |
|
903 punpcklbw m0, m5 |
|
904 punpcklbw m1, m5 |
|
905 punpcklbw m2, m5 |
|
906 punpcklbw m4, m5 |
|
907 pmullw m0, filter_x_a |
|
908 pmullw m1, filter_x_b |
|
909 punpcklbw m3, m5 |
|
910 paddw m0, filter_rnd |
|
911 pmullw m2, filter_x_a |
|
912 pmullw m4, filter_x_b |
|
913 paddw m0, m1 |
|
914 paddw m2, filter_rnd |
|
915 movh m1, [dstq] |
|
916 paddw m2, m4 |
|
917 %endif |
|
918 psraw m0, 4 |
|
919 psraw m2, 4 |
|
920 %if %2 == 1 ; avg |
|
921 ; FIXME(rbultje) pipeline |
|
922 packuswb m0, m2 |
|
923 pavgb m0, [secq] |
|
924 punpckhbw m2, m0, m5 |
|
925 punpcklbw m0, m5 |
|
926 %endif |
|
927 punpcklbw m1, m5 |
|
928 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
929 |
|
930 lea srcq, [srcq+src_strideq*2] |
|
931 lea dstq, [dstq+dst_strideq*2] |
|
932 %endif |
|
933 %if %2 == 1 ; avg |
|
934 add secq, sec_str |
|
935 %endif |
|
936 dec h |
|
937 jg .x_other_y_zero_loop |
|
938 %undef filter_x_a |
|
939 %undef filter_x_b |
|
940 %undef filter_rnd |
|
941 STORE_AND_RET |
|
942 |
|
943 .x_nonhalf_y_nonzero: |
|
944 cmp y_offsetd, 8 |
|
945 jne .x_nonhalf_y_nonhalf |
|
946 |
|
947 ; x_offset == bilin interpolation && y_offset == 0.5 |
|
948 %ifdef PIC |
|
949 lea bilin_filter, [bilin_filter_m] |
|
950 %endif |
|
951 shl x_offsetd, filter_idx_shift |
|
952 %if ARCH_X86_64 && mmsize == 16 |
|
953 mova m8, [bilin_filter+x_offsetq] |
|
954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
|
955 mova m9, [bilin_filter+x_offsetq+16] |
|
956 %endif |
|
957 mova m10, [pw_8] |
|
958 %define filter_x_a m8 |
|
959 %define filter_x_b m9 |
|
960 %define filter_rnd m10 |
|
961 %else ; x86-32 |
|
962 %if ARCH_X86=1 && CONFIG_PIC=1 |
|
963 ; y_offset == 0.5. We can reuse y_offset reg. |
|
964 %define tempq y_offsetq |
|
965 add x_offsetq, g_bilin_filterm |
|
966 %define filter_x_a [x_offsetq] |
|
967 %define filter_x_b [x_offsetq+16] |
|
968 mov tempq, g_pw_8m |
|
969 %define filter_rnd [tempq] |
|
970 %else |
|
971 add x_offsetq, bilin_filter |
|
972 %define filter_x_a [x_offsetq] |
|
973 %define filter_x_b [x_offsetq+16] |
|
974 %define filter_rnd [pw_8] |
|
975 %endif |
|
976 %endif |
|
977 |
|
978 %if %1 == 16 |
|
979 movu m0, [srcq] |
|
980 movu m1, [srcq+1] |
|
981 %if cpuflag(ssse3) |
|
982 punpckhbw m2, m0, m1 |
|
983 punpcklbw m0, m1 |
|
984 pmaddubsw m2, filter_x_a |
|
985 pmaddubsw m0, filter_x_a |
|
986 paddw m2, filter_rnd |
|
987 paddw m0, filter_rnd |
|
988 %else |
|
989 punpckhbw m2, m0, m5 |
|
990 punpckhbw m3, m1, m5 |
|
991 punpcklbw m0, m5 |
|
992 punpcklbw m1, m5 |
|
993 pmullw m0, filter_x_a |
|
994 pmullw m1, filter_x_b |
|
995 paddw m0, filter_rnd |
|
996 pmullw m2, filter_x_a |
|
997 pmullw m3, filter_x_b |
|
998 paddw m2, filter_rnd |
|
999 paddw m0, m1 |
|
1000 paddw m2, m3 |
|
1001 %endif |
|
1002 psraw m0, 4 |
|
1003 psraw m2, 4 |
|
1004 add srcq, src_strideq |
|
1005 packuswb m0, m2 |
|
1006 .x_other_y_half_loop: |
|
1007 movu m4, [srcq] |
|
1008 movu m3, [srcq+1] |
|
1009 %if cpuflag(ssse3) |
|
1010 mova m1, [dstq] |
|
1011 punpckhbw m2, m4, m3 |
|
1012 punpcklbw m4, m3 |
|
1013 pmaddubsw m2, filter_x_a |
|
1014 pmaddubsw m4, filter_x_a |
|
1015 paddw m2, filter_rnd |
|
1016 paddw m4, filter_rnd |
|
1017 psraw m2, 4 |
|
1018 psraw m4, 4 |
|
1019 packuswb m4, m2 |
|
1020 pavgb m0, m4 |
|
1021 punpckhbw m3, m1, m5 |
|
1022 punpcklbw m1, m5 |
|
1023 %else |
|
1024 punpckhbw m2, m4, m5 |
|
1025 punpckhbw m1, m3, m5 |
|
1026 punpcklbw m4, m5 |
|
1027 punpcklbw m3, m5 |
|
1028 pmullw m4, filter_x_a |
|
1029 pmullw m3, filter_x_b |
|
1030 paddw m4, filter_rnd |
|
1031 pmullw m2, filter_x_a |
|
1032 pmullw m1, filter_x_b |
|
1033 paddw m2, filter_rnd |
|
1034 paddw m4, m3 |
|
1035 paddw m2, m1 |
|
1036 mova m1, [dstq] |
|
1037 psraw m4, 4 |
|
1038 psraw m2, 4 |
|
1039 punpckhbw m3, m1, m5 |
|
1040 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we |
|
1041 ; have a 1-register shortage to be able to store the backup of the bilin |
|
1042 ; filtered second line as words as cache for the next line. Packing into |
|
1043 ; a byte costs 1 pack and 2 unpacks, but saves a register. |
|
1044 packuswb m4, m2 |
|
1045 punpcklbw m1, m5 |
|
1046 pavgb m0, m4 |
|
1047 %endif |
|
1048 %if %2 == 1 ; avg |
|
1049 ; FIXME(rbultje) pipeline |
|
1050 pavgb m0, [secq] |
|
1051 %endif |
|
1052 punpckhbw m2, m0, m5 |
|
1053 punpcklbw m0, m5 |
|
1054 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
1055 mova m0, m4 |
|
1056 |
|
1057 add srcq, src_strideq |
|
1058 add dstq, dst_strideq |
|
1059 %else ; %1 < 16 |
|
1060 movh m0, [srcq] |
|
1061 movh m1, [srcq+1] |
|
1062 %if cpuflag(ssse3) |
|
1063 punpcklbw m0, m1 |
|
1064 pmaddubsw m0, filter_x_a |
|
1065 paddw m0, filter_rnd |
|
1066 %else |
|
1067 punpcklbw m0, m5 |
|
1068 punpcklbw m1, m5 |
|
1069 pmullw m0, filter_x_a |
|
1070 pmullw m1, filter_x_b |
|
1071 paddw m0, filter_rnd |
|
1072 paddw m0, m1 |
|
1073 %endif |
|
1074 add srcq, src_strideq |
|
1075 psraw m0, 4 |
|
1076 .x_other_y_half_loop: |
|
1077 movh m2, [srcq] |
|
1078 movh m1, [srcq+1] |
|
1079 movh m4, [srcq+src_strideq] |
|
1080 movh m3, [srcq+src_strideq+1] |
|
1081 %if cpuflag(ssse3) |
|
1082 punpcklbw m2, m1 |
|
1083 punpcklbw m4, m3 |
|
1084 pmaddubsw m2, filter_x_a |
|
1085 pmaddubsw m4, filter_x_a |
|
1086 movh m1, [dstq] |
|
1087 movh m3, [dstq+dst_strideq] |
|
1088 paddw m2, filter_rnd |
|
1089 paddw m4, filter_rnd |
|
1090 %else |
|
1091 punpcklbw m2, m5 |
|
1092 punpcklbw m1, m5 |
|
1093 punpcklbw m4, m5 |
|
1094 punpcklbw m3, m5 |
|
1095 pmullw m2, filter_x_a |
|
1096 pmullw m1, filter_x_b |
|
1097 paddw m2, filter_rnd |
|
1098 pmullw m4, filter_x_a |
|
1099 pmullw m3, filter_x_b |
|
1100 paddw m4, filter_rnd |
|
1101 paddw m2, m1 |
|
1102 movh m1, [dstq] |
|
1103 paddw m4, m3 |
|
1104 movh m3, [dstq+dst_strideq] |
|
1105 %endif |
|
1106 psraw m2, 4 |
|
1107 psraw m4, 4 |
|
1108 pavgw m0, m2 |
|
1109 pavgw m2, m4 |
|
1110 %if %2 == 1 ; avg |
|
1111 ; FIXME(rbultje) pipeline - also consider going to bytes here |
|
1112 packuswb m0, m2 |
|
1113 pavgb m0, [secq] |
|
1114 punpckhbw m2, m0, m5 |
|
1115 punpcklbw m0, m5 |
|
1116 %endif |
|
1117 punpcklbw m3, m5 |
|
1118 punpcklbw m1, m5 |
|
1119 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
1120 mova m0, m4 |
|
1121 |
|
1122 lea srcq, [srcq+src_strideq*2] |
|
1123 lea dstq, [dstq+dst_strideq*2] |
|
1124 %endif |
|
1125 %if %2 == 1 ; avg |
|
1126 add secq, sec_str |
|
1127 %endif |
|
1128 dec h |
|
1129 jg .x_other_y_half_loop |
|
1130 %undef filter_x_a |
|
1131 %undef filter_x_b |
|
1132 %undef filter_rnd |
|
1133 STORE_AND_RET |
|
1134 |
|
1135 .x_nonhalf_y_nonhalf: |
|
1136 %ifdef PIC |
|
1137 lea bilin_filter, [bilin_filter_m] |
|
1138 %endif |
|
1139 shl x_offsetd, filter_idx_shift |
|
1140 shl y_offsetd, filter_idx_shift |
|
1141 %if ARCH_X86_64 && mmsize == 16 |
|
1142 mova m8, [bilin_filter+x_offsetq] |
|
1143 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
|
1144 mova m9, [bilin_filter+x_offsetq+16] |
|
1145 %endif |
|
1146 mova m10, [bilin_filter+y_offsetq] |
|
1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 |
|
1148 mova m11, [bilin_filter+y_offsetq+16] |
|
1149 %endif |
|
1150 mova m12, [pw_8] |
|
1151 %define filter_x_a m8 |
|
1152 %define filter_x_b m9 |
|
1153 %define filter_y_a m10 |
|
1154 %define filter_y_b m11 |
|
1155 %define filter_rnd m12 |
|
1156 %else ; x86-32 |
|
1157 %if ARCH_X86=1 && CONFIG_PIC=1 |
|
1158 ; In this case, there is NO unused register. Used src_stride register. Later, |
|
1159 ; src_stride has to be loaded from stack when it is needed. |
|
1160 %define tempq src_strideq |
|
1161 mov tempq, g_bilin_filterm |
|
1162 add x_offsetq, tempq |
|
1163 add y_offsetq, tempq |
|
1164 %define filter_x_a [x_offsetq] |
|
1165 %define filter_x_b [x_offsetq+16] |
|
1166 %define filter_y_a [y_offsetq] |
|
1167 %define filter_y_b [y_offsetq+16] |
|
1168 |
|
1169 mov tempq, g_pw_8m |
|
1170 %define filter_rnd [tempq] |
|
1171 %else |
|
1172 add x_offsetq, bilin_filter |
|
1173 add y_offsetq, bilin_filter |
|
1174 %define filter_x_a [x_offsetq] |
|
1175 %define filter_x_b [x_offsetq+16] |
|
1176 %define filter_y_a [y_offsetq] |
|
1177 %define filter_y_b [y_offsetq+16] |
|
1178 %define filter_rnd [pw_8] |
|
1179 %endif |
|
1180 %endif |
|
1181 |
|
1182 ; x_offset == bilin interpolation && y_offset == bilin interpolation |
|
1183 %if %1 == 16 |
|
1184 movu m0, [srcq] |
|
1185 movu m1, [srcq+1] |
|
1186 %if cpuflag(ssse3) |
|
1187 punpckhbw m2, m0, m1 |
|
1188 punpcklbw m0, m1 |
|
1189 pmaddubsw m2, filter_x_a |
|
1190 pmaddubsw m0, filter_x_a |
|
1191 paddw m2, filter_rnd |
|
1192 paddw m0, filter_rnd |
|
1193 %else |
|
1194 punpckhbw m2, m0, m5 |
|
1195 punpckhbw m3, m1, m5 |
|
1196 punpcklbw m0, m5 |
|
1197 punpcklbw m1, m5 |
|
1198 pmullw m0, filter_x_a |
|
1199 pmullw m1, filter_x_b |
|
1200 paddw m0, filter_rnd |
|
1201 pmullw m2, filter_x_a |
|
1202 pmullw m3, filter_x_b |
|
1203 paddw m2, filter_rnd |
|
1204 paddw m0, m1 |
|
1205 paddw m2, m3 |
|
1206 %endif |
|
1207 psraw m0, 4 |
|
1208 psraw m2, 4 |
|
1209 |
|
1210 INC_SRC_BY_SRC_STRIDE |
|
1211 |
|
1212 packuswb m0, m2 |
|
1213 .x_other_y_other_loop: |
|
1214 %if cpuflag(ssse3) |
|
1215 movu m4, [srcq] |
|
1216 movu m3, [srcq+1] |
|
1217 mova m1, [dstq] |
|
1218 punpckhbw m2, m4, m3 |
|
1219 punpcklbw m4, m3 |
|
1220 pmaddubsw m2, filter_x_a |
|
1221 pmaddubsw m4, filter_x_a |
|
1222 punpckhbw m3, m1, m5 |
|
1223 paddw m2, filter_rnd |
|
1224 paddw m4, filter_rnd |
|
1225 psraw m2, 4 |
|
1226 psraw m4, 4 |
|
1227 packuswb m4, m2 |
|
1228 punpckhbw m2, m0, m4 |
|
1229 punpcklbw m0, m4 |
|
1230 pmaddubsw m2, filter_y_a |
|
1231 pmaddubsw m0, filter_y_a |
|
1232 punpcklbw m1, m5 |
|
1233 paddw m2, filter_rnd |
|
1234 paddw m0, filter_rnd |
|
1235 psraw m2, 4 |
|
1236 psraw m0, 4 |
|
1237 %else |
|
1238 movu m3, [srcq] |
|
1239 movu m4, [srcq+1] |
|
1240 punpckhbw m1, m3, m5 |
|
1241 punpckhbw m2, m4, m5 |
|
1242 punpcklbw m3, m5 |
|
1243 punpcklbw m4, m5 |
|
1244 pmullw m3, filter_x_a |
|
1245 pmullw m4, filter_x_b |
|
1246 paddw m3, filter_rnd |
|
1247 pmullw m1, filter_x_a |
|
1248 pmullw m2, filter_x_b |
|
1249 paddw m1, filter_rnd |
|
1250 paddw m3, m4 |
|
1251 paddw m1, m2 |
|
1252 psraw m3, 4 |
|
1253 psraw m1, 4 |
|
1254 packuswb m4, m3, m1 |
|
1255 punpckhbw m2, m0, m5 |
|
1256 punpcklbw m0, m5 |
|
1257 pmullw m2, filter_y_a |
|
1258 pmullw m1, filter_y_b |
|
1259 paddw m2, filter_rnd |
|
1260 pmullw m0, filter_y_a |
|
1261 pmullw m3, filter_y_b |
|
1262 paddw m2, m1 |
|
1263 mova m1, [dstq] |
|
1264 paddw m0, filter_rnd |
|
1265 psraw m2, 4 |
|
1266 paddw m0, m3 |
|
1267 punpckhbw m3, m1, m5 |
|
1268 psraw m0, 4 |
|
1269 punpcklbw m1, m5 |
|
1270 %endif |
|
1271 %if %2 == 1 ; avg |
|
1272 ; FIXME(rbultje) pipeline |
|
1273 packuswb m0, m2 |
|
1274 pavgb m0, [secq] |
|
1275 punpckhbw m2, m0, m5 |
|
1276 punpcklbw m0, m5 |
|
1277 %endif |
|
1278 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
1279 mova m0, m4 |
|
1280 |
|
1281 INC_SRC_BY_SRC_STRIDE |
|
1282 add dstq, dst_strideq |
|
1283 %else ; %1 < 16 |
|
1284 movh m0, [srcq] |
|
1285 movh m1, [srcq+1] |
|
1286 %if cpuflag(ssse3) |
|
1287 punpcklbw m0, m1 |
|
1288 pmaddubsw m0, filter_x_a |
|
1289 paddw m0, filter_rnd |
|
1290 %else |
|
1291 punpcklbw m0, m5 |
|
1292 punpcklbw m1, m5 |
|
1293 pmullw m0, filter_x_a |
|
1294 pmullw m1, filter_x_b |
|
1295 paddw m0, filter_rnd |
|
1296 paddw m0, m1 |
|
1297 %endif |
|
1298 psraw m0, 4 |
|
1299 %if cpuflag(ssse3) |
|
1300 packuswb m0, m0 |
|
1301 %endif |
|
1302 |
|
1303 INC_SRC_BY_SRC_STRIDE |
|
1304 |
|
1305 .x_other_y_other_loop: |
|
1306 movh m2, [srcq] |
|
1307 movh m1, [srcq+1] |
|
1308 |
|
1309 INC_SRC_BY_SRC_STRIDE |
|
1310 movh m4, [srcq] |
|
1311 movh m3, [srcq+1] |
|
1312 |
|
1313 %if cpuflag(ssse3) |
|
1314 punpcklbw m2, m1 |
|
1315 punpcklbw m4, m3 |
|
1316 pmaddubsw m2, filter_x_a |
|
1317 pmaddubsw m4, filter_x_a |
|
1318 movh m3, [dstq+dst_strideq] |
|
1319 movh m1, [dstq] |
|
1320 paddw m2, filter_rnd |
|
1321 paddw m4, filter_rnd |
|
1322 psraw m2, 4 |
|
1323 psraw m4, 4 |
|
1324 packuswb m2, m2 |
|
1325 packuswb m4, m4 |
|
1326 punpcklbw m0, m2 |
|
1327 punpcklbw m2, m4 |
|
1328 pmaddubsw m0, filter_y_a |
|
1329 pmaddubsw m2, filter_y_a |
|
1330 punpcklbw m3, m5 |
|
1331 paddw m0, filter_rnd |
|
1332 paddw m2, filter_rnd |
|
1333 psraw m0, 4 |
|
1334 psraw m2, 4 |
|
1335 punpcklbw m1, m5 |
|
1336 %else |
|
1337 punpcklbw m2, m5 |
|
1338 punpcklbw m1, m5 |
|
1339 punpcklbw m4, m5 |
|
1340 punpcklbw m3, m5 |
|
1341 pmullw m2, filter_x_a |
|
1342 pmullw m1, filter_x_b |
|
1343 paddw m2, filter_rnd |
|
1344 pmullw m4, filter_x_a |
|
1345 pmullw m3, filter_x_b |
|
1346 paddw m4, filter_rnd |
|
1347 paddw m2, m1 |
|
1348 paddw m4, m3 |
|
1349 psraw m2, 4 |
|
1350 psraw m4, 4 |
|
1351 pmullw m0, filter_y_a |
|
1352 pmullw m3, m2, filter_y_b |
|
1353 paddw m0, filter_rnd |
|
1354 pmullw m2, filter_y_a |
|
1355 pmullw m1, m4, filter_y_b |
|
1356 paddw m2, filter_rnd |
|
1357 paddw m0, m3 |
|
1358 movh m3, [dstq+dst_strideq] |
|
1359 paddw m2, m1 |
|
1360 movh m1, [dstq] |
|
1361 psraw m0, 4 |
|
1362 psraw m2, 4 |
|
1363 punpcklbw m3, m5 |
|
1364 punpcklbw m1, m5 |
|
1365 %endif |
|
1366 %if %2 == 1 ; avg |
|
1367 ; FIXME(rbultje) pipeline |
|
1368 packuswb m0, m2 |
|
1369 pavgb m0, [secq] |
|
1370 punpckhbw m2, m0, m5 |
|
1371 punpcklbw m0, m5 |
|
1372 %endif |
|
1373 SUM_SSE m0, m1, m2, m3, m6, m7 |
|
1374 mova m0, m4 |
|
1375 |
|
1376 INC_SRC_BY_SRC_STRIDE |
|
1377 lea dstq, [dstq+dst_strideq*2] |
|
1378 %endif |
|
1379 %if %2 == 1 ; avg |
|
1380 add secq, sec_str |
|
1381 %endif |
|
1382 dec h |
|
1383 jg .x_other_y_other_loop |
|
1384 %undef filter_x_a |
|
1385 %undef filter_x_b |
|
1386 %undef filter_y_a |
|
1387 %undef filter_y_b |
|
1388 %undef filter_rnd |
|
1389 STORE_AND_RET |
|
1390 %endmacro |
|
1391 |
|
1392 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical |
|
1393 ; between the ssse3 and non-ssse3 version. It may make sense to merge their |
|
1394 ; code in the sense that the ssse3 version would jump to the appropriate |
|
1395 ; location in the sse/2 version, rather than duplicating that code in the |
|
1396 ; binary. |
|
1397 |
|
1398 INIT_MMX sse |
|
1399 SUBPEL_VARIANCE 4 |
|
1400 INIT_XMM sse2 |
|
1401 SUBPEL_VARIANCE 8 |
|
1402 SUBPEL_VARIANCE 16 |
|
1403 |
|
1404 INIT_MMX ssse3 |
|
1405 SUBPEL_VARIANCE 4 |
|
1406 INIT_XMM ssse3 |
|
1407 SUBPEL_VARIANCE 8 |
|
1408 SUBPEL_VARIANCE 16 |
|
1409 |
|
1410 INIT_MMX sse |
|
1411 SUBPEL_VARIANCE 4, 1 |
|
1412 INIT_XMM sse2 |
|
1413 SUBPEL_VARIANCE 8, 1 |
|
1414 SUBPEL_VARIANCE 16, 1 |
|
1415 |
|
1416 INIT_MMX ssse3 |
|
1417 SUBPEL_VARIANCE 4, 1 |
|
1418 INIT_XMM ssse3 |
|
1419 SUBPEL_VARIANCE 8, 1 |
|
1420 SUBPEL_VARIANCE 16, 1 |