media/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm

branch
TOR_BUG_9701
changeset 10
ac0c01689b40
equal deleted inserted replaced
-1:000000000000 0:fb095b5d37ce
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "third_party/x86inc/x86inc.asm"
12
13 SECTION_RODATA
14 pw_8: times 8 dw 8
15 bilin_filter_m_sse2: times 8 dw 16
16 times 8 dw 0
17 times 8 dw 15
18 times 8 dw 1
19 times 8 dw 14
20 times 8 dw 2
21 times 8 dw 13
22 times 8 dw 3
23 times 8 dw 12
24 times 8 dw 4
25 times 8 dw 11
26 times 8 dw 5
27 times 8 dw 10
28 times 8 dw 6
29 times 8 dw 9
30 times 8 dw 7
31 times 16 dw 8
32 times 8 dw 7
33 times 8 dw 9
34 times 8 dw 6
35 times 8 dw 10
36 times 8 dw 5
37 times 8 dw 11
38 times 8 dw 4
39 times 8 dw 12
40 times 8 dw 3
41 times 8 dw 13
42 times 8 dw 2
43 times 8 dw 14
44 times 8 dw 1
45 times 8 dw 15
46
47 bilin_filter_m_ssse3: times 8 db 16, 0
48 times 8 db 15, 1
49 times 8 db 14, 2
50 times 8 db 13, 3
51 times 8 db 12, 4
52 times 8 db 11, 5
53 times 8 db 10, 6
54 times 8 db 9, 7
55 times 16 db 8
56 times 8 db 7, 9
57 times 8 db 6, 10
58 times 8 db 5, 11
59 times 8 db 4, 12
60 times 8 db 3, 13
61 times 8 db 2, 14
62 times 8 db 1, 15
63
64 SECTION .text
65
66 ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
67 ; int x_offset, int y_offset,
68 ; const uint8_t *dst, ptrdiff_t dst_stride,
69 ; int height, unsigned int *sse);
70 ;
71 ; This function returns the SE and stores SSE in the given pointer.
72
73 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
74 psubw %3, %4
75 psubw %1, %2
76 paddw %5, %3
77 pmaddwd %3, %3
78 paddw %5, %1
79 pmaddwd %1, %1
80 paddd %6, %3
81 paddd %6, %1
82 %endmacro
83
84 %macro STORE_AND_RET 0
85 %if mmsize == 16
86 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
87 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
88 ; We have to sign-extend it before adding the words within the register
89 ; and outputing to a dword.
90 pcmpgtw m5, m6 ; mask for 0 > x
91 movhlps m3, m7
92 punpcklwd m4, m6, m5
93 punpckhwd m6, m5 ; sign-extend m6 word->dword
94 paddd m7, m3
95 paddd m6, m4
96 pshufd m3, m7, 0x1
97 movhlps m4, m6
98 paddd m7, m3
99 paddd m6, m4
100 mov r1, ssem ; r1 = unsigned int *sse
101 pshufd m4, m6, 0x1
102 movd [r1], m7 ; store sse
103 paddd m6, m4
104 movd rax, m6 ; store sum as return value
105 %else ; mmsize == 8
106 pshufw m4, m6, 0xe
107 pshufw m3, m7, 0xe
108 paddw m6, m4
109 paddd m7, m3
110 pcmpgtw m5, m6 ; mask for 0 > x
111 mov r1, ssem ; r1 = unsigned int *sse
112 punpcklwd m6, m5 ; sign-extend m6 word->dword
113 movd [r1], m7 ; store sse
114 pshufw m4, m6, 0xe
115 paddd m6, m4
116 movd rax, m6 ; store sum as return value
117 %endif
118 RET
119 %endmacro
120
121 %macro INC_SRC_BY_SRC_STRIDE 0
122 %if ARCH_X86=1 && CONFIG_PIC=1
123 add srcq, src_stridemp
124 %else
125 add srcq, src_strideq
126 %endif
127 %endmacro
128
129 %macro SUBPEL_VARIANCE 1-2 0 ; W
130 %if cpuflag(ssse3)
131 %define bilin_filter_m bilin_filter_m_ssse3
132 %define filter_idx_shift 4
133 %else
134 %define bilin_filter_m bilin_filter_m_sse2
135 %define filter_idx_shift 5
136 %endif
137 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
138 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
139 ; difference on Win64
140
141 %ifdef PIC ; 64bit PIC
142 %if %2 == 1 ; avg
143 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
144 x_offset, y_offset, \
145 dst, dst_stride, \
146 sec, sec_stride, height, sse
147 %define sec_str sec_strideq
148 %else
149 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
150 y_offset, dst, dst_stride, height, sse
151 %endif
152 %define h heightd
153 %define bilin_filter sseq
154 %else
155 %if ARCH_X86=1 && CONFIG_PIC=1
156 %if %2 == 1 ; avg
157 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
158 x_offset, y_offset, \
159 dst, dst_stride, \
160 sec, sec_stride, \
161 height, sse, g_bilin_filter, g_pw_8
162 %define h dword heightm
163 %define sec_str sec_stridemp
164
165 ;Store bilin_filter and pw_8 location in stack
166 GET_GOT eax
167 add esp, 4 ; restore esp
168
169 lea ecx, [GLOBAL(bilin_filter_m)]
170 mov g_bilin_filterm, ecx
171
172 lea ecx, [GLOBAL(pw_8)]
173 mov g_pw_8m, ecx
174
175 LOAD_IF_USED 0, 1 ; load eax, ecx back
176 %else
177 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
178 y_offset, dst, dst_stride, height, sse, \
179 g_bilin_filter, g_pw_8
180 %define h heightd
181
182 ;Store bilin_filter and pw_8 location in stack
183 GET_GOT eax
184 add esp, 4 ; restore esp
185
186 lea ecx, [GLOBAL(bilin_filter_m)]
187 mov g_bilin_filterm, ecx
188
189 lea ecx, [GLOBAL(pw_8)]
190 mov g_pw_8m, ecx
191
192 LOAD_IF_USED 0, 1 ; load eax, ecx back
193 %endif
194 %else
195 %if %2 == 1 ; avg
196 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
197 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
198 x_offset, y_offset, \
199 dst, dst_stride, \
200 sec, sec_stride, \
201 height, sse
202 %if ARCH_X86_64
203 %define h heightd
204 %define sec_str sec_strideq
205 %else
206 %define h dword heightm
207 %define sec_str sec_stridemp
208 %endif
209 %else
210 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
211 y_offset, dst, dst_stride, height, sse
212 %define h heightd
213 %endif
214
215 %define bilin_filter bilin_filter_m
216 %endif
217 %endif
218
219 ASSERT %1 <= 16 ; m6 overflows if w > 16
220 pxor m6, m6 ; sum
221 pxor m7, m7 ; sse
222 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
223 ; could perhaps use it for something more productive then
224 pxor m5, m5 ; dedicated zero register
225 %if %1 < 16
226 sar h, 1
227 %if %2 == 1 ; avg
228 shl sec_str, 1
229 %endif
230 %endif
231
232 ; FIXME(rbultje) replace by jumptable?
233 test x_offsetd, x_offsetd
234 jnz .x_nonzero
235 ; x_offset == 0
236 test y_offsetd, y_offsetd
237 jnz .x_zero_y_nonzero
238
239 ; x_offset == 0 && y_offset == 0
240 .x_zero_y_zero_loop:
241 %if %1 == 16
242 movu m0, [srcq]
243 mova m1, [dstq]
244 %if %2 == 1 ; avg
245 pavgb m0, [secq]
246 punpckhbw m3, m1, m5
247 punpcklbw m1, m5
248 %endif
249 punpckhbw m2, m0, m5
250 punpcklbw m0, m5
251 %if %2 == 0 ; !avg
252 punpckhbw m3, m1, m5
253 punpcklbw m1, m5
254 %endif
255 SUM_SSE m0, m1, m2, m3, m6, m7
256
257 add srcq, src_strideq
258 add dstq, dst_strideq
259 %else ; %1 < 16
260 movh m0, [srcq]
261 %if %2 == 1 ; avg
262 %if mmsize == 16
263 movhps m0, [srcq+src_strideq]
264 %else ; mmsize == 8
265 punpckldq m0, [srcq+src_strideq]
266 %endif
267 %else ; !avg
268 movh m2, [srcq+src_strideq]
269 %endif
270 movh m1, [dstq]
271 movh m3, [dstq+dst_strideq]
272 %if %2 == 1 ; avg
273 pavgb m0, [secq]
274 punpcklbw m3, m5
275 punpcklbw m1, m5
276 punpckhbw m2, m0, m5
277 punpcklbw m0, m5
278 %else ; !avg
279 punpcklbw m0, m5
280 punpcklbw m2, m5
281 punpcklbw m3, m5
282 punpcklbw m1, m5
283 %endif
284 SUM_SSE m0, m1, m2, m3, m6, m7
285
286 lea srcq, [srcq+src_strideq*2]
287 lea dstq, [dstq+dst_strideq*2]
288 %endif
289 %if %2 == 1 ; avg
290 add secq, sec_str
291 %endif
292 dec h
293 jg .x_zero_y_zero_loop
294 STORE_AND_RET
295
296 .x_zero_y_nonzero:
297 cmp y_offsetd, 8
298 jne .x_zero_y_nonhalf
299
300 ; x_offset == 0 && y_offset == 0.5
301 .x_zero_y_half_loop:
302 %if %1 == 16
303 movu m0, [srcq]
304 movu m4, [srcq+src_strideq]
305 mova m1, [dstq]
306 pavgb m0, m4
307 punpckhbw m3, m1, m5
308 %if %2 == 1 ; avg
309 pavgb m0, [secq]
310 %endif
311 punpcklbw m1, m5
312 punpckhbw m2, m0, m5
313 punpcklbw m0, m5
314 SUM_SSE m0, m1, m2, m3, m6, m7
315
316 add srcq, src_strideq
317 add dstq, dst_strideq
318 %else ; %1 < 16
319 movh m0, [srcq]
320 movh m2, [srcq+src_strideq]
321 %if %2 == 1 ; avg
322 %if mmsize == 16
323 movhps m2, [srcq+src_strideq*2]
324 %else ; mmsize == 8
325 %if %1 == 4
326 movh m1, [srcq+src_strideq*2]
327 punpckldq m2, m1
328 %else
329 punpckldq m2, [srcq+src_strideq*2]
330 %endif
331 %endif
332 movh m1, [dstq]
333 %if mmsize == 16
334 movlhps m0, m2
335 %else ; mmsize == 8
336 punpckldq m0, m2
337 %endif
338 movh m3, [dstq+dst_strideq]
339 pavgb m0, m2
340 punpcklbw m1, m5
341 pavgb m0, [secq]
342 punpcklbw m3, m5
343 punpckhbw m2, m0, m5
344 punpcklbw m0, m5
345 %else ; !avg
346 movh m4, [srcq+src_strideq*2]
347 movh m1, [dstq]
348 pavgb m0, m2
349 movh m3, [dstq+dst_strideq]
350 pavgb m2, m4
351 punpcklbw m0, m5
352 punpcklbw m2, m5
353 punpcklbw m3, m5
354 punpcklbw m1, m5
355 %endif
356 SUM_SSE m0, m1, m2, m3, m6, m7
357
358 lea srcq, [srcq+src_strideq*2]
359 lea dstq, [dstq+dst_strideq*2]
360 %endif
361 %if %2 == 1 ; avg
362 add secq, sec_str
363 %endif
364 dec h
365 jg .x_zero_y_half_loop
366 STORE_AND_RET
367
368 .x_zero_y_nonhalf:
369 ; x_offset == 0 && y_offset == bilin interpolation
370 %ifdef PIC
371 lea bilin_filter, [bilin_filter_m]
372 %endif
373 shl y_offsetd, filter_idx_shift
374 %if ARCH_X86_64 && mmsize == 16
375 mova m8, [bilin_filter+y_offsetq]
376 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
377 mova m9, [bilin_filter+y_offsetq+16]
378 %endif
379 mova m10, [pw_8]
380 %define filter_y_a m8
381 %define filter_y_b m9
382 %define filter_rnd m10
383 %else ; x86-32 or mmx
384 %if ARCH_X86=1 && CONFIG_PIC=1
385 ; x_offset == 0, reuse x_offset reg
386 %define tempq x_offsetq
387 add y_offsetq, g_bilin_filterm
388 %define filter_y_a [y_offsetq]
389 %define filter_y_b [y_offsetq+16]
390 mov tempq, g_pw_8m
391 %define filter_rnd [tempq]
392 %else
393 add y_offsetq, bilin_filter
394 %define filter_y_a [y_offsetq]
395 %define filter_y_b [y_offsetq+16]
396 %define filter_rnd [pw_8]
397 %endif
398 %endif
399
400 .x_zero_y_other_loop:
401 %if %1 == 16
402 movu m0, [srcq]
403 movu m4, [srcq+src_strideq]
404 mova m1, [dstq]
405 %if cpuflag(ssse3)
406 punpckhbw m2, m0, m4
407 punpcklbw m0, m4
408 pmaddubsw m2, filter_y_a
409 pmaddubsw m0, filter_y_a
410 paddw m2, filter_rnd
411 paddw m0, filter_rnd
412 %else
413 punpckhbw m2, m0, m5
414 punpckhbw m3, m4, m5
415 punpcklbw m0, m5
416 punpcklbw m4, m5
417 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
418 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
419 ; instructions is the same (5), but it is 1 mul instead of 2, so might be
420 ; slightly faster because of pmullw latency. It would also cut our rodata
421 ; tables in half for this function, and save 1-2 registers on x86-64.
422 pmullw m2, filter_y_a
423 pmullw m3, filter_y_b
424 paddw m2, filter_rnd
425 pmullw m0, filter_y_a
426 pmullw m4, filter_y_b
427 paddw m0, filter_rnd
428 paddw m2, m3
429 paddw m0, m4
430 %endif
431 psraw m2, 4
432 psraw m0, 4
433 %if %2 == 1 ; avg
434 ; FIXME(rbultje) pipeline
435 packuswb m0, m2
436 pavgb m0, [secq]
437 punpckhbw m2, m0, m5
438 punpcklbw m0, m5
439 %endif
440 punpckhbw m3, m1, m5
441 punpcklbw m1, m5
442 SUM_SSE m0, m1, m2, m3, m6, m7
443
444 add srcq, src_strideq
445 add dstq, dst_strideq
446 %else ; %1 < 16
447 movh m0, [srcq]
448 movh m2, [srcq+src_strideq]
449 movh m4, [srcq+src_strideq*2]
450 movh m3, [dstq+dst_strideq]
451 %if cpuflag(ssse3)
452 movh m1, [dstq]
453 punpcklbw m0, m2
454 punpcklbw m2, m4
455 pmaddubsw m0, filter_y_a
456 pmaddubsw m2, filter_y_a
457 punpcklbw m3, m5
458 paddw m2, filter_rnd
459 paddw m0, filter_rnd
460 %else
461 punpcklbw m0, m5
462 punpcklbw m2, m5
463 punpcklbw m4, m5
464 pmullw m0, filter_y_a
465 pmullw m1, m2, filter_y_b
466 punpcklbw m3, m5
467 paddw m0, filter_rnd
468 pmullw m2, filter_y_a
469 pmullw m4, filter_y_b
470 paddw m0, m1
471 paddw m2, filter_rnd
472 movh m1, [dstq]
473 paddw m2, m4
474 %endif
475 psraw m0, 4
476 psraw m2, 4
477 %if %2 == 1 ; avg
478 ; FIXME(rbultje) pipeline
479 packuswb m0, m2
480 pavgb m0, [secq]
481 punpckhbw m2, m0, m5
482 punpcklbw m0, m5
483 %endif
484 punpcklbw m1, m5
485 SUM_SSE m0, m1, m2, m3, m6, m7
486
487 lea srcq, [srcq+src_strideq*2]
488 lea dstq, [dstq+dst_strideq*2]
489 %endif
490 %if %2 == 1 ; avg
491 add secq, sec_str
492 %endif
493 dec h
494 jg .x_zero_y_other_loop
495 %undef filter_y_a
496 %undef filter_y_b
497 %undef filter_rnd
498 STORE_AND_RET
499
500 .x_nonzero:
501 cmp x_offsetd, 8
502 jne .x_nonhalf
503 ; x_offset == 0.5
504 test y_offsetd, y_offsetd
505 jnz .x_half_y_nonzero
506
507 ; x_offset == 0.5 && y_offset == 0
508 .x_half_y_zero_loop:
509 %if %1 == 16
510 movu m0, [srcq]
511 movu m4, [srcq+1]
512 mova m1, [dstq]
513 pavgb m0, m4
514 punpckhbw m3, m1, m5
515 %if %2 == 1 ; avg
516 pavgb m0, [secq]
517 %endif
518 punpcklbw m1, m5
519 punpckhbw m2, m0, m5
520 punpcklbw m0, m5
521 SUM_SSE m0, m1, m2, m3, m6, m7
522
523 add srcq, src_strideq
524 add dstq, dst_strideq
525 %else ; %1 < 16
526 movh m0, [srcq]
527 movh m4, [srcq+1]
528 %if %2 == 1 ; avg
529 %if mmsize == 16
530 movhps m0, [srcq+src_strideq]
531 movhps m4, [srcq+src_strideq+1]
532 %else ; mmsize == 8
533 punpckldq m0, [srcq+src_strideq]
534 punpckldq m4, [srcq+src_strideq+1]
535 %endif
536 movh m1, [dstq]
537 movh m3, [dstq+dst_strideq]
538 pavgb m0, m4
539 punpcklbw m3, m5
540 pavgb m0, [secq]
541 punpcklbw m1, m5
542 punpckhbw m2, m0, m5
543 punpcklbw m0, m5
544 %else ; !avg
545 movh m2, [srcq+src_strideq]
546 movh m1, [dstq]
547 pavgb m0, m4
548 movh m4, [srcq+src_strideq+1]
549 movh m3, [dstq+dst_strideq]
550 pavgb m2, m4
551 punpcklbw m0, m5
552 punpcklbw m2, m5
553 punpcklbw m3, m5
554 punpcklbw m1, m5
555 %endif
556 SUM_SSE m0, m1, m2, m3, m6, m7
557
558 lea srcq, [srcq+src_strideq*2]
559 lea dstq, [dstq+dst_strideq*2]
560 %endif
561 %if %2 == 1 ; avg
562 add secq, sec_str
563 %endif
564 dec h
565 jg .x_half_y_zero_loop
566 STORE_AND_RET
567
568 .x_half_y_nonzero:
569 cmp y_offsetd, 8
570 jne .x_half_y_nonhalf
571
572 ; x_offset == 0.5 && y_offset == 0.5
573 %if %1 == 16
574 movu m0, [srcq]
575 movu m3, [srcq+1]
576 add srcq, src_strideq
577 pavgb m0, m3
578 .x_half_y_half_loop:
579 movu m4, [srcq]
580 movu m3, [srcq+1]
581 mova m1, [dstq]
582 pavgb m4, m3
583 punpckhbw m3, m1, m5
584 pavgb m0, m4
585 %if %2 == 1 ; avg
586 punpcklbw m1, m5
587 pavgb m0, [secq]
588 punpckhbw m2, m0, m5
589 punpcklbw m0, m5
590 %else
591 punpckhbw m2, m0, m5
592 punpcklbw m0, m5
593 punpcklbw m1, m5
594 %endif
595 SUM_SSE m0, m1, m2, m3, m6, m7
596 mova m0, m4
597
598 add srcq, src_strideq
599 add dstq, dst_strideq
600 %else ; %1 < 16
601 movh m0, [srcq]
602 movh m3, [srcq+1]
603 add srcq, src_strideq
604 pavgb m0, m3
605 .x_half_y_half_loop:
606 movh m2, [srcq]
607 movh m3, [srcq+1]
608 %if %2 == 1 ; avg
609 %if mmsize == 16
610 movhps m2, [srcq+src_strideq]
611 movhps m3, [srcq+src_strideq+1]
612 %else
613 %if %1 == 4
614 movh m1, [srcq+src_strideq]
615 punpckldq m2, m1
616 movh m1, [srcq+src_strideq+1]
617 punpckldq m3, m1
618 %else
619 punpckldq m2, [srcq+src_strideq]
620 punpckldq m3, [srcq+src_strideq+1]
621 %endif
622 %endif
623 pavgb m2, m3
624 %if mmsize == 16
625 movlhps m0, m2
626 movhlps m4, m2
627 %else ; mmsize == 8
628 punpckldq m0, m2
629 pshufw m4, m2, 0xe
630 %endif
631 movh m1, [dstq]
632 pavgb m0, m2
633 movh m3, [dstq+dst_strideq]
634 pavgb m0, [secq]
635 punpcklbw m3, m5
636 punpcklbw m1, m5
637 punpckhbw m2, m0, m5
638 punpcklbw m0, m5
639 %else ; !avg
640 movh m4, [srcq+src_strideq]
641 movh m1, [srcq+src_strideq+1]
642 pavgb m2, m3
643 pavgb m4, m1
644 pavgb m0, m2
645 pavgb m2, m4
646 movh m1, [dstq]
647 movh m3, [dstq+dst_strideq]
648 punpcklbw m0, m5
649 punpcklbw m2, m5
650 punpcklbw m3, m5
651 punpcklbw m1, m5
652 %endif
653 SUM_SSE m0, m1, m2, m3, m6, m7
654 mova m0, m4
655
656 lea srcq, [srcq+src_strideq*2]
657 lea dstq, [dstq+dst_strideq*2]
658 %endif
659 %if %2 == 1 ; avg
660 add secq, sec_str
661 %endif
662 dec h
663 jg .x_half_y_half_loop
664 STORE_AND_RET
665
666 .x_half_y_nonhalf:
667 ; x_offset == 0.5 && y_offset == bilin interpolation
668 %ifdef PIC
669 lea bilin_filter, [bilin_filter_m]
670 %endif
671 shl y_offsetd, filter_idx_shift
672 %if ARCH_X86_64 && mmsize == 16
673 mova m8, [bilin_filter+y_offsetq]
674 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
675 mova m9, [bilin_filter+y_offsetq+16]
676 %endif
677 mova m10, [pw_8]
678 %define filter_y_a m8
679 %define filter_y_b m9
680 %define filter_rnd m10
681 %else ;x86_32
682 %if ARCH_X86=1 && CONFIG_PIC=1
683 ; x_offset == 0.5. We can reuse x_offset reg
684 %define tempq x_offsetq
685 add y_offsetq, g_bilin_filterm
686 %define filter_y_a [y_offsetq]
687 %define filter_y_b [y_offsetq+16]
688 mov tempq, g_pw_8m
689 %define filter_rnd [tempq]
690 %else
691 add y_offsetq, bilin_filter
692 %define filter_y_a [y_offsetq]
693 %define filter_y_b [y_offsetq+16]
694 %define filter_rnd [pw_8]
695 %endif
696 %endif
697
698 %if %1 == 16
699 movu m0, [srcq]
700 movu m3, [srcq+1]
701 add srcq, src_strideq
702 pavgb m0, m3
703 .x_half_y_other_loop:
704 movu m4, [srcq]
705 movu m2, [srcq+1]
706 mova m1, [dstq]
707 pavgb m4, m2
708 %if cpuflag(ssse3)
709 punpckhbw m2, m0, m4
710 punpcklbw m0, m4
711 pmaddubsw m2, filter_y_a
712 pmaddubsw m0, filter_y_a
713 paddw m2, filter_rnd
714 paddw m0, filter_rnd
715 psraw m2, 4
716 %else
717 punpckhbw m2, m0, m5
718 punpckhbw m3, m4, m5
719 pmullw m2, filter_y_a
720 pmullw m3, filter_y_b
721 paddw m2, filter_rnd
722 punpcklbw m0, m5
723 paddw m2, m3
724 punpcklbw m3, m4, m5
725 pmullw m0, filter_y_a
726 pmullw m3, filter_y_b
727 paddw m0, filter_rnd
728 psraw m2, 4
729 paddw m0, m3
730 %endif
731 punpckhbw m3, m1, m5
732 psraw m0, 4
733 %if %2 == 1 ; avg
734 ; FIXME(rbultje) pipeline
735 packuswb m0, m2
736 pavgb m0, [secq]
737 punpckhbw m2, m0, m5
738 punpcklbw m0, m5
739 %endif
740 punpcklbw m1, m5
741 SUM_SSE m0, m1, m2, m3, m6, m7
742 mova m0, m4
743
744 add srcq, src_strideq
745 add dstq, dst_strideq
746 %else ; %1 < 16
747 movh m0, [srcq]
748 movh m3, [srcq+1]
749 add srcq, src_strideq
750 pavgb m0, m3
751 %if notcpuflag(ssse3)
752 punpcklbw m0, m5
753 %endif
754 .x_half_y_other_loop:
755 movh m2, [srcq]
756 movh m1, [srcq+1]
757 movh m4, [srcq+src_strideq]
758 movh m3, [srcq+src_strideq+1]
759 pavgb m2, m1
760 pavgb m4, m3
761 movh m3, [dstq+dst_strideq]
762 %if cpuflag(ssse3)
763 movh m1, [dstq]
764 punpcklbw m0, m2
765 punpcklbw m2, m4
766 pmaddubsw m0, filter_y_a
767 pmaddubsw m2, filter_y_a
768 punpcklbw m3, m5
769 paddw m0, filter_rnd
770 paddw m2, filter_rnd
771 %else
772 punpcklbw m2, m5
773 punpcklbw m4, m5
774 pmullw m0, filter_y_a
775 pmullw m1, m2, filter_y_b
776 punpcklbw m3, m5
777 paddw m0, filter_rnd
778 pmullw m2, filter_y_a
779 paddw m0, m1
780 pmullw m1, m4, filter_y_b
781 paddw m2, filter_rnd
782 paddw m2, m1
783 movh m1, [dstq]
784 %endif
785 psraw m0, 4
786 psraw m2, 4
787 %if %2 == 1 ; avg
788 ; FIXME(rbultje) pipeline
789 packuswb m0, m2
790 pavgb m0, [secq]
791 punpckhbw m2, m0, m5
792 punpcklbw m0, m5
793 %endif
794 punpcklbw m1, m5
795 SUM_SSE m0, m1, m2, m3, m6, m7
796 mova m0, m4
797
798 lea srcq, [srcq+src_strideq*2]
799 lea dstq, [dstq+dst_strideq*2]
800 %endif
801 %if %2 == 1 ; avg
802 add secq, sec_str
803 %endif
804 dec h
805 jg .x_half_y_other_loop
806 %undef filter_y_a
807 %undef filter_y_b
808 %undef filter_rnd
809 STORE_AND_RET
810
811 .x_nonhalf:
812 test y_offsetd, y_offsetd
813 jnz .x_nonhalf_y_nonzero
814
815 ; x_offset == bilin interpolation && y_offset == 0
816 %ifdef PIC
817 lea bilin_filter, [bilin_filter_m]
818 %endif
819 shl x_offsetd, filter_idx_shift
820 %if ARCH_X86_64 && mmsize == 16
821 mova m8, [bilin_filter+x_offsetq]
822 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
823 mova m9, [bilin_filter+x_offsetq+16]
824 %endif
825 mova m10, [pw_8]
826 %define filter_x_a m8
827 %define filter_x_b m9
828 %define filter_rnd m10
829 %else ; x86-32
830 %if ARCH_X86=1 && CONFIG_PIC=1
831 ;y_offset == 0. We can reuse y_offset reg.
832 %define tempq y_offsetq
833 add x_offsetq, g_bilin_filterm
834 %define filter_x_a [x_offsetq]
835 %define filter_x_b [x_offsetq+16]
836 mov tempq, g_pw_8m
837 %define filter_rnd [tempq]
838 %else
839 add x_offsetq, bilin_filter
840 %define filter_x_a [x_offsetq]
841 %define filter_x_b [x_offsetq+16]
842 %define filter_rnd [pw_8]
843 %endif
844 %endif
845
846 .x_other_y_zero_loop:
847 %if %1 == 16
848 movu m0, [srcq]
849 movu m4, [srcq+1]
850 mova m1, [dstq]
851 %if cpuflag(ssse3)
852 punpckhbw m2, m0, m4
853 punpcklbw m0, m4
854 pmaddubsw m2, filter_x_a
855 pmaddubsw m0, filter_x_a
856 paddw m2, filter_rnd
857 paddw m0, filter_rnd
858 %else
859 punpckhbw m2, m0, m5
860 punpckhbw m3, m4, m5
861 punpcklbw m0, m5
862 punpcklbw m4, m5
863 pmullw m2, filter_x_a
864 pmullw m3, filter_x_b
865 paddw m2, filter_rnd
866 pmullw m0, filter_x_a
867 pmullw m4, filter_x_b
868 paddw m0, filter_rnd
869 paddw m2, m3
870 paddw m0, m4
871 %endif
872 psraw m2, 4
873 psraw m0, 4
874 %if %2 == 1 ; avg
875 ; FIXME(rbultje) pipeline
876 packuswb m0, m2
877 pavgb m0, [secq]
878 punpckhbw m2, m0, m5
879 punpcklbw m0, m5
880 %endif
881 punpckhbw m3, m1, m5
882 punpcklbw m1, m5
883 SUM_SSE m0, m1, m2, m3, m6, m7
884
885 add srcq, src_strideq
886 add dstq, dst_strideq
887 %else ; %1 < 16
888 movh m0, [srcq]
889 movh m1, [srcq+1]
890 movh m2, [srcq+src_strideq]
891 movh m4, [srcq+src_strideq+1]
892 movh m3, [dstq+dst_strideq]
893 %if cpuflag(ssse3)
894 punpcklbw m0, m1
895 movh m1, [dstq]
896 punpcklbw m2, m4
897 pmaddubsw m0, filter_x_a
898 pmaddubsw m2, filter_x_a
899 punpcklbw m3, m5
900 paddw m0, filter_rnd
901 paddw m2, filter_rnd
902 %else
903 punpcklbw m0, m5
904 punpcklbw m1, m5
905 punpcklbw m2, m5
906 punpcklbw m4, m5
907 pmullw m0, filter_x_a
908 pmullw m1, filter_x_b
909 punpcklbw m3, m5
910 paddw m0, filter_rnd
911 pmullw m2, filter_x_a
912 pmullw m4, filter_x_b
913 paddw m0, m1
914 paddw m2, filter_rnd
915 movh m1, [dstq]
916 paddw m2, m4
917 %endif
918 psraw m0, 4
919 psraw m2, 4
920 %if %2 == 1 ; avg
921 ; FIXME(rbultje) pipeline
922 packuswb m0, m2
923 pavgb m0, [secq]
924 punpckhbw m2, m0, m5
925 punpcklbw m0, m5
926 %endif
927 punpcklbw m1, m5
928 SUM_SSE m0, m1, m2, m3, m6, m7
929
930 lea srcq, [srcq+src_strideq*2]
931 lea dstq, [dstq+dst_strideq*2]
932 %endif
933 %if %2 == 1 ; avg
934 add secq, sec_str
935 %endif
936 dec h
937 jg .x_other_y_zero_loop
938 %undef filter_x_a
939 %undef filter_x_b
940 %undef filter_rnd
941 STORE_AND_RET
942
943 .x_nonhalf_y_nonzero:
944 cmp y_offsetd, 8
945 jne .x_nonhalf_y_nonhalf
946
947 ; x_offset == bilin interpolation && y_offset == 0.5
948 %ifdef PIC
949 lea bilin_filter, [bilin_filter_m]
950 %endif
951 shl x_offsetd, filter_idx_shift
952 %if ARCH_X86_64 && mmsize == 16
953 mova m8, [bilin_filter+x_offsetq]
954 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
955 mova m9, [bilin_filter+x_offsetq+16]
956 %endif
957 mova m10, [pw_8]
958 %define filter_x_a m8
959 %define filter_x_b m9
960 %define filter_rnd m10
961 %else ; x86-32
962 %if ARCH_X86=1 && CONFIG_PIC=1
963 ; y_offset == 0.5. We can reuse y_offset reg.
964 %define tempq y_offsetq
965 add x_offsetq, g_bilin_filterm
966 %define filter_x_a [x_offsetq]
967 %define filter_x_b [x_offsetq+16]
968 mov tempq, g_pw_8m
969 %define filter_rnd [tempq]
970 %else
971 add x_offsetq, bilin_filter
972 %define filter_x_a [x_offsetq]
973 %define filter_x_b [x_offsetq+16]
974 %define filter_rnd [pw_8]
975 %endif
976 %endif
977
978 %if %1 == 16
979 movu m0, [srcq]
980 movu m1, [srcq+1]
981 %if cpuflag(ssse3)
982 punpckhbw m2, m0, m1
983 punpcklbw m0, m1
984 pmaddubsw m2, filter_x_a
985 pmaddubsw m0, filter_x_a
986 paddw m2, filter_rnd
987 paddw m0, filter_rnd
988 %else
989 punpckhbw m2, m0, m5
990 punpckhbw m3, m1, m5
991 punpcklbw m0, m5
992 punpcklbw m1, m5
993 pmullw m0, filter_x_a
994 pmullw m1, filter_x_b
995 paddw m0, filter_rnd
996 pmullw m2, filter_x_a
997 pmullw m3, filter_x_b
998 paddw m2, filter_rnd
999 paddw m0, m1
1000 paddw m2, m3
1001 %endif
1002 psraw m0, 4
1003 psraw m2, 4
1004 add srcq, src_strideq
1005 packuswb m0, m2
1006 .x_other_y_half_loop:
1007 movu m4, [srcq]
1008 movu m3, [srcq+1]
1009 %if cpuflag(ssse3)
1010 mova m1, [dstq]
1011 punpckhbw m2, m4, m3
1012 punpcklbw m4, m3
1013 pmaddubsw m2, filter_x_a
1014 pmaddubsw m4, filter_x_a
1015 paddw m2, filter_rnd
1016 paddw m4, filter_rnd
1017 psraw m2, 4
1018 psraw m4, 4
1019 packuswb m4, m2
1020 pavgb m0, m4
1021 punpckhbw m3, m1, m5
1022 punpcklbw m1, m5
1023 %else
1024 punpckhbw m2, m4, m5
1025 punpckhbw m1, m3, m5
1026 punpcklbw m4, m5
1027 punpcklbw m3, m5
1028 pmullw m4, filter_x_a
1029 pmullw m3, filter_x_b
1030 paddw m4, filter_rnd
1031 pmullw m2, filter_x_a
1032 pmullw m1, filter_x_b
1033 paddw m2, filter_rnd
1034 paddw m4, m3
1035 paddw m2, m1
1036 mova m1, [dstq]
1037 psraw m4, 4
1038 psraw m2, 4
1039 punpckhbw m3, m1, m5
1040 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
1041 ; have a 1-register shortage to be able to store the backup of the bilin
1042 ; filtered second line as words as cache for the next line. Packing into
1043 ; a byte costs 1 pack and 2 unpacks, but saves a register.
1044 packuswb m4, m2
1045 punpcklbw m1, m5
1046 pavgb m0, m4
1047 %endif
1048 %if %2 == 1 ; avg
1049 ; FIXME(rbultje) pipeline
1050 pavgb m0, [secq]
1051 %endif
1052 punpckhbw m2, m0, m5
1053 punpcklbw m0, m5
1054 SUM_SSE m0, m1, m2, m3, m6, m7
1055 mova m0, m4
1056
1057 add srcq, src_strideq
1058 add dstq, dst_strideq
1059 %else ; %1 < 16
1060 movh m0, [srcq]
1061 movh m1, [srcq+1]
1062 %if cpuflag(ssse3)
1063 punpcklbw m0, m1
1064 pmaddubsw m0, filter_x_a
1065 paddw m0, filter_rnd
1066 %else
1067 punpcklbw m0, m5
1068 punpcklbw m1, m5
1069 pmullw m0, filter_x_a
1070 pmullw m1, filter_x_b
1071 paddw m0, filter_rnd
1072 paddw m0, m1
1073 %endif
1074 add srcq, src_strideq
1075 psraw m0, 4
1076 .x_other_y_half_loop:
1077 movh m2, [srcq]
1078 movh m1, [srcq+1]
1079 movh m4, [srcq+src_strideq]
1080 movh m3, [srcq+src_strideq+1]
1081 %if cpuflag(ssse3)
1082 punpcklbw m2, m1
1083 punpcklbw m4, m3
1084 pmaddubsw m2, filter_x_a
1085 pmaddubsw m4, filter_x_a
1086 movh m1, [dstq]
1087 movh m3, [dstq+dst_strideq]
1088 paddw m2, filter_rnd
1089 paddw m4, filter_rnd
1090 %else
1091 punpcklbw m2, m5
1092 punpcklbw m1, m5
1093 punpcklbw m4, m5
1094 punpcklbw m3, m5
1095 pmullw m2, filter_x_a
1096 pmullw m1, filter_x_b
1097 paddw m2, filter_rnd
1098 pmullw m4, filter_x_a
1099 pmullw m3, filter_x_b
1100 paddw m4, filter_rnd
1101 paddw m2, m1
1102 movh m1, [dstq]
1103 paddw m4, m3
1104 movh m3, [dstq+dst_strideq]
1105 %endif
1106 psraw m2, 4
1107 psraw m4, 4
1108 pavgw m0, m2
1109 pavgw m2, m4
1110 %if %2 == 1 ; avg
1111 ; FIXME(rbultje) pipeline - also consider going to bytes here
1112 packuswb m0, m2
1113 pavgb m0, [secq]
1114 punpckhbw m2, m0, m5
1115 punpcklbw m0, m5
1116 %endif
1117 punpcklbw m3, m5
1118 punpcklbw m1, m5
1119 SUM_SSE m0, m1, m2, m3, m6, m7
1120 mova m0, m4
1121
1122 lea srcq, [srcq+src_strideq*2]
1123 lea dstq, [dstq+dst_strideq*2]
1124 %endif
1125 %if %2 == 1 ; avg
1126 add secq, sec_str
1127 %endif
1128 dec h
1129 jg .x_other_y_half_loop
1130 %undef filter_x_a
1131 %undef filter_x_b
1132 %undef filter_rnd
1133 STORE_AND_RET
1134
1135 .x_nonhalf_y_nonhalf:
1136 %ifdef PIC
1137 lea bilin_filter, [bilin_filter_m]
1138 %endif
1139 shl x_offsetd, filter_idx_shift
1140 shl y_offsetd, filter_idx_shift
1141 %if ARCH_X86_64 && mmsize == 16
1142 mova m8, [bilin_filter+x_offsetq]
1143 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1144 mova m9, [bilin_filter+x_offsetq+16]
1145 %endif
1146 mova m10, [bilin_filter+y_offsetq]
1147 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1148 mova m11, [bilin_filter+y_offsetq+16]
1149 %endif
1150 mova m12, [pw_8]
1151 %define filter_x_a m8
1152 %define filter_x_b m9
1153 %define filter_y_a m10
1154 %define filter_y_b m11
1155 %define filter_rnd m12
1156 %else ; x86-32
1157 %if ARCH_X86=1 && CONFIG_PIC=1
1158 ; In this case, there is NO unused register. Used src_stride register. Later,
1159 ; src_stride has to be loaded from stack when it is needed.
1160 %define tempq src_strideq
1161 mov tempq, g_bilin_filterm
1162 add x_offsetq, tempq
1163 add y_offsetq, tempq
1164 %define filter_x_a [x_offsetq]
1165 %define filter_x_b [x_offsetq+16]
1166 %define filter_y_a [y_offsetq]
1167 %define filter_y_b [y_offsetq+16]
1168
1169 mov tempq, g_pw_8m
1170 %define filter_rnd [tempq]
1171 %else
1172 add x_offsetq, bilin_filter
1173 add y_offsetq, bilin_filter
1174 %define filter_x_a [x_offsetq]
1175 %define filter_x_b [x_offsetq+16]
1176 %define filter_y_a [y_offsetq]
1177 %define filter_y_b [y_offsetq+16]
1178 %define filter_rnd [pw_8]
1179 %endif
1180 %endif
1181
1182 ; x_offset == bilin interpolation && y_offset == bilin interpolation
1183 %if %1 == 16
1184 movu m0, [srcq]
1185 movu m1, [srcq+1]
1186 %if cpuflag(ssse3)
1187 punpckhbw m2, m0, m1
1188 punpcklbw m0, m1
1189 pmaddubsw m2, filter_x_a
1190 pmaddubsw m0, filter_x_a
1191 paddw m2, filter_rnd
1192 paddw m0, filter_rnd
1193 %else
1194 punpckhbw m2, m0, m5
1195 punpckhbw m3, m1, m5
1196 punpcklbw m0, m5
1197 punpcklbw m1, m5
1198 pmullw m0, filter_x_a
1199 pmullw m1, filter_x_b
1200 paddw m0, filter_rnd
1201 pmullw m2, filter_x_a
1202 pmullw m3, filter_x_b
1203 paddw m2, filter_rnd
1204 paddw m0, m1
1205 paddw m2, m3
1206 %endif
1207 psraw m0, 4
1208 psraw m2, 4
1209
1210 INC_SRC_BY_SRC_STRIDE
1211
1212 packuswb m0, m2
1213 .x_other_y_other_loop:
1214 %if cpuflag(ssse3)
1215 movu m4, [srcq]
1216 movu m3, [srcq+1]
1217 mova m1, [dstq]
1218 punpckhbw m2, m4, m3
1219 punpcklbw m4, m3
1220 pmaddubsw m2, filter_x_a
1221 pmaddubsw m4, filter_x_a
1222 punpckhbw m3, m1, m5
1223 paddw m2, filter_rnd
1224 paddw m4, filter_rnd
1225 psraw m2, 4
1226 psraw m4, 4
1227 packuswb m4, m2
1228 punpckhbw m2, m0, m4
1229 punpcklbw m0, m4
1230 pmaddubsw m2, filter_y_a
1231 pmaddubsw m0, filter_y_a
1232 punpcklbw m1, m5
1233 paddw m2, filter_rnd
1234 paddw m0, filter_rnd
1235 psraw m2, 4
1236 psraw m0, 4
1237 %else
1238 movu m3, [srcq]
1239 movu m4, [srcq+1]
1240 punpckhbw m1, m3, m5
1241 punpckhbw m2, m4, m5
1242 punpcklbw m3, m5
1243 punpcklbw m4, m5
1244 pmullw m3, filter_x_a
1245 pmullw m4, filter_x_b
1246 paddw m3, filter_rnd
1247 pmullw m1, filter_x_a
1248 pmullw m2, filter_x_b
1249 paddw m1, filter_rnd
1250 paddw m3, m4
1251 paddw m1, m2
1252 psraw m3, 4
1253 psraw m1, 4
1254 packuswb m4, m3, m1
1255 punpckhbw m2, m0, m5
1256 punpcklbw m0, m5
1257 pmullw m2, filter_y_a
1258 pmullw m1, filter_y_b
1259 paddw m2, filter_rnd
1260 pmullw m0, filter_y_a
1261 pmullw m3, filter_y_b
1262 paddw m2, m1
1263 mova m1, [dstq]
1264 paddw m0, filter_rnd
1265 psraw m2, 4
1266 paddw m0, m3
1267 punpckhbw m3, m1, m5
1268 psraw m0, 4
1269 punpcklbw m1, m5
1270 %endif
1271 %if %2 == 1 ; avg
1272 ; FIXME(rbultje) pipeline
1273 packuswb m0, m2
1274 pavgb m0, [secq]
1275 punpckhbw m2, m0, m5
1276 punpcklbw m0, m5
1277 %endif
1278 SUM_SSE m0, m1, m2, m3, m6, m7
1279 mova m0, m4
1280
1281 INC_SRC_BY_SRC_STRIDE
1282 add dstq, dst_strideq
1283 %else ; %1 < 16
1284 movh m0, [srcq]
1285 movh m1, [srcq+1]
1286 %if cpuflag(ssse3)
1287 punpcklbw m0, m1
1288 pmaddubsw m0, filter_x_a
1289 paddw m0, filter_rnd
1290 %else
1291 punpcklbw m0, m5
1292 punpcklbw m1, m5
1293 pmullw m0, filter_x_a
1294 pmullw m1, filter_x_b
1295 paddw m0, filter_rnd
1296 paddw m0, m1
1297 %endif
1298 psraw m0, 4
1299 %if cpuflag(ssse3)
1300 packuswb m0, m0
1301 %endif
1302
1303 INC_SRC_BY_SRC_STRIDE
1304
1305 .x_other_y_other_loop:
1306 movh m2, [srcq]
1307 movh m1, [srcq+1]
1308
1309 INC_SRC_BY_SRC_STRIDE
1310 movh m4, [srcq]
1311 movh m3, [srcq+1]
1312
1313 %if cpuflag(ssse3)
1314 punpcklbw m2, m1
1315 punpcklbw m4, m3
1316 pmaddubsw m2, filter_x_a
1317 pmaddubsw m4, filter_x_a
1318 movh m3, [dstq+dst_strideq]
1319 movh m1, [dstq]
1320 paddw m2, filter_rnd
1321 paddw m4, filter_rnd
1322 psraw m2, 4
1323 psraw m4, 4
1324 packuswb m2, m2
1325 packuswb m4, m4
1326 punpcklbw m0, m2
1327 punpcklbw m2, m4
1328 pmaddubsw m0, filter_y_a
1329 pmaddubsw m2, filter_y_a
1330 punpcklbw m3, m5
1331 paddw m0, filter_rnd
1332 paddw m2, filter_rnd
1333 psraw m0, 4
1334 psraw m2, 4
1335 punpcklbw m1, m5
1336 %else
1337 punpcklbw m2, m5
1338 punpcklbw m1, m5
1339 punpcklbw m4, m5
1340 punpcklbw m3, m5
1341 pmullw m2, filter_x_a
1342 pmullw m1, filter_x_b
1343 paddw m2, filter_rnd
1344 pmullw m4, filter_x_a
1345 pmullw m3, filter_x_b
1346 paddw m4, filter_rnd
1347 paddw m2, m1
1348 paddw m4, m3
1349 psraw m2, 4
1350 psraw m4, 4
1351 pmullw m0, filter_y_a
1352 pmullw m3, m2, filter_y_b
1353 paddw m0, filter_rnd
1354 pmullw m2, filter_y_a
1355 pmullw m1, m4, filter_y_b
1356 paddw m2, filter_rnd
1357 paddw m0, m3
1358 movh m3, [dstq+dst_strideq]
1359 paddw m2, m1
1360 movh m1, [dstq]
1361 psraw m0, 4
1362 psraw m2, 4
1363 punpcklbw m3, m5
1364 punpcklbw m1, m5
1365 %endif
1366 %if %2 == 1 ; avg
1367 ; FIXME(rbultje) pipeline
1368 packuswb m0, m2
1369 pavgb m0, [secq]
1370 punpckhbw m2, m0, m5
1371 punpcklbw m0, m5
1372 %endif
1373 SUM_SSE m0, m1, m2, m3, m6, m7
1374 mova m0, m4
1375
1376 INC_SRC_BY_SRC_STRIDE
1377 lea dstq, [dstq+dst_strideq*2]
1378 %endif
1379 %if %2 == 1 ; avg
1380 add secq, sec_str
1381 %endif
1382 dec h
1383 jg .x_other_y_other_loop
1384 %undef filter_x_a
1385 %undef filter_x_b
1386 %undef filter_y_a
1387 %undef filter_y_b
1388 %undef filter_rnd
1389 STORE_AND_RET
1390 %endmacro
1391
1392 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
1393 ; between the ssse3 and non-ssse3 version. It may make sense to merge their
1394 ; code in the sense that the ssse3 version would jump to the appropriate
1395 ; location in the sse/2 version, rather than duplicating that code in the
1396 ; binary.
1397
1398 INIT_MMX sse
1399 SUBPEL_VARIANCE 4
1400 INIT_XMM sse2
1401 SUBPEL_VARIANCE 8
1402 SUBPEL_VARIANCE 16
1403
1404 INIT_MMX ssse3
1405 SUBPEL_VARIANCE 4
1406 INIT_XMM ssse3
1407 SUBPEL_VARIANCE 8
1408 SUBPEL_VARIANCE 16
1409
1410 INIT_MMX sse
1411 SUBPEL_VARIANCE 4, 1
1412 INIT_XMM sse2
1413 SUBPEL_VARIANCE 8, 1
1414 SUBPEL_VARIANCE 16, 1
1415
1416 INIT_MMX ssse3
1417 SUBPEL_VARIANCE 4, 1
1418 INIT_XMM ssse3
1419 SUBPEL_VARIANCE 8, 1
1420 SUBPEL_VARIANCE 16, 1

mercurial