|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "third_party/x86inc/x86inc.asm" |
|
12 |
|
13 SECTION .text |
|
14 |
|
15 ; void vp9_subtract_block(int rows, int cols, |
|
16 ; int16_t *diff, ptrdiff_t diff_stride, |
|
17 ; const uint8_t *src, ptrdiff_t src_stride, |
|
18 ; const uint8_t *pred, ptrdiff_t pred_stride) |
|
19 |
|
20 INIT_XMM sse2 |
|
21 cglobal subtract_block, 7, 7, 8, \ |
|
22 rows, cols, diff, diff_stride, src, src_stride, \ |
|
23 pred, pred_stride |
|
24 %define pred_str colsq |
|
25 pxor m7, m7 ; dedicated zero register |
|
26 cmp colsd, 4 |
|
27 je .case_4 |
|
28 cmp colsd, 8 |
|
29 je .case_8 |
|
30 cmp colsd, 16 |
|
31 je .case_16 |
|
32 cmp colsd, 32 |
|
33 je .case_32 |
|
34 |
|
35 %macro loop16 6 |
|
36 mova m0, [srcq+%1] |
|
37 mova m4, [srcq+%2] |
|
38 mova m1, [predq+%3] |
|
39 mova m5, [predq+%4] |
|
40 punpckhbw m2, m0, m7 |
|
41 punpckhbw m3, m1, m7 |
|
42 punpcklbw m0, m7 |
|
43 punpcklbw m1, m7 |
|
44 psubw m2, m3 |
|
45 psubw m0, m1 |
|
46 punpckhbw m1, m4, m7 |
|
47 punpckhbw m3, m5, m7 |
|
48 punpcklbw m4, m7 |
|
49 punpcklbw m5, m7 |
|
50 psubw m1, m3 |
|
51 psubw m4, m5 |
|
52 mova [diffq+mmsize*0+%5], m0 |
|
53 mova [diffq+mmsize*1+%5], m2 |
|
54 mova [diffq+mmsize*0+%6], m4 |
|
55 mova [diffq+mmsize*1+%6], m1 |
|
56 %endmacro |
|
57 |
|
58 mov pred_str, pred_stridemp |
|
59 .loop_64: |
|
60 loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize |
|
61 loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize |
|
62 lea diffq, [diffq+diff_strideq*2] |
|
63 add predq, pred_str |
|
64 add srcq, src_strideq |
|
65 dec rowsd |
|
66 jg .loop_64 |
|
67 RET |
|
68 |
|
69 .case_32: |
|
70 mov pred_str, pred_stridemp |
|
71 .loop_32: |
|
72 loop16 0, mmsize, 0, mmsize, 0, 2*mmsize |
|
73 lea diffq, [diffq+diff_strideq*2] |
|
74 add predq, pred_str |
|
75 add srcq, src_strideq |
|
76 dec rowsd |
|
77 jg .loop_32 |
|
78 RET |
|
79 |
|
80 .case_16: |
|
81 mov pred_str, pred_stridemp |
|
82 .loop_16: |
|
83 loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 |
|
84 lea diffq, [diffq+diff_strideq*4] |
|
85 lea predq, [predq+pred_str*2] |
|
86 lea srcq, [srcq+src_strideq*2] |
|
87 sub rowsd, 2 |
|
88 jg .loop_16 |
|
89 RET |
|
90 |
|
91 %macro loop_h 0 |
|
92 movh m0, [srcq] |
|
93 movh m2, [srcq+src_strideq] |
|
94 movh m1, [predq] |
|
95 movh m3, [predq+pred_str] |
|
96 punpcklbw m0, m7 |
|
97 punpcklbw m1, m7 |
|
98 punpcklbw m2, m7 |
|
99 punpcklbw m3, m7 |
|
100 psubw m0, m1 |
|
101 psubw m2, m3 |
|
102 mova [diffq], m0 |
|
103 mova [diffq+diff_strideq*2], m2 |
|
104 %endmacro |
|
105 |
|
106 .case_8: |
|
107 mov pred_str, pred_stridemp |
|
108 .loop_8: |
|
109 loop_h |
|
110 lea diffq, [diffq+diff_strideq*4] |
|
111 lea srcq, [srcq+src_strideq*2] |
|
112 lea predq, [predq+pred_str*2] |
|
113 sub rowsd, 2 |
|
114 jg .loop_8 |
|
115 RET |
|
116 |
|
117 INIT_MMX |
|
118 .case_4: |
|
119 mov pred_str, pred_stridemp |
|
120 .loop_4: |
|
121 loop_h |
|
122 lea diffq, [diffq+diff_strideq*4] |
|
123 lea srcq, [srcq+src_strideq*2] |
|
124 lea predq, [predq+pred_str*2] |
|
125 sub rowsd, 2 |
|
126 jg .loop_4 |
|
127 RET |