|
1 ; |
|
2 ; Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ;void vp8_filter_by_weight16x16_sse2 |
|
15 ;( |
|
16 ; unsigned char *src, |
|
17 ; int src_stride, |
|
18 ; unsigned char *dst, |
|
19 ; int dst_stride, |
|
20 ; int src_weight |
|
21 ;) |
|
22 global sym(vp8_filter_by_weight16x16_sse2) PRIVATE |
|
23 sym(vp8_filter_by_weight16x16_sse2): |
|
24 push rbp |
|
25 mov rbp, rsp |
|
26 SHADOW_ARGS_TO_STACK 5 |
|
27 SAVE_XMM 6 |
|
28 GET_GOT rbx |
|
29 push rsi |
|
30 push rdi |
|
31 ; end prolog |
|
32 |
|
33 movd xmm0, arg(4) ; src_weight |
|
34 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words |
|
35 punpcklqdq xmm0, xmm0 ; replicate to all hi words |
|
36 |
|
37 movdqa xmm1, [GLOBAL(tMFQE)] |
|
38 psubw xmm1, xmm0 ; dst_weight |
|
39 |
|
40 mov rax, arg(0) ; src |
|
41 mov rsi, arg(1) ; src_stride |
|
42 mov rdx, arg(2) ; dst |
|
43 mov rdi, arg(3) ; dst_stride |
|
44 |
|
45 mov rcx, 16 ; loop count |
|
46 pxor xmm6, xmm6 |
|
47 |
|
48 .combine |
|
49 movdqa xmm2, [rax] |
|
50 movdqa xmm4, [rdx] |
|
51 add rax, rsi |
|
52 |
|
53 ; src * src_weight |
|
54 movdqa xmm3, xmm2 |
|
55 punpcklbw xmm2, xmm6 |
|
56 punpckhbw xmm3, xmm6 |
|
57 pmullw xmm2, xmm0 |
|
58 pmullw xmm3, xmm0 |
|
59 |
|
60 ; dst * dst_weight |
|
61 movdqa xmm5, xmm4 |
|
62 punpcklbw xmm4, xmm6 |
|
63 punpckhbw xmm5, xmm6 |
|
64 pmullw xmm4, xmm1 |
|
65 pmullw xmm5, xmm1 |
|
66 |
|
67 ; sum, round and shift |
|
68 paddw xmm2, xmm4 |
|
69 paddw xmm3, xmm5 |
|
70 paddw xmm2, [GLOBAL(tMFQE_round)] |
|
71 paddw xmm3, [GLOBAL(tMFQE_round)] |
|
72 psrlw xmm2, 4 |
|
73 psrlw xmm3, 4 |
|
74 |
|
75 packuswb xmm2, xmm3 |
|
76 movdqa [rdx], xmm2 |
|
77 add rdx, rdi |
|
78 |
|
79 dec rcx |
|
80 jnz .combine |
|
81 |
|
82 ; begin epilog |
|
83 pop rdi |
|
84 pop rsi |
|
85 RESTORE_GOT |
|
86 RESTORE_XMM |
|
87 UNSHADOW_ARGS |
|
88 pop rbp |
|
89 |
|
90 ret |
|
91 |
|
92 ;void vp8_filter_by_weight8x8_sse2 |
|
93 ;( |
|
94 ; unsigned char *src, |
|
95 ; int src_stride, |
|
96 ; unsigned char *dst, |
|
97 ; int dst_stride, |
|
98 ; int src_weight |
|
99 ;) |
|
100 global sym(vp8_filter_by_weight8x8_sse2) PRIVATE |
|
101 sym(vp8_filter_by_weight8x8_sse2): |
|
102 push rbp |
|
103 mov rbp, rsp |
|
104 SHADOW_ARGS_TO_STACK 5 |
|
105 GET_GOT rbx |
|
106 push rsi |
|
107 push rdi |
|
108 ; end prolog |
|
109 |
|
110 movd xmm0, arg(4) ; src_weight |
|
111 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words |
|
112 punpcklqdq xmm0, xmm0 ; replicate to all hi words |
|
113 |
|
114 movdqa xmm1, [GLOBAL(tMFQE)] |
|
115 psubw xmm1, xmm0 ; dst_weight |
|
116 |
|
117 mov rax, arg(0) ; src |
|
118 mov rsi, arg(1) ; src_stride |
|
119 mov rdx, arg(2) ; dst |
|
120 mov rdi, arg(3) ; dst_stride |
|
121 |
|
122 mov rcx, 8 ; loop count |
|
123 pxor xmm4, xmm4 |
|
124 |
|
125 .combine |
|
126 movq xmm2, [rax] |
|
127 movq xmm3, [rdx] |
|
128 add rax, rsi |
|
129 |
|
130 ; src * src_weight |
|
131 punpcklbw xmm2, xmm4 |
|
132 pmullw xmm2, xmm0 |
|
133 |
|
134 ; dst * dst_weight |
|
135 punpcklbw xmm3, xmm4 |
|
136 pmullw xmm3, xmm1 |
|
137 |
|
138 ; sum, round and shift |
|
139 paddw xmm2, xmm3 |
|
140 paddw xmm2, [GLOBAL(tMFQE_round)] |
|
141 psrlw xmm2, 4 |
|
142 |
|
143 packuswb xmm2, xmm4 |
|
144 movq [rdx], xmm2 |
|
145 add rdx, rdi |
|
146 |
|
147 dec rcx |
|
148 jnz .combine |
|
149 |
|
150 ; begin epilog |
|
151 pop rdi |
|
152 pop rsi |
|
153 RESTORE_GOT |
|
154 UNSHADOW_ARGS |
|
155 pop rbp |
|
156 |
|
157 ret |
|
158 |
|
159 ;void vp8_variance_and_sad_16x16_sse2 | arg |
|
160 ;( |
|
161 ; unsigned char *src1, 0 |
|
162 ; int stride1, 1 |
|
163 ; unsigned char *src2, 2 |
|
164 ; int stride2, 3 |
|
165 ; unsigned int *variance, 4 |
|
166 ; unsigned int *sad, 5 |
|
167 ;) |
|
168 global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE |
|
169 sym(vp8_variance_and_sad_16x16_sse2): |
|
170 push rbp |
|
171 mov rbp, rsp |
|
172 SHADOW_ARGS_TO_STACK 6 |
|
173 GET_GOT rbx |
|
174 push rsi |
|
175 push rdi |
|
176 ; end prolog |
|
177 |
|
178 mov rax, arg(0) ; src1 |
|
179 mov rcx, arg(1) ; stride1 |
|
180 mov rdx, arg(2) ; src2 |
|
181 mov rdi, arg(3) ; stride2 |
|
182 |
|
183 mov rsi, 16 ; block height |
|
184 |
|
185 ; Prep accumulator registers |
|
186 pxor xmm3, xmm3 ; SAD |
|
187 pxor xmm4, xmm4 ; sum of src2 |
|
188 pxor xmm5, xmm5 ; sum of src2^2 |
|
189 |
|
190 ; Because we're working with the actual output frames |
|
191 ; we can't depend on any kind of data alignment. |
|
192 .accumulate |
|
193 movdqa xmm0, [rax] ; src1 |
|
194 movdqa xmm1, [rdx] ; src2 |
|
195 add rax, rcx ; src1 + stride1 |
|
196 add rdx, rdi ; src2 + stride2 |
|
197 |
|
198 ; SAD(src1, src2) |
|
199 psadbw xmm0, xmm1 |
|
200 paddusw xmm3, xmm0 |
|
201 |
|
202 ; SUM(src2) |
|
203 pxor xmm2, xmm2 |
|
204 psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 |
|
205 paddusw xmm4, xmm2 |
|
206 |
|
207 ; pmaddubsw would be ideal if it took two unsigned values. instead, |
|
208 ; it expects a signed and an unsigned value. so instead we zero extend |
|
209 ; and operate on words. |
|
210 pxor xmm2, xmm2 |
|
211 movdqa xmm0, xmm1 |
|
212 punpcklbw xmm0, xmm2 |
|
213 punpckhbw xmm1, xmm2 |
|
214 pmaddwd xmm0, xmm0 |
|
215 pmaddwd xmm1, xmm1 |
|
216 paddd xmm5, xmm0 |
|
217 paddd xmm5, xmm1 |
|
218 |
|
219 sub rsi, 1 |
|
220 jnz .accumulate |
|
221 |
|
222 ; phaddd only operates on adjacent double words. |
|
223 ; Finalize SAD and store |
|
224 movdqa xmm0, xmm3 |
|
225 psrldq xmm0, 8 |
|
226 paddusw xmm0, xmm3 |
|
227 paddd xmm0, [GLOBAL(t128)] |
|
228 psrld xmm0, 8 |
|
229 |
|
230 mov rax, arg(5) |
|
231 movd [rax], xmm0 |
|
232 |
|
233 ; Accumulate sum of src2 |
|
234 movdqa xmm0, xmm4 |
|
235 psrldq xmm0, 8 |
|
236 paddusw xmm0, xmm4 |
|
237 ; Square src2. Ignore high value |
|
238 pmuludq xmm0, xmm0 |
|
239 psrld xmm0, 8 |
|
240 |
|
241 ; phaddw could be used to sum adjacent values but we want |
|
242 ; all the values summed. promote to doubles, accumulate, |
|
243 ; shift and sum |
|
244 pxor xmm2, xmm2 |
|
245 movdqa xmm1, xmm5 |
|
246 punpckldq xmm1, xmm2 |
|
247 punpckhdq xmm5, xmm2 |
|
248 paddd xmm1, xmm5 |
|
249 movdqa xmm2, xmm1 |
|
250 psrldq xmm1, 8 |
|
251 paddd xmm1, xmm2 |
|
252 |
|
253 psubd xmm1, xmm0 |
|
254 |
|
255 ; (variance + 128) >> 8 |
|
256 paddd xmm1, [GLOBAL(t128)] |
|
257 psrld xmm1, 8 |
|
258 mov rax, arg(4) |
|
259 |
|
260 movd [rax], xmm1 |
|
261 |
|
262 |
|
263 ; begin epilog |
|
264 pop rdi |
|
265 pop rsi |
|
266 RESTORE_GOT |
|
267 UNSHADOW_ARGS |
|
268 pop rbp |
|
269 ret |
|
270 |
|
271 SECTION_RODATA |
|
272 align 16 |
|
273 t128: |
|
274 %ifndef __NASM_VER__ |
|
275 ddq 128 |
|
276 %elif CONFIG_BIG_ENDIAN |
|
277 dq 0, 128 |
|
278 %else |
|
279 dq 128, 0 |
|
280 %endif |
|
281 align 16 |
|
282 tMFQE: ; 1 << MFQE_PRECISION |
|
283 times 8 dw 0x10 |
|
284 align 16 |
|
285 tMFQE_round: ; 1 << (MFQE_PRECISION - 1) |
|
286 times 8 dw 0x08 |
|
287 |