|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "third_party/x86inc/x86inc.asm" |
|
12 |
|
13 SECTION_RODATA |
|
14 pw_4: times 8 dw 4 |
|
15 pw_8: times 8 dw 8 |
|
16 pw_16: times 8 dw 16 |
|
17 pw_32: times 8 dw 32 |
|
18 |
|
19 SECTION .text |
|
20 |
|
21 INIT_MMX sse |
|
22 cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset |
|
23 GET_GOT goffsetq |
|
24 |
|
25 pxor m1, m1 |
|
26 movd m0, [aboveq] |
|
27 punpckldq m0, [leftq] |
|
28 psadbw m0, m1 |
|
29 paddw m0, [GLOBAL(pw_4)] |
|
30 psraw m0, 3 |
|
31 pshufw m0, m0, 0x0 |
|
32 packuswb m0, m0 |
|
33 movd [dstq ], m0 |
|
34 movd [dstq+strideq], m0 |
|
35 lea dstq, [dstq+strideq*2] |
|
36 movd [dstq ], m0 |
|
37 movd [dstq+strideq], m0 |
|
38 |
|
39 RESTORE_GOT |
|
40 RET |
|
41 |
|
42 INIT_MMX sse |
|
43 cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset |
|
44 GET_GOT goffsetq |
|
45 |
|
46 pxor m1, m1 |
|
47 movq m0, [aboveq] |
|
48 movq m2, [leftq] |
|
49 DEFINE_ARGS dst, stride, stride3 |
|
50 lea stride3q, [strideq*3] |
|
51 psadbw m0, m1 |
|
52 psadbw m2, m1 |
|
53 paddw m0, m2 |
|
54 paddw m0, [GLOBAL(pw_8)] |
|
55 psraw m0, 4 |
|
56 pshufw m0, m0, 0x0 |
|
57 packuswb m0, m0 |
|
58 movq [dstq ], m0 |
|
59 movq [dstq+strideq ], m0 |
|
60 movq [dstq+strideq*2], m0 |
|
61 movq [dstq+stride3q ], m0 |
|
62 lea dstq, [dstq+strideq*4] |
|
63 movq [dstq ], m0 |
|
64 movq [dstq+strideq ], m0 |
|
65 movq [dstq+strideq*2], m0 |
|
66 movq [dstq+stride3q ], m0 |
|
67 |
|
68 RESTORE_GOT |
|
69 RET |
|
70 |
|
71 INIT_XMM sse2 |
|
72 cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
|
73 GET_GOT goffsetq |
|
74 |
|
75 pxor m1, m1 |
|
76 mova m0, [aboveq] |
|
77 mova m2, [leftq] |
|
78 DEFINE_ARGS dst, stride, stride3, lines4 |
|
79 lea stride3q, [strideq*3] |
|
80 mov lines4d, 4 |
|
81 psadbw m0, m1 |
|
82 psadbw m2, m1 |
|
83 paddw m0, m2 |
|
84 movhlps m2, m0 |
|
85 paddw m0, m2 |
|
86 paddw m0, [GLOBAL(pw_16)] |
|
87 psraw m0, 5 |
|
88 pshuflw m0, m0, 0x0 |
|
89 punpcklqdq m0, m0 |
|
90 packuswb m0, m0 |
|
91 .loop: |
|
92 mova [dstq ], m0 |
|
93 mova [dstq+strideq ], m0 |
|
94 mova [dstq+strideq*2], m0 |
|
95 mova [dstq+stride3q ], m0 |
|
96 lea dstq, [dstq+strideq*4] |
|
97 dec lines4d |
|
98 jnz .loop |
|
99 |
|
100 RESTORE_GOT |
|
101 REP_RET |
|
102 |
|
103 INIT_XMM sse2 |
|
104 cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset |
|
105 GET_GOT goffsetq |
|
106 |
|
107 pxor m1, m1 |
|
108 mova m0, [aboveq] |
|
109 mova m2, [aboveq+16] |
|
110 mova m3, [leftq] |
|
111 mova m4, [leftq+16] |
|
112 DEFINE_ARGS dst, stride, stride3, lines4 |
|
113 lea stride3q, [strideq*3] |
|
114 mov lines4d, 8 |
|
115 psadbw m0, m1 |
|
116 psadbw m2, m1 |
|
117 psadbw m3, m1 |
|
118 psadbw m4, m1 |
|
119 paddw m0, m2 |
|
120 paddw m0, m3 |
|
121 paddw m0, m4 |
|
122 movhlps m2, m0 |
|
123 paddw m0, m2 |
|
124 paddw m0, [GLOBAL(pw_32)] |
|
125 psraw m0, 6 |
|
126 pshuflw m0, m0, 0x0 |
|
127 punpcklqdq m0, m0 |
|
128 packuswb m0, m0 |
|
129 .loop: |
|
130 mova [dstq ], m0 |
|
131 mova [dstq +16], m0 |
|
132 mova [dstq+strideq ], m0 |
|
133 mova [dstq+strideq +16], m0 |
|
134 mova [dstq+strideq*2 ], m0 |
|
135 mova [dstq+strideq*2+16], m0 |
|
136 mova [dstq+stride3q ], m0 |
|
137 mova [dstq+stride3q +16], m0 |
|
138 lea dstq, [dstq+strideq*4] |
|
139 dec lines4d |
|
140 jnz .loop |
|
141 |
|
142 RESTORE_GOT |
|
143 REP_RET |
|
144 |
|
145 INIT_MMX sse |
|
146 cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above |
|
147 movd m0, [aboveq] |
|
148 movd [dstq ], m0 |
|
149 movd [dstq+strideq], m0 |
|
150 lea dstq, [dstq+strideq*2] |
|
151 movd [dstq ], m0 |
|
152 movd [dstq+strideq], m0 |
|
153 RET |
|
154 |
|
155 INIT_MMX sse |
|
156 cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above |
|
157 movq m0, [aboveq] |
|
158 DEFINE_ARGS dst, stride, stride3 |
|
159 lea stride3q, [strideq*3] |
|
160 movq [dstq ], m0 |
|
161 movq [dstq+strideq ], m0 |
|
162 movq [dstq+strideq*2], m0 |
|
163 movq [dstq+stride3q ], m0 |
|
164 lea dstq, [dstq+strideq*4] |
|
165 movq [dstq ], m0 |
|
166 movq [dstq+strideq ], m0 |
|
167 movq [dstq+strideq*2], m0 |
|
168 movq [dstq+stride3q ], m0 |
|
169 RET |
|
170 |
|
171 INIT_XMM sse2 |
|
172 cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above |
|
173 mova m0, [aboveq] |
|
174 DEFINE_ARGS dst, stride, stride3, nlines4 |
|
175 lea stride3q, [strideq*3] |
|
176 mov nlines4d, 4 |
|
177 .loop: |
|
178 mova [dstq ], m0 |
|
179 mova [dstq+strideq ], m0 |
|
180 mova [dstq+strideq*2], m0 |
|
181 mova [dstq+stride3q ], m0 |
|
182 lea dstq, [dstq+strideq*4] |
|
183 dec nlines4d |
|
184 jnz .loop |
|
185 REP_RET |
|
186 |
|
187 INIT_XMM sse2 |
|
188 cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above |
|
189 mova m0, [aboveq] |
|
190 mova m1, [aboveq+16] |
|
191 DEFINE_ARGS dst, stride, stride3, nlines4 |
|
192 lea stride3q, [strideq*3] |
|
193 mov nlines4d, 8 |
|
194 .loop: |
|
195 mova [dstq ], m0 |
|
196 mova [dstq +16], m1 |
|
197 mova [dstq+strideq ], m0 |
|
198 mova [dstq+strideq +16], m1 |
|
199 mova [dstq+strideq*2 ], m0 |
|
200 mova [dstq+strideq*2+16], m1 |
|
201 mova [dstq+stride3q ], m0 |
|
202 mova [dstq+stride3q +16], m1 |
|
203 lea dstq, [dstq+strideq*4] |
|
204 dec nlines4d |
|
205 jnz .loop |
|
206 REP_RET |
|
207 |
|
208 INIT_MMX sse |
|
209 cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left |
|
210 pxor m1, m1 |
|
211 movd m2, [aboveq-1] |
|
212 movd m0, [aboveq] |
|
213 punpcklbw m2, m1 |
|
214 punpcklbw m0, m1 |
|
215 pshufw m2, m2, 0x0 |
|
216 DEFINE_ARGS dst, stride, line, left |
|
217 mov lineq, -2 |
|
218 add leftq, 4 |
|
219 psubw m0, m2 |
|
220 .loop: |
|
221 movd m2, [leftq+lineq*2] |
|
222 movd m3, [leftq+lineq*2+1] |
|
223 punpcklbw m2, m1 |
|
224 punpcklbw m3, m1 |
|
225 pshufw m2, m2, 0x0 |
|
226 pshufw m3, m3, 0x0 |
|
227 paddw m2, m0 |
|
228 paddw m3, m0 |
|
229 packuswb m2, m2 |
|
230 packuswb m3, m3 |
|
231 movd [dstq ], m2 |
|
232 movd [dstq+strideq], m3 |
|
233 lea dstq, [dstq+strideq*2] |
|
234 inc lineq |
|
235 jnz .loop |
|
236 REP_RET |
|
237 |
|
238 INIT_XMM sse2 |
|
239 cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left |
|
240 pxor m1, m1 |
|
241 movd m2, [aboveq-1] |
|
242 movq m0, [aboveq] |
|
243 punpcklbw m2, m1 |
|
244 punpcklbw m0, m1 |
|
245 pshuflw m2, m2, 0x0 |
|
246 DEFINE_ARGS dst, stride, line, left |
|
247 mov lineq, -4 |
|
248 punpcklqdq m2, m2 |
|
249 add leftq, 8 |
|
250 psubw m0, m2 |
|
251 .loop: |
|
252 movd m2, [leftq+lineq*2] |
|
253 movd m3, [leftq+lineq*2+1] |
|
254 punpcklbw m2, m1 |
|
255 punpcklbw m3, m1 |
|
256 pshuflw m2, m2, 0x0 |
|
257 pshuflw m3, m3, 0x0 |
|
258 punpcklqdq m2, m2 |
|
259 punpcklqdq m3, m3 |
|
260 paddw m2, m0 |
|
261 paddw m3, m0 |
|
262 packuswb m2, m3 |
|
263 movq [dstq ], m2 |
|
264 movhps [dstq+strideq], m2 |
|
265 lea dstq, [dstq+strideq*2] |
|
266 inc lineq |
|
267 jnz .loop |
|
268 REP_RET |
|
269 |
|
270 INIT_XMM sse2 |
|
271 cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left |
|
272 pxor m1, m1 |
|
273 movd m2, [aboveq-1] |
|
274 mova m0, [aboveq] |
|
275 punpcklbw m2, m1 |
|
276 punpckhbw m4, m0, m1 |
|
277 punpcklbw m0, m1 |
|
278 pshuflw m2, m2, 0x0 |
|
279 DEFINE_ARGS dst, stride, line, left |
|
280 mov lineq, -8 |
|
281 punpcklqdq m2, m2 |
|
282 add leftq, 16 |
|
283 psubw m0, m2 |
|
284 psubw m4, m2 |
|
285 .loop: |
|
286 movd m2, [leftq+lineq*2] |
|
287 movd m3, [leftq+lineq*2+1] |
|
288 punpcklbw m2, m1 |
|
289 punpcklbw m3, m1 |
|
290 pshuflw m2, m2, 0x0 |
|
291 pshuflw m3, m3, 0x0 |
|
292 punpcklqdq m2, m2 |
|
293 punpcklqdq m3, m3 |
|
294 paddw m5, m2, m0 |
|
295 paddw m6, m3, m0 |
|
296 paddw m2, m4 |
|
297 paddw m3, m4 |
|
298 packuswb m5, m2 |
|
299 packuswb m6, m3 |
|
300 mova [dstq ], m5 |
|
301 mova [dstq+strideq], m6 |
|
302 lea dstq, [dstq+strideq*2] |
|
303 inc lineq |
|
304 jnz .loop |
|
305 REP_RET |
|
306 |
|
307 %if ARCH_X86_64 |
|
308 INIT_XMM sse2 |
|
309 cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left |
|
310 pxor m1, m1 |
|
311 movd m2, [aboveq-1] |
|
312 mova m0, [aboveq] |
|
313 mova m4, [aboveq+16] |
|
314 punpcklbw m2, m1 |
|
315 punpckhbw m3, m0, m1 |
|
316 punpckhbw m5, m4, m1 |
|
317 punpcklbw m0, m1 |
|
318 punpcklbw m4, m1 |
|
319 pshuflw m2, m2, 0x0 |
|
320 DEFINE_ARGS dst, stride, line, left |
|
321 mov lineq, -16 |
|
322 punpcklqdq m2, m2 |
|
323 add leftq, 32 |
|
324 psubw m0, m2 |
|
325 psubw m3, m2 |
|
326 psubw m4, m2 |
|
327 psubw m5, m2 |
|
328 .loop: |
|
329 movd m2, [leftq+lineq*2] |
|
330 movd m6, [leftq+lineq*2+1] |
|
331 punpcklbw m2, m1 |
|
332 punpcklbw m6, m1 |
|
333 pshuflw m2, m2, 0x0 |
|
334 pshuflw m6, m6, 0x0 |
|
335 punpcklqdq m2, m2 |
|
336 punpcklqdq m6, m6 |
|
337 paddw m7, m2, m0 |
|
338 paddw m8, m2, m3 |
|
339 paddw m9, m2, m4 |
|
340 paddw m2, m5 |
|
341 packuswb m7, m8 |
|
342 packuswb m9, m2 |
|
343 paddw m2, m6, m0 |
|
344 paddw m8, m6, m3 |
|
345 mova [dstq ], m7 |
|
346 paddw m7, m6, m4 |
|
347 paddw m6, m5 |
|
348 mova [dstq +16], m9 |
|
349 packuswb m2, m8 |
|
350 packuswb m7, m6 |
|
351 mova [dstq+strideq ], m2 |
|
352 mova [dstq+strideq+16], m7 |
|
353 lea dstq, [dstq+strideq*2] |
|
354 inc lineq |
|
355 jnz .loop |
|
356 REP_RET |
|
357 %endif |