|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "third_party/x86inc/x86inc.asm" |
|
12 |
|
13 SECTION_RODATA |
|
14 |
|
15 pb_1: times 16 db 1 |
|
16 sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 |
|
17 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 |
|
18 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 |
|
19 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 |
|
20 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 |
|
21 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 |
|
22 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 |
|
23 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 |
|
24 sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 |
|
25 sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 |
|
26 sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
|
27 sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 |
|
28 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 |
|
29 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 |
|
30 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
|
31 sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
|
32 sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
|
33 |
|
34 SECTION .text |
|
35 |
|
36 INIT_MMX ssse3 |
|
37 cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left |
|
38 movifnidn leftq, leftmp |
|
39 add leftq, 4 |
|
40 mov lineq, -2 |
|
41 pxor m0, m0 |
|
42 .loop: |
|
43 movd m1, [leftq+lineq*2 ] |
|
44 movd m2, [leftq+lineq*2+1] |
|
45 pshufb m1, m0 |
|
46 pshufb m2, m0 |
|
47 movd [dstq ], m1 |
|
48 movd [dstq+strideq], m2 |
|
49 lea dstq, [dstq+strideq*2] |
|
50 inc lineq |
|
51 jnz .loop |
|
52 REP_RET |
|
53 |
|
54 INIT_MMX ssse3 |
|
55 cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left |
|
56 movifnidn leftq, leftmp |
|
57 add leftq, 8 |
|
58 mov lineq, -4 |
|
59 pxor m0, m0 |
|
60 .loop: |
|
61 movd m1, [leftq+lineq*2 ] |
|
62 movd m2, [leftq+lineq*2+1] |
|
63 pshufb m1, m0 |
|
64 pshufb m2, m0 |
|
65 movq [dstq ], m1 |
|
66 movq [dstq+strideq], m2 |
|
67 lea dstq, [dstq+strideq*2] |
|
68 inc lineq |
|
69 jnz .loop |
|
70 REP_RET |
|
71 |
|
72 INIT_XMM ssse3 |
|
73 cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left |
|
74 movifnidn leftq, leftmp |
|
75 add leftq, 16 |
|
76 mov lineq, -8 |
|
77 pxor m0, m0 |
|
78 .loop: |
|
79 movd m1, [leftq+lineq*2 ] |
|
80 movd m2, [leftq+lineq*2+1] |
|
81 pshufb m1, m0 |
|
82 pshufb m2, m0 |
|
83 mova [dstq ], m1 |
|
84 mova [dstq+strideq], m2 |
|
85 lea dstq, [dstq+strideq*2] |
|
86 inc lineq |
|
87 jnz .loop |
|
88 REP_RET |
|
89 |
|
90 INIT_XMM ssse3 |
|
91 cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left |
|
92 movifnidn leftq, leftmp |
|
93 add leftq, 32 |
|
94 mov lineq, -16 |
|
95 pxor m0, m0 |
|
96 .loop: |
|
97 movd m1, [leftq+lineq*2 ] |
|
98 movd m2, [leftq+lineq*2+1] |
|
99 pshufb m1, m0 |
|
100 pshufb m2, m0 |
|
101 mova [dstq ], m1 |
|
102 mova [dstq +16], m1 |
|
103 mova [dstq+strideq ], m2 |
|
104 mova [dstq+strideq+16], m2 |
|
105 lea dstq, [dstq+strideq*2] |
|
106 inc lineq |
|
107 jnz .loop |
|
108 REP_RET |
|
109 |
|
110 INIT_MMX ssse3 |
|
111 cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset |
|
112 GET_GOT goffsetq |
|
113 |
|
114 movq m0, [aboveq] |
|
115 pshufb m2, m0, [GLOBAL(sh_b23456777)] |
|
116 pshufb m1, m0, [GLOBAL(sh_b01234577)] |
|
117 pshufb m0, [GLOBAL(sh_b12345677)] |
|
118 pavgb m3, m2, m1 |
|
119 pxor m2, m1 |
|
120 pand m2, [GLOBAL(pb_1)] |
|
121 psubb m3, m2 |
|
122 pavgb m0, m3 |
|
123 |
|
124 ; store 4 lines |
|
125 movd [dstq ], m0 |
|
126 psrlq m0, 8 |
|
127 movd [dstq+strideq], m0 |
|
128 lea dstq, [dstq+strideq*2] |
|
129 psrlq m0, 8 |
|
130 movd [dstq ], m0 |
|
131 psrlq m0, 8 |
|
132 movd [dstq+strideq], m0 |
|
133 |
|
134 RESTORE_GOT |
|
135 RET |
|
136 |
|
137 INIT_MMX ssse3 |
|
138 cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset |
|
139 GET_GOT goffsetq |
|
140 |
|
141 movq m0, [aboveq] |
|
142 mova m1, [GLOBAL(sh_b12345677)] |
|
143 DEFINE_ARGS dst, stride, stride3 |
|
144 lea stride3q, [strideq*3] |
|
145 pshufb m2, m0, [GLOBAL(sh_b23456777)] |
|
146 pavgb m3, m2, m0 |
|
147 pxor m2, m0 |
|
148 pshufb m0, m1 |
|
149 pand m2, [GLOBAL(pb_1)] |
|
150 psubb m3, m2 |
|
151 pavgb m0, m3 |
|
152 |
|
153 ; store 4 lines |
|
154 movq [dstq ], m0 |
|
155 pshufb m0, m1 |
|
156 movq [dstq+strideq ], m0 |
|
157 pshufb m0, m1 |
|
158 movq [dstq+strideq*2], m0 |
|
159 pshufb m0, m1 |
|
160 movq [dstq+stride3q ], m0 |
|
161 pshufb m0, m1 |
|
162 lea dstq, [dstq+strideq*4] |
|
163 |
|
164 ; store next 4 lines |
|
165 movq [dstq ], m0 |
|
166 pshufb m0, m1 |
|
167 movq [dstq+strideq ], m0 |
|
168 pshufb m0, m1 |
|
169 movq [dstq+strideq*2], m0 |
|
170 pshufb m0, m1 |
|
171 movq [dstq+stride3q ], m0 |
|
172 |
|
173 RESTORE_GOT |
|
174 RET |
|
175 |
|
176 INIT_XMM ssse3 |
|
177 cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset |
|
178 GET_GOT goffsetq |
|
179 |
|
180 mova m0, [aboveq] |
|
181 DEFINE_ARGS dst, stride, stride3, dst8, line |
|
182 lea stride3q, [strideq*3] |
|
183 lea dst8q, [dstq+strideq*8] |
|
184 mova m1, [GLOBAL(sh_b123456789abcdeff)] |
|
185 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] |
|
186 pavgb m3, m2, m0 |
|
187 pxor m2, m0 |
|
188 pshufb m0, m1 |
|
189 pand m2, [GLOBAL(pb_1)] |
|
190 psubb m3, m2 |
|
191 pavgb m0, m3 |
|
192 |
|
193 ; first 4 lines and first half of 3rd 4 lines |
|
194 mov lined, 2 |
|
195 .loop: |
|
196 mova [dstq ], m0 |
|
197 movhps [dst8q ], m0 |
|
198 pshufb m0, m1 |
|
199 mova [dstq +strideq ], m0 |
|
200 movhps [dst8q+strideq ], m0 |
|
201 pshufb m0, m1 |
|
202 mova [dstq +strideq*2 ], m0 |
|
203 movhps [dst8q+strideq*2 ], m0 |
|
204 pshufb m0, m1 |
|
205 mova [dstq +stride3q ], m0 |
|
206 movhps [dst8q+stride3q ], m0 |
|
207 pshufb m0, m1 |
|
208 lea dstq, [dstq +strideq*4] |
|
209 lea dst8q, [dst8q+strideq*4] |
|
210 dec lined |
|
211 jnz .loop |
|
212 |
|
213 ; bottom-right 8x8 block |
|
214 movhps [dstq +8], m0 |
|
215 movhps [dstq+strideq +8], m0 |
|
216 movhps [dstq+strideq*2+8], m0 |
|
217 movhps [dstq+stride3q +8], m0 |
|
218 lea dstq, [dstq+strideq*4] |
|
219 movhps [dstq +8], m0 |
|
220 movhps [dstq+strideq +8], m0 |
|
221 movhps [dstq+strideq*2+8], m0 |
|
222 movhps [dstq+stride3q +8], m0 |
|
223 |
|
224 RESTORE_GOT |
|
225 RET |
|
226 |
|
227 INIT_XMM ssse3 |
|
228 cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset |
|
229 GET_GOT goffsetq |
|
230 |
|
231 mova m0, [aboveq] |
|
232 mova m4, [aboveq+16] |
|
233 DEFINE_ARGS dst, stride, stride3, dst16, line |
|
234 lea stride3q, [strideq*3] |
|
235 lea dst16q, [dstq +strideq*8] |
|
236 lea dst16q, [dst16q+strideq*8] |
|
237 mova m1, [GLOBAL(sh_b123456789abcdeff)] |
|
238 pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] |
|
239 pavgb m3, m2, m4 |
|
240 pxor m2, m4 |
|
241 palignr m5, m4, m0, 1 |
|
242 palignr m6, m4, m0, 2 |
|
243 pshufb m4, m1 |
|
244 pand m2, [GLOBAL(pb_1)] |
|
245 psubb m3, m2 |
|
246 pavgb m4, m3 |
|
247 pavgb m3, m0, m6 |
|
248 pxor m0, m6 |
|
249 pand m0, [GLOBAL(pb_1)] |
|
250 psubb m3, m0 |
|
251 pavgb m5, m3 |
|
252 |
|
253 ; write 4x4 lines (and the first half of the second 4x4 lines) |
|
254 mov lined, 4 |
|
255 .loop: |
|
256 mova [dstq ], m5 |
|
257 mova [dstq +16], m4 |
|
258 mova [dst16q ], m4 |
|
259 palignr m3, m4, m5, 1 |
|
260 pshufb m4, m1 |
|
261 mova [dstq +strideq ], m3 |
|
262 mova [dstq +strideq +16], m4 |
|
263 mova [dst16q+strideq ], m4 |
|
264 palignr m5, m4, m3, 1 |
|
265 pshufb m4, m1 |
|
266 mova [dstq +strideq*2 ], m5 |
|
267 mova [dstq +strideq*2+16], m4 |
|
268 mova [dst16q+strideq*2 ], m4 |
|
269 palignr m3, m4, m5, 1 |
|
270 pshufb m4, m1 |
|
271 mova [dstq +stride3q ], m3 |
|
272 mova [dstq +stride3q +16], m4 |
|
273 mova [dst16q+stride3q ], m4 |
|
274 palignr m5, m4, m3, 1 |
|
275 pshufb m4, m1 |
|
276 lea dstq, [dstq +strideq*4] |
|
277 lea dst16q, [dst16q+strideq*4] |
|
278 dec lined |
|
279 jnz .loop |
|
280 |
|
281 ; write second half of second 4x4 lines |
|
282 mova [dstq +16], m4 |
|
283 mova [dstq +strideq +16], m4 |
|
284 mova [dstq +strideq*2+16], m4 |
|
285 mova [dstq +stride3q +16], m4 |
|
286 lea dstq, [dstq +strideq*4] |
|
287 mova [dstq +16], m4 |
|
288 mova [dstq +strideq +16], m4 |
|
289 mova [dstq +strideq*2+16], m4 |
|
290 mova [dstq +stride3q +16], m4 |
|
291 lea dstq, [dstq +strideq*4] |
|
292 mova [dstq +16], m4 |
|
293 mova [dstq +strideq +16], m4 |
|
294 mova [dstq +strideq*2+16], m4 |
|
295 mova [dstq +stride3q +16], m4 |
|
296 lea dstq, [dstq +strideq*4] |
|
297 mova [dstq +16], m4 |
|
298 mova [dstq +strideq +16], m4 |
|
299 mova [dstq +strideq*2+16], m4 |
|
300 mova [dstq +stride3q +16], m4 |
|
301 |
|
302 RESTORE_GOT |
|
303 RET |
|
304 |
|
305 ; ------------------------------------------ |
|
306 ; input: x, y, z, result |
|
307 ; |
|
308 ; trick from pascal |
|
309 ; (x+2y+z+2)>>2 can be calculated as: |
|
310 ; result = avg(x,z) |
|
311 ; result -= xor(x,z) & 1 |
|
312 ; result = avg(result,y) |
|
313 ; ------------------------------------------ |
|
314 %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 |
|
315 pavgb %4, %1, %3 |
|
316 pxor %3, %1 |
|
317 pand %3, [GLOBAL(pb_1)] |
|
318 psubb %4, %3 |
|
319 pavgb %4, %2 |
|
320 %endmacro |
|
321 |
|
322 INIT_XMM ssse3 |
|
323 cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset |
|
324 GET_GOT goffsetq |
|
325 |
|
326 movq m3, [aboveq] |
|
327 pshufb m1, m3, [GLOBAL(sh_b23456777)] |
|
328 pshufb m2, m3, [GLOBAL(sh_b12345677)] |
|
329 |
|
330 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 |
|
331 pavgb m3, m2 |
|
332 |
|
333 ; store 4 lines |
|
334 movd [dstq ], m3 |
|
335 movd [dstq+strideq], m4 |
|
336 lea dstq, [dstq+strideq*2] |
|
337 psrldq m3, 1 |
|
338 psrldq m4, 1 |
|
339 movd [dstq ], m3 |
|
340 movd [dstq+strideq], m4 |
|
341 RESTORE_GOT |
|
342 RET |
|
343 |
|
344 INIT_XMM ssse3 |
|
345 cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset |
|
346 GET_GOT goffsetq |
|
347 |
|
348 movq m3, [aboveq] |
|
349 DEFINE_ARGS dst, stride, stride3 |
|
350 lea stride3q, [strideq*3] |
|
351 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] |
|
352 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] |
|
353 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] |
|
354 pshufb m3, [GLOBAL(sh_b0123456777777777)] |
|
355 |
|
356 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 |
|
357 pavgb m3, m2 |
|
358 |
|
359 ; store 4 lines |
|
360 movq [dstq ], m3 |
|
361 movq [dstq+strideq], m4 |
|
362 psrldq m3, 1 |
|
363 psrldq m4, 1 |
|
364 movq [dstq+strideq*2], m3 |
|
365 movq [dstq+stride3q ], m4 |
|
366 lea dstq, [dstq+strideq*4] |
|
367 psrldq m3, 1 |
|
368 psrldq m4, 1 |
|
369 |
|
370 ; store 4 lines |
|
371 movq [dstq ], m3 |
|
372 movq [dstq+strideq], m4 |
|
373 psrldq m3, 1 |
|
374 psrldq m4, 1 |
|
375 movq [dstq+strideq*2], m3 |
|
376 movq [dstq+stride3q ], m4 |
|
377 RESTORE_GOT |
|
378 RET |
|
379 |
|
380 INIT_XMM ssse3 |
|
381 cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset |
|
382 GET_GOT goffsetq |
|
383 |
|
384 mova m0, [aboveq] |
|
385 DEFINE_ARGS dst, stride, stride3, line |
|
386 lea stride3q, [strideq*3] |
|
387 mova m1, [GLOBAL(sh_b123456789abcdeff)] |
|
388 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] |
|
389 pshufb m3, m0, m1 |
|
390 |
|
391 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 |
|
392 pavgb m0, m3 |
|
393 |
|
394 mov lined, 4 |
|
395 .loop: |
|
396 mova [dstq ], m0 |
|
397 mova [dstq+strideq ], m4 |
|
398 pshufb m0, m1 |
|
399 pshufb m4, m1 |
|
400 mova [dstq+strideq*2], m0 |
|
401 mova [dstq+stride3q ], m4 |
|
402 pshufb m0, m1 |
|
403 pshufb m4, m1 |
|
404 lea dstq, [dstq+strideq*4] |
|
405 dec lined |
|
406 jnz .loop |
|
407 RESTORE_GOT |
|
408 REP_RET |
|
409 |
|
410 INIT_XMM ssse3 |
|
411 cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset |
|
412 GET_GOT goffsetq |
|
413 |
|
414 mova m0, [aboveq] |
|
415 mova m7, [aboveq+16] |
|
416 DEFINE_ARGS dst, stride, stride3, line |
|
417 mova m1, [GLOBAL(sh_b123456789abcdeff)] |
|
418 lea stride3q, [strideq*3] |
|
419 pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] |
|
420 pshufb m3, m7, m1 |
|
421 |
|
422 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 |
|
423 palignr m6, m7, m0, 1 |
|
424 palignr m5, m7, m0, 2 |
|
425 pavgb m7, m3 |
|
426 |
|
427 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 |
|
428 pavgb m0, m6 |
|
429 |
|
430 mov lined, 8 |
|
431 .loop: |
|
432 mova [dstq ], m0 |
|
433 mova [dstq +16], m7 |
|
434 mova [dstq+strideq ], m2 |
|
435 mova [dstq+strideq +16], m4 |
|
436 palignr m3, m7, m0, 1 |
|
437 palignr m5, m4, m2, 1 |
|
438 pshufb m7, m1 |
|
439 pshufb m4, m1 |
|
440 |
|
441 mova [dstq+strideq*2 ], m3 |
|
442 mova [dstq+strideq*2+16], m7 |
|
443 mova [dstq+stride3q ], m5 |
|
444 mova [dstq+stride3q +16], m4 |
|
445 palignr m0, m7, m3, 1 |
|
446 palignr m2, m4, m5, 1 |
|
447 pshufb m7, m1 |
|
448 pshufb m4, m1 |
|
449 lea dstq, [dstq+strideq*4] |
|
450 dec lined |
|
451 jnz .loop |
|
452 RESTORE_GOT |
|
453 REP_RET |
|
454 |
|
455 INIT_XMM ssse3 |
|
456 cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset |
|
457 GET_GOT goffsetq |
|
458 movd m0, [leftq] ; l1, l2, l3, l4 |
|
459 movd m1, [aboveq-1] ; tl, t1, t2, t3 |
|
460 punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 |
|
461 pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 |
|
462 psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 |
|
463 psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 |
|
464 ; comments below are for a predictor like this |
|
465 ; A1 B1 C1 D1 |
|
466 ; A2 B2 A1 B1 |
|
467 ; A3 B3 A2 B2 |
|
468 ; A4 B4 A3 B3 |
|
469 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 |
|
470 pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 |
|
471 |
|
472 punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. |
|
473 |
|
474 DEFINE_ARGS dst, stride, stride3 |
|
475 lea stride3q, [strideq*3] |
|
476 pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. |
|
477 movd [dstq+stride3q ], m3 |
|
478 psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. |
|
479 movd [dstq+strideq*2], m3 |
|
480 psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. |
|
481 movd [dstq+strideq ], m3 |
|
482 psrldq m3, 2 ; A1 B1 C1 D1 .. |
|
483 movd [dstq ], m3 |
|
484 RESTORE_GOT |
|
485 RET |
|
486 |
|
487 INIT_XMM ssse3 |
|
488 cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset |
|
489 GET_GOT goffsetq |
|
490 movq m0, [leftq] ; [0- 7] l1-8 [byte] |
|
491 movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] |
|
492 pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] |
|
493 pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] |
|
494 pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] |
|
495 pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] |
|
496 psrldq m4, m0, 1 ; t1-7 [word] |
|
497 psrldq m5, m0, 2 ; t2-7 [word] |
|
498 ; comments below are for a predictor like this |
|
499 ; A1 B1 C1 D1 E1 F1 G1 H1 |
|
500 ; A2 B2 A1 B1 C1 D1 E1 F1 |
|
501 ; A3 B3 A2 B2 A1 B1 C1 D1 |
|
502 ; A4 B4 A3 B3 A2 B2 A1 B1 |
|
503 ; A5 B5 A4 B4 A3 B3 A2 B2 |
|
504 ; A6 B6 A5 B5 A4 B4 A3 B3 |
|
505 ; A7 B7 A6 B6 A5 B5 A4 B4 |
|
506 ; A8 B8 A7 B7 A6 B6 A5 B5 |
|
507 pavgb m6, m1, m2 ; 2-tap avg A8-A1 |
|
508 |
|
509 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 |
|
510 |
|
511 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 |
|
512 |
|
513 punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 |
|
514 |
|
515 DEFINE_ARGS dst, stride, stride3 |
|
516 lea stride3q, [strideq*3] |
|
517 |
|
518 movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 |
|
519 palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 |
|
520 movq [dstq+strideq*2], m0 |
|
521 psrldq m0, 2 ; A-B2, A-B1, C-H1 |
|
522 movq [dstq+strideq ], m0 |
|
523 psrldq m0, 2 ; A-H1 |
|
524 movq [dstq ], m0 |
|
525 lea dstq, [dstq+strideq*4] |
|
526 movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 |
|
527 psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 |
|
528 movq [dstq+strideq*2], m6 |
|
529 psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 |
|
530 movq [dstq+strideq ], m6 |
|
531 psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 |
|
532 movq [dstq ], m6 |
|
533 RESTORE_GOT |
|
534 RET |
|
535 |
|
536 INIT_XMM ssse3 |
|
537 cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset |
|
538 GET_GOT goffsetq |
|
539 mova m0, [leftq] |
|
540 movu m7, [aboveq-1] |
|
541 ; comments below are for a predictor like this |
|
542 ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 |
|
543 ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 |
|
544 ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 |
|
545 ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 |
|
546 ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 |
|
547 ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 |
|
548 ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 |
|
549 ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 |
|
550 ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 |
|
551 ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 |
|
552 ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 |
|
553 ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 |
|
554 ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 |
|
555 ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 |
|
556 ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 |
|
557 ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 |
|
558 pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] |
|
559 palignr m5, m0, m6, 15 |
|
560 palignr m3, m0, m6, 14 |
|
561 |
|
562 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg |
|
563 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] |
|
564 pavgb m5, m0 ; A1 - Ag |
|
565 |
|
566 punpcklbw m0, m4, m5 ; A-B8 ... A-B1 |
|
567 punpckhbw m4, m5 ; A-B9 ... A-Bg |
|
568 |
|
569 pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] |
|
570 pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] |
|
571 |
|
572 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 |
|
573 |
|
574 pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] |
|
575 DEFINE_ARGS dst, stride, stride3 |
|
576 lea stride3q, [strideq*3] |
|
577 palignr m2, m1, m6, 14 |
|
578 mova [dstq ], m2 |
|
579 palignr m2, m1, m6, 12 |
|
580 mova [dstq+strideq ], m2 |
|
581 palignr m2, m1, m6, 10 |
|
582 mova [dstq+strideq*2], m2 |
|
583 palignr m2, m1, m6, 8 |
|
584 mova [dstq+stride3q ], m2 |
|
585 lea dstq, [dstq+strideq*4] |
|
586 palignr m2, m1, m6, 6 |
|
587 mova [dstq ], m2 |
|
588 palignr m2, m1, m6, 4 |
|
589 mova [dstq+strideq ], m2 |
|
590 palignr m2, m1, m6, 2 |
|
591 mova [dstq+strideq*2], m2 |
|
592 pshufb m4, [GLOBAL(sh_bfedcba9876543210)] |
|
593 mova [dstq+stride3q ], m6 |
|
594 lea dstq, [dstq+strideq*4] |
|
595 |
|
596 palignr m2, m6, m4, 14 |
|
597 mova [dstq ], m2 |
|
598 palignr m2, m6, m4, 12 |
|
599 mova [dstq+strideq ], m2 |
|
600 palignr m2, m6, m4, 10 |
|
601 mova [dstq+strideq*2], m2 |
|
602 palignr m2, m6, m4, 8 |
|
603 mova [dstq+stride3q ], m2 |
|
604 lea dstq, [dstq+strideq*4] |
|
605 palignr m2, m6, m4, 6 |
|
606 mova [dstq ], m2 |
|
607 palignr m2, m6, m4, 4 |
|
608 mova [dstq+strideq ], m2 |
|
609 palignr m2, m6, m4, 2 |
|
610 mova [dstq+strideq*2], m2 |
|
611 mova [dstq+stride3q ], m4 |
|
612 RESTORE_GOT |
|
613 RET |
|
614 |
|
615 INIT_XMM ssse3 |
|
616 cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset |
|
617 GET_GOT goffsetq |
|
618 mova m0, [leftq] |
|
619 movu m7, [aboveq-1] |
|
620 movu m1, [aboveq+15] |
|
621 |
|
622 pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] |
|
623 pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] |
|
624 |
|
625 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] |
|
626 |
|
627 palignr m3, m1, m7, 1 |
|
628 palignr m5, m1, m7, 2 |
|
629 |
|
630 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] |
|
631 |
|
632 pshufb m7, [GLOBAL(sh_bfedcba9876543210)] |
|
633 palignr m5, m0, m7, 15 |
|
634 palignr m3, m0, m7, 14 |
|
635 |
|
636 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg |
|
637 pavgb m5, m0 ; A1 - Ag |
|
638 punpcklbw m6, m4, m5 ; A-B8 ... A-B1 |
|
639 punpckhbw m4, m5 ; A-B9 ... A-Bg |
|
640 pshufb m6, [GLOBAL(sh_bfedcba9876543210)] |
|
641 pshufb m4, [GLOBAL(sh_bfedcba9876543210)] |
|
642 |
|
643 DEFINE_ARGS dst, stride, stride3, left, line |
|
644 lea stride3q, [strideq*3] |
|
645 |
|
646 palignr m5, m2, m1, 14 |
|
647 palignr m7, m1, m6, 14 |
|
648 mova [dstq ], m7 |
|
649 mova [dstq+16 ], m5 |
|
650 palignr m5, m2, m1, 12 |
|
651 palignr m7, m1, m6, 12 |
|
652 mova [dstq+strideq ], m7 |
|
653 mova [dstq+strideq+16 ], m5 |
|
654 palignr m5, m2, m1, 10 |
|
655 palignr m7, m1, m6, 10 |
|
656 mova [dstq+strideq*2 ], m7 |
|
657 mova [dstq+strideq*2+16], m5 |
|
658 palignr m5, m2, m1, 8 |
|
659 palignr m7, m1, m6, 8 |
|
660 mova [dstq+stride3q ], m7 |
|
661 mova [dstq+stride3q+16 ], m5 |
|
662 lea dstq, [dstq+strideq*4] |
|
663 palignr m5, m2, m1, 6 |
|
664 palignr m7, m1, m6, 6 |
|
665 mova [dstq ], m7 |
|
666 mova [dstq+16 ], m5 |
|
667 palignr m5, m2, m1, 4 |
|
668 palignr m7, m1, m6, 4 |
|
669 mova [dstq+strideq ], m7 |
|
670 mova [dstq+strideq+16 ], m5 |
|
671 palignr m5, m2, m1, 2 |
|
672 palignr m7, m1, m6, 2 |
|
673 mova [dstq+strideq*2 ], m7 |
|
674 mova [dstq+strideq*2+16], m5 |
|
675 mova [dstq+stride3q ], m6 |
|
676 mova [dstq+stride3q+16 ], m1 |
|
677 lea dstq, [dstq+strideq*4] |
|
678 |
|
679 palignr m5, m1, m6, 14 |
|
680 palignr m3, m6, m4, 14 |
|
681 mova [dstq ], m3 |
|
682 mova [dstq+16 ], m5 |
|
683 palignr m5, m1, m6, 12 |
|
684 palignr m3, m6, m4, 12 |
|
685 mova [dstq+strideq ], m3 |
|
686 mova [dstq+strideq+16 ], m5 |
|
687 palignr m5, m1, m6, 10 |
|
688 palignr m3, m6, m4, 10 |
|
689 mova [dstq+strideq*2 ], m3 |
|
690 mova [dstq+strideq*2+16], m5 |
|
691 palignr m5, m1, m6, 8 |
|
692 palignr m3, m6, m4, 8 |
|
693 mova [dstq+stride3q ], m3 |
|
694 mova [dstq+stride3q+16 ], m5 |
|
695 lea dstq, [dstq+strideq*4] |
|
696 palignr m5, m1, m6, 6 |
|
697 palignr m3, m6, m4, 6 |
|
698 mova [dstq ], m3 |
|
699 mova [dstq+16 ], m5 |
|
700 palignr m5, m1, m6, 4 |
|
701 palignr m3, m6, m4, 4 |
|
702 mova [dstq+strideq ], m3 |
|
703 mova [dstq+strideq+16 ], m5 |
|
704 palignr m5, m1, m6, 2 |
|
705 palignr m3, m6, m4, 2 |
|
706 mova [dstq+strideq*2 ], m3 |
|
707 mova [dstq+strideq*2+16], m5 |
|
708 mova [dstq+stride3q ], m4 |
|
709 mova [dstq+stride3q+16 ], m6 |
|
710 lea dstq, [dstq+strideq*4] |
|
711 |
|
712 mova m7, [leftq] |
|
713 mova m3, [leftq+16] |
|
714 palignr m5, m3, m7, 15 |
|
715 palignr m0, m3, m7, 14 |
|
716 |
|
717 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - |
|
718 pavgb m5, m3 ; Ah - |
|
719 punpcklbw m3, m2, m5 ; A-B8 ... A-B1 |
|
720 punpckhbw m2, m5 ; A-B9 ... A-Bg |
|
721 pshufb m3, [GLOBAL(sh_bfedcba9876543210)] |
|
722 pshufb m2, [GLOBAL(sh_bfedcba9876543210)] |
|
723 |
|
724 palignr m7, m6, m4, 14 |
|
725 palignr m0, m4, m3, 14 |
|
726 mova [dstq ], m0 |
|
727 mova [dstq+16 ], m7 |
|
728 palignr m7, m6, m4, 12 |
|
729 palignr m0, m4, m3, 12 |
|
730 mova [dstq+strideq ], m0 |
|
731 mova [dstq+strideq+16 ], m7 |
|
732 palignr m7, m6, m4, 10 |
|
733 palignr m0, m4, m3, 10 |
|
734 mova [dstq+strideq*2 ], m0 |
|
735 mova [dstq+strideq*2+16], m7 |
|
736 palignr m7, m6, m4, 8 |
|
737 palignr m0, m4, m3, 8 |
|
738 mova [dstq+stride3q ], m0 |
|
739 mova [dstq+stride3q+16 ], m7 |
|
740 lea dstq, [dstq+strideq*4] |
|
741 palignr m7, m6, m4, 6 |
|
742 palignr m0, m4, m3, 6 |
|
743 mova [dstq ], m0 |
|
744 mova [dstq+16 ], m7 |
|
745 palignr m7, m6, m4, 4 |
|
746 palignr m0, m4, m3, 4 |
|
747 mova [dstq+strideq ], m0 |
|
748 mova [dstq+strideq+16 ], m7 |
|
749 palignr m7, m6, m4, 2 |
|
750 palignr m0, m4, m3, 2 |
|
751 mova [dstq+strideq*2 ], m0 |
|
752 mova [dstq+strideq*2+16], m7 |
|
753 mova [dstq+stride3q ], m3 |
|
754 mova [dstq+stride3q+16 ], m4 |
|
755 lea dstq, [dstq+strideq*4] |
|
756 |
|
757 palignr m7, m4, m3, 14 |
|
758 palignr m0, m3, m2, 14 |
|
759 mova [dstq ], m0 |
|
760 mova [dstq+16 ], m7 |
|
761 palignr m7, m4, m3, 12 |
|
762 palignr m0, m3, m2, 12 |
|
763 mova [dstq+strideq ], m0 |
|
764 mova [dstq+strideq+16 ], m7 |
|
765 palignr m7, m4, m3, 10 |
|
766 palignr m0, m3, m2, 10 |
|
767 mova [dstq+strideq*2 ], m0 |
|
768 mova [dstq+strideq*2+16], m7 |
|
769 palignr m7, m4, m3, 8 |
|
770 palignr m0, m3, m2, 8 |
|
771 mova [dstq+stride3q ], m0 |
|
772 mova [dstq+stride3q+16 ], m7 |
|
773 lea dstq, [dstq+strideq*4] |
|
774 palignr m7, m4, m3, 6 |
|
775 palignr m0, m3, m2, 6 |
|
776 mova [dstq ], m0 |
|
777 mova [dstq+16 ], m7 |
|
778 palignr m7, m4, m3, 4 |
|
779 palignr m0, m3, m2, 4 |
|
780 mova [dstq+strideq ], m0 |
|
781 mova [dstq+strideq+16 ], m7 |
|
782 palignr m7, m4, m3, 2 |
|
783 palignr m0, m3, m2, 2 |
|
784 mova [dstq+strideq*2 ], m0 |
|
785 mova [dstq+strideq*2+16], m7 |
|
786 mova [dstq+stride3q ], m2 |
|
787 mova [dstq+stride3q+16 ], m3 |
|
788 |
|
789 RESTORE_GOT |
|
790 RET |
|
791 |
|
792 INIT_MMX ssse3 |
|
793 cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset |
|
794 GET_GOT goffsetq |
|
795 movd m0, [leftq] ; abcd [byte] |
|
796 pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte] |
|
797 pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd |
|
798 |
|
799 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2 |
|
800 pavgb m1, m0 ; ab, bc, cd, d [byte] |
|
801 |
|
802 punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d |
|
803 movd [dstq ], m1 |
|
804 psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d |
|
805 movd [dstq+strideq], m1 |
|
806 lea dstq, [dstq+strideq*2] |
|
807 psrlq m1, 16 ; cd, c3d, d, d |
|
808 movd [dstq ], m1 |
|
809 pshufw m1, m1, q1111 ; d, d, d, d |
|
810 movd [dstq+strideq], m1 |
|
811 RESTORE_GOT |
|
812 RET |
|
813 |
|
814 INIT_XMM ssse3 |
|
815 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset |
|
816 GET_GOT goffsetq |
|
817 movq m3, [leftq] ; abcdefgh [byte] |
|
818 lea stride3q, [strideq*3] |
|
819 |
|
820 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] |
|
821 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] |
|
822 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] |
|
823 |
|
824 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 |
|
825 pavgb m0, m2 |
|
826 punpcklbw m0, m3 ; interleaved output |
|
827 |
|
828 movq [dstq ], m0 |
|
829 psrldq m0, 2 |
|
830 movq [dstq+strideq ], m0 |
|
831 psrldq m0, 2 |
|
832 movq [dstq+strideq*2], m0 |
|
833 psrldq m0, 2 |
|
834 movq [dstq+stride3q ], m0 |
|
835 lea dstq, [dstq+strideq*4] |
|
836 pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh |
|
837 psrldq m0, 2 |
|
838 movq [dstq ], m0 |
|
839 psrldq m0, 2 |
|
840 movq [dstq+strideq ], m0 |
|
841 psrldq m0, 2 |
|
842 movq [dstq+strideq*2], m0 |
|
843 psrldq m0, 2 |
|
844 movq [dstq+stride3q ], m0 |
|
845 RESTORE_GOT |
|
846 RET |
|
847 |
|
848 INIT_XMM ssse3 |
|
849 cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset |
|
850 GET_GOT goffsetq |
|
851 lea stride3q, [strideq*3] |
|
852 mova m0, [leftq] ; abcdefghijklmnop [byte] |
|
853 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp |
|
854 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] |
|
855 |
|
856 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 |
|
857 pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] |
|
858 |
|
859 punpckhbw m4, m1, m3 ; interleaved input |
|
860 punpcklbw m1, m3 ; interleaved output |
|
861 mova [dstq ], m1 |
|
862 palignr m3, m4, m1, 2 |
|
863 mova [dstq+strideq ], m3 |
|
864 palignr m3, m4, m1, 4 |
|
865 mova [dstq+strideq*2], m3 |
|
866 palignr m3, m4, m1, 6 |
|
867 mova [dstq+stride3q ], m3 |
|
868 lea dstq, [dstq+strideq*4] |
|
869 palignr m3, m4, m1, 8 |
|
870 mova [dstq ], m3 |
|
871 palignr m3, m4, m1, 10 |
|
872 mova [dstq+strideq ], m3 |
|
873 palignr m3, m4, m1, 12 |
|
874 mova [dstq+strideq*2], m3 |
|
875 palignr m3, m4, m1, 14 |
|
876 mova [dstq+stride3q ], m3 |
|
877 DEFINE_ARGS dst, stride, stride3, line |
|
878 mov lined, 2 |
|
879 mova m0, [GLOBAL(sh_b23456789abcdefff)] |
|
880 .loop: |
|
881 lea dstq, [dstq+strideq*4] |
|
882 mova [dstq ], m4 |
|
883 pshufb m4, m0 |
|
884 mova [dstq+strideq ], m4 |
|
885 pshufb m4, m0 |
|
886 mova [dstq+strideq*2], m4 |
|
887 pshufb m4, m0 |
|
888 mova [dstq+stride3q ], m4 |
|
889 pshufb m4, m0 |
|
890 dec lined |
|
891 jnz .loop |
|
892 RESTORE_GOT |
|
893 REP_RET |
|
894 |
|
895 INIT_XMM ssse3 |
|
896 cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset |
|
897 GET_GOT goffsetq |
|
898 lea stride3q, [strideq*3] |
|
899 mova m1, [leftq] ; 0-15 [byte] |
|
900 mova m2, [leftq+16] ; 16-31 [byte] |
|
901 pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] |
|
902 pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] |
|
903 |
|
904 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 |
|
905 palignr m6, m2, m1, 1 |
|
906 palignr m5, m2, m1, 2 |
|
907 pavgb m2, m4 ; high 16px even lines |
|
908 |
|
909 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 |
|
910 pavgb m1, m6 ; low 16px even lines |
|
911 |
|
912 punpckhbw m6, m1, m0 ; interleaved output 2 |
|
913 punpcklbw m1, m0 ; interleaved output 1 |
|
914 |
|
915 punpckhbw m7, m2, m3 ; interleaved output 4 |
|
916 punpcklbw m2, m3 ; interleaved output 3 |
|
917 |
|
918 ; output 1st 8 lines (and half of 2nd 8 lines) |
|
919 DEFINE_ARGS dst, stride, stride3, dst8 |
|
920 lea dst8q, [dstq+strideq*8] |
|
921 mova [dstq ], m1 |
|
922 mova [dstq +16], m6 |
|
923 mova [dst8q ], m6 |
|
924 palignr m0, m6, m1, 2 |
|
925 palignr m4, m2, m6, 2 |
|
926 mova [dstq +strideq ], m0 |
|
927 mova [dstq +strideq +16], m4 |
|
928 mova [dst8q+strideq ], m4 |
|
929 palignr m0, m6, m1, 4 |
|
930 palignr m4, m2, m6, 4 |
|
931 mova [dstq +strideq*2 ], m0 |
|
932 mova [dstq +strideq*2+16], m4 |
|
933 mova [dst8q+strideq*2 ], m4 |
|
934 palignr m0, m6, m1, 6 |
|
935 palignr m4, m2, m6, 6 |
|
936 mova [dstq +stride3q ], m0 |
|
937 mova [dstq +stride3q +16], m4 |
|
938 mova [dst8q+stride3q ], m4 |
|
939 lea dstq, [dstq +strideq*4] |
|
940 lea dst8q, [dst8q+strideq*4] |
|
941 palignr m0, m6, m1, 8 |
|
942 palignr m4, m2, m6, 8 |
|
943 mova [dstq ], m0 |
|
944 mova [dstq +16], m4 |
|
945 mova [dst8q ], m4 |
|
946 palignr m0, m6, m1, 10 |
|
947 palignr m4, m2, m6, 10 |
|
948 mova [dstq +strideq ], m0 |
|
949 mova [dstq +strideq +16], m4 |
|
950 mova [dst8q+strideq ], m4 |
|
951 palignr m0, m6, m1, 12 |
|
952 palignr m4, m2, m6, 12 |
|
953 mova [dstq +strideq*2 ], m0 |
|
954 mova [dstq +strideq*2+16], m4 |
|
955 mova [dst8q+strideq*2 ], m4 |
|
956 palignr m0, m6, m1, 14 |
|
957 palignr m4, m2, m6, 14 |
|
958 mova [dstq +stride3q ], m0 |
|
959 mova [dstq +stride3q +16], m4 |
|
960 mova [dst8q+stride3q ], m4 |
|
961 lea dstq, [dstq+strideq*4] |
|
962 lea dst8q, [dst8q+strideq*4] |
|
963 |
|
964 ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines |
|
965 mova [dstq +16], m2 |
|
966 mova [dst8q ], m2 |
|
967 palignr m4, m7, m2, 2 |
|
968 mova [dstq +strideq +16], m4 |
|
969 mova [dst8q+strideq ], m4 |
|
970 palignr m4, m7, m2, 4 |
|
971 mova [dstq +strideq*2+16], m4 |
|
972 mova [dst8q+strideq*2 ], m4 |
|
973 palignr m4, m7, m2, 6 |
|
974 mova [dstq +stride3q +16], m4 |
|
975 mova [dst8q+stride3q ], m4 |
|
976 lea dstq, [dstq+strideq*4] |
|
977 lea dst8q, [dst8q+strideq*4] |
|
978 palignr m4, m7, m2, 8 |
|
979 mova [dstq +16], m4 |
|
980 mova [dst8q ], m4 |
|
981 palignr m4, m7, m2, 10 |
|
982 mova [dstq +strideq +16], m4 |
|
983 mova [dst8q+strideq ], m4 |
|
984 palignr m4, m7, m2, 12 |
|
985 mova [dstq +strideq*2+16], m4 |
|
986 mova [dst8q+strideq*2 ], m4 |
|
987 palignr m4, m7, m2, 14 |
|
988 mova [dstq +stride3q +16], m4 |
|
989 mova [dst8q+stride3q ], m4 |
|
990 lea dstq, [dstq+strideq*4] |
|
991 lea dst8q, [dst8q+strideq*4] |
|
992 |
|
993 ; output 2nd half of 3rd 8 lines and half of 4th 8 lines |
|
994 mova m0, [GLOBAL(sh_b23456789abcdefff)] |
|
995 mova [dstq +16], m7 |
|
996 mova [dst8q ], m7 |
|
997 pshufb m7, m0 |
|
998 mova [dstq +strideq +16], m7 |
|
999 mova [dst8q+strideq ], m7 |
|
1000 pshufb m7, m0 |
|
1001 mova [dstq +strideq*2+16], m7 |
|
1002 mova [dst8q+strideq*2 ], m7 |
|
1003 pshufb m7, m0 |
|
1004 mova [dstq +stride3q +16], m7 |
|
1005 mova [dst8q+stride3q ], m7 |
|
1006 pshufb m7, m0 |
|
1007 lea dstq, [dstq+strideq*4] |
|
1008 lea dst8q, [dst8q+strideq*4] |
|
1009 mova [dstq +16], m7 |
|
1010 mova [dst8q ], m7 |
|
1011 pshufb m7, m0 |
|
1012 mova [dstq +strideq +16], m7 |
|
1013 mova [dst8q+strideq ], m7 |
|
1014 pshufb m7, m0 |
|
1015 mova [dstq +strideq*2+16], m7 |
|
1016 mova [dst8q+strideq*2 ], m7 |
|
1017 pshufb m7, m0 |
|
1018 mova [dstq +stride3q +16], m7 |
|
1019 mova [dst8q+stride3q ], m7 |
|
1020 pshufb m7, m0 |
|
1021 lea dstq, [dstq+strideq*4] |
|
1022 |
|
1023 ; output last half of 4th 8 lines |
|
1024 mova [dstq +16], m7 |
|
1025 mova [dstq +strideq +16], m7 |
|
1026 mova [dstq +strideq*2+16], m7 |
|
1027 mova [dstq +stride3q +16], m7 |
|
1028 lea dstq, [dstq+strideq*4] |
|
1029 mova [dstq +16], m7 |
|
1030 mova [dstq +strideq +16], m7 |
|
1031 mova [dstq +strideq*2+16], m7 |
|
1032 mova [dstq +stride3q +16], m7 |
|
1033 |
|
1034 ; done! |
|
1035 RESTORE_GOT |
|
1036 RET |