Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
11 %include "third_party/x86inc/x86inc.asm"
13 SECTION_RODATA
15 pb_1: times 16 db 1
16 sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
17 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
18 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
19 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
20 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
21 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
22 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
23 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
24 sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
25 sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
26 sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
27 sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
28 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
29 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
30 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
31 sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
32 sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
34 SECTION .text
36 INIT_MMX ssse3
37 cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
38 movifnidn leftq, leftmp
39 add leftq, 4
40 mov lineq, -2
41 pxor m0, m0
42 .loop:
43 movd m1, [leftq+lineq*2 ]
44 movd m2, [leftq+lineq*2+1]
45 pshufb m1, m0
46 pshufb m2, m0
47 movd [dstq ], m1
48 movd [dstq+strideq], m2
49 lea dstq, [dstq+strideq*2]
50 inc lineq
51 jnz .loop
52 REP_RET
54 INIT_MMX ssse3
55 cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
56 movifnidn leftq, leftmp
57 add leftq, 8
58 mov lineq, -4
59 pxor m0, m0
60 .loop:
61 movd m1, [leftq+lineq*2 ]
62 movd m2, [leftq+lineq*2+1]
63 pshufb m1, m0
64 pshufb m2, m0
65 movq [dstq ], m1
66 movq [dstq+strideq], m2
67 lea dstq, [dstq+strideq*2]
68 inc lineq
69 jnz .loop
70 REP_RET
72 INIT_XMM ssse3
73 cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
74 movifnidn leftq, leftmp
75 add leftq, 16
76 mov lineq, -8
77 pxor m0, m0
78 .loop:
79 movd m1, [leftq+lineq*2 ]
80 movd m2, [leftq+lineq*2+1]
81 pshufb m1, m0
82 pshufb m2, m0
83 mova [dstq ], m1
84 mova [dstq+strideq], m2
85 lea dstq, [dstq+strideq*2]
86 inc lineq
87 jnz .loop
88 REP_RET
90 INIT_XMM ssse3
91 cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
92 movifnidn leftq, leftmp
93 add leftq, 32
94 mov lineq, -16
95 pxor m0, m0
96 .loop:
97 movd m1, [leftq+lineq*2 ]
98 movd m2, [leftq+lineq*2+1]
99 pshufb m1, m0
100 pshufb m2, m0
101 mova [dstq ], m1
102 mova [dstq +16], m1
103 mova [dstq+strideq ], m2
104 mova [dstq+strideq+16], m2
105 lea dstq, [dstq+strideq*2]
106 inc lineq
107 jnz .loop
108 REP_RET
110 INIT_MMX ssse3
111 cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
112 GET_GOT goffsetq
114 movq m0, [aboveq]
115 pshufb m2, m0, [GLOBAL(sh_b23456777)]
116 pshufb m1, m0, [GLOBAL(sh_b01234577)]
117 pshufb m0, [GLOBAL(sh_b12345677)]
118 pavgb m3, m2, m1
119 pxor m2, m1
120 pand m2, [GLOBAL(pb_1)]
121 psubb m3, m2
122 pavgb m0, m3
124 ; store 4 lines
125 movd [dstq ], m0
126 psrlq m0, 8
127 movd [dstq+strideq], m0
128 lea dstq, [dstq+strideq*2]
129 psrlq m0, 8
130 movd [dstq ], m0
131 psrlq m0, 8
132 movd [dstq+strideq], m0
134 RESTORE_GOT
135 RET
137 INIT_MMX ssse3
138 cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
139 GET_GOT goffsetq
141 movq m0, [aboveq]
142 mova m1, [GLOBAL(sh_b12345677)]
143 DEFINE_ARGS dst, stride, stride3
144 lea stride3q, [strideq*3]
145 pshufb m2, m0, [GLOBAL(sh_b23456777)]
146 pavgb m3, m2, m0
147 pxor m2, m0
148 pshufb m0, m1
149 pand m2, [GLOBAL(pb_1)]
150 psubb m3, m2
151 pavgb m0, m3
153 ; store 4 lines
154 movq [dstq ], m0
155 pshufb m0, m1
156 movq [dstq+strideq ], m0
157 pshufb m0, m1
158 movq [dstq+strideq*2], m0
159 pshufb m0, m1
160 movq [dstq+stride3q ], m0
161 pshufb m0, m1
162 lea dstq, [dstq+strideq*4]
164 ; store next 4 lines
165 movq [dstq ], m0
166 pshufb m0, m1
167 movq [dstq+strideq ], m0
168 pshufb m0, m1
169 movq [dstq+strideq*2], m0
170 pshufb m0, m1
171 movq [dstq+stride3q ], m0
173 RESTORE_GOT
174 RET
176 INIT_XMM ssse3
177 cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
178 GET_GOT goffsetq
180 mova m0, [aboveq]
181 DEFINE_ARGS dst, stride, stride3, dst8, line
182 lea stride3q, [strideq*3]
183 lea dst8q, [dstq+strideq*8]
184 mova m1, [GLOBAL(sh_b123456789abcdeff)]
185 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
186 pavgb m3, m2, m0
187 pxor m2, m0
188 pshufb m0, m1
189 pand m2, [GLOBAL(pb_1)]
190 psubb m3, m2
191 pavgb m0, m3
193 ; first 4 lines and first half of 3rd 4 lines
194 mov lined, 2
195 .loop:
196 mova [dstq ], m0
197 movhps [dst8q ], m0
198 pshufb m0, m1
199 mova [dstq +strideq ], m0
200 movhps [dst8q+strideq ], m0
201 pshufb m0, m1
202 mova [dstq +strideq*2 ], m0
203 movhps [dst8q+strideq*2 ], m0
204 pshufb m0, m1
205 mova [dstq +stride3q ], m0
206 movhps [dst8q+stride3q ], m0
207 pshufb m0, m1
208 lea dstq, [dstq +strideq*4]
209 lea dst8q, [dst8q+strideq*4]
210 dec lined
211 jnz .loop
213 ; bottom-right 8x8 block
214 movhps [dstq +8], m0
215 movhps [dstq+strideq +8], m0
216 movhps [dstq+strideq*2+8], m0
217 movhps [dstq+stride3q +8], m0
218 lea dstq, [dstq+strideq*4]
219 movhps [dstq +8], m0
220 movhps [dstq+strideq +8], m0
221 movhps [dstq+strideq*2+8], m0
222 movhps [dstq+stride3q +8], m0
224 RESTORE_GOT
225 RET
227 INIT_XMM ssse3
228 cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
229 GET_GOT goffsetq
231 mova m0, [aboveq]
232 mova m4, [aboveq+16]
233 DEFINE_ARGS dst, stride, stride3, dst16, line
234 lea stride3q, [strideq*3]
235 lea dst16q, [dstq +strideq*8]
236 lea dst16q, [dst16q+strideq*8]
237 mova m1, [GLOBAL(sh_b123456789abcdeff)]
238 pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)]
239 pavgb m3, m2, m4
240 pxor m2, m4
241 palignr m5, m4, m0, 1
242 palignr m6, m4, m0, 2
243 pshufb m4, m1
244 pand m2, [GLOBAL(pb_1)]
245 psubb m3, m2
246 pavgb m4, m3
247 pavgb m3, m0, m6
248 pxor m0, m6
249 pand m0, [GLOBAL(pb_1)]
250 psubb m3, m0
251 pavgb m5, m3
253 ; write 4x4 lines (and the first half of the second 4x4 lines)
254 mov lined, 4
255 .loop:
256 mova [dstq ], m5
257 mova [dstq +16], m4
258 mova [dst16q ], m4
259 palignr m3, m4, m5, 1
260 pshufb m4, m1
261 mova [dstq +strideq ], m3
262 mova [dstq +strideq +16], m4
263 mova [dst16q+strideq ], m4
264 palignr m5, m4, m3, 1
265 pshufb m4, m1
266 mova [dstq +strideq*2 ], m5
267 mova [dstq +strideq*2+16], m4
268 mova [dst16q+strideq*2 ], m4
269 palignr m3, m4, m5, 1
270 pshufb m4, m1
271 mova [dstq +stride3q ], m3
272 mova [dstq +stride3q +16], m4
273 mova [dst16q+stride3q ], m4
274 palignr m5, m4, m3, 1
275 pshufb m4, m1
276 lea dstq, [dstq +strideq*4]
277 lea dst16q, [dst16q+strideq*4]
278 dec lined
279 jnz .loop
281 ; write second half of second 4x4 lines
282 mova [dstq +16], m4
283 mova [dstq +strideq +16], m4
284 mova [dstq +strideq*2+16], m4
285 mova [dstq +stride3q +16], m4
286 lea dstq, [dstq +strideq*4]
287 mova [dstq +16], m4
288 mova [dstq +strideq +16], m4
289 mova [dstq +strideq*2+16], m4
290 mova [dstq +stride3q +16], m4
291 lea dstq, [dstq +strideq*4]
292 mova [dstq +16], m4
293 mova [dstq +strideq +16], m4
294 mova [dstq +strideq*2+16], m4
295 mova [dstq +stride3q +16], m4
296 lea dstq, [dstq +strideq*4]
297 mova [dstq +16], m4
298 mova [dstq +strideq +16], m4
299 mova [dstq +strideq*2+16], m4
300 mova [dstq +stride3q +16], m4
302 RESTORE_GOT
303 RET
305 ; ------------------------------------------
306 ; input: x, y, z, result
307 ;
308 ; trick from pascal
309 ; (x+2y+z+2)>>2 can be calculated as:
310 ; result = avg(x,z)
311 ; result -= xor(x,z) & 1
312 ; result = avg(result,y)
313 ; ------------------------------------------
314 %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
315 pavgb %4, %1, %3
316 pxor %3, %1
317 pand %3, [GLOBAL(pb_1)]
318 psubb %4, %3
319 pavgb %4, %2
320 %endmacro
322 INIT_XMM ssse3
323 cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
324 GET_GOT goffsetq
326 movq m3, [aboveq]
327 pshufb m1, m3, [GLOBAL(sh_b23456777)]
328 pshufb m2, m3, [GLOBAL(sh_b12345677)]
330 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
331 pavgb m3, m2
333 ; store 4 lines
334 movd [dstq ], m3
335 movd [dstq+strideq], m4
336 lea dstq, [dstq+strideq*2]
337 psrldq m3, 1
338 psrldq m4, 1
339 movd [dstq ], m3
340 movd [dstq+strideq], m4
341 RESTORE_GOT
342 RET
344 INIT_XMM ssse3
345 cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
346 GET_GOT goffsetq
348 movq m3, [aboveq]
349 DEFINE_ARGS dst, stride, stride3
350 lea stride3q, [strideq*3]
351 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
352 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
353 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
354 pshufb m3, [GLOBAL(sh_b0123456777777777)]
356 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
357 pavgb m3, m2
359 ; store 4 lines
360 movq [dstq ], m3
361 movq [dstq+strideq], m4
362 psrldq m3, 1
363 psrldq m4, 1
364 movq [dstq+strideq*2], m3
365 movq [dstq+stride3q ], m4
366 lea dstq, [dstq+strideq*4]
367 psrldq m3, 1
368 psrldq m4, 1
370 ; store 4 lines
371 movq [dstq ], m3
372 movq [dstq+strideq], m4
373 psrldq m3, 1
374 psrldq m4, 1
375 movq [dstq+strideq*2], m3
376 movq [dstq+stride3q ], m4
377 RESTORE_GOT
378 RET
380 INIT_XMM ssse3
381 cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
382 GET_GOT goffsetq
384 mova m0, [aboveq]
385 DEFINE_ARGS dst, stride, stride3, line
386 lea stride3q, [strideq*3]
387 mova m1, [GLOBAL(sh_b123456789abcdeff)]
388 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
389 pshufb m3, m0, m1
391 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
392 pavgb m0, m3
394 mov lined, 4
395 .loop:
396 mova [dstq ], m0
397 mova [dstq+strideq ], m4
398 pshufb m0, m1
399 pshufb m4, m1
400 mova [dstq+strideq*2], m0
401 mova [dstq+stride3q ], m4
402 pshufb m0, m1
403 pshufb m4, m1
404 lea dstq, [dstq+strideq*4]
405 dec lined
406 jnz .loop
407 RESTORE_GOT
408 REP_RET
410 INIT_XMM ssse3
411 cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
412 GET_GOT goffsetq
414 mova m0, [aboveq]
415 mova m7, [aboveq+16]
416 DEFINE_ARGS dst, stride, stride3, line
417 mova m1, [GLOBAL(sh_b123456789abcdeff)]
418 lea stride3q, [strideq*3]
419 pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)]
420 pshufb m3, m7, m1
422 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
423 palignr m6, m7, m0, 1
424 palignr m5, m7, m0, 2
425 pavgb m7, m3
427 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
428 pavgb m0, m6
430 mov lined, 8
431 .loop:
432 mova [dstq ], m0
433 mova [dstq +16], m7
434 mova [dstq+strideq ], m2
435 mova [dstq+strideq +16], m4
436 palignr m3, m7, m0, 1
437 palignr m5, m4, m2, 1
438 pshufb m7, m1
439 pshufb m4, m1
441 mova [dstq+strideq*2 ], m3
442 mova [dstq+strideq*2+16], m7
443 mova [dstq+stride3q ], m5
444 mova [dstq+stride3q +16], m4
445 palignr m0, m7, m3, 1
446 palignr m2, m4, m5, 1
447 pshufb m7, m1
448 pshufb m4, m1
449 lea dstq, [dstq+strideq*4]
450 dec lined
451 jnz .loop
452 RESTORE_GOT
453 REP_RET
455 INIT_XMM ssse3
456 cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
457 GET_GOT goffsetq
458 movd m0, [leftq] ; l1, l2, l3, l4
459 movd m1, [aboveq-1] ; tl, t1, t2, t3
460 punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
461 pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
462 psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
463 psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
464 ; comments below are for a predictor like this
465 ; A1 B1 C1 D1
466 ; A2 B2 A1 B1
467 ; A3 B3 A2 B2
468 ; A4 B4 A3 B3
469 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
470 pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
472 punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
474 DEFINE_ARGS dst, stride, stride3
475 lea stride3q, [strideq*3]
476 pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
477 movd [dstq+stride3q ], m3
478 psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
479 movd [dstq+strideq*2], m3
480 psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
481 movd [dstq+strideq ], m3
482 psrldq m3, 2 ; A1 B1 C1 D1 ..
483 movd [dstq ], m3
484 RESTORE_GOT
485 RET
487 INIT_XMM ssse3
488 cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
489 GET_GOT goffsetq
490 movq m0, [leftq] ; [0- 7] l1-8 [byte]
491 movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
492 pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
493 pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
494 pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
495 pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
496 psrldq m4, m0, 1 ; t1-7 [word]
497 psrldq m5, m0, 2 ; t2-7 [word]
498 ; comments below are for a predictor like this
499 ; A1 B1 C1 D1 E1 F1 G1 H1
500 ; A2 B2 A1 B1 C1 D1 E1 F1
501 ; A3 B3 A2 B2 A1 B1 C1 D1
502 ; A4 B4 A3 B3 A2 B2 A1 B1
503 ; A5 B5 A4 B4 A3 B3 A2 B2
504 ; A6 B6 A5 B5 A4 B4 A3 B3
505 ; A7 B7 A6 B6 A5 B5 A4 B4
506 ; A8 B8 A7 B7 A6 B6 A5 B5
507 pavgb m6, m1, m2 ; 2-tap avg A8-A1
509 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
511 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
513 punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
515 DEFINE_ARGS dst, stride, stride3
516 lea stride3q, [strideq*3]
518 movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
519 palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
520 movq [dstq+strideq*2], m0
521 psrldq m0, 2 ; A-B2, A-B1, C-H1
522 movq [dstq+strideq ], m0
523 psrldq m0, 2 ; A-H1
524 movq [dstq ], m0
525 lea dstq, [dstq+strideq*4]
526 movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
527 psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
528 movq [dstq+strideq*2], m6
529 psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
530 movq [dstq+strideq ], m6
531 psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
532 movq [dstq ], m6
533 RESTORE_GOT
534 RET
536 INIT_XMM ssse3
537 cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
538 GET_GOT goffsetq
539 mova m0, [leftq]
540 movu m7, [aboveq-1]
541 ; comments below are for a predictor like this
542 ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
543 ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
544 ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
545 ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
546 ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
547 ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
548 ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
549 ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
550 ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
551 ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
552 ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
553 ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
554 ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
555 ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
556 ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
557 ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
558 pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
559 palignr m5, m0, m6, 15
560 palignr m3, m0, m6, 14
562 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
563 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
564 pavgb m5, m0 ; A1 - Ag
566 punpcklbw m0, m4, m5 ; A-B8 ... A-B1
567 punpckhbw m4, m5 ; A-B9 ... A-Bg
569 pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
570 pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
572 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
574 pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
575 DEFINE_ARGS dst, stride, stride3
576 lea stride3q, [strideq*3]
577 palignr m2, m1, m6, 14
578 mova [dstq ], m2
579 palignr m2, m1, m6, 12
580 mova [dstq+strideq ], m2
581 palignr m2, m1, m6, 10
582 mova [dstq+strideq*2], m2
583 palignr m2, m1, m6, 8
584 mova [dstq+stride3q ], m2
585 lea dstq, [dstq+strideq*4]
586 palignr m2, m1, m6, 6
587 mova [dstq ], m2
588 palignr m2, m1, m6, 4
589 mova [dstq+strideq ], m2
590 palignr m2, m1, m6, 2
591 mova [dstq+strideq*2], m2
592 pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
593 mova [dstq+stride3q ], m6
594 lea dstq, [dstq+strideq*4]
596 palignr m2, m6, m4, 14
597 mova [dstq ], m2
598 palignr m2, m6, m4, 12
599 mova [dstq+strideq ], m2
600 palignr m2, m6, m4, 10
601 mova [dstq+strideq*2], m2
602 palignr m2, m6, m4, 8
603 mova [dstq+stride3q ], m2
604 lea dstq, [dstq+strideq*4]
605 palignr m2, m6, m4, 6
606 mova [dstq ], m2
607 palignr m2, m6, m4, 4
608 mova [dstq+strideq ], m2
609 palignr m2, m6, m4, 2
610 mova [dstq+strideq*2], m2
611 mova [dstq+stride3q ], m4
612 RESTORE_GOT
613 RET
615 INIT_XMM ssse3
616 cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
617 GET_GOT goffsetq
618 mova m0, [leftq]
619 movu m7, [aboveq-1]
620 movu m1, [aboveq+15]
622 pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
623 pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
625 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
627 palignr m3, m1, m7, 1
628 palignr m5, m1, m7, 2
630 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
632 pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
633 palignr m5, m0, m7, 15
634 palignr m3, m0, m7, 14
636 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
637 pavgb m5, m0 ; A1 - Ag
638 punpcklbw m6, m4, m5 ; A-B8 ... A-B1
639 punpckhbw m4, m5 ; A-B9 ... A-Bg
640 pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
641 pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
643 DEFINE_ARGS dst, stride, stride3, left, line
644 lea stride3q, [strideq*3]
646 palignr m5, m2, m1, 14
647 palignr m7, m1, m6, 14
648 mova [dstq ], m7
649 mova [dstq+16 ], m5
650 palignr m5, m2, m1, 12
651 palignr m7, m1, m6, 12
652 mova [dstq+strideq ], m7
653 mova [dstq+strideq+16 ], m5
654 palignr m5, m2, m1, 10
655 palignr m7, m1, m6, 10
656 mova [dstq+strideq*2 ], m7
657 mova [dstq+strideq*2+16], m5
658 palignr m5, m2, m1, 8
659 palignr m7, m1, m6, 8
660 mova [dstq+stride3q ], m7
661 mova [dstq+stride3q+16 ], m5
662 lea dstq, [dstq+strideq*4]
663 palignr m5, m2, m1, 6
664 palignr m7, m1, m6, 6
665 mova [dstq ], m7
666 mova [dstq+16 ], m5
667 palignr m5, m2, m1, 4
668 palignr m7, m1, m6, 4
669 mova [dstq+strideq ], m7
670 mova [dstq+strideq+16 ], m5
671 palignr m5, m2, m1, 2
672 palignr m7, m1, m6, 2
673 mova [dstq+strideq*2 ], m7
674 mova [dstq+strideq*2+16], m5
675 mova [dstq+stride3q ], m6
676 mova [dstq+stride3q+16 ], m1
677 lea dstq, [dstq+strideq*4]
679 palignr m5, m1, m6, 14
680 palignr m3, m6, m4, 14
681 mova [dstq ], m3
682 mova [dstq+16 ], m5
683 palignr m5, m1, m6, 12
684 palignr m3, m6, m4, 12
685 mova [dstq+strideq ], m3
686 mova [dstq+strideq+16 ], m5
687 palignr m5, m1, m6, 10
688 palignr m3, m6, m4, 10
689 mova [dstq+strideq*2 ], m3
690 mova [dstq+strideq*2+16], m5
691 palignr m5, m1, m6, 8
692 palignr m3, m6, m4, 8
693 mova [dstq+stride3q ], m3
694 mova [dstq+stride3q+16 ], m5
695 lea dstq, [dstq+strideq*4]
696 palignr m5, m1, m6, 6
697 palignr m3, m6, m4, 6
698 mova [dstq ], m3
699 mova [dstq+16 ], m5
700 palignr m5, m1, m6, 4
701 palignr m3, m6, m4, 4
702 mova [dstq+strideq ], m3
703 mova [dstq+strideq+16 ], m5
704 palignr m5, m1, m6, 2
705 palignr m3, m6, m4, 2
706 mova [dstq+strideq*2 ], m3
707 mova [dstq+strideq*2+16], m5
708 mova [dstq+stride3q ], m4
709 mova [dstq+stride3q+16 ], m6
710 lea dstq, [dstq+strideq*4]
712 mova m7, [leftq]
713 mova m3, [leftq+16]
714 palignr m5, m3, m7, 15
715 palignr m0, m3, m7, 14
717 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
718 pavgb m5, m3 ; Ah -
719 punpcklbw m3, m2, m5 ; A-B8 ... A-B1
720 punpckhbw m2, m5 ; A-B9 ... A-Bg
721 pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
722 pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
724 palignr m7, m6, m4, 14
725 palignr m0, m4, m3, 14
726 mova [dstq ], m0
727 mova [dstq+16 ], m7
728 palignr m7, m6, m4, 12
729 palignr m0, m4, m3, 12
730 mova [dstq+strideq ], m0
731 mova [dstq+strideq+16 ], m7
732 palignr m7, m6, m4, 10
733 palignr m0, m4, m3, 10
734 mova [dstq+strideq*2 ], m0
735 mova [dstq+strideq*2+16], m7
736 palignr m7, m6, m4, 8
737 palignr m0, m4, m3, 8
738 mova [dstq+stride3q ], m0
739 mova [dstq+stride3q+16 ], m7
740 lea dstq, [dstq+strideq*4]
741 palignr m7, m6, m4, 6
742 palignr m0, m4, m3, 6
743 mova [dstq ], m0
744 mova [dstq+16 ], m7
745 palignr m7, m6, m4, 4
746 palignr m0, m4, m3, 4
747 mova [dstq+strideq ], m0
748 mova [dstq+strideq+16 ], m7
749 palignr m7, m6, m4, 2
750 palignr m0, m4, m3, 2
751 mova [dstq+strideq*2 ], m0
752 mova [dstq+strideq*2+16], m7
753 mova [dstq+stride3q ], m3
754 mova [dstq+stride3q+16 ], m4
755 lea dstq, [dstq+strideq*4]
757 palignr m7, m4, m3, 14
758 palignr m0, m3, m2, 14
759 mova [dstq ], m0
760 mova [dstq+16 ], m7
761 palignr m7, m4, m3, 12
762 palignr m0, m3, m2, 12
763 mova [dstq+strideq ], m0
764 mova [dstq+strideq+16 ], m7
765 palignr m7, m4, m3, 10
766 palignr m0, m3, m2, 10
767 mova [dstq+strideq*2 ], m0
768 mova [dstq+strideq*2+16], m7
769 palignr m7, m4, m3, 8
770 palignr m0, m3, m2, 8
771 mova [dstq+stride3q ], m0
772 mova [dstq+stride3q+16 ], m7
773 lea dstq, [dstq+strideq*4]
774 palignr m7, m4, m3, 6
775 palignr m0, m3, m2, 6
776 mova [dstq ], m0
777 mova [dstq+16 ], m7
778 palignr m7, m4, m3, 4
779 palignr m0, m3, m2, 4
780 mova [dstq+strideq ], m0
781 mova [dstq+strideq+16 ], m7
782 palignr m7, m4, m3, 2
783 palignr m0, m3, m2, 2
784 mova [dstq+strideq*2 ], m0
785 mova [dstq+strideq*2+16], m7
786 mova [dstq+stride3q ], m2
787 mova [dstq+stride3q+16 ], m3
789 RESTORE_GOT
790 RET
792 INIT_MMX ssse3
793 cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
794 GET_GOT goffsetq
795 movd m0, [leftq] ; abcd [byte]
796 pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
797 pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd
799 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
800 pavgb m1, m0 ; ab, bc, cd, d [byte]
802 punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
803 movd [dstq ], m1
804 psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
805 movd [dstq+strideq], m1
806 lea dstq, [dstq+strideq*2]
807 psrlq m1, 16 ; cd, c3d, d, d
808 movd [dstq ], m1
809 pshufw m1, m1, q1111 ; d, d, d, d
810 movd [dstq+strideq], m1
811 RESTORE_GOT
812 RET
814 INIT_XMM ssse3
815 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
816 GET_GOT goffsetq
817 movq m3, [leftq] ; abcdefgh [byte]
818 lea stride3q, [strideq*3]
820 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
821 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
822 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
824 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
825 pavgb m0, m2
826 punpcklbw m0, m3 ; interleaved output
828 movq [dstq ], m0
829 psrldq m0, 2
830 movq [dstq+strideq ], m0
831 psrldq m0, 2
832 movq [dstq+strideq*2], m0
833 psrldq m0, 2
834 movq [dstq+stride3q ], m0
835 lea dstq, [dstq+strideq*4]
836 pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
837 psrldq m0, 2
838 movq [dstq ], m0
839 psrldq m0, 2
840 movq [dstq+strideq ], m0
841 psrldq m0, 2
842 movq [dstq+strideq*2], m0
843 psrldq m0, 2
844 movq [dstq+stride3q ], m0
845 RESTORE_GOT
846 RET
848 INIT_XMM ssse3
849 cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
850 GET_GOT goffsetq
851 lea stride3q, [strideq*3]
852 mova m0, [leftq] ; abcdefghijklmnop [byte]
853 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
854 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
856 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
857 pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte]
859 punpckhbw m4, m1, m3 ; interleaved input
860 punpcklbw m1, m3 ; interleaved output
861 mova [dstq ], m1
862 palignr m3, m4, m1, 2
863 mova [dstq+strideq ], m3
864 palignr m3, m4, m1, 4
865 mova [dstq+strideq*2], m3
866 palignr m3, m4, m1, 6
867 mova [dstq+stride3q ], m3
868 lea dstq, [dstq+strideq*4]
869 palignr m3, m4, m1, 8
870 mova [dstq ], m3
871 palignr m3, m4, m1, 10
872 mova [dstq+strideq ], m3
873 palignr m3, m4, m1, 12
874 mova [dstq+strideq*2], m3
875 palignr m3, m4, m1, 14
876 mova [dstq+stride3q ], m3
877 DEFINE_ARGS dst, stride, stride3, line
878 mov lined, 2
879 mova m0, [GLOBAL(sh_b23456789abcdefff)]
880 .loop:
881 lea dstq, [dstq+strideq*4]
882 mova [dstq ], m4
883 pshufb m4, m0
884 mova [dstq+strideq ], m4
885 pshufb m4, m0
886 mova [dstq+strideq*2], m4
887 pshufb m4, m0
888 mova [dstq+stride3q ], m4
889 pshufb m4, m0
890 dec lined
891 jnz .loop
892 RESTORE_GOT
893 REP_RET
895 INIT_XMM ssse3
896 cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
897 GET_GOT goffsetq
898 lea stride3q, [strideq*3]
899 mova m1, [leftq] ; 0-15 [byte]
900 mova m2, [leftq+16] ; 16-31 [byte]
901 pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)]
902 pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)]
904 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
905 palignr m6, m2, m1, 1
906 palignr m5, m2, m1, 2
907 pavgb m2, m4 ; high 16px even lines
909 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
910 pavgb m1, m6 ; low 16px even lines
912 punpckhbw m6, m1, m0 ; interleaved output 2
913 punpcklbw m1, m0 ; interleaved output 1
915 punpckhbw m7, m2, m3 ; interleaved output 4
916 punpcklbw m2, m3 ; interleaved output 3
918 ; output 1st 8 lines (and half of 2nd 8 lines)
919 DEFINE_ARGS dst, stride, stride3, dst8
920 lea dst8q, [dstq+strideq*8]
921 mova [dstq ], m1
922 mova [dstq +16], m6
923 mova [dst8q ], m6
924 palignr m0, m6, m1, 2
925 palignr m4, m2, m6, 2
926 mova [dstq +strideq ], m0
927 mova [dstq +strideq +16], m4
928 mova [dst8q+strideq ], m4
929 palignr m0, m6, m1, 4
930 palignr m4, m2, m6, 4
931 mova [dstq +strideq*2 ], m0
932 mova [dstq +strideq*2+16], m4
933 mova [dst8q+strideq*2 ], m4
934 palignr m0, m6, m1, 6
935 palignr m4, m2, m6, 6
936 mova [dstq +stride3q ], m0
937 mova [dstq +stride3q +16], m4
938 mova [dst8q+stride3q ], m4
939 lea dstq, [dstq +strideq*4]
940 lea dst8q, [dst8q+strideq*4]
941 palignr m0, m6, m1, 8
942 palignr m4, m2, m6, 8
943 mova [dstq ], m0
944 mova [dstq +16], m4
945 mova [dst8q ], m4
946 palignr m0, m6, m1, 10
947 palignr m4, m2, m6, 10
948 mova [dstq +strideq ], m0
949 mova [dstq +strideq +16], m4
950 mova [dst8q+strideq ], m4
951 palignr m0, m6, m1, 12
952 palignr m4, m2, m6, 12
953 mova [dstq +strideq*2 ], m0
954 mova [dstq +strideq*2+16], m4
955 mova [dst8q+strideq*2 ], m4
956 palignr m0, m6, m1, 14
957 palignr m4, m2, m6, 14
958 mova [dstq +stride3q ], m0
959 mova [dstq +stride3q +16], m4
960 mova [dst8q+stride3q ], m4
961 lea dstq, [dstq+strideq*4]
962 lea dst8q, [dst8q+strideq*4]
964 ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
965 mova [dstq +16], m2
966 mova [dst8q ], m2
967 palignr m4, m7, m2, 2
968 mova [dstq +strideq +16], m4
969 mova [dst8q+strideq ], m4
970 palignr m4, m7, m2, 4
971 mova [dstq +strideq*2+16], m4
972 mova [dst8q+strideq*2 ], m4
973 palignr m4, m7, m2, 6
974 mova [dstq +stride3q +16], m4
975 mova [dst8q+stride3q ], m4
976 lea dstq, [dstq+strideq*4]
977 lea dst8q, [dst8q+strideq*4]
978 palignr m4, m7, m2, 8
979 mova [dstq +16], m4
980 mova [dst8q ], m4
981 palignr m4, m7, m2, 10
982 mova [dstq +strideq +16], m4
983 mova [dst8q+strideq ], m4
984 palignr m4, m7, m2, 12
985 mova [dstq +strideq*2+16], m4
986 mova [dst8q+strideq*2 ], m4
987 palignr m4, m7, m2, 14
988 mova [dstq +stride3q +16], m4
989 mova [dst8q+stride3q ], m4
990 lea dstq, [dstq+strideq*4]
991 lea dst8q, [dst8q+strideq*4]
993 ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
994 mova m0, [GLOBAL(sh_b23456789abcdefff)]
995 mova [dstq +16], m7
996 mova [dst8q ], m7
997 pshufb m7, m0
998 mova [dstq +strideq +16], m7
999 mova [dst8q+strideq ], m7
1000 pshufb m7, m0
1001 mova [dstq +strideq*2+16], m7
1002 mova [dst8q+strideq*2 ], m7
1003 pshufb m7, m0
1004 mova [dstq +stride3q +16], m7
1005 mova [dst8q+stride3q ], m7
1006 pshufb m7, m0
1007 lea dstq, [dstq+strideq*4]
1008 lea dst8q, [dst8q+strideq*4]
1009 mova [dstq +16], m7
1010 mova [dst8q ], m7
1011 pshufb m7, m0
1012 mova [dstq +strideq +16], m7
1013 mova [dst8q+strideq ], m7
1014 pshufb m7, m0
1015 mova [dstq +strideq*2+16], m7
1016 mova [dst8q+strideq*2 ], m7
1017 pshufb m7, m0
1018 mova [dstq +stride3q +16], m7
1019 mova [dst8q+stride3q ], m7
1020 pshufb m7, m0
1021 lea dstq, [dstq+strideq*4]
1023 ; output last half of 4th 8 lines
1024 mova [dstq +16], m7
1025 mova [dstq +strideq +16], m7
1026 mova [dstq +strideq*2+16], m7
1027 mova [dstq +stride3q +16], m7
1028 lea dstq, [dstq+strideq*4]
1029 mova [dstq +16], m7
1030 mova [dstq +strideq +16], m7
1031 mova [dstq +strideq*2+16], m7
1032 mova [dstq +stride3q +16], m7
1034 ; done!
1035 RESTORE_GOT
1036 RET