media/libvpx/vp8/common/x86/idctllm_sse2.asm

branch
TOR_BUG_9701
changeset 10
ac0c01689b40
equal deleted inserted replaced
-1:000000000000 0:c2708fafe3e5
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;void vp8_idct_dequant_0_2x_sse2
15 ; (
16 ; short *qcoeff - 0
17 ; short *dequant - 1
18 ; unsigned char *dst - 2
19 ; int dst_stride - 3
20 ; )
21
22 global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
23 sym(vp8_idct_dequant_0_2x_sse2):
24 push rbp
25 mov rbp, rsp
26 SHADOW_ARGS_TO_STACK 4
27 GET_GOT rbx
28 ; end prolog
29
30 mov rdx, arg(1) ; dequant
31 mov rax, arg(0) ; qcoeff
32
33 movd xmm4, [rax]
34 movd xmm5, [rdx]
35
36 pinsrw xmm4, [rax+32], 4
37 pinsrw xmm5, [rdx], 4
38
39 pmullw xmm4, xmm5
40
41 ; Zero out xmm5, for use unpacking
42 pxor xmm5, xmm5
43
44 ; clear coeffs
45 movd [rax], xmm5
46 movd [rax+32], xmm5
47 ;pshufb
48 mov rax, arg(2) ; dst
49 movsxd rdx, dword ptr arg(3) ; dst_stride
50
51 pshuflw xmm4, xmm4, 00000000b
52 pshufhw xmm4, xmm4, 00000000b
53
54 lea rcx, [rdx + rdx*2]
55 paddw xmm4, [GLOBAL(fours)]
56
57 psraw xmm4, 3
58
59 movq xmm0, [rax]
60 movq xmm1, [rax+rdx]
61 movq xmm2, [rax+2*rdx]
62 movq xmm3, [rax+rcx]
63
64 punpcklbw xmm0, xmm5
65 punpcklbw xmm1, xmm5
66 punpcklbw xmm2, xmm5
67 punpcklbw xmm3, xmm5
68
69
70 ; Add to predict buffer
71 paddw xmm0, xmm4
72 paddw xmm1, xmm4
73 paddw xmm2, xmm4
74 paddw xmm3, xmm4
75
76 ; pack up before storing
77 packuswb xmm0, xmm5
78 packuswb xmm1, xmm5
79 packuswb xmm2, xmm5
80 packuswb xmm3, xmm5
81
82 ; store blocks back out
83 movq [rax], xmm0
84 movq [rax + rdx], xmm1
85
86 lea rax, [rax + 2*rdx]
87
88 movq [rax], xmm2
89 movq [rax + rdx], xmm3
90
91 ; begin epilog
92 RESTORE_GOT
93 UNSHADOW_ARGS
94 pop rbp
95 ret
96
97 ;void vp8_idct_dequant_full_2x_sse2
98 ; (
99 ; short *qcoeff - 0
100 ; short *dequant - 1
101 ; unsigned char *dst - 2
102 ; int dst_stride - 3
103 ; )
104 global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
105 sym(vp8_idct_dequant_full_2x_sse2):
106 push rbp
107 mov rbp, rsp
108 SHADOW_ARGS_TO_STACK 4
109 SAVE_XMM 7
110 GET_GOT rbx
111 push rsi
112 push rdi
113 ; end prolog
114
115 ; special case when 2 blocks have 0 or 1 coeffs
116 ; dc is set as first coeff, so no need to load qcoeff
117 mov rax, arg(0) ; qcoeff
118 mov rdx, arg(1) ; dequant
119 mov rdi, arg(2) ; dst
120
121
122 ; Zero out xmm7, for use unpacking
123 pxor xmm7, xmm7
124
125
126 ; note the transpose of xmm1 and xmm2, necessary for shuffle
127 ; to spit out sensicle data
128 movdqa xmm0, [rax]
129 movdqa xmm2, [rax+16]
130 movdqa xmm1, [rax+32]
131 movdqa xmm3, [rax+48]
132
133 ; Clear out coeffs
134 movdqa [rax], xmm7
135 movdqa [rax+16], xmm7
136 movdqa [rax+32], xmm7
137 movdqa [rax+48], xmm7
138
139 ; dequantize qcoeff buffer
140 pmullw xmm0, [rdx]
141 pmullw xmm2, [rdx+16]
142 pmullw xmm1, [rdx]
143 pmullw xmm3, [rdx+16]
144 movsxd rdx, dword ptr arg(3) ; dst_stride
145
146 ; repack so block 0 row x and block 1 row x are together
147 movdqa xmm4, xmm0
148 punpckldq xmm0, xmm1
149 punpckhdq xmm4, xmm1
150
151 pshufd xmm0, xmm0, 11011000b
152 pshufd xmm1, xmm4, 11011000b
153
154 movdqa xmm4, xmm2
155 punpckldq xmm2, xmm3
156 punpckhdq xmm4, xmm3
157
158 pshufd xmm2, xmm2, 11011000b
159 pshufd xmm3, xmm4, 11011000b
160
161 ; first pass
162 psubw xmm0, xmm2 ; b1 = 0-2
163 paddw xmm2, xmm2 ;
164
165 movdqa xmm5, xmm1
166 paddw xmm2, xmm0 ; a1 = 0+2
167
168 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
169 lea rcx, [rdx + rdx*2] ;dst_stride * 3
170 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
171
172 movdqa xmm7, xmm3
173 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
174
175 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
176 psubw xmm7, xmm5 ; c1
177
178 movdqa xmm5, xmm1
179 movdqa xmm4, xmm3
180
181 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
182 paddw xmm5, xmm1
183
184 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
185 paddw xmm3, xmm4
186
187 paddw xmm3, xmm5 ; d1
188 movdqa xmm6, xmm2 ; a1
189
190 movdqa xmm4, xmm0 ; b1
191 paddw xmm2, xmm3 ;0
192
193 paddw xmm4, xmm7 ;1
194 psubw xmm0, xmm7 ;2
195
196 psubw xmm6, xmm3 ;3
197
198 ; transpose for the second pass
199 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
200 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
201 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
202
203 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
204 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
205 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
206
207
208 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
209 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
210 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
211
212 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
213 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
214 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
215
216
217 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
218 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
219 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
220
221 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
222 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
223 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
224
225 pshufd xmm0, xmm2, 11011000b
226 pshufd xmm2, xmm1, 11011000b
227
228 pshufd xmm1, xmm5, 11011000b
229 pshufd xmm3, xmm7, 11011000b
230
231 ; second pass
232 psubw xmm0, xmm2 ; b1 = 0-2
233 paddw xmm2, xmm2
234
235 movdqa xmm5, xmm1
236 paddw xmm2, xmm0 ; a1 = 0+2
237
238 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
239 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
240
241 movdqa xmm7, xmm3
242 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
243
244 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
245 psubw xmm7, xmm5 ; c1
246
247 movdqa xmm5, xmm1
248 movdqa xmm4, xmm3
249
250 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
251 paddw xmm5, xmm1
252
253 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
254 paddw xmm3, xmm4
255
256 paddw xmm3, xmm5 ; d1
257 paddw xmm0, [GLOBAL(fours)]
258
259 paddw xmm2, [GLOBAL(fours)]
260 movdqa xmm6, xmm2 ; a1
261
262 movdqa xmm4, xmm0 ; b1
263 paddw xmm2, xmm3 ;0
264
265 paddw xmm4, xmm7 ;1
266 psubw xmm0, xmm7 ;2
267
268 psubw xmm6, xmm3 ;3
269 psraw xmm2, 3
270
271 psraw xmm0, 3
272 psraw xmm4, 3
273
274 psraw xmm6, 3
275
276 ; transpose to save
277 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
278 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
279 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
280
281 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
282 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
283 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
284
285
286 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
287 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
288 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
289
290 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
291 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
292 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
293
294
295 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
296 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
297 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
298
299 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
300 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
301 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
302
303 pshufd xmm0, xmm2, 11011000b
304 pshufd xmm2, xmm1, 11011000b
305
306 pshufd xmm1, xmm5, 11011000b
307 pshufd xmm3, xmm7, 11011000b
308
309 pxor xmm7, xmm7
310
311 ; Load up predict blocks
312 movq xmm4, [rdi]
313 movq xmm5, [rdi+rdx]
314
315 punpcklbw xmm4, xmm7
316 punpcklbw xmm5, xmm7
317
318 paddw xmm0, xmm4
319 paddw xmm1, xmm5
320
321 movq xmm4, [rdi+2*rdx]
322 movq xmm5, [rdi+rcx]
323
324 punpcklbw xmm4, xmm7
325 punpcklbw xmm5, xmm7
326
327 paddw xmm2, xmm4
328 paddw xmm3, xmm5
329
330 .finish:
331
332 ; pack up before storing
333 packuswb xmm0, xmm7
334 packuswb xmm1, xmm7
335 packuswb xmm2, xmm7
336 packuswb xmm3, xmm7
337
338 ; store blocks back out
339 movq [rdi], xmm0
340 movq [rdi + rdx], xmm1
341 movq [rdi + rdx*2], xmm2
342 movq [rdi + rcx], xmm3
343
344 ; begin epilog
345 pop rdi
346 pop rsi
347 RESTORE_GOT
348 RESTORE_XMM
349 UNSHADOW_ARGS
350 pop rbp
351 ret
352
353 ;void vp8_idct_dequant_dc_0_2x_sse2
354 ; (
355 ; short *qcoeff - 0
356 ; short *dequant - 1
357 ; unsigned char *dst - 2
358 ; int dst_stride - 3
359 ; short *dc - 4
360 ; )
361 global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
362 sym(vp8_idct_dequant_dc_0_2x_sse2):
363 push rbp
364 mov rbp, rsp
365 SHADOW_ARGS_TO_STACK 5
366 GET_GOT rbx
367 push rdi
368 ; end prolog
369
370 ; special case when 2 blocks have 0 or 1 coeffs
371 ; dc is set as first coeff, so no need to load qcoeff
372 mov rax, arg(0) ; qcoeff
373
374 mov rdi, arg(2) ; dst
375 mov rdx, arg(4) ; dc
376
377 ; Zero out xmm5, for use unpacking
378 pxor xmm5, xmm5
379
380 ; load up 2 dc words here == 2*16 = doubleword
381 movd xmm4, [rdx]
382
383 movsxd rdx, dword ptr arg(3) ; dst_stride
384 lea rcx, [rdx + rdx*2]
385 ; Load up predict blocks
386 movq xmm0, [rdi]
387 movq xmm1, [rdi+rdx*1]
388 movq xmm2, [rdi+rdx*2]
389 movq xmm3, [rdi+rcx]
390
391 ; Duplicate and expand dc across
392 punpcklwd xmm4, xmm4
393 punpckldq xmm4, xmm4
394
395 ; Rounding to dequant and downshift
396 paddw xmm4, [GLOBAL(fours)]
397 psraw xmm4, 3
398
399 ; Predict buffer needs to be expanded from bytes to words
400 punpcklbw xmm0, xmm5
401 punpcklbw xmm1, xmm5
402 punpcklbw xmm2, xmm5
403 punpcklbw xmm3, xmm5
404
405 ; Add to predict buffer
406 paddw xmm0, xmm4
407 paddw xmm1, xmm4
408 paddw xmm2, xmm4
409 paddw xmm3, xmm4
410
411 ; pack up before storing
412 packuswb xmm0, xmm5
413 packuswb xmm1, xmm5
414 packuswb xmm2, xmm5
415 packuswb xmm3, xmm5
416
417 ; store blocks back out
418 movq [rdi], xmm0
419 movq [rdi + rdx], xmm1
420 movq [rdi + rdx*2], xmm2
421 movq [rdi + rcx], xmm3
422
423 ; begin epilog
424 pop rdi
425 RESTORE_GOT
426 UNSHADOW_ARGS
427 pop rbp
428 ret
429 ;void vp8_idct_dequant_dc_full_2x_sse2
430 ; (
431 ; short *qcoeff - 0
432 ; short *dequant - 1
433 ; unsigned char *dst - 2
434 ; int dst_stride - 3
435 ; short *dc - 4
436 ; )
437 global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
438 sym(vp8_idct_dequant_dc_full_2x_sse2):
439 push rbp
440 mov rbp, rsp
441 SHADOW_ARGS_TO_STACK 5
442 SAVE_XMM 7
443 GET_GOT rbx
444 push rdi
445 ; end prolog
446
447 ; special case when 2 blocks have 0 or 1 coeffs
448 ; dc is set as first coeff, so no need to load qcoeff
449 mov rax, arg(0) ; qcoeff
450 mov rdx, arg(1) ; dequant
451
452 mov rdi, arg(2) ; dst
453
454 ; Zero out xmm7, for use unpacking
455 pxor xmm7, xmm7
456
457
458 ; note the transpose of xmm1 and xmm2, necessary for shuffle
459 ; to spit out sensicle data
460 movdqa xmm0, [rax]
461 movdqa xmm2, [rax+16]
462 movdqa xmm1, [rax+32]
463 movdqa xmm3, [rax+48]
464
465 ; Clear out coeffs
466 movdqa [rax], xmm7
467 movdqa [rax+16], xmm7
468 movdqa [rax+32], xmm7
469 movdqa [rax+48], xmm7
470
471 ; dequantize qcoeff buffer
472 pmullw xmm0, [rdx]
473 pmullw xmm2, [rdx+16]
474 pmullw xmm1, [rdx]
475 pmullw xmm3, [rdx+16]
476
477 ; DC component
478 mov rdx, arg(4)
479
480 ; repack so block 0 row x and block 1 row x are together
481 movdqa xmm4, xmm0
482 punpckldq xmm0, xmm1
483 punpckhdq xmm4, xmm1
484
485 pshufd xmm0, xmm0, 11011000b
486 pshufd xmm1, xmm4, 11011000b
487
488 movdqa xmm4, xmm2
489 punpckldq xmm2, xmm3
490 punpckhdq xmm4, xmm3
491
492 pshufd xmm2, xmm2, 11011000b
493 pshufd xmm3, xmm4, 11011000b
494
495 ; insert DC component
496 pinsrw xmm0, [rdx], 0
497 pinsrw xmm0, [rdx+2], 4
498
499 ; first pass
500 psubw xmm0, xmm2 ; b1 = 0-2
501 paddw xmm2, xmm2 ;
502
503 movdqa xmm5, xmm1
504 paddw xmm2, xmm0 ; a1 = 0+2
505
506 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
507 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
508
509 movdqa xmm7, xmm3
510 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
511
512 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
513 psubw xmm7, xmm5 ; c1
514
515 movdqa xmm5, xmm1
516 movdqa xmm4, xmm3
517
518 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
519 paddw xmm5, xmm1
520
521 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
522 paddw xmm3, xmm4
523
524 paddw xmm3, xmm5 ; d1
525 movdqa xmm6, xmm2 ; a1
526
527 movdqa xmm4, xmm0 ; b1
528 paddw xmm2, xmm3 ;0
529
530 paddw xmm4, xmm7 ;1
531 psubw xmm0, xmm7 ;2
532
533 psubw xmm6, xmm3 ;3
534
535 ; transpose for the second pass
536 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
537 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
538 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
539
540 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
541 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
542 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
543
544
545 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
546 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
547 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
548
549 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
550 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
551 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
552
553
554 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
555 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
556 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
557
558 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
559 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
560 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
561
562 pshufd xmm0, xmm2, 11011000b
563 pshufd xmm2, xmm1, 11011000b
564
565 pshufd xmm1, xmm5, 11011000b
566 pshufd xmm3, xmm7, 11011000b
567
568 ; second pass
569 psubw xmm0, xmm2 ; b1 = 0-2
570 paddw xmm2, xmm2
571
572 movdqa xmm5, xmm1
573 paddw xmm2, xmm0 ; a1 = 0+2
574
575 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
576 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
577
578 movdqa xmm7, xmm3
579 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
580
581 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
582 psubw xmm7, xmm5 ; c1
583
584 movdqa xmm5, xmm1
585 movdqa xmm4, xmm3
586
587 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
588 paddw xmm5, xmm1
589
590 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
591 paddw xmm3, xmm4
592
593 paddw xmm3, xmm5 ; d1
594 paddw xmm0, [GLOBAL(fours)]
595
596 paddw xmm2, [GLOBAL(fours)]
597 movdqa xmm6, xmm2 ; a1
598
599 movdqa xmm4, xmm0 ; b1
600 paddw xmm2, xmm3 ;0
601
602 paddw xmm4, xmm7 ;1
603 psubw xmm0, xmm7 ;2
604
605 psubw xmm6, xmm3 ;3
606 psraw xmm2, 3
607
608 psraw xmm0, 3
609 psraw xmm4, 3
610
611 psraw xmm6, 3
612
613 ; transpose to save
614 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
615 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
616 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
617
618 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
619 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
620 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
621
622
623 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
624 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
625 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
626
627 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
628 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
629 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
630
631
632 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
633 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
634 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
635
636 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
637 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
638 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
639
640 pshufd xmm0, xmm2, 11011000b
641 pshufd xmm2, xmm1, 11011000b
642
643 pshufd xmm1, xmm5, 11011000b
644 pshufd xmm3, xmm7, 11011000b
645
646 pxor xmm7, xmm7
647
648 ; Load up predict blocks
649 movsxd rdx, dword ptr arg(3) ; dst_stride
650 movq xmm4, [rdi]
651 movq xmm5, [rdi+rdx]
652 lea rcx, [rdx + rdx*2]
653
654 punpcklbw xmm4, xmm7
655 punpcklbw xmm5, xmm7
656
657 paddw xmm0, xmm4
658 paddw xmm1, xmm5
659
660 movq xmm4, [rdi+rdx*2]
661 movq xmm5, [rdi+rcx]
662
663 punpcklbw xmm4, xmm7
664 punpcklbw xmm5, xmm7
665
666 paddw xmm2, xmm4
667 paddw xmm3, xmm5
668
669 .finish:
670
671 ; pack up before storing
672 packuswb xmm0, xmm7
673 packuswb xmm1, xmm7
674 packuswb xmm2, xmm7
675 packuswb xmm3, xmm7
676
677 ; Load destination stride before writing out,
678 ; doesn't need to persist
679 movsxd rdx, dword ptr arg(3) ; dst_stride
680
681 ; store blocks back out
682 movq [rdi], xmm0
683 movq [rdi + rdx], xmm1
684
685 lea rdi, [rdi + 2*rdx]
686
687 movq [rdi], xmm2
688 movq [rdi + rdx], xmm3
689
690
691 ; begin epilog
692 pop rdi
693 RESTORE_GOT
694 RESTORE_XMM
695 UNSHADOW_ARGS
696 pop rbp
697 ret
698
699 SECTION_RODATA
700 align 16
701 fours:
702 times 8 dw 0x0004
703 align 16
704 x_s1sqr2:
705 times 8 dw 0x8A8C
706 align 16
707 x_c1sqr2less1:
708 times 8 dw 0x4E7B

mercurial