Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_idct_dequant_0_2x_sse2
15 ; (
16 ; short *qcoeff - 0
17 ; short *dequant - 1
18 ; unsigned char *dst - 2
19 ; int dst_stride - 3
20 ; )
22 global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
23 sym(vp8_idct_dequant_0_2x_sse2):
24 push rbp
25 mov rbp, rsp
26 SHADOW_ARGS_TO_STACK 4
27 GET_GOT rbx
28 ; end prolog
30 mov rdx, arg(1) ; dequant
31 mov rax, arg(0) ; qcoeff
33 movd xmm4, [rax]
34 movd xmm5, [rdx]
36 pinsrw xmm4, [rax+32], 4
37 pinsrw xmm5, [rdx], 4
39 pmullw xmm4, xmm5
41 ; Zero out xmm5, for use unpacking
42 pxor xmm5, xmm5
44 ; clear coeffs
45 movd [rax], xmm5
46 movd [rax+32], xmm5
47 ;pshufb
48 mov rax, arg(2) ; dst
49 movsxd rdx, dword ptr arg(3) ; dst_stride
51 pshuflw xmm4, xmm4, 00000000b
52 pshufhw xmm4, xmm4, 00000000b
54 lea rcx, [rdx + rdx*2]
55 paddw xmm4, [GLOBAL(fours)]
57 psraw xmm4, 3
59 movq xmm0, [rax]
60 movq xmm1, [rax+rdx]
61 movq xmm2, [rax+2*rdx]
62 movq xmm3, [rax+rcx]
64 punpcklbw xmm0, xmm5
65 punpcklbw xmm1, xmm5
66 punpcklbw xmm2, xmm5
67 punpcklbw xmm3, xmm5
70 ; Add to predict buffer
71 paddw xmm0, xmm4
72 paddw xmm1, xmm4
73 paddw xmm2, xmm4
74 paddw xmm3, xmm4
76 ; pack up before storing
77 packuswb xmm0, xmm5
78 packuswb xmm1, xmm5
79 packuswb xmm2, xmm5
80 packuswb xmm3, xmm5
82 ; store blocks back out
83 movq [rax], xmm0
84 movq [rax + rdx], xmm1
86 lea rax, [rax + 2*rdx]
88 movq [rax], xmm2
89 movq [rax + rdx], xmm3
91 ; begin epilog
92 RESTORE_GOT
93 UNSHADOW_ARGS
94 pop rbp
95 ret
97 ;void vp8_idct_dequant_full_2x_sse2
98 ; (
99 ; short *qcoeff - 0
100 ; short *dequant - 1
101 ; unsigned char *dst - 2
102 ; int dst_stride - 3
103 ; )
104 global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
105 sym(vp8_idct_dequant_full_2x_sse2):
106 push rbp
107 mov rbp, rsp
108 SHADOW_ARGS_TO_STACK 4
109 SAVE_XMM 7
110 GET_GOT rbx
111 push rsi
112 push rdi
113 ; end prolog
115 ; special case when 2 blocks have 0 or 1 coeffs
116 ; dc is set as first coeff, so no need to load qcoeff
117 mov rax, arg(0) ; qcoeff
118 mov rdx, arg(1) ; dequant
119 mov rdi, arg(2) ; dst
122 ; Zero out xmm7, for use unpacking
123 pxor xmm7, xmm7
126 ; note the transpose of xmm1 and xmm2, necessary for shuffle
127 ; to spit out sensicle data
128 movdqa xmm0, [rax]
129 movdqa xmm2, [rax+16]
130 movdqa xmm1, [rax+32]
131 movdqa xmm3, [rax+48]
133 ; Clear out coeffs
134 movdqa [rax], xmm7
135 movdqa [rax+16], xmm7
136 movdqa [rax+32], xmm7
137 movdqa [rax+48], xmm7
139 ; dequantize qcoeff buffer
140 pmullw xmm0, [rdx]
141 pmullw xmm2, [rdx+16]
142 pmullw xmm1, [rdx]
143 pmullw xmm3, [rdx+16]
144 movsxd rdx, dword ptr arg(3) ; dst_stride
146 ; repack so block 0 row x and block 1 row x are together
147 movdqa xmm4, xmm0
148 punpckldq xmm0, xmm1
149 punpckhdq xmm4, xmm1
151 pshufd xmm0, xmm0, 11011000b
152 pshufd xmm1, xmm4, 11011000b
154 movdqa xmm4, xmm2
155 punpckldq xmm2, xmm3
156 punpckhdq xmm4, xmm3
158 pshufd xmm2, xmm2, 11011000b
159 pshufd xmm3, xmm4, 11011000b
161 ; first pass
162 psubw xmm0, xmm2 ; b1 = 0-2
163 paddw xmm2, xmm2 ;
165 movdqa xmm5, xmm1
166 paddw xmm2, xmm0 ; a1 = 0+2
168 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
169 lea rcx, [rdx + rdx*2] ;dst_stride * 3
170 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
172 movdqa xmm7, xmm3
173 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
175 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
176 psubw xmm7, xmm5 ; c1
178 movdqa xmm5, xmm1
179 movdqa xmm4, xmm3
181 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
182 paddw xmm5, xmm1
184 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
185 paddw xmm3, xmm4
187 paddw xmm3, xmm5 ; d1
188 movdqa xmm6, xmm2 ; a1
190 movdqa xmm4, xmm0 ; b1
191 paddw xmm2, xmm3 ;0
193 paddw xmm4, xmm7 ;1
194 psubw xmm0, xmm7 ;2
196 psubw xmm6, xmm3 ;3
198 ; transpose for the second pass
199 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
200 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
201 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
203 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
204 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
205 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
208 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
209 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
210 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
212 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
213 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
214 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
217 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
218 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
219 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
221 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
222 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
223 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
225 pshufd xmm0, xmm2, 11011000b
226 pshufd xmm2, xmm1, 11011000b
228 pshufd xmm1, xmm5, 11011000b
229 pshufd xmm3, xmm7, 11011000b
231 ; second pass
232 psubw xmm0, xmm2 ; b1 = 0-2
233 paddw xmm2, xmm2
235 movdqa xmm5, xmm1
236 paddw xmm2, xmm0 ; a1 = 0+2
238 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
239 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
241 movdqa xmm7, xmm3
242 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
244 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
245 psubw xmm7, xmm5 ; c1
247 movdqa xmm5, xmm1
248 movdqa xmm4, xmm3
250 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
251 paddw xmm5, xmm1
253 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
254 paddw xmm3, xmm4
256 paddw xmm3, xmm5 ; d1
257 paddw xmm0, [GLOBAL(fours)]
259 paddw xmm2, [GLOBAL(fours)]
260 movdqa xmm6, xmm2 ; a1
262 movdqa xmm4, xmm0 ; b1
263 paddw xmm2, xmm3 ;0
265 paddw xmm4, xmm7 ;1
266 psubw xmm0, xmm7 ;2
268 psubw xmm6, xmm3 ;3
269 psraw xmm2, 3
271 psraw xmm0, 3
272 psraw xmm4, 3
274 psraw xmm6, 3
276 ; transpose to save
277 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
278 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
279 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
281 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
282 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
283 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
286 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
287 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
288 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
290 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
291 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
292 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
295 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
296 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
297 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
299 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
300 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
301 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
303 pshufd xmm0, xmm2, 11011000b
304 pshufd xmm2, xmm1, 11011000b
306 pshufd xmm1, xmm5, 11011000b
307 pshufd xmm3, xmm7, 11011000b
309 pxor xmm7, xmm7
311 ; Load up predict blocks
312 movq xmm4, [rdi]
313 movq xmm5, [rdi+rdx]
315 punpcklbw xmm4, xmm7
316 punpcklbw xmm5, xmm7
318 paddw xmm0, xmm4
319 paddw xmm1, xmm5
321 movq xmm4, [rdi+2*rdx]
322 movq xmm5, [rdi+rcx]
324 punpcklbw xmm4, xmm7
325 punpcklbw xmm5, xmm7
327 paddw xmm2, xmm4
328 paddw xmm3, xmm5
330 .finish:
332 ; pack up before storing
333 packuswb xmm0, xmm7
334 packuswb xmm1, xmm7
335 packuswb xmm2, xmm7
336 packuswb xmm3, xmm7
338 ; store blocks back out
339 movq [rdi], xmm0
340 movq [rdi + rdx], xmm1
341 movq [rdi + rdx*2], xmm2
342 movq [rdi + rcx], xmm3
344 ; begin epilog
345 pop rdi
346 pop rsi
347 RESTORE_GOT
348 RESTORE_XMM
349 UNSHADOW_ARGS
350 pop rbp
351 ret
353 ;void vp8_idct_dequant_dc_0_2x_sse2
354 ; (
355 ; short *qcoeff - 0
356 ; short *dequant - 1
357 ; unsigned char *dst - 2
358 ; int dst_stride - 3
359 ; short *dc - 4
360 ; )
361 global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
362 sym(vp8_idct_dequant_dc_0_2x_sse2):
363 push rbp
364 mov rbp, rsp
365 SHADOW_ARGS_TO_STACK 5
366 GET_GOT rbx
367 push rdi
368 ; end prolog
370 ; special case when 2 blocks have 0 or 1 coeffs
371 ; dc is set as first coeff, so no need to load qcoeff
372 mov rax, arg(0) ; qcoeff
374 mov rdi, arg(2) ; dst
375 mov rdx, arg(4) ; dc
377 ; Zero out xmm5, for use unpacking
378 pxor xmm5, xmm5
380 ; load up 2 dc words here == 2*16 = doubleword
381 movd xmm4, [rdx]
383 movsxd rdx, dword ptr arg(3) ; dst_stride
384 lea rcx, [rdx + rdx*2]
385 ; Load up predict blocks
386 movq xmm0, [rdi]
387 movq xmm1, [rdi+rdx*1]
388 movq xmm2, [rdi+rdx*2]
389 movq xmm3, [rdi+rcx]
391 ; Duplicate and expand dc across
392 punpcklwd xmm4, xmm4
393 punpckldq xmm4, xmm4
395 ; Rounding to dequant and downshift
396 paddw xmm4, [GLOBAL(fours)]
397 psraw xmm4, 3
399 ; Predict buffer needs to be expanded from bytes to words
400 punpcklbw xmm0, xmm5
401 punpcklbw xmm1, xmm5
402 punpcklbw xmm2, xmm5
403 punpcklbw xmm3, xmm5
405 ; Add to predict buffer
406 paddw xmm0, xmm4
407 paddw xmm1, xmm4
408 paddw xmm2, xmm4
409 paddw xmm3, xmm4
411 ; pack up before storing
412 packuswb xmm0, xmm5
413 packuswb xmm1, xmm5
414 packuswb xmm2, xmm5
415 packuswb xmm3, xmm5
417 ; store blocks back out
418 movq [rdi], xmm0
419 movq [rdi + rdx], xmm1
420 movq [rdi + rdx*2], xmm2
421 movq [rdi + rcx], xmm3
423 ; begin epilog
424 pop rdi
425 RESTORE_GOT
426 UNSHADOW_ARGS
427 pop rbp
428 ret
429 ;void vp8_idct_dequant_dc_full_2x_sse2
430 ; (
431 ; short *qcoeff - 0
432 ; short *dequant - 1
433 ; unsigned char *dst - 2
434 ; int dst_stride - 3
435 ; short *dc - 4
436 ; )
437 global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
438 sym(vp8_idct_dequant_dc_full_2x_sse2):
439 push rbp
440 mov rbp, rsp
441 SHADOW_ARGS_TO_STACK 5
442 SAVE_XMM 7
443 GET_GOT rbx
444 push rdi
445 ; end prolog
447 ; special case when 2 blocks have 0 or 1 coeffs
448 ; dc is set as first coeff, so no need to load qcoeff
449 mov rax, arg(0) ; qcoeff
450 mov rdx, arg(1) ; dequant
452 mov rdi, arg(2) ; dst
454 ; Zero out xmm7, for use unpacking
455 pxor xmm7, xmm7
458 ; note the transpose of xmm1 and xmm2, necessary for shuffle
459 ; to spit out sensicle data
460 movdqa xmm0, [rax]
461 movdqa xmm2, [rax+16]
462 movdqa xmm1, [rax+32]
463 movdqa xmm3, [rax+48]
465 ; Clear out coeffs
466 movdqa [rax], xmm7
467 movdqa [rax+16], xmm7
468 movdqa [rax+32], xmm7
469 movdqa [rax+48], xmm7
471 ; dequantize qcoeff buffer
472 pmullw xmm0, [rdx]
473 pmullw xmm2, [rdx+16]
474 pmullw xmm1, [rdx]
475 pmullw xmm3, [rdx+16]
477 ; DC component
478 mov rdx, arg(4)
480 ; repack so block 0 row x and block 1 row x are together
481 movdqa xmm4, xmm0
482 punpckldq xmm0, xmm1
483 punpckhdq xmm4, xmm1
485 pshufd xmm0, xmm0, 11011000b
486 pshufd xmm1, xmm4, 11011000b
488 movdqa xmm4, xmm2
489 punpckldq xmm2, xmm3
490 punpckhdq xmm4, xmm3
492 pshufd xmm2, xmm2, 11011000b
493 pshufd xmm3, xmm4, 11011000b
495 ; insert DC component
496 pinsrw xmm0, [rdx], 0
497 pinsrw xmm0, [rdx+2], 4
499 ; first pass
500 psubw xmm0, xmm2 ; b1 = 0-2
501 paddw xmm2, xmm2 ;
503 movdqa xmm5, xmm1
504 paddw xmm2, xmm0 ; a1 = 0+2
506 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
507 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
509 movdqa xmm7, xmm3
510 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
512 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
513 psubw xmm7, xmm5 ; c1
515 movdqa xmm5, xmm1
516 movdqa xmm4, xmm3
518 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
519 paddw xmm5, xmm1
521 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
522 paddw xmm3, xmm4
524 paddw xmm3, xmm5 ; d1
525 movdqa xmm6, xmm2 ; a1
527 movdqa xmm4, xmm0 ; b1
528 paddw xmm2, xmm3 ;0
530 paddw xmm4, xmm7 ;1
531 psubw xmm0, xmm7 ;2
533 psubw xmm6, xmm3 ;3
535 ; transpose for the second pass
536 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
537 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
538 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
540 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
541 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
542 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
545 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
546 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
547 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
549 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
550 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
551 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
554 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
555 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
556 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
558 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
559 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
560 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
562 pshufd xmm0, xmm2, 11011000b
563 pshufd xmm2, xmm1, 11011000b
565 pshufd xmm1, xmm5, 11011000b
566 pshufd xmm3, xmm7, 11011000b
568 ; second pass
569 psubw xmm0, xmm2 ; b1 = 0-2
570 paddw xmm2, xmm2
572 movdqa xmm5, xmm1
573 paddw xmm2, xmm0 ; a1 = 0+2
575 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
576 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
578 movdqa xmm7, xmm3
579 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
581 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
582 psubw xmm7, xmm5 ; c1
584 movdqa xmm5, xmm1
585 movdqa xmm4, xmm3
587 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
588 paddw xmm5, xmm1
590 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
591 paddw xmm3, xmm4
593 paddw xmm3, xmm5 ; d1
594 paddw xmm0, [GLOBAL(fours)]
596 paddw xmm2, [GLOBAL(fours)]
597 movdqa xmm6, xmm2 ; a1
599 movdqa xmm4, xmm0 ; b1
600 paddw xmm2, xmm3 ;0
602 paddw xmm4, xmm7 ;1
603 psubw xmm0, xmm7 ;2
605 psubw xmm6, xmm3 ;3
606 psraw xmm2, 3
608 psraw xmm0, 3
609 psraw xmm4, 3
611 psraw xmm6, 3
613 ; transpose to save
614 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
615 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
616 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
618 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
619 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
620 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
623 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
624 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
625 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
627 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
628 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
629 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
632 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
633 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
634 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
636 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
637 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
638 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
640 pshufd xmm0, xmm2, 11011000b
641 pshufd xmm2, xmm1, 11011000b
643 pshufd xmm1, xmm5, 11011000b
644 pshufd xmm3, xmm7, 11011000b
646 pxor xmm7, xmm7
648 ; Load up predict blocks
649 movsxd rdx, dword ptr arg(3) ; dst_stride
650 movq xmm4, [rdi]
651 movq xmm5, [rdi+rdx]
652 lea rcx, [rdx + rdx*2]
654 punpcklbw xmm4, xmm7
655 punpcklbw xmm5, xmm7
657 paddw xmm0, xmm4
658 paddw xmm1, xmm5
660 movq xmm4, [rdi+rdx*2]
661 movq xmm5, [rdi+rcx]
663 punpcklbw xmm4, xmm7
664 punpcklbw xmm5, xmm7
666 paddw xmm2, xmm4
667 paddw xmm3, xmm5
669 .finish:
671 ; pack up before storing
672 packuswb xmm0, xmm7
673 packuswb xmm1, xmm7
674 packuswb xmm2, xmm7
675 packuswb xmm3, xmm7
677 ; Load destination stride before writing out,
678 ; doesn't need to persist
679 movsxd rdx, dword ptr arg(3) ; dst_stride
681 ; store blocks back out
682 movq [rdi], xmm0
683 movq [rdi + rdx], xmm1
685 lea rdi, [rdi + 2*rdx]
687 movq [rdi], xmm2
688 movq [rdi + rdx], xmm3
691 ; begin epilog
692 pop rdi
693 RESTORE_GOT
694 RESTORE_XMM
695 UNSHADOW_ARGS
696 pop rbp
697 ret
699 SECTION_RODATA
700 align 16
701 fours:
702 times 8 dw 0x0004
703 align 16
704 x_s1sqr2:
705 times 8 dw 0x8A8C
706 align 16
707 x_c1sqr2less1:
708 times 8 dw 0x4E7B