|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ;macro in deblock functions |
|
15 %macro FIRST_2_ROWS 0 |
|
16 movdqa xmm4, xmm0 |
|
17 movdqa xmm6, xmm0 |
|
18 movdqa xmm5, xmm1 |
|
19 pavgb xmm5, xmm3 |
|
20 |
|
21 ;calculate absolute value |
|
22 psubusb xmm4, xmm1 |
|
23 psubusb xmm1, xmm0 |
|
24 psubusb xmm6, xmm3 |
|
25 psubusb xmm3, xmm0 |
|
26 paddusb xmm4, xmm1 |
|
27 paddusb xmm6, xmm3 |
|
28 |
|
29 ;get threshold |
|
30 movdqa xmm2, flimit |
|
31 pxor xmm1, xmm1 |
|
32 movdqa xmm7, xmm2 |
|
33 |
|
34 ;get mask |
|
35 psubusb xmm2, xmm4 |
|
36 psubusb xmm7, xmm6 |
|
37 pcmpeqb xmm2, xmm1 |
|
38 pcmpeqb xmm7, xmm1 |
|
39 por xmm7, xmm2 |
|
40 %endmacro |
|
41 |
|
42 %macro SECOND_2_ROWS 0 |
|
43 movdqa xmm6, xmm0 |
|
44 movdqa xmm4, xmm0 |
|
45 movdqa xmm2, xmm1 |
|
46 pavgb xmm1, xmm3 |
|
47 |
|
48 ;calculate absolute value |
|
49 psubusb xmm6, xmm2 |
|
50 psubusb xmm2, xmm0 |
|
51 psubusb xmm4, xmm3 |
|
52 psubusb xmm3, xmm0 |
|
53 paddusb xmm6, xmm2 |
|
54 paddusb xmm4, xmm3 |
|
55 |
|
56 pavgb xmm5, xmm1 |
|
57 |
|
58 ;get threshold |
|
59 movdqa xmm2, flimit |
|
60 pxor xmm1, xmm1 |
|
61 movdqa xmm3, xmm2 |
|
62 |
|
63 ;get mask |
|
64 psubusb xmm2, xmm6 |
|
65 psubusb xmm3, xmm4 |
|
66 pcmpeqb xmm2, xmm1 |
|
67 pcmpeqb xmm3, xmm1 |
|
68 |
|
69 por xmm7, xmm2 |
|
70 por xmm7, xmm3 |
|
71 |
|
72 pavgb xmm5, xmm0 |
|
73 |
|
74 ;decide if or not to use filtered value |
|
75 pand xmm0, xmm7 |
|
76 pandn xmm7, xmm5 |
|
77 paddusb xmm0, xmm7 |
|
78 %endmacro |
|
79 |
|
80 %macro UPDATE_FLIMIT 0 |
|
81 movdqa xmm2, XMMWORD PTR [rbx] |
|
82 movdqa [rsp], xmm2 |
|
83 add rbx, 16 |
|
84 %endmacro |
|
85 |
|
86 ;void vp8_post_proc_down_and_across_mb_row_sse2 |
|
87 ;( |
|
88 ; unsigned char *src_ptr, |
|
89 ; unsigned char *dst_ptr, |
|
90 ; int src_pixels_per_line, |
|
91 ; int dst_pixels_per_line, |
|
92 ; int cols, |
|
93 ; int *flimits, |
|
94 ; int size |
|
95 ;) |
|
96 global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE |
|
97 sym(vp8_post_proc_down_and_across_mb_row_sse2): |
|
98 push rbp |
|
99 mov rbp, rsp |
|
100 SHADOW_ARGS_TO_STACK 7 |
|
101 SAVE_XMM 7 |
|
102 push rbx |
|
103 push rsi |
|
104 push rdi |
|
105 ; end prolog |
|
106 ALIGN_STACK 16, rax |
|
107 sub rsp, 16 |
|
108 |
|
109 ; put flimit on stack |
|
110 mov rbx, arg(5) ;flimits ptr |
|
111 UPDATE_FLIMIT |
|
112 |
|
113 %define flimit [rsp] |
|
114 |
|
115 mov rsi, arg(0) ;src_ptr |
|
116 mov rdi, arg(1) ;dst_ptr |
|
117 |
|
118 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line |
|
119 movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock |
|
120 .nextrow: |
|
121 xor rdx, rdx ;col |
|
122 .nextcol: |
|
123 ;load current and next 2 rows |
|
124 movdqu xmm0, XMMWORD PTR [rsi] |
|
125 movdqu xmm1, XMMWORD PTR [rsi + rax] |
|
126 movdqu xmm3, XMMWORD PTR [rsi + 2*rax] |
|
127 |
|
128 FIRST_2_ROWS |
|
129 |
|
130 ;load above 2 rows |
|
131 neg rax |
|
132 movdqu xmm1, XMMWORD PTR [rsi + 2*rax] |
|
133 movdqu xmm3, XMMWORD PTR [rsi + rax] |
|
134 |
|
135 SECOND_2_ROWS |
|
136 |
|
137 movdqu XMMWORD PTR [rdi], xmm0 |
|
138 |
|
139 neg rax ; positive stride |
|
140 add rsi, 16 |
|
141 add rdi, 16 |
|
142 |
|
143 add rdx, 16 |
|
144 cmp edx, dword arg(4) ;cols |
|
145 jge .downdone |
|
146 UPDATE_FLIMIT |
|
147 jmp .nextcol |
|
148 |
|
149 .downdone: |
|
150 ; done with the all cols, start the across filtering in place |
|
151 sub rsi, rdx |
|
152 sub rdi, rdx |
|
153 |
|
154 mov rbx, arg(5) ; flimits |
|
155 UPDATE_FLIMIT |
|
156 |
|
157 ; dup the first byte into the left border 8 times |
|
158 movq mm1, [rdi] |
|
159 punpcklbw mm1, mm1 |
|
160 punpcklwd mm1, mm1 |
|
161 punpckldq mm1, mm1 |
|
162 mov rdx, -8 |
|
163 movq [rdi+rdx], mm1 |
|
164 |
|
165 ; dup the last byte into the right border |
|
166 movsxd rdx, dword arg(4) |
|
167 movq mm1, [rdi + rdx + -1] |
|
168 punpcklbw mm1, mm1 |
|
169 punpcklwd mm1, mm1 |
|
170 punpckldq mm1, mm1 |
|
171 movq [rdi+rdx], mm1 |
|
172 |
|
173 xor rdx, rdx |
|
174 movq mm0, QWORD PTR [rdi-16]; |
|
175 movq mm1, QWORD PTR [rdi-8]; |
|
176 |
|
177 .acrossnextcol: |
|
178 movdqu xmm0, XMMWORD PTR [rdi + rdx] |
|
179 movdqu xmm1, XMMWORD PTR [rdi + rdx -2] |
|
180 movdqu xmm3, XMMWORD PTR [rdi + rdx -1] |
|
181 |
|
182 FIRST_2_ROWS |
|
183 |
|
184 movdqu xmm1, XMMWORD PTR [rdi + rdx +1] |
|
185 movdqu xmm3, XMMWORD PTR [rdi + rdx +2] |
|
186 |
|
187 SECOND_2_ROWS |
|
188 |
|
189 movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes |
|
190 movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes |
|
191 movdq2q mm0, xmm0 |
|
192 psrldq xmm0, 8 |
|
193 movdq2q mm1, xmm0 |
|
194 |
|
195 add rdx, 16 |
|
196 cmp edx, dword arg(4) ;cols |
|
197 jge .acrossdone |
|
198 UPDATE_FLIMIT |
|
199 jmp .acrossnextcol |
|
200 |
|
201 .acrossdone |
|
202 ; last 16 pixels |
|
203 movq QWORD PTR [rdi+rdx-16], mm0 |
|
204 |
|
205 cmp edx, dword arg(4) |
|
206 jne .throw_last_8 |
|
207 movq QWORD PTR [rdi+rdx-8], mm1 |
|
208 .throw_last_8: |
|
209 ; done with this rwo |
|
210 add rsi,rax ;next src line |
|
211 mov eax, dword arg(3) ;dst_pixels_per_line |
|
212 add rdi,rax ;next destination |
|
213 mov eax, dword arg(2) ;src_pixels_per_line |
|
214 |
|
215 mov rbx, arg(5) ;flimits |
|
216 UPDATE_FLIMIT |
|
217 |
|
218 dec rcx ;decrement count |
|
219 jnz .nextrow ;next row |
|
220 |
|
221 add rsp, 16 |
|
222 pop rsp |
|
223 ; begin epilog |
|
224 pop rdi |
|
225 pop rsi |
|
226 pop rbx |
|
227 RESTORE_XMM |
|
228 UNSHADOW_ARGS |
|
229 pop rbp |
|
230 ret |
|
231 %undef flimit |
|
232 |
|
233 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst, |
|
234 ; int pitch, int rows, int cols,int flimit) |
|
235 extern sym(vp8_rv) |
|
236 global sym(vp8_mbpost_proc_down_xmm) PRIVATE |
|
237 sym(vp8_mbpost_proc_down_xmm): |
|
238 push rbp |
|
239 mov rbp, rsp |
|
240 SHADOW_ARGS_TO_STACK 5 |
|
241 SAVE_XMM 7 |
|
242 GET_GOT rbx |
|
243 push rsi |
|
244 push rdi |
|
245 ; end prolog |
|
246 |
|
247 ALIGN_STACK 16, rax |
|
248 sub rsp, 128+16 |
|
249 |
|
250 ; unsigned char d[16][8] at [rsp] |
|
251 ; create flimit2 at [rsp+128] |
|
252 mov eax, dword ptr arg(4) ;flimit |
|
253 mov [rsp+128], eax |
|
254 mov [rsp+128+4], eax |
|
255 mov [rsp+128+8], eax |
|
256 mov [rsp+128+12], eax |
|
257 %define flimit4 [rsp+128] |
|
258 |
|
259 %if ABI_IS_32BIT=0 |
|
260 lea r8, [GLOBAL(sym(vp8_rv))] |
|
261 %endif |
|
262 |
|
263 ;rows +=8; |
|
264 add dword arg(2), 8 |
|
265 |
|
266 ;for(c=0; c<cols; c+=8) |
|
267 .loop_col: |
|
268 mov rsi, arg(0) ; s |
|
269 pxor xmm0, xmm0 ; |
|
270 |
|
271 movsxd rax, dword ptr arg(1) ;pitch ; |
|
272 |
|
273 ; this copies the last row down into the border 8 rows |
|
274 mov rdi, rsi |
|
275 mov rdx, arg(2) |
|
276 sub rdx, 9 |
|
277 imul rdx, rax |
|
278 lea rdi, [rdi+rdx] |
|
279 movq xmm1, QWORD ptr[rdi] ; first row |
|
280 mov rcx, 8 |
|
281 .init_borderd ; initialize borders |
|
282 lea rdi, [rdi + rax] |
|
283 movq [rdi], xmm1 |
|
284 |
|
285 dec rcx |
|
286 jne .init_borderd |
|
287 |
|
288 neg rax ; rax = -pitch |
|
289 |
|
290 ; this copies the first row up into the border 8 rows |
|
291 mov rdi, rsi |
|
292 movq xmm1, QWORD ptr[rdi] ; first row |
|
293 mov rcx, 8 |
|
294 .init_border ; initialize borders |
|
295 lea rdi, [rdi + rax] |
|
296 movq [rdi], xmm1 |
|
297 |
|
298 dec rcx |
|
299 jne .init_border |
|
300 |
|
301 |
|
302 |
|
303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] |
|
304 neg rax |
|
305 |
|
306 pxor xmm5, xmm5 |
|
307 pxor xmm6, xmm6 ; |
|
308 |
|
309 pxor xmm7, xmm7 ; |
|
310 mov rdi, rsi |
|
311 |
|
312 mov rcx, 15 ; |
|
313 |
|
314 .loop_initvar: |
|
315 movq xmm1, QWORD PTR [rdi]; |
|
316 punpcklbw xmm1, xmm0 ; |
|
317 |
|
318 paddw xmm5, xmm1 ; |
|
319 pmullw xmm1, xmm1 ; |
|
320 |
|
321 movdqa xmm2, xmm1 ; |
|
322 punpcklwd xmm1, xmm0 ; |
|
323 |
|
324 punpckhwd xmm2, xmm0 ; |
|
325 paddd xmm6, xmm1 ; |
|
326 |
|
327 paddd xmm7, xmm2 ; |
|
328 lea rdi, [rdi+rax] ; |
|
329 |
|
330 dec rcx |
|
331 jne .loop_initvar |
|
332 ;save the var and sum |
|
333 xor rdx, rdx |
|
334 .loop_row: |
|
335 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] |
|
336 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] |
|
337 |
|
338 punpcklbw xmm1, xmm0 |
|
339 punpcklbw xmm2, xmm0 |
|
340 |
|
341 paddw xmm5, xmm2 |
|
342 psubw xmm5, xmm1 |
|
343 |
|
344 pmullw xmm2, xmm2 |
|
345 movdqa xmm4, xmm2 |
|
346 |
|
347 punpcklwd xmm2, xmm0 |
|
348 punpckhwd xmm4, xmm0 |
|
349 |
|
350 paddd xmm6, xmm2 |
|
351 paddd xmm7, xmm4 |
|
352 |
|
353 pmullw xmm1, xmm1 |
|
354 movdqa xmm2, xmm1 |
|
355 |
|
356 punpcklwd xmm1, xmm0 |
|
357 psubd xmm6, xmm1 |
|
358 |
|
359 punpckhwd xmm2, xmm0 |
|
360 psubd xmm7, xmm2 |
|
361 |
|
362 |
|
363 movdqa xmm3, xmm6 |
|
364 pslld xmm3, 4 |
|
365 |
|
366 psubd xmm3, xmm6 |
|
367 movdqa xmm1, xmm5 |
|
368 |
|
369 movdqa xmm4, xmm5 |
|
370 pmullw xmm1, xmm1 |
|
371 |
|
372 pmulhw xmm4, xmm4 |
|
373 movdqa xmm2, xmm1 |
|
374 |
|
375 punpcklwd xmm1, xmm4 |
|
376 punpckhwd xmm2, xmm4 |
|
377 |
|
378 movdqa xmm4, xmm7 |
|
379 pslld xmm4, 4 |
|
380 |
|
381 psubd xmm4, xmm7 |
|
382 |
|
383 psubd xmm3, xmm1 |
|
384 psubd xmm4, xmm2 |
|
385 |
|
386 psubd xmm3, flimit4 |
|
387 psubd xmm4, flimit4 |
|
388 |
|
389 psrad xmm3, 31 |
|
390 psrad xmm4, 31 |
|
391 |
|
392 packssdw xmm3, xmm4 |
|
393 packsswb xmm3, xmm0 |
|
394 |
|
395 movq xmm1, QWORD PTR [rsi+rax*8] |
|
396 |
|
397 movq xmm2, xmm1 |
|
398 punpcklbw xmm1, xmm0 |
|
399 |
|
400 paddw xmm1, xmm5 |
|
401 mov rcx, rdx |
|
402 |
|
403 and rcx, 127 |
|
404 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 |
|
405 push rax |
|
406 lea rax, [GLOBAL(sym(vp8_rv))] |
|
407 movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] |
|
408 pop rax |
|
409 %elif ABI_IS_32BIT=0 |
|
410 movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] |
|
411 %else |
|
412 movdqu xmm4, [sym(vp8_rv) + rcx*2] |
|
413 %endif |
|
414 |
|
415 paddw xmm1, xmm4 |
|
416 ;paddw xmm1, eight8s |
|
417 psraw xmm1, 4 |
|
418 |
|
419 packuswb xmm1, xmm0 |
|
420 pand xmm1, xmm3 |
|
421 |
|
422 pandn xmm3, xmm2 |
|
423 por xmm1, xmm3 |
|
424 |
|
425 and rcx, 15 |
|
426 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] |
|
427 |
|
428 mov rcx, rdx |
|
429 sub rcx, 8 |
|
430 |
|
431 and rcx, 15 |
|
432 movq mm0, [rsp + rcx*8] ;d[rcx*8] |
|
433 |
|
434 movq [rsi], mm0 |
|
435 lea rsi, [rsi+rax] |
|
436 |
|
437 lea rdi, [rdi+rax] |
|
438 add rdx, 1 |
|
439 |
|
440 cmp edx, dword arg(2) ;rows |
|
441 jl .loop_row |
|
442 |
|
443 add dword arg(0), 8 ; s += 8 |
|
444 sub dword arg(3), 8 ; cols -= 8 |
|
445 cmp dword arg(3), 0 |
|
446 jg .loop_col |
|
447 |
|
448 add rsp, 128+16 |
|
449 pop rsp |
|
450 |
|
451 ; begin epilog |
|
452 pop rdi |
|
453 pop rsi |
|
454 RESTORE_GOT |
|
455 RESTORE_XMM |
|
456 UNSHADOW_ARGS |
|
457 pop rbp |
|
458 ret |
|
459 %undef flimit4 |
|
460 |
|
461 |
|
462 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, |
|
463 ; int pitch, int rows, int cols,int flimit) |
|
464 global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE |
|
465 sym(vp8_mbpost_proc_across_ip_xmm): |
|
466 push rbp |
|
467 mov rbp, rsp |
|
468 SHADOW_ARGS_TO_STACK 5 |
|
469 SAVE_XMM 7 |
|
470 GET_GOT rbx |
|
471 push rsi |
|
472 push rdi |
|
473 ; end prolog |
|
474 |
|
475 ALIGN_STACK 16, rax |
|
476 sub rsp, 16 |
|
477 |
|
478 ; create flimit4 at [rsp] |
|
479 mov eax, dword ptr arg(4) ;flimit |
|
480 mov [rsp], eax |
|
481 mov [rsp+4], eax |
|
482 mov [rsp+8], eax |
|
483 mov [rsp+12], eax |
|
484 %define flimit4 [rsp] |
|
485 |
|
486 |
|
487 ;for(r=0;r<rows;r++) |
|
488 .ip_row_loop: |
|
489 |
|
490 xor rdx, rdx ;sumsq=0; |
|
491 xor rcx, rcx ;sum=0; |
|
492 mov rsi, arg(0); s |
|
493 |
|
494 |
|
495 ; dup the first byte into the left border 8 times |
|
496 movq mm1, [rsi] |
|
497 punpcklbw mm1, mm1 |
|
498 punpcklwd mm1, mm1 |
|
499 punpckldq mm1, mm1 |
|
500 |
|
501 mov rdi, -8 |
|
502 movq [rsi+rdi], mm1 |
|
503 |
|
504 ; dup the last byte into the right border |
|
505 movsxd rdx, dword arg(3) |
|
506 movq mm1, [rsi + rdx + -1] |
|
507 punpcklbw mm1, mm1 |
|
508 punpcklwd mm1, mm1 |
|
509 punpckldq mm1, mm1 |
|
510 movq [rsi+rdx], mm1 |
|
511 |
|
512 .ip_var_loop: |
|
513 ;for(i=-8;i<=6;i++) |
|
514 ;{ |
|
515 ; sumsq += s[i]*s[i]; |
|
516 ; sum += s[i]; |
|
517 ;} |
|
518 movzx eax, byte [rsi+rdi] |
|
519 add ecx, eax |
|
520 mul al |
|
521 add edx, eax |
|
522 add rdi, 1 |
|
523 cmp rdi, 6 |
|
524 jle .ip_var_loop |
|
525 |
|
526 |
|
527 ;mov rax, sumsq |
|
528 ;movd xmm7, rax |
|
529 movd xmm7, edx |
|
530 |
|
531 ;mov rax, sum |
|
532 ;movd xmm6, rax |
|
533 movd xmm6, ecx |
|
534 |
|
535 mov rsi, arg(0) ;s |
|
536 xor rcx, rcx |
|
537 |
|
538 movsxd rdx, dword arg(3) ;cols |
|
539 add rdx, 8 |
|
540 pxor mm0, mm0 |
|
541 pxor mm1, mm1 |
|
542 |
|
543 pxor xmm0, xmm0 |
|
544 .nextcol4: |
|
545 |
|
546 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 |
|
547 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 |
|
548 |
|
549 punpcklbw xmm1, xmm0 ; expanding |
|
550 punpcklbw xmm2, xmm0 ; expanding |
|
551 |
|
552 punpcklwd xmm1, xmm0 ; expanding to dwords |
|
553 punpcklwd xmm2, xmm0 ; expanding to dwords |
|
554 |
|
555 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 |
|
556 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 |
|
557 |
|
558 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 |
|
559 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 |
|
560 |
|
561 paddd xmm6, xmm2 |
|
562 paddd xmm7, xmm1 |
|
563 |
|
564 pshufd xmm6, xmm6, 0 ; duplicate the last ones |
|
565 pshufd xmm7, xmm7, 0 ; duplicate the last ones |
|
566 |
|
567 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 |
|
568 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 |
|
569 |
|
570 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared |
|
571 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared |
|
572 |
|
573 paddd xmm6, xmm4 |
|
574 paddd xmm7, xmm3 |
|
575 |
|
576 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared |
|
577 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared |
|
578 |
|
579 paddd xmm7, xmm3 |
|
580 paddd xmm6, xmm4 |
|
581 |
|
582 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared |
|
583 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared |
|
584 |
|
585 paddd xmm7, xmm3 |
|
586 paddd xmm6, xmm4 |
|
587 |
|
588 movdqa xmm3, xmm6 |
|
589 pmaddwd xmm3, xmm3 |
|
590 |
|
591 movdqa xmm5, xmm7 |
|
592 pslld xmm5, 4 |
|
593 |
|
594 psubd xmm5, xmm7 |
|
595 psubd xmm5, xmm3 |
|
596 |
|
597 psubd xmm5, flimit4 |
|
598 psrad xmm5, 31 |
|
599 |
|
600 packssdw xmm5, xmm0 |
|
601 packsswb xmm5, xmm0 |
|
602 |
|
603 movd xmm1, DWORD PTR [rsi+rcx] |
|
604 movq xmm2, xmm1 |
|
605 |
|
606 punpcklbw xmm1, xmm0 |
|
607 punpcklwd xmm1, xmm0 |
|
608 |
|
609 paddd xmm1, xmm6 |
|
610 paddd xmm1, [GLOBAL(four8s)] |
|
611 |
|
612 psrad xmm1, 4 |
|
613 packssdw xmm1, xmm0 |
|
614 |
|
615 packuswb xmm1, xmm0 |
|
616 pand xmm1, xmm5 |
|
617 |
|
618 pandn xmm5, xmm2 |
|
619 por xmm5, xmm1 |
|
620 |
|
621 movd [rsi+rcx-8], mm0 |
|
622 movq mm0, mm1 |
|
623 |
|
624 movdq2q mm1, xmm5 |
|
625 psrldq xmm7, 12 |
|
626 |
|
627 psrldq xmm6, 12 |
|
628 add rcx, 4 |
|
629 |
|
630 cmp rcx, rdx |
|
631 jl .nextcol4 |
|
632 |
|
633 ;s+=pitch; |
|
634 movsxd rax, dword arg(1) |
|
635 add arg(0), rax |
|
636 |
|
637 sub dword arg(2), 1 ;rows-=1 |
|
638 cmp dword arg(2), 0 |
|
639 jg .ip_row_loop |
|
640 |
|
641 add rsp, 16 |
|
642 pop rsp |
|
643 |
|
644 ; begin epilog |
|
645 pop rdi |
|
646 pop rsi |
|
647 RESTORE_GOT |
|
648 RESTORE_XMM |
|
649 UNSHADOW_ARGS |
|
650 pop rbp |
|
651 ret |
|
652 %undef flimit4 |
|
653 |
|
654 |
|
655 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, |
|
656 ; unsigned char blackclamp[16], |
|
657 ; unsigned char whiteclamp[16], |
|
658 ; unsigned char bothclamp[16], |
|
659 ; unsigned int Width, unsigned int Height, int Pitch) |
|
660 extern sym(rand) |
|
661 global sym(vp8_plane_add_noise_wmt) PRIVATE |
|
662 sym(vp8_plane_add_noise_wmt): |
|
663 push rbp |
|
664 mov rbp, rsp |
|
665 SHADOW_ARGS_TO_STACK 8 |
|
666 GET_GOT rbx |
|
667 push rsi |
|
668 push rdi |
|
669 ; end prolog |
|
670 |
|
671 .addnoise_loop: |
|
672 call sym(rand) WRT_PLT |
|
673 mov rcx, arg(1) ;noise |
|
674 and rax, 0xff |
|
675 add rcx, rax |
|
676 |
|
677 ; we rely on the fact that the clamping vectors are stored contiguously |
|
678 ; in black/white/both order. Note that we have to reload this here because |
|
679 ; rdx could be trashed by rand() |
|
680 mov rdx, arg(2) ; blackclamp |
|
681 |
|
682 |
|
683 mov rdi, rcx |
|
684 movsxd rcx, dword arg(5) ;[Width] |
|
685 mov rsi, arg(0) ;Pos |
|
686 xor rax,rax |
|
687 |
|
688 .addnoise_nextset: |
|
689 movdqu xmm1,[rsi+rax] ; get the source |
|
690 |
|
691 psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise |
|
692 paddusb xmm1, [rdx+32] ;bothclamp |
|
693 psubusb xmm1, [rdx+16] ;whiteclamp |
|
694 |
|
695 movdqu xmm2,[rdi+rax] ; get the noise for this line |
|
696 paddb xmm1,xmm2 ; add it in |
|
697 movdqu [rsi+rax],xmm1 ; store the result |
|
698 |
|
699 add rax,16 ; move to the next line |
|
700 |
|
701 cmp rax, rcx |
|
702 jl .addnoise_nextset |
|
703 |
|
704 movsxd rax, dword arg(7) ; Pitch |
|
705 add arg(0), rax ; Start += Pitch |
|
706 sub dword arg(6), 1 ; Height -= 1 |
|
707 jg .addnoise_loop |
|
708 |
|
709 ; begin epilog |
|
710 pop rdi |
|
711 pop rsi |
|
712 RESTORE_GOT |
|
713 UNSHADOW_ARGS |
|
714 pop rbp |
|
715 ret |
|
716 |
|
717 |
|
718 SECTION_RODATA |
|
719 align 16 |
|
720 four8s: |
|
721 times 4 dd 8 |