|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 %define VP9_FILTER_WEIGHT 128 |
|
15 %define VP9_FILTER_SHIFT 7 |
|
16 |
|
17 ;void vp9_post_proc_down_and_across_mmx |
|
18 ;( |
|
19 ; unsigned char *src_ptr, |
|
20 ; unsigned char *dst_ptr, |
|
21 ; int src_pixels_per_line, |
|
22 ; int dst_pixels_per_line, |
|
23 ; int rows, |
|
24 ; int cols, |
|
25 ; int flimit |
|
26 ;) |
|
27 global sym(vp9_post_proc_down_and_across_mmx) PRIVATE |
|
28 sym(vp9_post_proc_down_and_across_mmx): |
|
29 push rbp |
|
30 mov rbp, rsp |
|
31 SHADOW_ARGS_TO_STACK 7 |
|
32 GET_GOT rbx |
|
33 push rsi |
|
34 push rdi |
|
35 ; end prolog |
|
36 |
|
37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 |
|
38 ; move the global rd onto the stack, since we don't have enough registers |
|
39 ; to do PIC addressing |
|
40 movq mm0, [GLOBAL(rd)] |
|
41 sub rsp, 8 |
|
42 movq [rsp], mm0 |
|
43 %define RD [rsp] |
|
44 %else |
|
45 %define RD [GLOBAL(rd)] |
|
46 %endif |
|
47 |
|
48 push rbx |
|
49 lea rbx, [GLOBAL(Blur)] |
|
50 movd mm2, dword ptr arg(6) ;flimit |
|
51 punpcklwd mm2, mm2 |
|
52 punpckldq mm2, mm2 |
|
53 |
|
54 mov rsi, arg(0) ;src_ptr |
|
55 mov rdi, arg(1) ;dst_ptr |
|
56 |
|
57 movsxd rcx, DWORD PTR arg(4) ;rows |
|
58 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? |
|
59 pxor mm0, mm0 ; mm0 = 00000000 |
|
60 |
|
61 .nextrow: |
|
62 |
|
63 xor rdx, rdx ; clear out rdx for use as loop counter |
|
64 .nextcol: |
|
65 |
|
66 pxor mm7, mm7 ; mm7 = 00000000 |
|
67 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps |
|
68 movq mm3, [rsi] ; mm4 = r0 p0..p7 |
|
69 punpcklbw mm3, mm0 ; mm3 = p0..p3 |
|
70 movq mm1, mm3 ; mm1 = p0..p3 |
|
71 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers |
|
72 |
|
73 movq mm6, [rbx + 48] ; mm6 = kernel 3 taps |
|
74 movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 |
|
75 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 |
|
76 pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers |
|
77 paddusw mm3, mm6 ; mm3 += mm6 |
|
78 |
|
79 ; thresholding |
|
80 movq mm7, mm1 ; mm7 = r0 p0..p3 |
|
81 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 |
|
82 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 |
|
83 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) |
|
84 pcmpgtw mm7, mm2 |
|
85 |
|
86 movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers |
|
87 movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 |
|
88 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 |
|
89 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers |
|
90 paddusw mm3, mm6 ; mm3 += mm5 |
|
91 |
|
92 ; thresholding |
|
93 movq mm6, mm1 ; mm6 = r0 p0..p3 |
|
94 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 |
|
95 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 |
|
96 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) |
|
97 pcmpgtw mm6, mm2 |
|
98 por mm7, mm6 ; accumulate thresholds |
|
99 |
|
100 |
|
101 neg rax |
|
102 movq mm6, [rbx ] ; kernel 0 taps |
|
103 movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 |
|
104 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 |
|
105 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers |
|
106 paddusw mm3, mm6 ; mm3 += mm5 |
|
107 |
|
108 ; thresholding |
|
109 movq mm6, mm1 ; mm6 = r0 p0..p3 |
|
110 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 |
|
111 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 |
|
112 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) |
|
113 pcmpgtw mm6, mm2 |
|
114 por mm7, mm6 ; accumulate thresholds |
|
115 |
|
116 movq mm6, [rbx + 16] ; kernel 1 taps |
|
117 movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 |
|
118 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 |
|
119 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. |
|
120 paddusw mm3, mm6 ; mm3 += mm5 |
|
121 |
|
122 ; thresholding |
|
123 movq mm6, mm1 ; mm6 = r0 p0..p3 |
|
124 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 |
|
125 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 |
|
126 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) |
|
127 pcmpgtw mm6, mm2 |
|
128 por mm7, mm6 ; accumulate thresholds |
|
129 |
|
130 |
|
131 paddusw mm3, RD ; mm3 += round value |
|
132 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 |
|
133 |
|
134 pand mm1, mm7 ; mm1 select vals > thresh from source |
|
135 pandn mm7, mm3 ; mm7 select vals < thresh from blurred result |
|
136 paddusw mm1, mm7 ; combination |
|
137 |
|
138 packuswb mm1, mm0 ; pack to bytes |
|
139 |
|
140 movd [rdi], mm1 ; |
|
141 neg rax ; pitch is positive |
|
142 |
|
143 |
|
144 add rsi, 4 |
|
145 add rdi, 4 |
|
146 add rdx, 4 |
|
147 |
|
148 cmp edx, dword ptr arg(5) ;cols |
|
149 jl .nextcol |
|
150 ; done with the all cols, start the across filtering in place |
|
151 sub rsi, rdx |
|
152 sub rdi, rdx |
|
153 |
|
154 |
|
155 push rax |
|
156 xor rdx, rdx |
|
157 mov rax, [rdi-4]; |
|
158 |
|
159 .acrossnextcol: |
|
160 pxor mm7, mm7 ; mm7 = 00000000 |
|
161 movq mm6, [rbx + 32 ] ; |
|
162 movq mm4, [rdi+rdx] ; mm4 = p0..p7 |
|
163 movq mm3, mm4 ; mm3 = p0..p7 |
|
164 punpcklbw mm3, mm0 ; mm3 = p0..p3 |
|
165 movq mm1, mm3 ; mm1 = p0..p3 |
|
166 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers |
|
167 |
|
168 movq mm6, [rbx + 48] |
|
169 psrlq mm4, 8 ; mm4 = p1..p7 |
|
170 movq mm5, mm4 ; mm5 = p1..p7 |
|
171 punpcklbw mm5, mm0 ; mm5 = p1..p4 |
|
172 pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers |
|
173 paddusw mm3, mm6 ; mm3 += mm6 |
|
174 |
|
175 ; thresholding |
|
176 movq mm7, mm1 ; mm7 = p0..p3 |
|
177 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 |
|
178 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
|
179 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) |
|
180 pcmpgtw mm7, mm2 |
|
181 |
|
182 movq mm6, [rbx + 64 ] |
|
183 psrlq mm4, 8 ; mm4 = p2..p7 |
|
184 movq mm5, mm4 ; mm5 = p2..p7 |
|
185 punpcklbw mm5, mm0 ; mm5 = p2..p5 |
|
186 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers |
|
187 paddusw mm3, mm6 ; mm3 += mm5 |
|
188 |
|
189 ; thresholding |
|
190 movq mm6, mm1 ; mm6 = p0..p3 |
|
191 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 |
|
192 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
|
193 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) |
|
194 pcmpgtw mm6, mm2 |
|
195 por mm7, mm6 ; accumulate thresholds |
|
196 |
|
197 |
|
198 movq mm6, [rbx ] |
|
199 movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 |
|
200 movq mm5, mm4 ; mm5 = p-2..p5 |
|
201 punpcklbw mm5, mm0 ; mm5 = p-2..p1 |
|
202 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers |
|
203 paddusw mm3, mm6 ; mm3 += mm5 |
|
204 |
|
205 ; thresholding |
|
206 movq mm6, mm1 ; mm6 = p0..p3 |
|
207 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 |
|
208 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
|
209 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) |
|
210 pcmpgtw mm6, mm2 |
|
211 por mm7, mm6 ; accumulate thresholds |
|
212 |
|
213 movq mm6, [rbx + 16] |
|
214 psrlq mm4, 8 ; mm4 = p-1..p5 |
|
215 punpcklbw mm4, mm0 ; mm4 = p-1..p2 |
|
216 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. |
|
217 paddusw mm3, mm6 ; mm3 += mm5 |
|
218 |
|
219 ; thresholding |
|
220 movq mm6, mm1 ; mm6 = p0..p3 |
|
221 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 |
|
222 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 |
|
223 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) |
|
224 pcmpgtw mm6, mm2 |
|
225 por mm7, mm6 ; accumulate thresholds |
|
226 |
|
227 paddusw mm3, RD ; mm3 += round value |
|
228 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 |
|
229 |
|
230 pand mm1, mm7 ; mm1 select vals > thresh from source |
|
231 pandn mm7, mm3 ; mm7 select vals < thresh from blurred result |
|
232 paddusw mm1, mm7 ; combination |
|
233 |
|
234 packuswb mm1, mm0 ; pack to bytes |
|
235 mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes |
|
236 movd eax, mm1 |
|
237 |
|
238 add rdx, 4 |
|
239 cmp edx, dword ptr arg(5) ;cols |
|
240 jl .acrossnextcol; |
|
241 |
|
242 mov DWORD PTR [rdi+rdx-4], eax |
|
243 pop rax |
|
244 |
|
245 ; done with this rwo |
|
246 add rsi,rax ; next line |
|
247 movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? |
|
248 add rdi,rax ; next destination |
|
249 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? |
|
250 |
|
251 dec rcx ; decrement count |
|
252 jnz .nextrow ; next row |
|
253 pop rbx |
|
254 |
|
255 ; begin epilog |
|
256 pop rdi |
|
257 pop rsi |
|
258 RESTORE_GOT |
|
259 UNSHADOW_ARGS |
|
260 pop rbp |
|
261 ret |
|
262 %undef RD |
|
263 |
|
264 |
|
265 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst, |
|
266 ; int pitch, int rows, int cols,int flimit) |
|
267 extern sym(vp9_rv) |
|
268 global sym(vp9_mbpost_proc_down_mmx) PRIVATE |
|
269 sym(vp9_mbpost_proc_down_mmx): |
|
270 push rbp |
|
271 mov rbp, rsp |
|
272 SHADOW_ARGS_TO_STACK 5 |
|
273 GET_GOT rbx |
|
274 push rsi |
|
275 push rdi |
|
276 ; end prolog |
|
277 |
|
278 ALIGN_STACK 16, rax |
|
279 sub rsp, 136 |
|
280 |
|
281 ; unsigned char d[16][8] at [rsp] |
|
282 ; create flimit2 at [rsp+128] |
|
283 mov eax, dword ptr arg(4) ;flimit |
|
284 mov [rsp+128], eax |
|
285 mov [rsp+128+4], eax |
|
286 %define flimit2 [rsp+128] |
|
287 |
|
288 %if ABI_IS_32BIT=0 |
|
289 lea r8, [GLOBAL(sym(vp9_rv))] |
|
290 %endif |
|
291 |
|
292 ;rows +=8; |
|
293 add dword ptr arg(2), 8 |
|
294 |
|
295 ;for(c=0; c<cols; c+=4) |
|
296 .loop_col: |
|
297 mov rsi, arg(0) ;s |
|
298 pxor mm0, mm0 ; |
|
299 |
|
300 movsxd rax, dword ptr arg(1) ;pitch ; |
|
301 neg rax ; rax = -pitch |
|
302 |
|
303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] |
|
304 neg rax |
|
305 |
|
306 |
|
307 pxor mm5, mm5 |
|
308 pxor mm6, mm6 ; |
|
309 |
|
310 pxor mm7, mm7 ; |
|
311 mov rdi, rsi |
|
312 |
|
313 mov rcx, 15 ; |
|
314 |
|
315 .loop_initvar: |
|
316 movd mm1, DWORD PTR [rdi]; |
|
317 punpcklbw mm1, mm0 ; |
|
318 |
|
319 paddw mm5, mm1 ; |
|
320 pmullw mm1, mm1 ; |
|
321 |
|
322 movq mm2, mm1 ; |
|
323 punpcklwd mm1, mm0 ; |
|
324 |
|
325 punpckhwd mm2, mm0 ; |
|
326 paddd mm6, mm1 ; |
|
327 |
|
328 paddd mm7, mm2 ; |
|
329 lea rdi, [rdi+rax] ; |
|
330 |
|
331 dec rcx |
|
332 jne .loop_initvar |
|
333 ;save the var and sum |
|
334 xor rdx, rdx |
|
335 .loop_row: |
|
336 movd mm1, DWORD PTR [rsi] ; [s-pitch*8] |
|
337 movd mm2, DWORD PTR [rdi] ; [s+pitch*7] |
|
338 |
|
339 punpcklbw mm1, mm0 |
|
340 punpcklbw mm2, mm0 |
|
341 |
|
342 paddw mm5, mm2 |
|
343 psubw mm5, mm1 |
|
344 |
|
345 pmullw mm2, mm2 |
|
346 movq mm4, mm2 |
|
347 |
|
348 punpcklwd mm2, mm0 |
|
349 punpckhwd mm4, mm0 |
|
350 |
|
351 paddd mm6, mm2 |
|
352 paddd mm7, mm4 |
|
353 |
|
354 pmullw mm1, mm1 |
|
355 movq mm2, mm1 |
|
356 |
|
357 punpcklwd mm1, mm0 |
|
358 psubd mm6, mm1 |
|
359 |
|
360 punpckhwd mm2, mm0 |
|
361 psubd mm7, mm2 |
|
362 |
|
363 |
|
364 movq mm3, mm6 |
|
365 pslld mm3, 4 |
|
366 |
|
367 psubd mm3, mm6 |
|
368 movq mm1, mm5 |
|
369 |
|
370 movq mm4, mm5 |
|
371 pmullw mm1, mm1 |
|
372 |
|
373 pmulhw mm4, mm4 |
|
374 movq mm2, mm1 |
|
375 |
|
376 punpcklwd mm1, mm4 |
|
377 punpckhwd mm2, mm4 |
|
378 |
|
379 movq mm4, mm7 |
|
380 pslld mm4, 4 |
|
381 |
|
382 psubd mm4, mm7 |
|
383 |
|
384 psubd mm3, mm1 |
|
385 psubd mm4, mm2 |
|
386 |
|
387 psubd mm3, flimit2 |
|
388 psubd mm4, flimit2 |
|
389 |
|
390 psrad mm3, 31 |
|
391 psrad mm4, 31 |
|
392 |
|
393 packssdw mm3, mm4 |
|
394 packsswb mm3, mm0 |
|
395 |
|
396 movd mm1, DWORD PTR [rsi+rax*8] |
|
397 |
|
398 movq mm2, mm1 |
|
399 punpcklbw mm1, mm0 |
|
400 |
|
401 paddw mm1, mm5 |
|
402 mov rcx, rdx |
|
403 |
|
404 and rcx, 127 |
|
405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 |
|
406 push rax |
|
407 lea rax, [GLOBAL(sym(vp9_rv))] |
|
408 movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2] |
|
409 pop rax |
|
410 %elif ABI_IS_32BIT=0 |
|
411 movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2] |
|
412 %else |
|
413 movq mm4, [sym(vp9_rv) + rcx*2] |
|
414 %endif |
|
415 paddw mm1, mm4 |
|
416 ;paddw xmm1, eight8s |
|
417 psraw mm1, 4 |
|
418 |
|
419 packuswb mm1, mm0 |
|
420 pand mm1, mm3 |
|
421 |
|
422 pandn mm3, mm2 |
|
423 por mm1, mm3 |
|
424 |
|
425 and rcx, 15 |
|
426 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] |
|
427 |
|
428 mov rcx, rdx |
|
429 sub rcx, 8 |
|
430 |
|
431 and rcx, 15 |
|
432 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] |
|
433 |
|
434 movd [rsi], mm1 |
|
435 lea rsi, [rsi+rax] |
|
436 |
|
437 lea rdi, [rdi+rax] |
|
438 add rdx, 1 |
|
439 |
|
440 cmp edx, dword arg(2) ;rows |
|
441 jl .loop_row |
|
442 |
|
443 |
|
444 add dword arg(0), 4 ; s += 4 |
|
445 sub dword arg(3), 4 ; cols -= 4 |
|
446 cmp dword arg(3), 0 |
|
447 jg .loop_col |
|
448 |
|
449 add rsp, 136 |
|
450 pop rsp |
|
451 |
|
452 ; begin epilog |
|
453 pop rdi |
|
454 pop rsi |
|
455 RESTORE_GOT |
|
456 UNSHADOW_ARGS |
|
457 pop rbp |
|
458 ret |
|
459 %undef flimit2 |
|
460 |
|
461 |
|
462 ;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise, |
|
463 ; unsigned char blackclamp[16], |
|
464 ; unsigned char whiteclamp[16], |
|
465 ; unsigned char bothclamp[16], |
|
466 ; unsigned int width, unsigned int height, int pitch) |
|
467 extern sym(rand) |
|
468 global sym(vp9_plane_add_noise_mmx) PRIVATE |
|
469 sym(vp9_plane_add_noise_mmx): |
|
470 push rbp |
|
471 mov rbp, rsp |
|
472 SHADOW_ARGS_TO_STACK 8 |
|
473 GET_GOT rbx |
|
474 push rsi |
|
475 push rdi |
|
476 ; end prolog |
|
477 |
|
478 .addnoise_loop: |
|
479 call sym(rand) WRT_PLT |
|
480 mov rcx, arg(1) ;noise |
|
481 and rax, 0xff |
|
482 add rcx, rax |
|
483 |
|
484 ; we rely on the fact that the clamping vectors are stored contiguously |
|
485 ; in black/white/both order. Note that we have to reload this here because |
|
486 ; rdx could be trashed by rand() |
|
487 mov rdx, arg(2) ; blackclamp |
|
488 |
|
489 |
|
490 mov rdi, rcx |
|
491 movsxd rcx, dword arg(5) ;[Width] |
|
492 mov rsi, arg(0) ;Pos |
|
493 xor rax,rax |
|
494 |
|
495 .addnoise_nextset: |
|
496 movq mm1,[rsi+rax] ; get the source |
|
497 |
|
498 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise |
|
499 paddusb mm1, [rdx+32] ;bothclamp |
|
500 psubusb mm1, [rdx+16] ;whiteclamp |
|
501 |
|
502 movq mm2,[rdi+rax] ; get the noise for this line |
|
503 paddb mm1,mm2 ; add it in |
|
504 movq [rsi+rax],mm1 ; store the result |
|
505 |
|
506 add rax,8 ; move to the next line |
|
507 |
|
508 cmp rax, rcx |
|
509 jl .addnoise_nextset |
|
510 |
|
511 movsxd rax, dword arg(7) ; Pitch |
|
512 add arg(0), rax ; Start += Pitch |
|
513 sub dword arg(6), 1 ; Height -= 1 |
|
514 jg .addnoise_loop |
|
515 |
|
516 ; begin epilog |
|
517 pop rdi |
|
518 pop rsi |
|
519 RESTORE_GOT |
|
520 UNSHADOW_ARGS |
|
521 pop rbp |
|
522 ret |
|
523 |
|
524 |
|
525 SECTION_RODATA |
|
526 align 16 |
|
527 Blur: |
|
528 times 16 dw 16 |
|
529 times 8 dw 64 |
|
530 times 16 dw 16 |
|
531 times 8 dw 0 |
|
532 |
|
533 rd: |
|
534 times 4 dw 0x40 |