|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 %define VP8_FILTER_WEIGHT 128 |
|
15 %define VP8_FILTER_SHIFT 7 |
|
16 |
|
17 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst, |
|
18 ; int pitch, int rows, int cols,int flimit) |
|
19 extern sym(vp8_rv) |
|
20 global sym(vp8_mbpost_proc_down_mmx) PRIVATE |
|
21 sym(vp8_mbpost_proc_down_mmx): |
|
22 push rbp |
|
23 mov rbp, rsp |
|
24 SHADOW_ARGS_TO_STACK 5 |
|
25 GET_GOT rbx |
|
26 push rsi |
|
27 push rdi |
|
28 ; end prolog |
|
29 |
|
30 ALIGN_STACK 16, rax |
|
31 sub rsp, 136 |
|
32 |
|
33 ; unsigned char d[16][8] at [rsp] |
|
34 ; create flimit2 at [rsp+128] |
|
35 mov eax, dword ptr arg(4) ;flimit |
|
36 mov [rsp+128], eax |
|
37 mov [rsp+128+4], eax |
|
38 %define flimit2 [rsp+128] |
|
39 |
|
40 %if ABI_IS_32BIT=0 |
|
41 lea r8, [GLOBAL(sym(vp8_rv))] |
|
42 %endif |
|
43 |
|
44 ;rows +=8; |
|
45 add dword ptr arg(2), 8 |
|
46 |
|
47 ;for(c=0; c<cols; c+=4) |
|
48 .loop_col: |
|
49 mov rsi, arg(0) ;s |
|
50 pxor mm0, mm0 ; |
|
51 |
|
52 movsxd rax, dword ptr arg(1) ;pitch ; |
|
53 |
|
54 ; this copies the last row down into the border 8 rows |
|
55 mov rdi, rsi |
|
56 mov rdx, arg(2) |
|
57 sub rdx, 9 |
|
58 imul rdx, rax |
|
59 lea rdi, [rdi+rdx] |
|
60 movq mm1, QWORD ptr[rdi] ; first row |
|
61 mov rcx, 8 |
|
62 .init_borderd ; initialize borders |
|
63 lea rdi, [rdi + rax] |
|
64 movq [rdi], mm1 |
|
65 |
|
66 dec rcx |
|
67 jne .init_borderd |
|
68 |
|
69 neg rax ; rax = -pitch |
|
70 |
|
71 ; this copies the first row up into the border 8 rows |
|
72 mov rdi, rsi |
|
73 movq mm1, QWORD ptr[rdi] ; first row |
|
74 mov rcx, 8 |
|
75 .init_border ; initialize borders |
|
76 lea rdi, [rdi + rax] |
|
77 movq [rdi], mm1 |
|
78 |
|
79 dec rcx |
|
80 jne .init_border |
|
81 |
|
82 |
|
83 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] |
|
84 neg rax |
|
85 |
|
86 |
|
87 pxor mm5, mm5 |
|
88 pxor mm6, mm6 ; |
|
89 |
|
90 pxor mm7, mm7 ; |
|
91 mov rdi, rsi |
|
92 |
|
93 mov rcx, 15 ; |
|
94 |
|
95 .loop_initvar: |
|
96 movd mm1, DWORD PTR [rdi]; |
|
97 punpcklbw mm1, mm0 ; |
|
98 |
|
99 paddw mm5, mm1 ; |
|
100 pmullw mm1, mm1 ; |
|
101 |
|
102 movq mm2, mm1 ; |
|
103 punpcklwd mm1, mm0 ; |
|
104 |
|
105 punpckhwd mm2, mm0 ; |
|
106 paddd mm6, mm1 ; |
|
107 |
|
108 paddd mm7, mm2 ; |
|
109 lea rdi, [rdi+rax] ; |
|
110 |
|
111 dec rcx |
|
112 jne .loop_initvar |
|
113 ;save the var and sum |
|
114 xor rdx, rdx |
|
115 .loop_row: |
|
116 movd mm1, DWORD PTR [rsi] ; [s-pitch*8] |
|
117 movd mm2, DWORD PTR [rdi] ; [s+pitch*7] |
|
118 |
|
119 punpcklbw mm1, mm0 |
|
120 punpcklbw mm2, mm0 |
|
121 |
|
122 paddw mm5, mm2 |
|
123 psubw mm5, mm1 |
|
124 |
|
125 pmullw mm2, mm2 |
|
126 movq mm4, mm2 |
|
127 |
|
128 punpcklwd mm2, mm0 |
|
129 punpckhwd mm4, mm0 |
|
130 |
|
131 paddd mm6, mm2 |
|
132 paddd mm7, mm4 |
|
133 |
|
134 pmullw mm1, mm1 |
|
135 movq mm2, mm1 |
|
136 |
|
137 punpcklwd mm1, mm0 |
|
138 psubd mm6, mm1 |
|
139 |
|
140 punpckhwd mm2, mm0 |
|
141 psubd mm7, mm2 |
|
142 |
|
143 |
|
144 movq mm3, mm6 |
|
145 pslld mm3, 4 |
|
146 |
|
147 psubd mm3, mm6 |
|
148 movq mm1, mm5 |
|
149 |
|
150 movq mm4, mm5 |
|
151 pmullw mm1, mm1 |
|
152 |
|
153 pmulhw mm4, mm4 |
|
154 movq mm2, mm1 |
|
155 |
|
156 punpcklwd mm1, mm4 |
|
157 punpckhwd mm2, mm4 |
|
158 |
|
159 movq mm4, mm7 |
|
160 pslld mm4, 4 |
|
161 |
|
162 psubd mm4, mm7 |
|
163 |
|
164 psubd mm3, mm1 |
|
165 psubd mm4, mm2 |
|
166 |
|
167 psubd mm3, flimit2 |
|
168 psubd mm4, flimit2 |
|
169 |
|
170 psrad mm3, 31 |
|
171 psrad mm4, 31 |
|
172 |
|
173 packssdw mm3, mm4 |
|
174 packsswb mm3, mm0 |
|
175 |
|
176 movd mm1, DWORD PTR [rsi+rax*8] |
|
177 |
|
178 movq mm2, mm1 |
|
179 punpcklbw mm1, mm0 |
|
180 |
|
181 paddw mm1, mm5 |
|
182 mov rcx, rdx |
|
183 |
|
184 and rcx, 127 |
|
185 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 |
|
186 push rax |
|
187 lea rax, [GLOBAL(sym(vp8_rv))] |
|
188 movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] |
|
189 pop rax |
|
190 %elif ABI_IS_32BIT=0 |
|
191 movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] |
|
192 %else |
|
193 movq mm4, [sym(vp8_rv) + rcx*2] |
|
194 %endif |
|
195 paddw mm1, mm4 |
|
196 psraw mm1, 4 |
|
197 |
|
198 packuswb mm1, mm0 |
|
199 pand mm1, mm3 |
|
200 |
|
201 pandn mm3, mm2 |
|
202 por mm1, mm3 |
|
203 |
|
204 and rcx, 15 |
|
205 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] |
|
206 |
|
207 mov rcx, rdx |
|
208 sub rcx, 8 |
|
209 |
|
210 and rcx, 15 |
|
211 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] |
|
212 |
|
213 movd [rsi], mm1 |
|
214 lea rsi, [rsi+rax] |
|
215 |
|
216 lea rdi, [rdi+rax] |
|
217 add rdx, 1 |
|
218 |
|
219 cmp edx, dword arg(2) ;rows |
|
220 jl .loop_row |
|
221 |
|
222 |
|
223 add dword arg(0), 4 ; s += 4 |
|
224 sub dword arg(3), 4 ; cols -= 4 |
|
225 cmp dword arg(3), 0 |
|
226 jg .loop_col |
|
227 |
|
228 add rsp, 136 |
|
229 pop rsp |
|
230 |
|
231 ; begin epilog |
|
232 pop rdi |
|
233 pop rsi |
|
234 RESTORE_GOT |
|
235 UNSHADOW_ARGS |
|
236 pop rbp |
|
237 ret |
|
238 %undef flimit2 |
|
239 |
|
240 |
|
241 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, |
|
242 ; unsigned char blackclamp[16], |
|
243 ; unsigned char whiteclamp[16], |
|
244 ; unsigned char bothclamp[16], |
|
245 ; unsigned int Width, unsigned int Height, int Pitch) |
|
246 extern sym(rand) |
|
247 global sym(vp8_plane_add_noise_mmx) PRIVATE |
|
248 sym(vp8_plane_add_noise_mmx): |
|
249 push rbp |
|
250 mov rbp, rsp |
|
251 SHADOW_ARGS_TO_STACK 8 |
|
252 GET_GOT rbx |
|
253 push rsi |
|
254 push rdi |
|
255 ; end prolog |
|
256 |
|
257 .addnoise_loop: |
|
258 call sym(rand) WRT_PLT |
|
259 mov rcx, arg(1) ;noise |
|
260 and rax, 0xff |
|
261 add rcx, rax |
|
262 |
|
263 ; we rely on the fact that the clamping vectors are stored contiguously |
|
264 ; in black/white/both order. Note that we have to reload this here because |
|
265 ; rdx could be trashed by rand() |
|
266 mov rdx, arg(2) ; blackclamp |
|
267 |
|
268 |
|
269 mov rdi, rcx |
|
270 movsxd rcx, dword arg(5) ;[Width] |
|
271 mov rsi, arg(0) ;Pos |
|
272 xor rax,rax |
|
273 |
|
274 .addnoise_nextset: |
|
275 movq mm1,[rsi+rax] ; get the source |
|
276 |
|
277 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise |
|
278 paddusb mm1, [rdx+32] ;bothclamp |
|
279 psubusb mm1, [rdx+16] ;whiteclamp |
|
280 |
|
281 movq mm2,[rdi+rax] ; get the noise for this line |
|
282 paddb mm1,mm2 ; add it in |
|
283 movq [rsi+rax],mm1 ; store the result |
|
284 |
|
285 add rax,8 ; move to the next line |
|
286 |
|
287 cmp rax, rcx |
|
288 jl .addnoise_nextset |
|
289 |
|
290 movsxd rax, dword arg(7) ; Pitch |
|
291 add arg(0), rax ; Start += Pitch |
|
292 sub dword arg(6), 1 ; Height -= 1 |
|
293 jg .addnoise_loop |
|
294 |
|
295 ; begin epilog |
|
296 pop rdi |
|
297 pop rsi |
|
298 RESTORE_GOT |
|
299 UNSHADOW_ARGS |
|
300 pop rbp |
|
301 ret |
|
302 |
|
303 |
|
304 SECTION_RODATA |
|
305 align 16 |
|
306 Blur: |
|
307 times 16 dw 16 |
|
308 times 8 dw 64 |
|
309 times 16 dw 16 |
|
310 times 8 dw 0 |
|
311 |
|
312 rd: |
|
313 times 4 dw 0x40 |