media/libvpx/vp8/common/x86/postproc_mmx.asm

branch
TOR_BUG_9701
changeset 10
ac0c01689b40
equal deleted inserted replaced
-1:000000000000 0:1ba9c3f89fc3
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %define VP8_FILTER_WEIGHT 128
15 %define VP8_FILTER_SHIFT 7
16
17 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
18 ; int pitch, int rows, int cols,int flimit)
19 extern sym(vp8_rv)
20 global sym(vp8_mbpost_proc_down_mmx) PRIVATE
21 sym(vp8_mbpost_proc_down_mmx):
22 push rbp
23 mov rbp, rsp
24 SHADOW_ARGS_TO_STACK 5
25 GET_GOT rbx
26 push rsi
27 push rdi
28 ; end prolog
29
30 ALIGN_STACK 16, rax
31 sub rsp, 136
32
33 ; unsigned char d[16][8] at [rsp]
34 ; create flimit2 at [rsp+128]
35 mov eax, dword ptr arg(4) ;flimit
36 mov [rsp+128], eax
37 mov [rsp+128+4], eax
38 %define flimit2 [rsp+128]
39
40 %if ABI_IS_32BIT=0
41 lea r8, [GLOBAL(sym(vp8_rv))]
42 %endif
43
44 ;rows +=8;
45 add dword ptr arg(2), 8
46
47 ;for(c=0; c<cols; c+=4)
48 .loop_col:
49 mov rsi, arg(0) ;s
50 pxor mm0, mm0 ;
51
52 movsxd rax, dword ptr arg(1) ;pitch ;
53
54 ; this copies the last row down into the border 8 rows
55 mov rdi, rsi
56 mov rdx, arg(2)
57 sub rdx, 9
58 imul rdx, rax
59 lea rdi, [rdi+rdx]
60 movq mm1, QWORD ptr[rdi] ; first row
61 mov rcx, 8
62 .init_borderd ; initialize borders
63 lea rdi, [rdi + rax]
64 movq [rdi], mm1
65
66 dec rcx
67 jne .init_borderd
68
69 neg rax ; rax = -pitch
70
71 ; this copies the first row up into the border 8 rows
72 mov rdi, rsi
73 movq mm1, QWORD ptr[rdi] ; first row
74 mov rcx, 8
75 .init_border ; initialize borders
76 lea rdi, [rdi + rax]
77 movq [rdi], mm1
78
79 dec rcx
80 jne .init_border
81
82
83 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
84 neg rax
85
86
87 pxor mm5, mm5
88 pxor mm6, mm6 ;
89
90 pxor mm7, mm7 ;
91 mov rdi, rsi
92
93 mov rcx, 15 ;
94
95 .loop_initvar:
96 movd mm1, DWORD PTR [rdi];
97 punpcklbw mm1, mm0 ;
98
99 paddw mm5, mm1 ;
100 pmullw mm1, mm1 ;
101
102 movq mm2, mm1 ;
103 punpcklwd mm1, mm0 ;
104
105 punpckhwd mm2, mm0 ;
106 paddd mm6, mm1 ;
107
108 paddd mm7, mm2 ;
109 lea rdi, [rdi+rax] ;
110
111 dec rcx
112 jne .loop_initvar
113 ;save the var and sum
114 xor rdx, rdx
115 .loop_row:
116 movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
117 movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
118
119 punpcklbw mm1, mm0
120 punpcklbw mm2, mm0
121
122 paddw mm5, mm2
123 psubw mm5, mm1
124
125 pmullw mm2, mm2
126 movq mm4, mm2
127
128 punpcklwd mm2, mm0
129 punpckhwd mm4, mm0
130
131 paddd mm6, mm2
132 paddd mm7, mm4
133
134 pmullw mm1, mm1
135 movq mm2, mm1
136
137 punpcklwd mm1, mm0
138 psubd mm6, mm1
139
140 punpckhwd mm2, mm0
141 psubd mm7, mm2
142
143
144 movq mm3, mm6
145 pslld mm3, 4
146
147 psubd mm3, mm6
148 movq mm1, mm5
149
150 movq mm4, mm5
151 pmullw mm1, mm1
152
153 pmulhw mm4, mm4
154 movq mm2, mm1
155
156 punpcklwd mm1, mm4
157 punpckhwd mm2, mm4
158
159 movq mm4, mm7
160 pslld mm4, 4
161
162 psubd mm4, mm7
163
164 psubd mm3, mm1
165 psubd mm4, mm2
166
167 psubd mm3, flimit2
168 psubd mm4, flimit2
169
170 psrad mm3, 31
171 psrad mm4, 31
172
173 packssdw mm3, mm4
174 packsswb mm3, mm0
175
176 movd mm1, DWORD PTR [rsi+rax*8]
177
178 movq mm2, mm1
179 punpcklbw mm1, mm0
180
181 paddw mm1, mm5
182 mov rcx, rdx
183
184 and rcx, 127
185 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
186 push rax
187 lea rax, [GLOBAL(sym(vp8_rv))]
188 movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2]
189 pop rax
190 %elif ABI_IS_32BIT=0
191 movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
192 %else
193 movq mm4, [sym(vp8_rv) + rcx*2]
194 %endif
195 paddw mm1, mm4
196 psraw mm1, 4
197
198 packuswb mm1, mm0
199 pand mm1, mm3
200
201 pandn mm3, mm2
202 por mm1, mm3
203
204 and rcx, 15
205 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
206
207 mov rcx, rdx
208 sub rcx, 8
209
210 and rcx, 15
211 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
212
213 movd [rsi], mm1
214 lea rsi, [rsi+rax]
215
216 lea rdi, [rdi+rax]
217 add rdx, 1
218
219 cmp edx, dword arg(2) ;rows
220 jl .loop_row
221
222
223 add dword arg(0), 4 ; s += 4
224 sub dword arg(3), 4 ; cols -= 4
225 cmp dword arg(3), 0
226 jg .loop_col
227
228 add rsp, 136
229 pop rsp
230
231 ; begin epilog
232 pop rdi
233 pop rsi
234 RESTORE_GOT
235 UNSHADOW_ARGS
236 pop rbp
237 ret
238 %undef flimit2
239
240
241 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
242 ; unsigned char blackclamp[16],
243 ; unsigned char whiteclamp[16],
244 ; unsigned char bothclamp[16],
245 ; unsigned int Width, unsigned int Height, int Pitch)
246 extern sym(rand)
247 global sym(vp8_plane_add_noise_mmx) PRIVATE
248 sym(vp8_plane_add_noise_mmx):
249 push rbp
250 mov rbp, rsp
251 SHADOW_ARGS_TO_STACK 8
252 GET_GOT rbx
253 push rsi
254 push rdi
255 ; end prolog
256
257 .addnoise_loop:
258 call sym(rand) WRT_PLT
259 mov rcx, arg(1) ;noise
260 and rax, 0xff
261 add rcx, rax
262
263 ; we rely on the fact that the clamping vectors are stored contiguously
264 ; in black/white/both order. Note that we have to reload this here because
265 ; rdx could be trashed by rand()
266 mov rdx, arg(2) ; blackclamp
267
268
269 mov rdi, rcx
270 movsxd rcx, dword arg(5) ;[Width]
271 mov rsi, arg(0) ;Pos
272 xor rax,rax
273
274 .addnoise_nextset:
275 movq mm1,[rsi+rax] ; get the source
276
277 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
278 paddusb mm1, [rdx+32] ;bothclamp
279 psubusb mm1, [rdx+16] ;whiteclamp
280
281 movq mm2,[rdi+rax] ; get the noise for this line
282 paddb mm1,mm2 ; add it in
283 movq [rsi+rax],mm1 ; store the result
284
285 add rax,8 ; move to the next line
286
287 cmp rax, rcx
288 jl .addnoise_nextset
289
290 movsxd rax, dword arg(7) ; Pitch
291 add arg(0), rax ; Start += Pitch
292 sub dword arg(6), 1 ; Height -= 1
293 jg .addnoise_loop
294
295 ; begin epilog
296 pop rdi
297 pop rsi
298 RESTORE_GOT
299 UNSHADOW_ARGS
300 pop rbp
301 ret
302
303
304 SECTION_RODATA
305 align 16
306 Blur:
307 times 16 dw 16
308 times 8 dw 64
309 times 16 dw 16
310 times 8 dw 0
311
312 rd:
313 times 4 dw 0x40

mercurial