|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, |
|
15 ; short *qcoeff_ptr,short *dequant_ptr, |
|
16 ; short *scan_mask, short *round_ptr, |
|
17 ; short *quant_ptr, short *dqcoeff_ptr); |
|
18 global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE |
|
19 sym(vp8_fast_quantize_b_impl_mmx): |
|
20 push rbp |
|
21 mov rbp, rsp |
|
22 SHADOW_ARGS_TO_STACK 8 |
|
23 push rsi |
|
24 push rdi |
|
25 ; end prolog |
|
26 |
|
27 |
|
28 mov rsi, arg(0) ;coeff_ptr |
|
29 movq mm0, [rsi] |
|
30 |
|
31 mov rax, arg(1) ;zbin_ptr |
|
32 movq mm1, [rax] |
|
33 |
|
34 movq mm3, mm0 |
|
35 psraw mm0, 15 |
|
36 |
|
37 pxor mm3, mm0 |
|
38 psubw mm3, mm0 ; abs |
|
39 |
|
40 movq mm2, mm3 |
|
41 pcmpgtw mm1, mm2 |
|
42 |
|
43 pandn mm1, mm2 |
|
44 movq mm3, mm1 |
|
45 |
|
46 mov rdx, arg(6) ;quant_ptr |
|
47 movq mm1, [rdx] |
|
48 |
|
49 mov rcx, arg(5) ;round_ptr |
|
50 movq mm2, [rcx] |
|
51 |
|
52 paddw mm3, mm2 |
|
53 pmulhuw mm3, mm1 |
|
54 |
|
55 pxor mm3, mm0 |
|
56 psubw mm3, mm0 ;gain the sign back |
|
57 |
|
58 mov rdi, arg(2) ;qcoeff_ptr |
|
59 movq mm0, mm3 |
|
60 |
|
61 movq [rdi], mm3 |
|
62 |
|
63 mov rax, arg(3) ;dequant_ptr |
|
64 movq mm2, [rax] |
|
65 |
|
66 pmullw mm3, mm2 |
|
67 mov rax, arg(7) ;dqcoeff_ptr |
|
68 |
|
69 movq [rax], mm3 |
|
70 |
|
71 ; next 8 |
|
72 movq mm4, [rsi+8] |
|
73 |
|
74 mov rax, arg(1) ;zbin_ptr |
|
75 movq mm5, [rax+8] |
|
76 |
|
77 movq mm7, mm4 |
|
78 psraw mm4, 15 |
|
79 |
|
80 pxor mm7, mm4 |
|
81 psubw mm7, mm4 ; abs |
|
82 |
|
83 movq mm6, mm7 |
|
84 pcmpgtw mm5, mm6 |
|
85 |
|
86 pandn mm5, mm6 |
|
87 movq mm7, mm5 |
|
88 |
|
89 movq mm5, [rdx+8] |
|
90 movq mm6, [rcx+8] |
|
91 |
|
92 paddw mm7, mm6 |
|
93 pmulhuw mm7, mm5 |
|
94 |
|
95 pxor mm7, mm4 |
|
96 psubw mm7, mm4;gain the sign back |
|
97 |
|
98 mov rdi, arg(2) ;qcoeff_ptr |
|
99 |
|
100 movq mm1, mm7 |
|
101 movq [rdi+8], mm7 |
|
102 |
|
103 mov rax, arg(3) ;dequant_ptr |
|
104 movq mm6, [rax+8] |
|
105 |
|
106 pmullw mm7, mm6 |
|
107 mov rax, arg(7) ;dqcoeff_ptr |
|
108 |
|
109 movq [rax+8], mm7 |
|
110 |
|
111 |
|
112 ; next 8 |
|
113 movq mm4, [rsi+16] |
|
114 |
|
115 mov rax, arg(1) ;zbin_ptr |
|
116 movq mm5, [rax+16] |
|
117 |
|
118 movq mm7, mm4 |
|
119 psraw mm4, 15 |
|
120 |
|
121 pxor mm7, mm4 |
|
122 psubw mm7, mm4 ; abs |
|
123 |
|
124 movq mm6, mm7 |
|
125 pcmpgtw mm5, mm6 |
|
126 |
|
127 pandn mm5, mm6 |
|
128 movq mm7, mm5 |
|
129 |
|
130 movq mm5, [rdx+16] |
|
131 movq mm6, [rcx+16] |
|
132 |
|
133 paddw mm7, mm6 |
|
134 pmulhuw mm7, mm5 |
|
135 |
|
136 pxor mm7, mm4 |
|
137 psubw mm7, mm4;gain the sign back |
|
138 |
|
139 mov rdi, arg(2) ;qcoeff_ptr |
|
140 |
|
141 movq mm1, mm7 |
|
142 movq [rdi+16], mm7 |
|
143 |
|
144 mov rax, arg(3) ;dequant_ptr |
|
145 movq mm6, [rax+16] |
|
146 |
|
147 pmullw mm7, mm6 |
|
148 mov rax, arg(7) ;dqcoeff_ptr |
|
149 |
|
150 movq [rax+16], mm7 |
|
151 |
|
152 |
|
153 ; next 8 |
|
154 movq mm4, [rsi+24] |
|
155 |
|
156 mov rax, arg(1) ;zbin_ptr |
|
157 movq mm5, [rax+24] |
|
158 |
|
159 movq mm7, mm4 |
|
160 psraw mm4, 15 |
|
161 |
|
162 pxor mm7, mm4 |
|
163 psubw mm7, mm4 ; abs |
|
164 |
|
165 movq mm6, mm7 |
|
166 pcmpgtw mm5, mm6 |
|
167 |
|
168 pandn mm5, mm6 |
|
169 movq mm7, mm5 |
|
170 |
|
171 movq mm5, [rdx+24] |
|
172 movq mm6, [rcx+24] |
|
173 |
|
174 paddw mm7, mm6 |
|
175 pmulhuw mm7, mm5 |
|
176 |
|
177 pxor mm7, mm4 |
|
178 psubw mm7, mm4;gain the sign back |
|
179 |
|
180 mov rdi, arg(2) ;qcoeff_ptr |
|
181 |
|
182 movq mm1, mm7 |
|
183 movq [rdi+24], mm7 |
|
184 |
|
185 mov rax, arg(3) ;dequant_ptr |
|
186 movq mm6, [rax+24] |
|
187 |
|
188 pmullw mm7, mm6 |
|
189 mov rax, arg(7) ;dqcoeff_ptr |
|
190 |
|
191 movq [rax+24], mm7 |
|
192 |
|
193 |
|
194 |
|
195 mov rdi, arg(4) ;scan_mask |
|
196 mov rsi, arg(2) ;qcoeff_ptr |
|
197 |
|
198 pxor mm5, mm5 |
|
199 pxor mm7, mm7 |
|
200 |
|
201 movq mm0, [rsi] |
|
202 movq mm1, [rsi+8] |
|
203 |
|
204 movq mm2, [rdi] |
|
205 movq mm3, [rdi+8]; |
|
206 |
|
207 pcmpeqw mm0, mm7 |
|
208 pcmpeqw mm1, mm7 |
|
209 |
|
210 pcmpeqw mm6, mm6 |
|
211 pxor mm0, mm6 |
|
212 |
|
213 pxor mm1, mm6 |
|
214 psrlw mm0, 15 |
|
215 |
|
216 psrlw mm1, 15 |
|
217 pmaddwd mm0, mm2 |
|
218 |
|
219 pmaddwd mm1, mm3 |
|
220 movq mm5, mm0 |
|
221 |
|
222 paddd mm5, mm1 |
|
223 |
|
224 movq mm0, [rsi+16] |
|
225 movq mm1, [rsi+24] |
|
226 |
|
227 movq mm2, [rdi+16] |
|
228 movq mm3, [rdi+24]; |
|
229 |
|
230 pcmpeqw mm0, mm7 |
|
231 pcmpeqw mm1, mm7 |
|
232 |
|
233 pcmpeqw mm6, mm6 |
|
234 pxor mm0, mm6 |
|
235 |
|
236 pxor mm1, mm6 |
|
237 psrlw mm0, 15 |
|
238 |
|
239 psrlw mm1, 15 |
|
240 pmaddwd mm0, mm2 |
|
241 |
|
242 pmaddwd mm1, mm3 |
|
243 paddd mm5, mm0 |
|
244 |
|
245 paddd mm5, mm1 |
|
246 movq mm0, mm5 |
|
247 |
|
248 psrlq mm5, 32 |
|
249 paddd mm0, mm5 |
|
250 |
|
251 ; eob adjustment begins here |
|
252 movq rcx, mm0 |
|
253 and rcx, 0xffff |
|
254 |
|
255 xor rdx, rdx |
|
256 sub rdx, rcx ; rdx=-rcx |
|
257 |
|
258 bsr rax, rcx |
|
259 inc rax |
|
260 |
|
261 sar rdx, 31 |
|
262 and rax, rdx |
|
263 ; Substitute the sse assembly for the old mmx mixed assembly/C. The |
|
264 ; following is kept as reference |
|
265 ; movq rcx, mm0 |
|
266 ; bsr rax, rcx |
|
267 ; |
|
268 ; mov eob, rax |
|
269 ; mov eee, rcx |
|
270 ; |
|
271 ;if(eee==0) |
|
272 ;{ |
|
273 ; eob=-1; |
|
274 ;} |
|
275 ;else if(eee<0) |
|
276 ;{ |
|
277 ; eob=15; |
|
278 ;} |
|
279 ;d->eob = eob+1; |
|
280 |
|
281 ; begin epilog |
|
282 pop rdi |
|
283 pop rsi |
|
284 UNSHADOW_ARGS |
|
285 pop rbp |
|
286 ret |