|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ; /**************************************************************************** |
|
15 ; * Notes: |
|
16 ; * |
|
17 ; * This implementation makes use of 16 bit fixed point version of two multiply |
|
18 ; * constants: |
|
19 ; * 1. sqrt(2) * cos (pi/8) |
|
20 ; * 2. sqrt(2) * sin (pi/8) |
|
21 ; * Because the first constant is bigger than 1, to maintain the same 16 bit |
|
22 ; * fixed point precision as the second one, we use a trick of |
|
23 ; * x * a = x + x*(a-1) |
|
24 ; * so |
|
25 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). |
|
26 ; * |
|
27 ; * For the second constant, because of the 16bit version is 35468, which |
|
28 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative |
|
29 ; * number. |
|
30 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x |
|
31 ; * |
|
32 ; **************************************************************************/ |
|
33 |
|
34 |
|
35 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, |
|
36 ;int pitch, unsigned char *dest,int stride) |
|
37 global sym(vp8_short_idct4x4llm_mmx) PRIVATE |
|
38 sym(vp8_short_idct4x4llm_mmx): |
|
39 push rbp |
|
40 mov rbp, rsp |
|
41 SHADOW_ARGS_TO_STACK 5 |
|
42 GET_GOT rbx |
|
43 push rsi |
|
44 push rdi |
|
45 ; end prolog |
|
46 |
|
47 mov rax, arg(0) ;input |
|
48 mov rsi, arg(1) ;pred |
|
49 |
|
50 movq mm0, [rax ] |
|
51 movq mm1, [rax+ 8] |
|
52 movq mm2, [rax+16] |
|
53 movq mm3, [rax+24] |
|
54 |
|
55 %if 0 |
|
56 pxor mm7, mm7 |
|
57 movq [rax], mm7 |
|
58 movq [rax+8], mm7 |
|
59 movq [rax+16],mm7 |
|
60 movq [rax+24],mm7 |
|
61 %endif |
|
62 movsxd rax, dword ptr arg(2) ;pitch |
|
63 mov rdx, arg(3) ;dest |
|
64 movsxd rdi, dword ptr arg(4) ;stride |
|
65 |
|
66 |
|
67 psubw mm0, mm2 ; b1= 0-2 |
|
68 paddw mm2, mm2 ; |
|
69 |
|
70 movq mm5, mm1 |
|
71 paddw mm2, mm0 ; a1 =0+2 |
|
72 |
|
73 pmulhw mm5, [GLOBAL(x_s1sqr2)]; |
|
74 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) |
|
75 |
|
76 movq mm7, mm3 ; |
|
77 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; |
|
78 |
|
79 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) |
|
80 psubw mm7, mm5 ; c1 |
|
81 |
|
82 movq mm5, mm1 |
|
83 movq mm4, mm3 |
|
84 |
|
85 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] |
|
86 paddw mm5, mm1 |
|
87 |
|
88 pmulhw mm3, [GLOBAL(x_s1sqr2)] |
|
89 paddw mm3, mm4 |
|
90 |
|
91 paddw mm3, mm5 ; d1 |
|
92 movq mm6, mm2 ; a1 |
|
93 |
|
94 movq mm4, mm0 ; b1 |
|
95 paddw mm2, mm3 ;0 |
|
96 |
|
97 paddw mm4, mm7 ;1 |
|
98 psubw mm0, mm7 ;2 |
|
99 |
|
100 psubw mm6, mm3 ;3 |
|
101 |
|
102 movq mm1, mm2 ; 03 02 01 00 |
|
103 movq mm3, mm4 ; 23 22 21 20 |
|
104 |
|
105 punpcklwd mm1, mm0 ; 11 01 10 00 |
|
106 punpckhwd mm2, mm0 ; 13 03 12 02 |
|
107 |
|
108 punpcklwd mm3, mm6 ; 31 21 30 20 |
|
109 punpckhwd mm4, mm6 ; 33 23 32 22 |
|
110 |
|
111 movq mm0, mm1 ; 11 01 10 00 |
|
112 movq mm5, mm2 ; 13 03 12 02 |
|
113 |
|
114 punpckldq mm0, mm3 ; 30 20 10 00 |
|
115 punpckhdq mm1, mm3 ; 31 21 11 01 |
|
116 |
|
117 punpckldq mm2, mm4 ; 32 22 12 02 |
|
118 punpckhdq mm5, mm4 ; 33 23 13 03 |
|
119 |
|
120 movq mm3, mm5 ; 33 23 13 03 |
|
121 |
|
122 psubw mm0, mm2 ; b1= 0-2 |
|
123 paddw mm2, mm2 ; |
|
124 |
|
125 movq mm5, mm1 |
|
126 paddw mm2, mm0 ; a1 =0+2 |
|
127 |
|
128 pmulhw mm5, [GLOBAL(x_s1sqr2)]; |
|
129 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) |
|
130 |
|
131 movq mm7, mm3 ; |
|
132 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; |
|
133 |
|
134 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) |
|
135 psubw mm7, mm5 ; c1 |
|
136 |
|
137 movq mm5, mm1 |
|
138 movq mm4, mm3 |
|
139 |
|
140 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] |
|
141 paddw mm5, mm1 |
|
142 |
|
143 pmulhw mm3, [GLOBAL(x_s1sqr2)] |
|
144 paddw mm3, mm4 |
|
145 |
|
146 paddw mm3, mm5 ; d1 |
|
147 paddw mm0, [GLOBAL(fours)] |
|
148 |
|
149 paddw mm2, [GLOBAL(fours)] |
|
150 movq mm6, mm2 ; a1 |
|
151 |
|
152 movq mm4, mm0 ; b1 |
|
153 paddw mm2, mm3 ;0 |
|
154 |
|
155 paddw mm4, mm7 ;1 |
|
156 psubw mm0, mm7 ;2 |
|
157 |
|
158 psubw mm6, mm3 ;3 |
|
159 psraw mm2, 3 |
|
160 |
|
161 psraw mm0, 3 |
|
162 psraw mm4, 3 |
|
163 |
|
164 psraw mm6, 3 |
|
165 |
|
166 movq mm1, mm2 ; 03 02 01 00 |
|
167 movq mm3, mm4 ; 23 22 21 20 |
|
168 |
|
169 punpcklwd mm1, mm0 ; 11 01 10 00 |
|
170 punpckhwd mm2, mm0 ; 13 03 12 02 |
|
171 |
|
172 punpcklwd mm3, mm6 ; 31 21 30 20 |
|
173 punpckhwd mm4, mm6 ; 33 23 32 22 |
|
174 |
|
175 movq mm0, mm1 ; 11 01 10 00 |
|
176 movq mm5, mm2 ; 13 03 12 02 |
|
177 |
|
178 punpckldq mm0, mm3 ; 30 20 10 00 |
|
179 punpckhdq mm1, mm3 ; 31 21 11 01 |
|
180 |
|
181 punpckldq mm2, mm4 ; 32 22 12 02 |
|
182 punpckhdq mm5, mm4 ; 33 23 13 03 |
|
183 |
|
184 pxor mm7, mm7 |
|
185 |
|
186 movd mm4, [rsi] |
|
187 punpcklbw mm4, mm7 |
|
188 paddsw mm0, mm4 |
|
189 packuswb mm0, mm7 |
|
190 movd [rdx], mm0 |
|
191 |
|
192 movd mm4, [rsi+rax] |
|
193 punpcklbw mm4, mm7 |
|
194 paddsw mm1, mm4 |
|
195 packuswb mm1, mm7 |
|
196 movd [rdx+rdi], mm1 |
|
197 |
|
198 movd mm4, [rsi+2*rax] |
|
199 punpcklbw mm4, mm7 |
|
200 paddsw mm2, mm4 |
|
201 packuswb mm2, mm7 |
|
202 movd [rdx+rdi*2], mm2 |
|
203 |
|
204 add rdx, rdi |
|
205 add rsi, rax |
|
206 |
|
207 movd mm4, [rsi+2*rax] |
|
208 punpcklbw mm4, mm7 |
|
209 paddsw mm5, mm4 |
|
210 packuswb mm5, mm7 |
|
211 movd [rdx+rdi*2], mm5 |
|
212 |
|
213 ; begin epilog |
|
214 pop rdi |
|
215 pop rsi |
|
216 RESTORE_GOT |
|
217 UNSHADOW_ARGS |
|
218 pop rbp |
|
219 ret |
|
220 |
|
221 ;void vp8_dc_only_idct_add_mmx( |
|
222 ;short input_dc, |
|
223 ;unsigned char *pred_ptr, |
|
224 ;int pred_stride, |
|
225 ;unsigned char *dst_ptr, |
|
226 ;int stride) |
|
227 global sym(vp8_dc_only_idct_add_mmx) PRIVATE |
|
228 sym(vp8_dc_only_idct_add_mmx): |
|
229 push rbp |
|
230 mov rbp, rsp |
|
231 SHADOW_ARGS_TO_STACK 5 |
|
232 GET_GOT rbx |
|
233 ; end prolog |
|
234 |
|
235 movd mm5, arg(0) ;input_dc |
|
236 mov rax, arg(1) ;pred_ptr |
|
237 movsxd rdx, dword ptr arg(2) ;pred_stride |
|
238 |
|
239 pxor mm0, mm0 |
|
240 |
|
241 paddw mm5, [GLOBAL(fours)] |
|
242 lea rcx, [rdx + rdx*2] |
|
243 |
|
244 psraw mm5, 3 |
|
245 |
|
246 punpcklwd mm5, mm5 |
|
247 |
|
248 punpckldq mm5, mm5 |
|
249 |
|
250 movd mm1, [rax] |
|
251 movd mm2, [rax+rdx] |
|
252 movd mm3, [rax+2*rdx] |
|
253 movd mm4, [rax+rcx] |
|
254 |
|
255 mov rax, arg(3) ;d -- destination |
|
256 movsxd rdx, dword ptr arg(4) ;dst_stride |
|
257 |
|
258 punpcklbw mm1, mm0 |
|
259 paddsw mm1, mm5 |
|
260 packuswb mm1, mm0 ; pack and unpack to saturate |
|
261 lea rcx, [rdx + rdx*2] |
|
262 |
|
263 punpcklbw mm2, mm0 |
|
264 paddsw mm2, mm5 |
|
265 packuswb mm2, mm0 ; pack and unpack to saturate |
|
266 |
|
267 punpcklbw mm3, mm0 |
|
268 paddsw mm3, mm5 |
|
269 packuswb mm3, mm0 ; pack and unpack to saturate |
|
270 |
|
271 punpcklbw mm4, mm0 |
|
272 paddsw mm4, mm5 |
|
273 packuswb mm4, mm0 ; pack and unpack to saturate |
|
274 |
|
275 movd [rax], mm1 |
|
276 movd [rax+rdx], mm2 |
|
277 movd [rax+2*rdx], mm3 |
|
278 movd [rax+rcx], mm4 |
|
279 |
|
280 ; begin epilog |
|
281 RESTORE_GOT |
|
282 UNSHADOW_ARGS |
|
283 pop rbp |
|
284 ret |
|
285 |
|
286 SECTION_RODATA |
|
287 align 16 |
|
288 x_s1sqr2: |
|
289 times 4 dw 0x8A8C |
|
290 align 16 |
|
291 x_c1sqr2less1: |
|
292 times 4 dw 0x4E7B |
|
293 align 16 |
|
294 fours: |
|
295 times 4 dw 0x0004 |