|
1 ;******************************************************************** |
|
2 ;* * |
|
3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 ;* * |
|
8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * |
|
9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 ;* * |
|
11 ;******************************************************************** |
|
12 ; Original implementation: |
|
13 ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd |
|
14 ; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $ |
|
15 ;******************************************************************** |
|
16 |
|
17 AREA |.text|, CODE, READONLY |
|
18 |
|
19 ; Explicitly specifying alignment here because some versions of |
|
20 ; gas don't align code correctly. See |
|
21 ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html |
|
22 ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 |
|
23 ALIGN |
|
24 |
|
25 GET armopts.s |
|
26 |
|
27 EXPORT oc_idct8x8_1_arm |
|
28 EXPORT oc_idct8x8_arm |
|
29 |
|
30 oc_idct8x8_1_arm PROC |
|
31 ; r0 = ogg_int16_t *_y |
|
32 ; r1 = ogg_uint16_t _dc |
|
33 ORR r1, r1, r1, LSL #16 |
|
34 MOV r2, r1 |
|
35 MOV r3, r1 |
|
36 MOV r12,r1 |
|
37 STMIA r0!,{r1,r2,r3,r12} |
|
38 STMIA r0!,{r1,r2,r3,r12} |
|
39 STMIA r0!,{r1,r2,r3,r12} |
|
40 STMIA r0!,{r1,r2,r3,r12} |
|
41 STMIA r0!,{r1,r2,r3,r12} |
|
42 STMIA r0!,{r1,r2,r3,r12} |
|
43 STMIA r0!,{r1,r2,r3,r12} |
|
44 STMIA r0!,{r1,r2,r3,r12} |
|
45 MOV PC, r14 |
|
46 ENDP |
|
47 |
|
48 oc_idct8x8_arm PROC |
|
49 ; r0 = ogg_int16_t *_y |
|
50 ; r1 = ogg_int16_t *_x |
|
51 ; r2 = int _last_zzi |
|
52 CMP r2, #3 |
|
53 BLE oc_idct8x8_3_arm |
|
54 CMP r2, #6 |
|
55 BLE oc_idct8x8_6_arm |
|
56 CMP r2, #10 |
|
57 BLE oc_idct8x8_10_arm |
|
58 oc_idct8x8_slow_arm |
|
59 STMFD r13!,{r4-r11,r14} |
|
60 SUB r13,r13,#64*2 |
|
61 ; Row transforms |
|
62 STR r0, [r13,#-4]! |
|
63 ADD r0, r13, #4 ; Write to temp storage. |
|
64 BL idct8core_arm |
|
65 BL idct8core_arm |
|
66 BL idct8core_arm |
|
67 BL idct8core_arm |
|
68 BL idct8core_arm |
|
69 BL idct8core_arm |
|
70 BL idct8core_arm |
|
71 BL idct8core_arm |
|
72 LDR r0, [r13], #4 ; Write to the final destination. |
|
73 ; Clear input data for next block (decoder only). |
|
74 SUB r2, r1, #8*16 |
|
75 CMP r0, r2 |
|
76 MOV r1, r13 ; And read from temp storage. |
|
77 BEQ oc_idct8x8_slow_arm_cols |
|
78 MOV r4, #0 |
|
79 MOV r5, #0 |
|
80 MOV r6, #0 |
|
81 MOV r7, #0 |
|
82 STMIA r2!,{r4,r5,r6,r7} |
|
83 STMIA r2!,{r4,r5,r6,r7} |
|
84 STMIA r2!,{r4,r5,r6,r7} |
|
85 STMIA r2!,{r4,r5,r6,r7} |
|
86 STMIA r2!,{r4,r5,r6,r7} |
|
87 STMIA r2!,{r4,r5,r6,r7} |
|
88 STMIA r2!,{r4,r5,r6,r7} |
|
89 STMIA r2!,{r4,r5,r6,r7} |
|
90 oc_idct8x8_slow_arm_cols |
|
91 ; Column transforms |
|
92 BL idct8core_down_arm |
|
93 BL idct8core_down_arm |
|
94 BL idct8core_down_arm |
|
95 BL idct8core_down_arm |
|
96 BL idct8core_down_arm |
|
97 BL idct8core_down_arm |
|
98 BL idct8core_down_arm |
|
99 BL idct8core_down_arm |
|
100 ADD r13,r13,#64*2 |
|
101 LDMFD r13!,{r4-r11,PC} |
|
102 ENDP |
|
103 |
|
104 oc_idct8x8_10_arm PROC |
|
105 STMFD r13!,{r4-r11,r14} |
|
106 SUB r13,r13,#64*2 |
|
107 ; Row transforms |
|
108 MOV r2, r0 |
|
109 MOV r0, r13 ; Write to temp storage. |
|
110 BL idct4core_arm |
|
111 BL idct3core_arm |
|
112 BL idct2core_arm |
|
113 BL idct1core_arm |
|
114 ; Clear input data for next block (decoder only). |
|
115 SUB r0, r1, #4*16 |
|
116 CMP r0, r2 |
|
117 MOV r1, r13 ; Read from temp storage. |
|
118 BEQ oc_idct8x8_10_arm_cols |
|
119 MOV r4, #0 |
|
120 STR r4, [r0] |
|
121 STR r4, [r0,#4] |
|
122 STR r4, [r0,#16] |
|
123 STR r4, [r0,#20] |
|
124 STR r4, [r0,#32] |
|
125 STR r4, [r0,#48] |
|
126 MOV r0, r2 ; Write to the final destination |
|
127 oc_idct8x8_10_arm_cols |
|
128 ; Column transforms |
|
129 BL idct4core_down_arm |
|
130 BL idct4core_down_arm |
|
131 BL idct4core_down_arm |
|
132 BL idct4core_down_arm |
|
133 BL idct4core_down_arm |
|
134 BL idct4core_down_arm |
|
135 BL idct4core_down_arm |
|
136 BL idct4core_down_arm |
|
137 ADD r13,r13,#64*2 |
|
138 LDMFD r13!,{r4-r11,PC} |
|
139 ENDP |
|
140 |
|
141 oc_idct8x8_6_arm PROC |
|
142 STMFD r13!,{r4-r7,r9-r11,r14} |
|
143 SUB r13,r13,#64*2 |
|
144 ; Row transforms |
|
145 MOV r2, r0 |
|
146 MOV r0, r13 ; Write to temp storage. |
|
147 BL idct3core_arm |
|
148 BL idct2core_arm |
|
149 BL idct1core_arm |
|
150 ; Clear input data for next block (decoder only). |
|
151 SUB r0, r1, #3*16 |
|
152 CMP r0, r2 |
|
153 MOV r1, r13 ; Read from temp storage. |
|
154 BEQ oc_idct8x8_6_arm_cols |
|
155 MOV r4, #0 |
|
156 STR r4, [r0] |
|
157 STR r4, [r0,#4] |
|
158 STR r4, [r0,#16] |
|
159 STR r4, [r0,#32] |
|
160 MOV r0, r2 ; Write to the final destination |
|
161 oc_idct8x8_6_arm_cols |
|
162 ; Column transforms |
|
163 BL idct3core_down_arm |
|
164 BL idct3core_down_arm |
|
165 BL idct3core_down_arm |
|
166 BL idct3core_down_arm |
|
167 BL idct3core_down_arm |
|
168 BL idct3core_down_arm |
|
169 BL idct3core_down_arm |
|
170 BL idct3core_down_arm |
|
171 ADD r13,r13,#64*2 |
|
172 LDMFD r13!,{r4-r7,r9-r11,PC} |
|
173 ENDP |
|
174 |
|
175 oc_idct8x8_3_arm PROC |
|
176 STMFD r13!,{r4-r7,r9-r11,r14} |
|
177 SUB r13,r13,#64*2 |
|
178 ; Row transforms |
|
179 MOV r2, r0 |
|
180 MOV r0, r13 ; Write to temp storage. |
|
181 BL idct2core_arm |
|
182 BL idct1core_arm |
|
183 ; Clear input data for next block (decoder only). |
|
184 SUB r0, r1, #2*16 |
|
185 CMP r0, r2 |
|
186 MOV r1, r13 ; Read from temp storage. |
|
187 MOVNE r4, #0 |
|
188 STRNE r4, [r0] |
|
189 STRNE r4, [r0,#16] |
|
190 MOVNE r0, r2 ; Write to the final destination |
|
191 ; Column transforms |
|
192 BL idct2core_down_arm |
|
193 BL idct2core_down_arm |
|
194 BL idct2core_down_arm |
|
195 BL idct2core_down_arm |
|
196 BL idct2core_down_arm |
|
197 BL idct2core_down_arm |
|
198 BL idct2core_down_arm |
|
199 BL idct2core_down_arm |
|
200 ADD r13,r13,#64*2 |
|
201 LDMFD r13!,{r4-r7,r9-r11,PC} |
|
202 ENDP |
|
203 |
|
204 idct1core_arm PROC |
|
205 ; r0 = ogg_int16_t *_y (destination) |
|
206 ; r1 = const ogg_int16_t *_x (source) |
|
207 LDRSH r3, [r1], #16 |
|
208 MOV r12,#0x05 |
|
209 ORR r12,r12,#0xB500 |
|
210 MUL r3, r12, r3 |
|
211 ; Stall ? |
|
212 MOV r3, r3, ASR #16 |
|
213 STRH r3, [r0], #2 |
|
214 STRH r3, [r0, #14] |
|
215 STRH r3, [r0, #30] |
|
216 STRH r3, [r0, #46] |
|
217 STRH r3, [r0, #62] |
|
218 STRH r3, [r0, #78] |
|
219 STRH r3, [r0, #94] |
|
220 STRH r3, [r0, #110] |
|
221 MOV PC,R14 |
|
222 ENDP |
|
223 |
|
224 idct2core_arm PROC |
|
225 ; r0 = ogg_int16_t *_y (destination) |
|
226 ; r1 = const ogg_int16_t *_x (source) |
|
227 LDRSH r9, [r1], #16 ; r9 = x[0] |
|
228 LDR r12,OC_C4S4 |
|
229 LDRSH r11,[r1, #-14] ; r11= x[1] |
|
230 LDR r3, OC_C7S1 |
|
231 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
|
232 LDR r10,OC_C1S7 |
|
233 MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1] |
|
234 MOV r9, r9, ASR #16 ; r9 = t[0] |
|
235 MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1] |
|
236 MOV r3, r3, ASR #16 ; r3 = t[4] |
|
237 MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4] |
|
238 MOV r11,r11,ASR #16 ; r11= t[7] |
|
239 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] |
|
240 MOV r10,r10,ASR #16 ; r10= t[5] |
|
241 ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6] |
|
242 ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5] |
|
243 SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5] |
|
244 ADD r3, r3, r9 ; r3 = t[0]+t[4] |
|
245 ADD r11,r11,r9 ; r11= t[0]+t[7] |
|
246 STRH r11,[r0], #2 ; y[0] = t[0]+t[7] |
|
247 STRH r12,[r0, #14] ; y[1] = t[0]+t[6] |
|
248 STRH r10,[r0, #30] ; y[2] = t[0]+t[5] |
|
249 STRH r3, [r0, #46] ; y[3] = t[0]+t[4] |
|
250 RSB r3, r3, r9, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4] |
|
251 RSB r10,r10,r9, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5] |
|
252 RSB r12,r12,r9, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6] |
|
253 RSB r11,r11,r9, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7] |
|
254 STRH r3, [r0, #62] ; y[4] = t[0]-t[4] |
|
255 STRH r10,[r0, #78] ; y[5] = t[0]-t[5] |
|
256 STRH r12,[r0, #94] ; y[6] = t[0]-t[6] |
|
257 STRH r11,[r0, #110] ; y[7] = t[0]-t[7] |
|
258 MOV PC,r14 |
|
259 ENDP |
|
260 |
|
261 idct2core_down_arm PROC |
|
262 ; r0 = ogg_int16_t *_y (destination) |
|
263 ; r1 = const ogg_int16_t *_x (source) |
|
264 LDRSH r9, [r1], #16 ; r9 = x[0] |
|
265 LDR r12,OC_C4S4 |
|
266 LDRSH r11,[r1, #-14] ; r11= x[1] |
|
267 LDR r3, OC_C7S1 |
|
268 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
|
269 LDR r10,OC_C1S7 |
|
270 MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1] |
|
271 MOV r9, r9, ASR #16 ; r9 = t[0] |
|
272 MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1] |
|
273 ADD r9, r9, #8 ; r9 = t[0]+8 |
|
274 MOV r3, r3, ASR #16 ; r3 = t[4] |
|
275 MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4] |
|
276 MOV r11,r11,ASR #16 ; r11= t[7] |
|
277 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] |
|
278 MOV r10,r10,ASR #16 ; r10= t[5] |
|
279 ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]+8 |
|
280 ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8 |
|
281 SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8 |
|
282 ADD r3, r3, r9 ; r3 = t[0]+t[4]+8 |
|
283 ADD r11,r11,r9 ; r11= t[0]+t[7]+8 |
|
284 ; TODO: This is wrong. |
|
285 ; The C code truncates to 16 bits by storing to RAM and doing the |
|
286 ; shifts later; we've got an extra 4 bits here. |
|
287 MOV r4, r11,ASR #4 |
|
288 MOV r5, r12,ASR #4 |
|
289 MOV r6, r10,ASR #4 |
|
290 MOV r7, r3, ASR #4 |
|
291 RSB r3, r3, r9, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8 |
|
292 RSB r10,r10,r9, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8 |
|
293 RSB r12,r12,r9, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8 |
|
294 RSB r11,r11,r9, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8 |
|
295 MOV r3, r3, ASR #4 |
|
296 MOV r10,r10,ASR #4 |
|
297 MOV r12,r12,ASR #4 |
|
298 MOV r11,r11,ASR #4 |
|
299 STRH r4, [r0], #2 ; y[0] = t[0]+t[7] |
|
300 STRH r5, [r0, #14] ; y[1] = t[0]+t[6] |
|
301 STRH r6, [r0, #30] ; y[2] = t[0]+t[5] |
|
302 STRH r7, [r0, #46] ; y[3] = t[0]+t[4] |
|
303 STRH r3, [r0, #62] ; y[4] = t[0]-t[4] |
|
304 STRH r10,[r0, #78] ; y[5] = t[0]-t[5] |
|
305 STRH r12,[r0, #94] ; y[6] = t[0]-t[6] |
|
306 STRH r11,[r0, #110] ; y[7] = t[0]-t[7] |
|
307 MOV PC,r14 |
|
308 ENDP |
|
309 |
|
310 idct3core_arm PROC |
|
311 LDRSH r9, [r1], #16 ; r9 = x[0] |
|
312 LDR r12,OC_C4S4 ; r12= OC_C4S4 |
|
313 LDRSH r3, [r1, #-12] ; r3 = x[2] |
|
314 LDR r10,OC_C6S2 ; r10= OC_C6S2 |
|
315 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
|
316 LDR r4, OC_C2S6 ; r4 = OC_C2S6 |
|
317 MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2] |
|
318 LDRSH r11,[r1, #-14] ; r11= x[1] |
|
319 MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2] |
|
320 LDR r4, OC_C7S1 ; r4 = OC_C7S1 |
|
321 LDR r5, OC_C1S7 ; r5 = OC_C1S7 |
|
322 MOV r9, r9, ASR #16 ; r9 = t[0] |
|
323 MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1] |
|
324 ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3] |
|
325 MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1] |
|
326 MOV r4, r4, ASR #16 ; r4 = t[4] |
|
327 MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4] |
|
328 MOV r11,r11,ASR #16 ; r11= t[7] |
|
329 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] |
|
330 ADD r10,r9, r10,ASR #16 ; r10= t[1] = t[0]+t[2] |
|
331 RSB r6, r10,r9, LSL #1 ; r6 = t[2] = t[0]-t[2] |
|
332 ; r3 = t2[0] = t[0]+t[3] |
|
333 RSB r9, r3, r9, LSL #1 ; r9 = t2[3] = t[0]-t[3] |
|
334 MOV r12,r12,ASR #16 ; r12= t[6] |
|
335 ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5] |
|
336 RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5] |
|
337 ADD r11,r3, r11 ; r11= t2[0]+t[7] |
|
338 ADD r5, r10,r5 ; r5 = t[1]+t2[6] |
|
339 ADD r12,r6, r12 ; r12= t[2]+t2[5] |
|
340 ADD r4, r9, r4 ; r4 = t2[3]+t[4] |
|
341 STRH r11,[r0], #2 ; y[0] = t[0]+t[7] |
|
342 STRH r5, [r0, #14] ; y[1] = t[1]+t2[6] |
|
343 STRH r12,[r0, #30] ; y[2] = t[2]+t2[5] |
|
344 STRH r4, [r0, #46] ; y[3] = t2[3]+t[4] |
|
345 RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7] |
|
346 RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6] |
|
347 RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5] |
|
348 RSB r4, r4, r9, LSL #1 ; r4 = t2[3] - t[4] |
|
349 STRH r4, [r0, #62] ; y[4] = t2[3]-t[4] |
|
350 STRH r12,[r0, #78] ; y[5] = t[2]-t2[5] |
|
351 STRH r5, [r0, #94] ; y[6] = t[1]-t2[6] |
|
352 STRH r11,[r0, #110] ; y[7] = t2[0]-t[7] |
|
353 MOV PC,R14 |
|
354 ENDP |
|
355 |
|
356 idct3core_down_arm PROC |
|
357 LDRSH r9, [r1], #16 ; r9 = x[0] |
|
358 LDR r12,OC_C4S4 ; r12= OC_C4S4 |
|
359 LDRSH r3, [r1, #-12] ; r3 = x[2] |
|
360 LDR r10,OC_C6S2 ; r10= OC_C6S2 |
|
361 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
|
362 LDR r4, OC_C2S6 ; r4 = OC_C2S6 |
|
363 MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2] |
|
364 LDRSH r11,[r1, #-14] ; r11= x[1] |
|
365 MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2] |
|
366 LDR r4, OC_C7S1 ; r4 = OC_C7S1 |
|
367 LDR r5, OC_C1S7 ; r5 = OC_C1S7 |
|
368 MOV r9, r9, ASR #16 ; r9 = t[0] |
|
369 MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1] |
|
370 ADD r9, r9, #8 ; r9 = t[0]+8 |
|
371 MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1] |
|
372 ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]+8 |
|
373 MOV r4, r4, ASR #16 ; r4 = t[4] |
|
374 MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4] |
|
375 MOV r11,r11,ASR #16 ; r11= t[7] |
|
376 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] |
|
377 ADD r10,r9, r10,ASR #16 ; r10= t[1]+8 = t[0]+t[2]+8 |
|
378 RSB r6, r10,r9, LSL #1 ; r6 = t[2]+8 = t[0]-t[2]+8 |
|
379 ; r3 = t2[0]+8 = t[0]+t[3]+8 |
|
380 RSB r9, r3, r9, LSL #1 ; r9 = t2[3]+8 = t[0]-t[3]+8 |
|
381 MOV r12,r12,ASR #16 ; r12= t[6] |
|
382 ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5] |
|
383 RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5] |
|
384 ADD r11,r3, r11 ; r11= t2[0]+t[7] +8 |
|
385 ADD r5, r10,r5 ; r5 = t[1] +t2[6]+8 |
|
386 ADD r12,r6, r12 ; r12= t[2] +t2[5]+8 |
|
387 ADD r4, r9, r4 ; r4 = t2[3]+t[4] +8 |
|
388 RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7] + 8 |
|
389 RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6] + 8 |
|
390 RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5] + 8 |
|
391 RSB r9, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + 8 |
|
392 ; TODO: This is wrong. |
|
393 ; The C code truncates to 16 bits by storing to RAM and doing the |
|
394 ; shifts later; we've got an extra 4 bits here. |
|
395 MOV r11,r11,ASR #4 |
|
396 MOV r5, r5, ASR #4 |
|
397 MOV r12,r12,ASR #4 |
|
398 MOV r4, r4, ASR #4 |
|
399 MOV r9, r9, ASR #4 |
|
400 MOV r6, r6, ASR #4 |
|
401 MOV r10,r10,ASR #4 |
|
402 MOV r3, r3, ASR #4 |
|
403 STRH r11,[r0], #2 ; y[0] = t[0]+t[7] |
|
404 STRH r5, [r0, #14] ; y[1] = t[1]+t2[6] |
|
405 STRH r12,[r0, #30] ; y[2] = t[2]+t2[5] |
|
406 STRH r4, [r0, #46] ; y[3] = t2[3]+t[4] |
|
407 STRH r9, [r0, #62] ; y[4] = t2[3]-t[4] |
|
408 STRH r6, [r0, #78] ; y[5] = t[2]-t2[5] |
|
409 STRH r10,[r0, #94] ; y[6] = t[1]-t2[6] |
|
410 STRH r3, [r0, #110] ; y[7] = t2[0]-t[7] |
|
411 MOV PC,R14 |
|
412 ENDP |
|
413 |
|
414 idct4core_arm PROC |
|
415 ; r0 = ogg_int16_t *_y (destination) |
|
416 ; r1 = const ogg_int16_t *_x (source) |
|
417 LDRSH r9, [r1], #16 ; r9 = x[0] |
|
418 LDR r10,OC_C4S4 ; r10= OC_C4S4 |
|
419 LDRSH r12,[r1, #-12] ; r12= x[2] |
|
420 LDR r4, OC_C6S2 ; r4 = OC_C6S2 |
|
421 MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
|
422 LDR r5, OC_C2S6 ; r5 = OC_C2S6 |
|
423 MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2] |
|
424 LDRSH r3, [r1, #-14] ; r3 = x[1] |
|
425 MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2] |
|
426 LDR r6, OC_C7S1 ; r6 = OC_C7S1 |
|
427 LDR r12,OC_C1S7 ; r12= OC_C1S7 |
|
428 LDRSH r11,[r1, #-10] ; r11= x[3] |
|
429 MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1] |
|
430 LDR r7, OC_C5S3 ; r7 = OC_C5S3 |
|
431 MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1] |
|
432 LDR r8, OC_C3S5 ; r8 = OC_C3S5 |
|
433 MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3] |
|
434 MOV r9, r9, ASR #16 ; r9 = t[0] |
|
435 MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3] |
|
436 MOV r6, r6, ASR #16 ; r6 = t[4] |
|
437 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit |
|
438 ; before multiplying, not after (this is not equivalent) |
|
439 SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5]) |
|
440 RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5] |
|
441 MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5]) |
|
442 MOV r3, r3, ASR #16 ; r3 = t[7] |
|
443 ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6] |
|
444 RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6] |
|
445 MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6]) |
|
446 ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] |
|
447 RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] |
|
448 ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] |
|
449 RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] |
|
450 MOV r3, r3, ASR #16 ; r3 = t2[6] |
|
451 ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5] |
|
452 RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5] |
|
453 ADD r11,r5, r11 ; r11= t[0]+t2[7] |
|
454 ADD r6, r4, r6 ; r6 = t[1]+t3[6] |
|
455 ADD r3, r10,r3 ; r3 = t[2]+t3[5] |
|
456 ADD r7, r9, r7 ; r7 = t[3]+t2[4] |
|
457 STRH r11,[r0], #2 ; y[0] = t[0]+t[7] |
|
458 STRH r6, [r0, #14] ; y[1] = t[1]+t2[6] |
|
459 STRH r3, [r0, #30] ; y[2] = t[2]+t2[5] |
|
460 STRH r7, [r0, #46] ; y[3] = t2[3]+t[4] |
|
461 RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7] |
|
462 RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6] |
|
463 RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5] |
|
464 RSB r7, r7, r9, LSL #1 ; r7 = t[3]-t2[4] |
|
465 STRH r7, [r0, #62] ; y[4] = t2[3]-t[4] |
|
466 STRH r3, [r0, #78] ; y[5] = t[2]-t2[5] |
|
467 STRH r6, [r0, #94] ; y[6] = t[1]-t2[6] |
|
468 STRH r11, [r0, #110] ; y[7] = t2[0]-t[7] |
|
469 MOV PC,r14 |
|
470 ENDP |
|
471 |
|
472 idct4core_down_arm PROC |
|
473 ; r0 = ogg_int16_t *_y (destination) |
|
474 ; r1 = const ogg_int16_t *_x (source) |
|
475 LDRSH r9, [r1], #16 ; r9 = x[0] |
|
476 LDR r10,OC_C4S4 ; r10= OC_C4S4 |
|
477 LDRSH r12,[r1, #-12] ; r12= x[2] |
|
478 LDR r4, OC_C6S2 ; r4 = OC_C6S2 |
|
479 MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
|
480 LDR r5, OC_C2S6 ; r5 = OC_C2S6 |
|
481 MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2] |
|
482 LDRSH r3, [r1, #-14] ; r3 = x[1] |
|
483 MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2] |
|
484 LDR r6, OC_C7S1 ; r6 = OC_C7S1 |
|
485 LDR r12,OC_C1S7 ; r12= OC_C1S7 |
|
486 LDRSH r11,[r1, #-10] ; r11= x[3] |
|
487 MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1] |
|
488 LDR r7, OC_C5S3 ; r7 = OC_C5S3 |
|
489 MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1] |
|
490 LDR r8, OC_C3S5 ; r8 = OC_C3S5 |
|
491 MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3] |
|
492 MOV r9, r9, ASR #16 ; r9 = t[0] |
|
493 MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3] |
|
494 MOV r6, r6, ASR #16 ; r6 = t[4] |
|
495 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit |
|
496 ; before multiplying, not after (this is not equivalent) |
|
497 SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5]) |
|
498 RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5] |
|
499 MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5]) |
|
500 MOV r3, r3, ASR #16 ; r3 = t[7] |
|
501 ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6] |
|
502 RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6] |
|
503 ADD r9, r9, #8 ; r9 = t[0]+8 |
|
504 MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6]) |
|
505 ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + 8 |
|
506 RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + 8 |
|
507 ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + 8 |
|
508 RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + 8 |
|
509 MOV r3, r3, ASR #16 ; r3 = t2[6] |
|
510 ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5] |
|
511 RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5] |
|
512 ADD r5, r5, r11 ; r5 = t[0]+t2[7]+8 |
|
513 ADD r4, r4, r6 ; r4 = t[1]+t3[6]+8 |
|
514 ADD r10,r10,r3 ; r10= t[2]+t3[5]+8 |
|
515 ADD r9, r9, r7 ; r9 = t[3]+t2[4]+8 |
|
516 SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]+8 |
|
517 SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]+8 |
|
518 SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]+8 |
|
519 SUB r7, r9, r7, LSL #1 ; r7 = t[3]-t2[4]+8 |
|
520 ; TODO: This is wrong. |
|
521 ; The C code truncates to 16 bits by storing to RAM and doing the |
|
522 ; shifts later; we've got an extra 4 bits here. |
|
523 MOV r11,r11,ASR #4 |
|
524 MOV r6, r6, ASR #4 |
|
525 MOV r3, r3, ASR #4 |
|
526 MOV r7, r7, ASR #4 |
|
527 MOV r9, r9, ASR #4 |
|
528 MOV r10,r10,ASR #4 |
|
529 MOV r4, r4, ASR #4 |
|
530 MOV r5, r5, ASR #4 |
|
531 STRH r5,[r0], #2 ; y[0] = t[0]+t[7] |
|
532 STRH r4, [r0, #14] ; y[1] = t[1]+t2[6] |
|
533 STRH r10,[r0, #30] ; y[2] = t[2]+t2[5] |
|
534 STRH r9, [r0, #46] ; y[3] = t2[3]+t[4] |
|
535 STRH r7, [r0, #62] ; y[4] = t2[3]-t[4] |
|
536 STRH r3, [r0, #78] ; y[5] = t[2]-t2[5] |
|
537 STRH r6, [r0, #94] ; y[6] = t[1]-t2[6] |
|
538 STRH r11,[r0, #110] ; y[7] = t2[0]-t[7] |
|
539 MOV PC,r14 |
|
540 ENDP |
|
541 |
|
542 idct8core_arm PROC |
|
543 ; r0 = ogg_int16_t *_y (destination) |
|
544 ; r1 = const ogg_int16_t *_x (source) |
|
545 LDRSH r2, [r1],#16 ; r2 = x[0] |
|
546 STMFD r13!,{r1,r14} |
|
547 LDRSH r6, [r1, #-8] ; r6 = x[4] |
|
548 LDR r12,OC_C4S4 ; r12= C4S4 |
|
549 LDRSH r4, [r1, #-12] ; r4 = x[2] |
|
550 ADD r2, r2, r6 ; r2 = x[0] + x[4] |
|
551 SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4] |
|
552 ; For spec compliance, these sums must be truncated to 16-bit precision |
|
553 ; _before_ the multiply (not after). |
|
554 ; Sadly, ARMv4 provides no simple way to do that. |
|
555 MOV r2, r2, LSL #16 |
|
556 MOV r6, r6, LSL #16 |
|
557 MOV r2, r2, ASR #16 |
|
558 MOV r6, r6, ASR #16 |
|
559 MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4]) |
|
560 LDRSH r8, [r1, #-4] ; r8 = x[6] |
|
561 LDR r7, OC_C6S2 ; r7 = OC_C6S2 |
|
562 MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4]) |
|
563 LDR r14,OC_C2S6 ; r14= OC_C2S6 |
|
564 MUL r3, r4, r7 ; r3 = OC_C6S2*x[2] |
|
565 LDR r5, OC_C7S1 ; r5 = OC_C7S1 |
|
566 MUL r4, r14,r4 ; r4 = OC_C2S6*x[2] |
|
567 MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16 |
|
568 MUL r14,r8, r14 ; r14= OC_C2S6*x[6] |
|
569 MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16 |
|
570 MUL r8, r7, r8 ; r8 = OC_C6S2*x[6] |
|
571 LDR r7, OC_C1S7 ; r7 = OC_C1S7 |
|
572 SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16 |
|
573 LDRSH r14,[r1, #-14] ; r14= x[1] |
|
574 ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16 |
|
575 LDRSH r8, [r1, #-2] ; r8 = x[7] |
|
576 MUL r9, r5, r14 ; r9 = OC_C7S1*x[1] |
|
577 LDRSH r10,[r1, #-6] ; r10= x[5] |
|
578 MUL r14,r7, r14 ; r14= OC_C1S7*x[1] |
|
579 MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16 |
|
580 MUL r7, r8, r7 ; r7 = OC_C1S7*x[7] |
|
581 MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16 |
|
582 MUL r8, r5, r8 ; r8 = OC_C7S1*x[7] |
|
583 LDRSH r1, [r1, #-10] ; r1 = x[3] |
|
584 LDR r5, OC_C3S5 ; r5 = OC_C3S5 |
|
585 LDR r11,OC_C5S3 ; r11= OC_C5S3 |
|
586 ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16 |
|
587 MUL r14,r5, r10 ; r14= OC_C3S5*x[5] |
|
588 SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16 |
|
589 MUL r10,r11,r10 ; r10= OC_C5S3*x[5] |
|
590 MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16 |
|
591 MUL r11,r1, r11 ; r11= OC_C5S3*x[3] |
|
592 MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16 |
|
593 MUL r1, r5, r1 ; r1 = OC_C3S5*x[3] |
|
594 SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16 |
|
595 ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16 |
|
596 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4] |
|
597 ; r10=t[6] r12=C4S4 r14=t[5] |
|
598 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit |
|
599 ; before multiplying, not after (this is not equivalent) |
|
600 ; Stage 2 |
|
601 ; 4-5 butterfly |
|
602 ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5] |
|
603 SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5] |
|
604 MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5]) |
|
605 ; 7-6 butterfly |
|
606 ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6] |
|
607 SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6] |
|
608 MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6]) |
|
609 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4] |
|
610 ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16 |
|
611 ; Stage 3 |
|
612 ; 0-3 butterfly |
|
613 ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] |
|
614 SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] |
|
615 ; 1-2 butterfly |
|
616 ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] |
|
617 SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] |
|
618 ; 6-5 butterfly |
|
619 MOV r14,r14,ASR #16 ; r14= t2[5] |
|
620 ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5] |
|
621 SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5] |
|
622 ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4] |
|
623 ; r10=t3[6] r14=t3[5] |
|
624 ; Stage 4 |
|
625 ADD r2, r2, r8 ; r2 = t[0] + t[7] |
|
626 ADD r6, r6, r10 ; r6 = t[1] + t[6] |
|
627 ADD r3, r3, r14 ; r3 = t[2] + t[5] |
|
628 ADD r4, r4, r9 ; r4 = t[3] + t[4] |
|
629 SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] |
|
630 SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] |
|
631 SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] |
|
632 SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] |
|
633 STRH r2, [r0], #2 ; y[0] = t[0]+t[7] |
|
634 STRH r6, [r0, #14] ; y[1] = t[1]+t[6] |
|
635 STRH r3, [r0, #30] ; y[2] = t[2]+t[5] |
|
636 STRH r4, [r0, #46] ; y[3] = t[3]+t[4] |
|
637 STRH r9, [r0, #62] ; y[4] = t[3]-t[4] |
|
638 STRH r14,[r0, #78] ; y[5] = t[2]-t[5] |
|
639 STRH r10,[r0, #94] ; y[6] = t[1]-t[6] |
|
640 STRH r8, [r0, #110] ; y[7] = t[0]-t[7] |
|
641 LDMFD r13!,{r1,PC} |
|
642 ENDP |
|
643 |
|
644 idct8core_down_arm PROC |
|
645 ; r0 = ogg_int16_t *_y (destination) |
|
646 ; r1 = const ogg_int16_t *_x (source) |
|
647 LDRSH r2, [r1],#16 ; r2 = x[0] |
|
648 STMFD r13!,{r1,r14} |
|
649 LDRSH r6, [r1, #-8] ; r6 = x[4] |
|
650 LDR r12,OC_C4S4 ; r12= C4S4 |
|
651 LDRSH r4, [r1, #-12] ; r4 = x[2] |
|
652 ADD r2, r2, r6 ; r2 = x[0] + x[4] |
|
653 SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4] |
|
654 ; For spec compliance, these sums must be truncated to 16-bit precision |
|
655 ; _before_ the multiply (not after). |
|
656 ; Sadly, ARMv4 provides no simple way to do that. |
|
657 MOV r2, r2, LSL #16 |
|
658 MOV r6, r6, LSL #16 |
|
659 MOV r2, r2, ASR #16 |
|
660 MOV r6, r6, ASR #16 |
|
661 MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4]) |
|
662 LDRSH r8, [r1, #-4] ; r8 = x[6] |
|
663 LDR r7, OC_C6S2 ; r7 = OC_C6S2 |
|
664 MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4]) |
|
665 LDR r14,OC_C2S6 ; r14= OC_C2S6 |
|
666 MUL r3, r4, r7 ; r3 = OC_C6S2*x[2] |
|
667 LDR r5, OC_C7S1 ; r5 = OC_C7S1 |
|
668 MUL r4, r14,r4 ; r4 = OC_C2S6*x[2] |
|
669 MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16 |
|
670 MUL r14,r8, r14 ; r14= OC_C2S6*x[6] |
|
671 MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16 |
|
672 MUL r8, r7, r8 ; r8 = OC_C6S2*x[6] |
|
673 LDR r7, OC_C1S7 ; r7 = OC_C1S7 |
|
674 SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16 |
|
675 LDRSH r14,[r1, #-14] ; r14= x[1] |
|
676 ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16 |
|
677 LDRSH r8, [r1, #-2] ; r8 = x[7] |
|
678 MUL r9, r5, r14 ; r9 = OC_C7S1*x[1] |
|
679 LDRSH r10,[r1, #-6] ; r10= x[5] |
|
680 MUL r14,r7, r14 ; r14= OC_C1S7*x[1] |
|
681 MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16 |
|
682 MUL r7, r8, r7 ; r7 = OC_C1S7*x[7] |
|
683 MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16 |
|
684 MUL r8, r5, r8 ; r8 = OC_C7S1*x[7] |
|
685 LDRSH r1, [r1, #-10] ; r1 = x[3] |
|
686 LDR r5, OC_C3S5 ; r5 = OC_C3S5 |
|
687 LDR r11,OC_C5S3 ; r11= OC_C5S3 |
|
688 ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16 |
|
689 MUL r14,r5, r10 ; r14= OC_C3S5*x[5] |
|
690 SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16 |
|
691 MUL r10,r11,r10 ; r10= OC_C5S3*x[5] |
|
692 MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16 |
|
693 MUL r11,r1, r11 ; r11= OC_C5S3*x[3] |
|
694 MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16 |
|
695 MUL r1, r5, r1 ; r1 = OC_C3S5*x[3] |
|
696 SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16 |
|
697 ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16 |
|
698 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4] |
|
699 ; r10=t[6] r12=C4S4 r14=t[5] |
|
700 ; Stage 2 |
|
701 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit |
|
702 ; before multiplying, not after (this is not equivalent) |
|
703 ; 4-5 butterfly |
|
704 ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5] |
|
705 SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5] |
|
706 MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5]) |
|
707 ; 7-6 butterfly |
|
708 ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6] |
|
709 SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6] |
|
710 MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6]) |
|
711 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4] |
|
712 ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16 |
|
713 ; Stage 3 |
|
714 ADD r2, r2, #8<<16 ; r2 = t[0]+8<<16 |
|
715 ADD r6, r6, #8<<16 ; r6 = t[1]+8<<16 |
|
716 ; 0-3 butterfly |
|
717 ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + 8 |
|
718 SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + 8 |
|
719 ; 1-2 butterfly |
|
720 ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + 8 |
|
721 SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + 8 |
|
722 ; 6-5 butterfly |
|
723 MOV r14,r14,ASR #16 ; r14= t2[5] |
|
724 ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5] |
|
725 SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5] |
|
726 ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4] |
|
727 ; r10=t3[6] r14=t3[5] |
|
728 ; Stage 4 |
|
729 ADD r2, r2, r8 ; r2 = t[0] + t[7] + 8 |
|
730 ADD r6, r6, r10 ; r6 = t[1] + t[6] + 8 |
|
731 ADD r3, r3, r14 ; r3 = t[2] + t[5] + 8 |
|
732 ADD r4, r4, r9 ; r4 = t[3] + t[4] + 8 |
|
733 SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + 8 |
|
734 SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + 8 |
|
735 SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + 8 |
|
736 SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + 8 |
|
737 ; TODO: This is wrong. |
|
738 ; The C code truncates to 16 bits by storing to RAM and doing the |
|
739 ; shifts later; we've got an extra 4 bits here. |
|
740 MOV r2, r2, ASR #4 |
|
741 MOV r6, r6, ASR #4 |
|
742 MOV r3, r3, ASR #4 |
|
743 MOV r4, r4, ASR #4 |
|
744 MOV r8, r8, ASR #4 |
|
745 MOV r10,r10,ASR #4 |
|
746 MOV r14,r14,ASR #4 |
|
747 MOV r9, r9, ASR #4 |
|
748 STRH r2, [r0], #2 ; y[0] = t[0]+t[7] |
|
749 STRH r6, [r0, #14] ; y[1] = t[1]+t[6] |
|
750 STRH r3, [r0, #30] ; y[2] = t[2]+t[5] |
|
751 STRH r4, [r0, #46] ; y[3] = t[3]+t[4] |
|
752 STRH r9, [r0, #62] ; y[4] = t[3]-t[4] |
|
753 STRH r14,[r0, #78] ; y[5] = t[2]-t[5] |
|
754 STRH r10,[r0, #94] ; y[6] = t[1]-t[6] |
|
755 STRH r8, [r0, #110] ; y[7] = t[0]-t[7] |
|
756 LDMFD r13!,{r1,PC} |
|
757 ENDP |
|
758 |
|
759 [ OC_ARM_ASM_MEDIA |
|
760 EXPORT oc_idct8x8_1_v6 |
|
761 EXPORT oc_idct8x8_v6 |
|
762 |
|
763 oc_idct8x8_1_v6 PROC |
|
764 ; r0 = ogg_int16_t *_y |
|
765 ; r1 = ogg_uint16_t _dc |
|
766 ORR r2, r1, r1, LSL #16 |
|
767 ORR r3, r1, r1, LSL #16 |
|
768 STRD r2, [r0], #8 |
|
769 STRD r2, [r0], #8 |
|
770 STRD r2, [r0], #8 |
|
771 STRD r2, [r0], #8 |
|
772 STRD r2, [r0], #8 |
|
773 STRD r2, [r0], #8 |
|
774 STRD r2, [r0], #8 |
|
775 STRD r2, [r0], #8 |
|
776 STRD r2, [r0], #8 |
|
777 STRD r2, [r0], #8 |
|
778 STRD r2, [r0], #8 |
|
779 STRD r2, [r0], #8 |
|
780 STRD r2, [r0], #8 |
|
781 STRD r2, [r0], #8 |
|
782 STRD r2, [r0], #8 |
|
783 STRD r2, [r0], #8 |
|
784 MOV PC, r14 |
|
785 ENDP |
|
786 |
|
787 oc_idct8x8_v6 PROC |
|
788 ; r0 = ogg_int16_t *_y |
|
789 ; r1 = ogg_int16_t *_x |
|
790 ; r2 = int _last_zzi |
|
791 CMP r2, #3 |
|
792 BLE oc_idct8x8_3_v6 |
|
793 ;CMP r2, #6 |
|
794 ;BLE oc_idct8x8_6_v6 |
|
795 CMP r2, #10 |
|
796 BLE oc_idct8x8_10_v6 |
|
797 oc_idct8x8_slow_v6 |
|
798 STMFD r13!,{r4-r11,r14} |
|
799 SUB r13,r13,#64*2 |
|
800 ; Row transforms |
|
801 STR r0, [r13,#-4]! |
|
802 ADD r0, r13, #4 ; Write to temp storage. |
|
803 BL idct8_8core_v6 |
|
804 BL idct8_8core_v6 |
|
805 BL idct8_8core_v6 |
|
806 BL idct8_8core_v6 |
|
807 LDR r0, [r13], #4 ; Write to the final destination. |
|
808 ; Clear input data for next block (decoder only). |
|
809 SUB r2, r1, #8*16 |
|
810 CMP r0, r2 |
|
811 MOV r1, r13 ; And read from temp storage. |
|
812 BEQ oc_idct8x8_slow_v6_cols |
|
813 MOV r4, #0 |
|
814 MOV r5, #0 |
|
815 STRD r4, [r2], #8 |
|
816 STRD r4, [r2], #8 |
|
817 STRD r4, [r2], #8 |
|
818 STRD r4, [r2], #8 |
|
819 STRD r4, [r2], #8 |
|
820 STRD r4, [r2], #8 |
|
821 STRD r4, [r2], #8 |
|
822 STRD r4, [r2], #8 |
|
823 STRD r4, [r2], #8 |
|
824 STRD r4, [r2], #8 |
|
825 STRD r4, [r2], #8 |
|
826 STRD r4, [r2], #8 |
|
827 STRD r4, [r2], #8 |
|
828 STRD r4, [r2], #8 |
|
829 STRD r4, [r2], #8 |
|
830 STRD r4, [r2], #8 |
|
831 oc_idct8x8_slow_v6_cols |
|
832 ; Column transforms |
|
833 BL idct8_8core_down_v6 |
|
834 BL idct8_8core_down_v6 |
|
835 BL idct8_8core_down_v6 |
|
836 BL idct8_8core_down_v6 |
|
837 ADD r13,r13,#64*2 |
|
838 LDMFD r13!,{r4-r11,PC} |
|
839 ENDP |
|
840 |
|
841 oc_idct8x8_10_v6 PROC |
|
842 STMFD r13!,{r4-r11,r14} |
|
843 SUB r13,r13,#64*2+4 |
|
844 ; Row transforms |
|
845 MOV r2, r13 |
|
846 STR r0, [r13,#-4]! |
|
847 AND r0, r2, #4 ; Align the stack. |
|
848 ADD r0, r0, r2 ; Write to temp storage. |
|
849 BL idct4_3core_v6 |
|
850 BL idct2_1core_v6 |
|
851 LDR r0, [r13], #4 ; Write to the final destination. |
|
852 ; Clear input data for next block (decoder only). |
|
853 SUB r2, r1, #4*16 |
|
854 CMP r0, r2 |
|
855 AND r1, r13,#4 ; Align the stack. |
|
856 BEQ oc_idct8x8_10_v6_cols |
|
857 MOV r4, #0 |
|
858 MOV r5, #0 |
|
859 STRD r4, [r2] |
|
860 STRD r4, [r2,#16] |
|
861 STR r4, [r2,#32] |
|
862 STR r4, [r2,#48] |
|
863 oc_idct8x8_10_v6_cols |
|
864 ; Column transforms |
|
865 ADD r1, r1, r13 ; And read from temp storage. |
|
866 BL idct4_4core_down_v6 |
|
867 BL idct4_4core_down_v6 |
|
868 BL idct4_4core_down_v6 |
|
869 BL idct4_4core_down_v6 |
|
870 ADD r13,r13,#64*2+4 |
|
871 LDMFD r13!,{r4-r11,PC} |
|
872 ENDP |
|
873 |
|
874 oc_idct8x8_3_v6 PROC |
|
875 STMFD r13!,{r4-r8,r14} |
|
876 SUB r13,r13,#64*2 |
|
877 ; Row transforms |
|
878 MOV r8, r0 |
|
879 MOV r0, r13 ; Write to temp storage. |
|
880 BL idct2_1core_v6 |
|
881 ; Clear input data for next block (decoder only). |
|
882 SUB r0, r1, #2*16 |
|
883 CMP r0, r8 |
|
884 MOV r1, r13 ; Read from temp storage. |
|
885 MOVNE r4, #0 |
|
886 STRNE r4, [r0] |
|
887 STRNE r4, [r0,#16] |
|
888 MOVNE r0, r8 ; Write to the final destination. |
|
889 ; Column transforms |
|
890 BL idct2_2core_down_v6 |
|
891 BL idct2_2core_down_v6 |
|
892 BL idct2_2core_down_v6 |
|
893 BL idct2_2core_down_v6 |
|
894 ADD r13,r13,#64*2 |
|
895 LDMFD r13!,{r4-r8,PC} |
|
896 ENDP |
|
897 |
|
898 idct2_1core_v6 PROC |
|
899 ; r0 = ogg_int16_t *_y (destination) |
|
900 ; r1 = const ogg_int16_t *_x (source) |
|
901 ; Stage 1: |
|
902 LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]> |
|
903 LDR r3, OC_C4S4 |
|
904 LDRSH r6, [r1], #16 ; r6 = x[1,0] |
|
905 SMULWB r12,r3, r2 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16 |
|
906 LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7 |
|
907 SMULWB r6, r3, r6 ; r6 = t[1,0]=OC_C4S4*x[1,0]>>16 |
|
908 SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 |
|
909 SMULWT r7, r5, r2 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
|
910 ; Stage 2: |
|
911 SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 |
|
912 PKHBT r12,r12,r6, LSL #16 ; r12= <t[1,0]|t[0,0]> |
|
913 SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 |
|
914 PKHBT r7, r7, r3 ; r7 = <0|t[0,7]> |
|
915 ; Stage 3: |
|
916 PKHBT r5, r6, r5, LSL #16 ; r5 = <t[0,5]|t[0,6]> |
|
917 PKHBT r4, r4, r3 ; r4 = <0|t[0,4]> |
|
918 SASX r5, r5, r5 ; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]> |
|
919 ; Stage 4: |
|
920 PKHTB r6, r3, r5, ASR #16 ; r6 = <0|t[0,6]> |
|
921 PKHBT r5, r5, r3 ; r5 = <0|t[0,5]> |
|
922 SADD16 r3, r12,r7 ; r3 = t[0]+t[7] |
|
923 STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7] |
|
924 SADD16 r3, r12,r6 ; r3 = t[0]+t[6] |
|
925 STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6] |
|
926 SADD16 r3, r12,r5 ; r3 = t[0]+t[5] |
|
927 STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5] |
|
928 SADD16 r3, r12,r4 ; r3 = t[0]+t[4] |
|
929 STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4] |
|
930 SSUB16 r4, r12,r4 ; r4 = t[0]-t[4] |
|
931 STR r4, [r0, #60] ; y[4<<3] = t[0]-t[4] |
|
932 SSUB16 r5, r12,r5 ; r5 = t[0]-t[5] |
|
933 STR r5, [r0, #76] ; y[5<<3] = t[0]-t[5] |
|
934 SSUB16 r6, r12,r6 ; r6 = t[0]-t[6] |
|
935 STR r6, [r0, #92] ; y[6<<3] = t[0]-t[6] |
|
936 SSUB16 r7, r12,r7 ; r7 = t[0]-t[7] |
|
937 STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7] |
|
938 MOV PC,r14 |
|
939 ENDP |
|
940 ] |
|
941 |
|
942 ALIGN 8 |
|
943 OC_C7S1 |
|
944 DCD 12785 ; 31F1 |
|
945 OC_C1S7 |
|
946 DCD 64277 ; FB15 |
|
947 OC_C6S2 |
|
948 DCD 25080 ; 61F8 |
|
949 OC_C2S6 |
|
950 DCD 60547 ; EC83 |
|
951 OC_C5S3 |
|
952 DCD 36410 ; 8E3A |
|
953 OC_C3S5 |
|
954 DCD 54491 ; D4DB |
|
955 OC_C4S4 |
|
956 DCD 46341 ; B505 |
|
957 |
|
958 [ OC_ARM_ASM_MEDIA |
|
959 idct2_2core_down_v6 PROC |
|
960 ; r0 = ogg_int16_t *_y (destination) |
|
961 ; r1 = const ogg_int16_t *_x (source) |
|
962 ; Stage 1: |
|
963 LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]> |
|
964 LDR r3, OC_C4S4 |
|
965 MOV r7 ,#8 ; r7 = 8 |
|
966 LDR r6, [r1], #16 ; r6 = <x[1,1]|x[1,0]> |
|
967 SMLAWB r12,r3, r2, r7 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8 |
|
968 LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7 |
|
969 SMLAWB r7, r3, r6, r7 ; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8 |
|
970 SMULWT r5, r5, r2 ; r2 = t[0,7]=OC_C1S7*x[0,1]>>16 |
|
971 PKHBT r12,r12,r7, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> |
|
972 SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 |
|
973 ; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition. |
|
974 PKHBT r7, r5, r5, LSL #16 ; r7 = <t[0,7]|t[0,7]> |
|
975 ; Stage 2: |
|
976 SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 |
|
977 PKHBT r4, r4, r4, LSL #16 ; r4 = <t[0,4]|t[0,4]> |
|
978 SMULWT r2, r3, r7 ; r2 = t[1,6]=OC_C4S4*t[1,7]>>16 |
|
979 SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 |
|
980 PKHBT r6, r6, r2, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
981 SMULWT r2, r3, r4 ; r2 = t[1,5]=OC_C4S4*t[1,4]>>16 |
|
982 PKHBT r2, r5, r2, LSL #16 ; r2 = <t[1,5]|t[0,5]> |
|
983 ; Stage 3: |
|
984 SSUB16 r5, r6, r2 ; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]> |
|
985 SADD16 r6, r6, r2 ; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]> |
|
986 ; Stage 4: |
|
987 SADD16 r2, r12,r7 ; r2 = t[0]+t[7]+8 |
|
988 MOV r3, r2, ASR #4 |
|
989 MOV r2, r2, LSL #16 |
|
990 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[7]+8>>4 |
|
991 STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4 |
|
992 SADD16 r2, r12,r6 ; r2 = t[0]+t[6]+8 |
|
993 MOV r3, r2, ASR #4 |
|
994 MOV r2, r2, LSL #16 |
|
995 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[6]+8>>4 |
|
996 STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]+8>>4 |
|
997 SADD16 r2, r12,r5 ; r2 = t[0]+t[5]+8 |
|
998 MOV r3, r2, ASR #4 |
|
999 MOV r2, r2, LSL #16 |
|
1000 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[5]+8>>4 |
|
1001 STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]+8>>4 |
|
1002 SADD16 r2, r12,r4 ; r2 = t[0]+t[4]+8 |
|
1003 MOV r3, r2, ASR #4 |
|
1004 MOV r2, r2, LSL #16 |
|
1005 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[4]+8>>4 |
|
1006 STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]+8>>4 |
|
1007 SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]+8 |
|
1008 MOV r3, r4, ASR #4 |
|
1009 MOV r4, r4, LSL #16 |
|
1010 PKHTB r3, r3, r4, ASR #20 ; r3 = t[0]-t[4]+8>>4 |
|
1011 STR r3, [r0, #60] ; y[4<<3] = t[0]-t[4]+8>>4 |
|
1012 SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]+8 |
|
1013 MOV r3, r5, ASR #4 |
|
1014 MOV r5, r5, LSL #16 |
|
1015 PKHTB r3, r3, r5, ASR #20 ; r3 = t[0]-t[5]+8>>4 |
|
1016 STR r3, [r0, #76] ; y[5<<3] = t[0]-t[5]+8>>4 |
|
1017 SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]+8 |
|
1018 MOV r3, r6, ASR #4 |
|
1019 MOV r6, r6, LSL #16 |
|
1020 PKHTB r3, r3, r6, ASR #20 ; r3 = t[0]-t[6]+8>>4 |
|
1021 STR r3, [r0, #92] ; y[6<<3] = t[0]-t[6]+8>>4 |
|
1022 SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]+8 |
|
1023 MOV r3, r7, ASR #4 |
|
1024 MOV r7, r7, LSL #16 |
|
1025 PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4 |
|
1026 STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4 |
|
1027 MOV PC,r14 |
|
1028 ENDP |
|
1029 |
|
1030 ; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to |
|
1031 ; pay for increased branch mis-prediction to get here, but in practice it |
|
1032 ; doesn't seem to slow anything down to take it out, and it's less code this |
|
1033 ; way. |
|
1034 [ 0 |
|
1035 oc_idct8x8_6_v6 PROC |
|
1036 STMFD r13!,{r4-r8,r10,r11,r14} |
|
1037 SUB r13,r13,#64*2+4 |
|
1038 ; Row transforms |
|
1039 MOV r8, r0 |
|
1040 AND r0, r13,#4 ; Align the stack. |
|
1041 ADD r0, r0, r13 ; Write to temp storage. |
|
1042 BL idct3_2core_v6 |
|
1043 BL idct1core_v6 |
|
1044 ; Clear input data for next block (decoder only). |
|
1045 SUB r0, r1, #3*16 |
|
1046 CMP r0, r8 |
|
1047 AND r1, r13,#4 ; Align the stack. |
|
1048 BEQ oc_idct8x8_6_v6_cols |
|
1049 MOV r4, #0 |
|
1050 MOV r5, #0 |
|
1051 STRD r4, [r0] |
|
1052 STR r4, [r0,#16] |
|
1053 STR r4, [r0,#32] |
|
1054 MOV r0, r8 ; Write to the final destination. |
|
1055 oc_idct8x8_6_v6_cols |
|
1056 ; Column transforms |
|
1057 ADD r1, r1, r13 ; And read from temp storage. |
|
1058 BL idct3_3core_down_v6 |
|
1059 BL idct3_3core_down_v6 |
|
1060 BL idct3_3core_down_v6 |
|
1061 BL idct3_3core_down_v6 |
|
1062 ADD r13,r13,#64*2+4 |
|
1063 LDMFD r13!,{r4-r8,r10,r11,PC} |
|
1064 ENDP |
|
1065 |
|
1066 idct1core_v6 PROC |
|
1067 ; r0 = ogg_int16_t *_y (destination) |
|
1068 ; r1 = const ogg_int16_t *_x (source) |
|
1069 LDRSH r3, [r1], #16 |
|
1070 MOV r12,#0x05 |
|
1071 ORR r12,r12,#0xB500 |
|
1072 MUL r3, r12, r3 |
|
1073 ; Stall ? |
|
1074 MOV r3, r3, ASR #16 |
|
1075 ; Don't need to actually store the odd lines; they won't be read. |
|
1076 STRH r3, [r0], #2 |
|
1077 STRH r3, [r0, #30] |
|
1078 STRH r3, [r0, #62] |
|
1079 STRH r3, [r0, #94] |
|
1080 MOV PC,R14 |
|
1081 ENDP |
|
1082 |
|
1083 idct3_2core_v6 PROC |
|
1084 ; r0 = ogg_int16_t *_y (destination) |
|
1085 ; r1 = const ogg_int16_t *_x (source) |
|
1086 ; Stage 1: |
|
1087 LDRD r4, [r1], #16 ; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]> |
|
1088 LDRD r10,OC_C6S2_3_v6 ; r10= OC_C6S2; r11= OC_C2S6 |
|
1089 ; Stall |
|
1090 SMULWB r3, r11,r5 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 |
|
1091 LDR r11,OC_C4S4 |
|
1092 SMULWB r2, r10,r5 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 |
|
1093 LDR r5, [r1], #16 ; r5 = <x[1,1]|x[1,0]> |
|
1094 SMULWB r12,r11,r4 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16) |
|
1095 LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 |
|
1096 SMULWB r10,r11,r5 ; r10= (t[1,0]=OC_C4S4*x[1,0]>>16) |
|
1097 PKHBT r12,r12,r10,LSL #16 ; r12= <t[1,0]|t[0,0]> |
|
1098 SMULWT r10,r7, r5 ; r10= t[1,7]=OC_C1S7*x[1,1]>>16 |
|
1099 PKHBT r2, r2, r11 ; r2 = <0|t[0,2]> |
|
1100 SMULWT r7, r7, r4 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
|
1101 PKHBT r3, r3, r11 ; r3 = <0|t[0,3]> |
|
1102 SMULWT r5, r6, r5 ; r10= t[1,4]=OC_C7S1*x[1,1]>>16 |
|
1103 PKHBT r7, r7, r10,LSL #16 ; r7 = <t[1,7]|t[0,7]> |
|
1104 SMULWT r4, r6, r4 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 |
|
1105 ; Stage 2: |
|
1106 SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 |
|
1107 PKHBT r4, r4, r5, LSL #16 ; r4 = <t[1,4]|t[0,4]> |
|
1108 SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16 |
|
1109 SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 |
|
1110 PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
1111 SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16 |
|
1112 ; Stage 3: |
|
1113 B idct4_3core_stage3_v6 |
|
1114 ENDP |
|
1115 |
|
1116 ; Another copy so the LDRD offsets are less than +/- 255. |
|
1117 ALIGN 8 |
|
1118 OC_C7S1_3_v6 |
|
1119 DCD 12785 ; 31F1 |
|
1120 OC_C1S7_3_v6 |
|
1121 DCD 64277 ; FB15 |
|
1122 OC_C6S2_3_v6 |
|
1123 DCD 25080 ; 61F8 |
|
1124 OC_C2S6_3_v6 |
|
1125 DCD 60547 ; EC83 |
|
1126 |
|
1127 idct3_3core_down_v6 PROC |
|
1128 ; r0 = ogg_int16_t *_y (destination) |
|
1129 ; r1 = const ogg_int16_t *_x (source) |
|
1130 ; Stage 1: |
|
1131 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]> |
|
1132 LDRD r6, OC_C6S2_3_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 |
|
1133 LDR r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]> |
|
1134 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 |
|
1135 MOV r7,#8 |
|
1136 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 |
|
1137 LDR r11,OC_C4S4 |
|
1138 SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8 |
|
1139 ; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition. |
|
1140 PKHBT r3, r3, r3, LSL #16 ; r3 = <t[0,3]|t[0,3]> |
|
1141 SMLAWB r5, r11,r4, r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8 |
|
1142 PKHBT r2, r2, r2, LSL #16 ; r2 = <t[0,2]|t[0,2]> |
|
1143 LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 |
|
1144 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> |
|
1145 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 |
|
1146 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
|
1147 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 |
|
1148 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> |
|
1149 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 |
|
1150 ; Stage 2: |
|
1151 SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 |
|
1152 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> |
|
1153 SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16 |
|
1154 SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 |
|
1155 PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
1156 SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16 |
|
1157 ; Stage 3: |
|
1158 B idct4_4core_down_stage3_v6 |
|
1159 ENDP |
|
1160 ] |
|
1161 |
|
1162 idct4_3core_v6 PROC |
|
1163 ; r0 = ogg_int16_t *_y (destination) |
|
1164 ; r1 = const ogg_int16_t *_x (source) |
|
1165 ; Stage 1: |
|
1166 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]> |
|
1167 LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5 |
|
1168 LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]> |
|
1169 SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16 |
|
1170 SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16 |
|
1171 PKHBT r9, r9, r2 ; r9 = <0|t[0,6]> |
|
1172 LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 |
|
1173 PKHBT r8, r8, r2 ; r9 = <0|-t[0,5]> |
|
1174 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 |
|
1175 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 |
|
1176 LDR r11,OC_C4S4 |
|
1177 SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16 |
|
1178 SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16 |
|
1179 PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]> |
|
1180 SMULWB r12,r11,r10 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16 |
|
1181 PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]> |
|
1182 SMULWB r5, r11,r4 ; r5 = t[1,0]=OC_C4S4*x[1,0]>>16 |
|
1183 LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 |
|
1184 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]|t[0,0]> |
|
1185 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 |
|
1186 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
|
1187 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 |
|
1188 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> |
|
1189 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 |
|
1190 ; Stage 2: |
|
1191 SSUB16 r6, r7, r9 ; r6 = t[7]-t[6] |
|
1192 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> |
|
1193 SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6] |
|
1194 SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16 |
|
1195 SADD16 r5, r4, r8 ; r5 = t[4]-t[5] |
|
1196 SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16 |
|
1197 SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5] |
|
1198 SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16 |
|
1199 PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
1200 SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16 |
|
1201 ; Stage 3: |
|
1202 idct4_3core_stage3_v6 |
|
1203 SADD16 r11,r12,r2 ; r11= t[1]=t[0]+t[2] |
|
1204 PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]> |
|
1205 SSUB16 r2, r12,r2 ; r2 = t[2]=t[0]-t[2] |
|
1206 idct4_3core_stage3_5_v6 |
|
1207 SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5] |
|
1208 SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5] |
|
1209 SADD16 r10,r12,r3 ; r10= t[0]'=t[0]+t[3] |
|
1210 SSUB16 r3, r12,r3 ; r3 = t[3]=t[0]-t[3] |
|
1211 ; Stage 4: |
|
1212 SADD16 r12,r10,r7 ; r12= t[0]+t[7] |
|
1213 STR r12,[r0], #4 ; y[0<<3] = t[0]+t[7] |
|
1214 SADD16 r12,r11,r6 ; r12= t[1]+t[6] |
|
1215 STR r12,[r0, #12] ; y[1<<3] = t[1]+t[6] |
|
1216 SADD16 r12,r2, r5 ; r12= t[2]+t[5] |
|
1217 STR r12,[r0, #28] ; y[2<<3] = t[2]+t[5] |
|
1218 SADD16 r12,r3, r4 ; r12= t[3]+t[4] |
|
1219 STR r12,[r0, #44] ; y[3<<3] = t[3]+t[4] |
|
1220 SSUB16 r4, r3, r4 ; r4 = t[3]-t[4] |
|
1221 STR r4, [r0, #60] ; y[4<<3] = t[3]-t[4] |
|
1222 SSUB16 r5, r2, r5 ; r5 = t[2]-t[5] |
|
1223 STR r5, [r0, #76] ; y[5<<3] = t[2]-t[5] |
|
1224 SSUB16 r6, r11,r6 ; r6 = t[1]-t[6] |
|
1225 STR r6, [r0, #92] ; y[6<<3] = t[1]-t[6] |
|
1226 SSUB16 r7, r10,r7 ; r7 = t[0]-t[7] |
|
1227 STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7] |
|
1228 MOV PC,r14 |
|
1229 ENDP |
|
1230 |
|
1231 ; Another copy so the LDRD offsets are less than +/- 255. |
|
1232 ALIGN 8 |
|
1233 OC_C7S1_4_v6 |
|
1234 DCD 12785 ; 31F1 |
|
1235 OC_C1S7_4_v6 |
|
1236 DCD 64277 ; FB15 |
|
1237 OC_C6S2_4_v6 |
|
1238 DCD 25080 ; 61F8 |
|
1239 OC_C2S6_4_v6 |
|
1240 DCD 60547 ; EC83 |
|
1241 OC_C5S3_4_v6 |
|
1242 DCD 36410 ; 8E3A |
|
1243 OC_C3S5_4_v6 |
|
1244 DCD 54491 ; D4DB |
|
1245 |
|
1246 idct4_4core_down_v6 PROC |
|
1247 ; r0 = ogg_int16_t *_y (destination) |
|
1248 ; r1 = const ogg_int16_t *_x (source) |
|
1249 ; Stage 1: |
|
1250 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]> |
|
1251 LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5 |
|
1252 LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]> |
|
1253 SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16 |
|
1254 LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 |
|
1255 SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16 |
|
1256 ; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition. |
|
1257 PKHBT r9, r9, r9, LSL #16 ; r9 = <t[0,6]|t[0,6]> |
|
1258 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 |
|
1259 PKHBT r8, r8, r8, LSL #16 ; r8 = <-t[0,5]|-t[0,5]> |
|
1260 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 |
|
1261 LDR r11,OC_C4S4 |
|
1262 SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16 |
|
1263 MOV r7,#8 |
|
1264 SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16 |
|
1265 PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]> |
|
1266 SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8 |
|
1267 PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]> |
|
1268 SMLAWB r5, r11,r4 ,r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8 |
|
1269 LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 |
|
1270 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> |
|
1271 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 |
|
1272 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
|
1273 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 |
|
1274 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> |
|
1275 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 |
|
1276 ; Stage 2: |
|
1277 SSUB16 r6, r7, r9 ; r6 = t[7]-t[6] |
|
1278 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> |
|
1279 SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6] |
|
1280 SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16 |
|
1281 SADD16 r5, r4, r8 ; r5 = t[4]-t[5] |
|
1282 SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16 |
|
1283 SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5] |
|
1284 SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16 |
|
1285 PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
1286 SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16 |
|
1287 ; Stage 3: |
|
1288 idct4_4core_down_stage3_v6 |
|
1289 SADD16 r11,r12,r2 ; r11= t[1]+8=t[0]+t[2]+8 |
|
1290 PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]> |
|
1291 SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8 |
|
1292 B idct8_8core_down_stage3_5_v6 |
|
1293 ENDP |
|
1294 |
|
1295 idct8_8core_v6 PROC |
|
1296 STMFD r13!,{r0,r14} |
|
1297 ; Stage 1: |
|
1298 ;5-6 rotation by 3pi/16 |
|
1299 LDRD r10,OC_C5S3_4_v6 ; r10= OC_C5S3, r11= OC_C3S5 |
|
1300 LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]> |
|
1301 LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]> |
|
1302 SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16 |
|
1303 LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]> |
|
1304 SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16 |
|
1305 LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]> |
|
1306 SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16 |
|
1307 SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16 |
|
1308 SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16) |
|
1309 PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5> |
|
1310 SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16) |
|
1311 PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]> |
|
1312 SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16 |
|
1313 PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
1314 SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16 |
|
1315 ;2-3 rotation by 6pi/16 |
|
1316 LDRD r10,OC_C6S2_4_v6 ; r10= OC_C6S2, r11= OC_C2S6 |
|
1317 PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3> |
|
1318 LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]> |
|
1319 SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16 |
|
1320 SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]> |
|
1321 SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16 |
|
1322 LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]> |
|
1323 SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16 |
|
1324 SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16 |
|
1325 PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9> |
|
1326 SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16) |
|
1327 SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16) |
|
1328 SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16 |
|
1329 PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]> |
|
1330 SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16 |
|
1331 ;4-7 rotation by 7pi/16 |
|
1332 LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7 |
|
1333 PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12> |
|
1334 LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]> |
|
1335 PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]> |
|
1336 SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]> |
|
1337 SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16 |
|
1338 LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]> |
|
1339 SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16 |
|
1340 SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16 |
|
1341 SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16 |
|
1342 SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16) |
|
1343 PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8> |
|
1344 SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16) |
|
1345 PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]> |
|
1346 SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16 |
|
1347 PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]> |
|
1348 SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16 |
|
1349 ;0-1 butterfly |
|
1350 LDR r11,OC_C4S4 |
|
1351 PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10> |
|
1352 SADD16 r7, r0, r4 ; r7 = x[0]+x[4] |
|
1353 SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]> |
|
1354 SSUB16 r4, r0, r4 ; r4 = x[0]-x[4] |
|
1355 SMULWB r8, r11,r7 ; r8 = t[0,0]=OC_C4S4*r7B>>16 |
|
1356 SMULWT r12,r11,r7 ; r12= t[1,0]=OC_C4S4*r7T>>16 |
|
1357 SMULWB r7, r11,r4 ; r7 = t[0,1]=OC_C4S4*r4B>>16 |
|
1358 PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]|t[0,0]> |
|
1359 SMULWT r8, r11,r4 ; r8 = t[1,1]=OC_C4S4*r4T>>16 |
|
1360 ; Stage 2: |
|
1361 SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5] |
|
1362 PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]|t[0,0]> |
|
1363 SSUB16 r5, r10,r5 ; r5 = t[4]-t[5] |
|
1364 SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16 |
|
1365 SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6] |
|
1366 SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16 |
|
1367 SSUB16 r6, r9, r6 ; r6 = t[7]-t[6] |
|
1368 SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16 |
|
1369 PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]> |
|
1370 SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16 |
|
1371 ; Stage 3: |
|
1372 SADD16 r11,r8, r2 ; r11= t[1]'=t[1]+t[2] |
|
1373 PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
1374 SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2] |
|
1375 LDMFD r13!,{r0,r14} |
|
1376 B idct4_3core_stage3_5_v6 |
|
1377 ENDP |
|
1378 |
|
1379 ; Another copy so the LDRD offsets are less than +/- 255. |
|
1380 ALIGN 8 |
|
1381 OC_C7S1_8_v6 |
|
1382 DCD 12785 ; 31F1 |
|
1383 OC_C1S7_8_v6 |
|
1384 DCD 64277 ; FB15 |
|
1385 OC_C6S2_8_v6 |
|
1386 DCD 25080 ; 61F8 |
|
1387 OC_C2S6_8_v6 |
|
1388 DCD 60547 ; EC83 |
|
1389 OC_C5S3_8_v6 |
|
1390 DCD 36410 ; 8E3A |
|
1391 OC_C3S5_8_v6 |
|
1392 DCD 54491 ; D4DB |
|
1393 |
|
1394 idct8_8core_down_v6 PROC |
|
1395 STMFD r13!,{r0,r14} |
|
1396 ; Stage 1: |
|
1397 ;5-6 rotation by 3pi/16 |
|
1398 LDRD r10,OC_C5S3_8_v6 ; r10= OC_C5S3, r11= OC_C3S5 |
|
1399 LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]> |
|
1400 LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]> |
|
1401 SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16 |
|
1402 LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]> |
|
1403 SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16 |
|
1404 LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]> |
|
1405 SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16 |
|
1406 SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16 |
|
1407 SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16) |
|
1408 PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5> |
|
1409 SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16) |
|
1410 PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]> |
|
1411 SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16 |
|
1412 PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
1413 SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16 |
|
1414 ;2-3 rotation by 6pi/16 |
|
1415 LDRD r10,OC_C6S2_8_v6 ; r10= OC_C6S2, r11= OC_C2S6 |
|
1416 PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3> |
|
1417 LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]> |
|
1418 SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16 |
|
1419 SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]> |
|
1420 SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16 |
|
1421 LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]> |
|
1422 SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16 |
|
1423 SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16 |
|
1424 PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9> |
|
1425 SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16) |
|
1426 SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16) |
|
1427 SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16 |
|
1428 PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]> |
|
1429 SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16 |
|
1430 ;4-7 rotation by 7pi/16 |
|
1431 LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7 |
|
1432 PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12> |
|
1433 LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]> |
|
1434 PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]> |
|
1435 SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]> |
|
1436 SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16 |
|
1437 LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]> |
|
1438 SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16 |
|
1439 SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16 |
|
1440 SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16 |
|
1441 SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16) |
|
1442 PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8> |
|
1443 SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16) |
|
1444 PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]> |
|
1445 SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16 |
|
1446 PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]> |
|
1447 SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16 |
|
1448 ;0-1 butterfly |
|
1449 LDR r11,OC_C4S4 |
|
1450 MOV r14,#8 |
|
1451 PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10> |
|
1452 SADD16 r7, r0, r4 ; r7 = x[0]+x[4] |
|
1453 SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]> |
|
1454 SMLAWB r8, r11,r7, r14 ; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8 |
|
1455 SSUB16 r4, r0, r4 ; r4 = x[0]-x[4] |
|
1456 SMLAWT r12,r11,r7, r14 ; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8 |
|
1457 SMLAWB r7, r11,r4, r14 ; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8 |
|
1458 PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> |
|
1459 SMLAWT r8, r11,r4, r14 ; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8 |
|
1460 ; Stage 2: |
|
1461 SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5] |
|
1462 PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]+8|t[0,0]+8> |
|
1463 SSUB16 r5, r10,r5 ; r5 = t[4]-t[5] |
|
1464 SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16 |
|
1465 SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6] |
|
1466 SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16 |
|
1467 SSUB16 r6, r9, r6 ; r6 = t[7]-t[6] |
|
1468 SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16 |
|
1469 PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]> |
|
1470 SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16 |
|
1471 ; Stage 3: |
|
1472 SADD16 r11,r8, r2 ; r11= t[1]'+8=t[1]+t[2]+8 |
|
1473 PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
|
1474 SSUB16 r2, r8, r2 ; r2 = t[2]+8=t[1]-t[2]+8 |
|
1475 LDMFD r13!,{r0,r14} |
|
1476 idct8_8core_down_stage3_5_v6 |
|
1477 SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5] |
|
1478 SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5] |
|
1479 SADD16 r10,r12,r3 ; r10= t[0]'+8=t[0]+t[3]+8 |
|
1480 SSUB16 r3, r12,r3 ; r3 = t[3]+8=t[0]-t[3]+8 |
|
1481 ; Stage 4: |
|
1482 SADD16 r12,r10,r7 ; r12= t[0]+t[7]+8 |
|
1483 SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]+8 |
|
1484 MOV r10,r12,ASR #4 |
|
1485 MOV r12,r12,LSL #16 |
|
1486 PKHTB r10,r10,r12,ASR #20 ; r10= t[0]+t[7]+8>>4 |
|
1487 STR r10,[r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4 |
|
1488 SADD16 r12,r11,r6 ; r12= t[1]+t[6]+8 |
|
1489 SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]+8 |
|
1490 MOV r10,r12,ASR #4 |
|
1491 MOV r12,r12,LSL #16 |
|
1492 PKHTB r10,r10,r12,ASR #20 ; r10= t[1]+t[6]+8>>4 |
|
1493 STR r10,[r0, #12] ; y[1<<3] = t[1]+t[6]+8>>4 |
|
1494 SADD16 r12,r2, r5 ; r12= t[2]+t[5]+8 |
|
1495 SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]+8 |
|
1496 MOV r10,r12,ASR #4 |
|
1497 MOV r12,r12,LSL #16 |
|
1498 PKHTB r10,r10,r12,ASR #20 ; r10= t[2]+t[5]+8>>4 |
|
1499 STR r10,[r0, #28] ; y[2<<3] = t[2]+t[5]+8>>4 |
|
1500 SADD16 r12,r3, r4 ; r12= t[3]+t[4]+8 |
|
1501 SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]+8 |
|
1502 MOV r10,r12,ASR #4 |
|
1503 MOV r12,r12,LSL #16 |
|
1504 PKHTB r10,r10,r12,ASR #20 ; r10= t[3]+t[4]+8>>4 |
|
1505 STR r10,[r0, #44] ; y[3<<3] = t[3]+t[4]+8>>4 |
|
1506 MOV r10,r4, ASR #4 |
|
1507 MOV r4, r4, LSL #16 |
|
1508 PKHTB r10,r10,r4, ASR #20 ; r10= t[3]-t[4]+8>>4 |
|
1509 STR r10,[r0, #60] ; y[4<<3] = t[3]-t[4]+8>>4 |
|
1510 MOV r10,r5, ASR #4 |
|
1511 MOV r5, r5, LSL #16 |
|
1512 PKHTB r10,r10,r5, ASR #20 ; r10= t[2]-t[5]+8>>4 |
|
1513 STR r10,[r0, #76] ; y[5<<3] = t[2]-t[5]+8>>4 |
|
1514 MOV r10,r6, ASR #4 |
|
1515 MOV r6, r6, LSL #16 |
|
1516 PKHTB r10,r10,r6, ASR #20 ; r10= t[1]-t[6]+8>>4 |
|
1517 STR r10,[r0, #92] ; y[6<<3] = t[1]-t[6]+8>>4 |
|
1518 MOV r10,r7, ASR #4 |
|
1519 MOV r7, r7, LSL #16 |
|
1520 PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4 |
|
1521 STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4 |
|
1522 MOV PC,r14 |
|
1523 ENDP |
|
1524 ] |
|
1525 |
|
1526 [ OC_ARM_ASM_NEON |
|
1527 EXPORT oc_idct8x8_1_neon |
|
1528 EXPORT oc_idct8x8_neon |
|
1529 |
|
1530 ALIGN 16 |
|
1531 OC_IDCT_CONSTS_NEON |
|
1532 DCW 8 |
|
1533 DCW 64277 ; FB15 (C1S7) |
|
1534 DCW 60547 ; EC83 (C2S6) |
|
1535 DCW 54491 ; D4DB (C3S5) |
|
1536 DCW 46341 ; B505 (C4S4) |
|
1537 DCW 36410 ; 471D (C5S3) |
|
1538 DCW 25080 ; 30FC (C6S2) |
|
1539 DCW 12785 ; 31F1 (C7S1) |
|
1540 |
|
1541 oc_idct8x8_1_neon PROC |
|
1542 ; r0 = ogg_int16_t *_y |
|
1543 ; r1 = ogg_uint16_t _dc |
|
1544 VDUP.S16 Q0, r1 |
|
1545 VMOV Q1, Q0 |
|
1546 VST1.64 {D0, D1, D2, D3}, [r0@128]! |
|
1547 VST1.64 {D0, D1, D2, D3}, [r0@128]! |
|
1548 VST1.64 {D0, D1, D2, D3}, [r0@128]! |
|
1549 VST1.64 {D0, D1, D2, D3}, [r0@128] |
|
1550 MOV PC, r14 |
|
1551 ENDP |
|
1552 |
|
1553 oc_idct8x8_neon PROC |
|
1554 ; r0 = ogg_int16_t *_y |
|
1555 ; r1 = ogg_int16_t *_x |
|
1556 ; r2 = int _last_zzi |
|
1557 CMP r2, #10 |
|
1558 BLE oc_idct8x8_10_neon |
|
1559 oc_idct8x8_slow_neon |
|
1560 VPUSH {D8-D15} |
|
1561 MOV r2, r1 |
|
1562 ADR r3, OC_IDCT_CONSTS_NEON |
|
1563 ; Row transforms (input is pre-transposed) |
|
1564 VLD1.64 {D16,D17,D18,D19}, [r2@128]! |
|
1565 VLD1.64 {D20,D21,D22,D23}, [r2@128]! |
|
1566 VLD1.64 {D24,D25,D26,D27}, [r2@128]! |
|
1567 VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4] |
|
1568 VLD1.64 {D28,D29,D30,D31}, [r2@128] |
|
1569 VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4] |
|
1570 VLD1.64 {D0,D1}, [r3@128] |
|
1571 MOV r12, r14 |
|
1572 BL oc_idct8x8_stage123_neon |
|
1573 ; Stage 4 |
|
1574 VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]' |
|
1575 VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]' |
|
1576 VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]'' |
|
1577 VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]'' |
|
1578 VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]'' |
|
1579 VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' |
|
1580 VTRN.16 Q14,Q15 |
|
1581 VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' |
|
1582 VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' |
|
1583 ; 8x8 Transpose |
|
1584 VTRN.16 Q8, Q9 |
|
1585 VTRN.16 Q10,Q11 |
|
1586 VTRN.16 Q12,Q13 |
|
1587 VTRN.32 Q8, Q10 |
|
1588 VTRN.32 Q9, Q11 |
|
1589 VTRN.32 Q12,Q14 |
|
1590 VTRN.32 Q13,Q15 |
|
1591 VSWP D17,D24 |
|
1592 VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4] |
|
1593 VSWP D19,D26 |
|
1594 VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4] |
|
1595 VSWP D21,D28 |
|
1596 VSWP D23,D30 |
|
1597 ; Column transforms |
|
1598 BL oc_idct8x8_stage123_neon |
|
1599 CMP r0,r1 |
|
1600 ; We have to put the return address back in the LR, or the branch |
|
1601 ; predictor will not recognize the function return and mis-predict the |
|
1602 ; entire call stack. |
|
1603 MOV r14, r12 |
|
1604 ; Stage 4 |
|
1605 VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]' |
|
1606 VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]' |
|
1607 VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]'' |
|
1608 VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]'' |
|
1609 VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]'' |
|
1610 VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' |
|
1611 VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' |
|
1612 VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' |
|
1613 BEQ oc_idct8x8_slow_neon_noclear |
|
1614 VMOV.I8 Q2,#0 |
|
1615 VPOP {D8-D15} |
|
1616 VMOV.I8 Q3,#0 |
|
1617 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 |
|
1618 VST1.64 {D4, D5, D6, D7}, [r1@128]! |
|
1619 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 |
|
1620 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 |
|
1621 VST1.64 {D4, D5, D6, D7}, [r1@128]! |
|
1622 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 |
|
1623 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 |
|
1624 VST1.64 {D4, D5, D6, D7}, [r1@128]! |
|
1625 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 |
|
1626 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 |
|
1627 VST1.64 {D4, D5, D6, D7}, [r1@128] |
|
1628 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 |
|
1629 VSTMIA r0, {D16-D31} |
|
1630 MOV PC, r14 |
|
1631 |
|
1632 oc_idct8x8_slow_neon_noclear |
|
1633 VPOP {D8-D15} |
|
1634 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 |
|
1635 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 |
|
1636 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 |
|
1637 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 |
|
1638 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 |
|
1639 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 |
|
1640 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 |
|
1641 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 |
|
1642 VSTMIA r0, {D16-D31} |
|
1643 MOV PC, r14 |
|
1644 ENDP |
|
1645 |
|
1646 oc_idct8x8_stage123_neon PROC |
|
1647 ; Stages 1 & 2 |
|
1648 VMULL.S16 Q4, D18,D1[3] |
|
1649 VMULL.S16 Q5, D19,D1[3] |
|
1650 VMULL.S16 Q7, D30,D1[3] |
|
1651 VMULL.S16 Q6, D31,D1[3] |
|
1652 VMULL.S16 Q2, D30,D0[1] |
|
1653 VMULL.S16 Q3, D31,D0[1] |
|
1654 VSHRN.S32 D8, Q4, #16 |
|
1655 VSHRN.S32 D9, Q5, #16 ; Q4 = (OC_C7S1*x[1]>>16) |
|
1656 VSHRN.S32 D14,Q7, #16 |
|
1657 VSHRN.S32 D15,Q6, #16 ; Q7 = (OC_C7S1*x[7]>>16) |
|
1658 VSHRN.S32 D4, Q2, #16 |
|
1659 VSHRN.S32 D5, Q3, #16 ; Q2 = (OC_C1S7*x[7]>>16)-x[7] |
|
1660 VSUB.S16 Q4, Q4, Q15 |
|
1661 VADD.S16 Q7, Q7, Q9 |
|
1662 VSUB.S16 Q4, Q4, Q2 ; Q4 = t[4] |
|
1663 VMULL.S16 Q2, D18,D0[1] |
|
1664 VMULL.S16 Q9, D19,D0[1] |
|
1665 VMULL.S16 Q5, D26,D0[3] |
|
1666 VMULL.S16 Q3, D27,D0[3] |
|
1667 VMULL.S16 Q6, D22,D0[3] |
|
1668 VMULL.S16 Q12,D23,D0[3] |
|
1669 VSHRN.S32 D4, Q2, #16 |
|
1670 VSHRN.S32 D5, Q9, #16 ; Q2 = (OC_C1S7*x[1]>>16)-x[1] |
|
1671 VSHRN.S32 D10,Q5, #16 |
|
1672 VSHRN.S32 D11,Q3, #16 ; Q5 = (OC_C3S5*x[5]>>16)-x[5] |
|
1673 VSHRN.S32 D12,Q6, #16 |
|
1674 VSHRN.S32 D13,Q12,#16 ; Q6 = (OC_C3S5*x[3]>>16)-x[3] |
|
1675 VADD.S16 Q7, Q7, Q2 ; Q7 = t[7] |
|
1676 VSUB.S16 Q5, Q5, Q11 |
|
1677 VADD.S16 Q6, Q6, Q11 |
|
1678 VADD.S16 Q5, Q5, Q13 |
|
1679 VADD.S16 Q6, Q6, Q13 |
|
1680 VMULL.S16 Q9, D22,D1[1] |
|
1681 VMULL.S16 Q11,D23,D1[1] |
|
1682 VMULL.S16 Q15,D26,D1[1] |
|
1683 VMULL.S16 Q13,D27,D1[1] |
|
1684 VMULL.S16 Q2, D20,D1[2] |
|
1685 VMULL.S16 Q12,D21,D1[2] |
|
1686 VSHRN.S32 D18,Q9, #16 |
|
1687 VSHRN.S32 D19,Q11,#16 ; Q9 = (OC_C5S3*x[3]>>16)-x[3] |
|
1688 VSHRN.S32 D30,Q15,#16 |
|
1689 VSHRN.S32 D31,Q13,#16 ; Q15= (OC_C5S3*x[5]>>16)-x[5] |
|
1690 VSHRN.S32 D4, Q2, #16 |
|
1691 VSHRN.S32 D5, Q12,#16 ; Q2 = (OC_C6S2*x[2]>>16) |
|
1692 VSUB.S16 Q5, Q5, Q9 ; Q5 = t[5] |
|
1693 VADD.S16 Q6, Q6, Q15 ; Q6 = t[6] |
|
1694 VSUB.S16 Q2, Q2, Q14 |
|
1695 VMULL.S16 Q3, D28,D1[2] |
|
1696 VMULL.S16 Q11,D29,D1[2] |
|
1697 VMULL.S16 Q12,D28,D0[2] |
|
1698 VMULL.S16 Q9, D29,D0[2] |
|
1699 VMULL.S16 Q13,D20,D0[2] |
|
1700 VMULL.S16 Q15,D21,D0[2] |
|
1701 VSHRN.S32 D6, Q3, #16 |
|
1702 VSHRN.S32 D7, Q11,#16 ; Q3 = (OC_C6S2*x[6]>>16) |
|
1703 VSHRN.S32 D24,Q12,#16 |
|
1704 VSHRN.S32 D25,Q9, #16 ; Q12= (OC_C2S6*x[6]>>16)-x[6] |
|
1705 VSHRN.S32 D26,Q13,#16 |
|
1706 VSHRN.S32 D27,Q15,#16 ; Q13= (OC_C2S6*x[2]>>16)-x[2] |
|
1707 VSUB.S16 Q9, Q4, Q5 ; Q9 = t[4]-t[5] |
|
1708 VSUB.S16 Q11,Q7, Q6 ; Q11= t[7]-t[6] |
|
1709 VADD.S16 Q3, Q3, Q10 |
|
1710 VADD.S16 Q4, Q4, Q5 ; Q4 = t[4]'=t[4]+t[5] |
|
1711 VADD.S16 Q7, Q7, Q6 ; Q7 = t[7]'=t[7]+t[6] |
|
1712 VSUB.S16 Q2, Q2, Q12 ; Q2 = t[2] |
|
1713 VADD.S16 Q3, Q3, Q13 ; Q3 = t[3] |
|
1714 VMULL.S16 Q12,D16,D1[0] |
|
1715 VMULL.S16 Q13,D17,D1[0] |
|
1716 VMULL.S16 Q14,D2, D1[0] |
|
1717 VMULL.S16 Q15,D3, D1[0] |
|
1718 VMULL.S16 Q5, D18,D1[0] |
|
1719 VMULL.S16 Q6, D22,D1[0] |
|
1720 VSHRN.S32 D24,Q12,#16 |
|
1721 VSHRN.S32 D25,Q13,#16 |
|
1722 VSHRN.S32 D28,Q14,#16 |
|
1723 VSHRN.S32 D29,Q15,#16 |
|
1724 VMULL.S16 Q13,D19,D1[0] |
|
1725 VMULL.S16 Q15,D23,D1[0] |
|
1726 VADD.S16 Q8, Q8, Q12 ; Q8 = t[0] |
|
1727 VADD.S16 Q1, Q1, Q14 ; Q1 = t[1] |
|
1728 VSHRN.S32 D10,Q5, #16 |
|
1729 VSHRN.S32 D12,Q6, #16 |
|
1730 VSHRN.S32 D11,Q13,#16 |
|
1731 VSHRN.S32 D13,Q15,#16 |
|
1732 VADD.S16 Q5, Q5, Q9 ; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16 |
|
1733 VADD.S16 Q6, Q6, Q11 ; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16 |
|
1734 ; Stage 3 |
|
1735 VSUB.S16 Q11,Q8, Q3 ; Q11 = t[3]''=t[0]-t[3] |
|
1736 VADD.S16 Q8, Q8, Q3 ; Q8 = t[0]''=t[0]+t[3] |
|
1737 VADD.S16 Q9, Q1, Q2 ; Q9 = t[1]''=t[1]+t[2] |
|
1738 VADD.S16 Q3, Q6, Q5 ; Q3 = t[6]''=t[6]'+t[5]' |
|
1739 VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2] |
|
1740 VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]' |
|
1741 MOV PC, r14 |
|
1742 ENDP |
|
1743 |
|
1744 oc_idct8x8_10_neon PROC |
|
1745 ADR r3, OC_IDCT_CONSTS_NEON |
|
1746 VLD1.64 {D0,D1}, [r3@128] |
|
1747 MOV r2, r1 |
|
1748 ; Row transforms (input is pre-transposed) |
|
1749 ; Stage 1 |
|
1750 VLD1.64 {D16,D17,D18,D19},[r2@128]! |
|
1751 MOV r12, #16 |
|
1752 VMULL.S16 Q15,D16,D1[0] ; Q15= OC_C4S4*x[0]-(x[0]<<16) |
|
1753 VLD1.64 {D17}, [r2@64], r12 |
|
1754 VMULL.S16 Q2, D18,D0[1] ; Q2 = OC_C1S7*x[1]-(x[1]<<16) |
|
1755 VLD1.64 {D19}, [r2@64] |
|
1756 VMULL.S16 Q14,D17,D0[2] ; Q14= OC_C2S6*x[2]-(x[2]<<16) |
|
1757 VMULL.S16 Q3, D19,D0[3] ; Q3 = OC_C3S5*x[3]-(x[3]<<16) |
|
1758 VMULL.S16 Q13,D19,D1[1] ; Q13= OC_C5S3*x[3]-(x[3]<<16) |
|
1759 VMULL.S16 Q12,D18,D1[3] ; Q12= OC_C7S1*x[1] |
|
1760 VMULL.S16 Q1, D17,D1[2] ; Q1 = OC_C6S2*x[2] |
|
1761 VSHRN.S32 D30,Q15,#16 ; D30= t[0]-x[0] |
|
1762 VSHRN.S32 D4, Q2, #16 ; D4 = t[7]-x[1] |
|
1763 VSHRN.S32 D31,Q14,#16 ; D31= t[3]-x[2] |
|
1764 VSHRN.S32 D6, Q3, #16 ; D6 = t[6]-x[3] |
|
1765 VSHRN.S32 D7, Q13,#16 ; D7 = -t[5]-x[3] |
|
1766 VSHRN.S32 D5, Q12,#16 ; D5 = t[4] |
|
1767 VSHRN.S32 D2, Q1, #16 ; D2 = t[2] |
|
1768 VADD.S16 D4, D4, D18 ; D4 = t[7] |
|
1769 VADD.S16 D6, D6, D19 ; D6 = t[6] |
|
1770 VADD.S16 D7, D7, D19 ; D7 = -t[5] |
|
1771 VADD.S16 Q15,Q15,Q8 ; D30= t[0] |
|
1772 ; D31= t[3] |
|
1773 ; Stages 2 & 3 |
|
1774 VSUB.S16 Q12,Q2, Q3 ; D24= t[7]-t[6] |
|
1775 ; D25= t[4]'=t[4]+t[5] |
|
1776 VADD.S16 Q13,Q2, Q3 ; D26= t[7]'=t[7]+t[6] |
|
1777 ; D27= t[4]-t[5] |
|
1778 VMULL.S16 Q11,D24,D1[0] ; Q11= OC_C4S4*(t[7]-t[6]) |
|
1779 ; -(t[7]-t[6]<<16) |
|
1780 VMULL.S16 Q14,D27,D1[0] ; Q14= OC_C4S4*(t[4]-t[5]) |
|
1781 ; -(t[4]-t[5]<<16) |
|
1782 VADD.S16 D16,D30,D31 ; D16= t[0]'=t[0]+t[3] |
|
1783 VSUB.S16 D17,D30,D2 ; D17= t[2]'=t[0]-t[2] |
|
1784 VADD.S16 D18,D30,D2 ; D18= t[1]'=t[0]+t[2] |
|
1785 VSHRN.S32 D22,Q11,#16 ; D22= (OC_C4S4*(t[7]-t[6])>>16) |
|
1786 ; -(t[7]-t[6]) |
|
1787 VSHRN.S32 D23,Q14,#16 ; D23= (OC_C4S4*(t[4]-t[5])>>16) |
|
1788 ; -(t[4]-t[5]) |
|
1789 VSUB.S16 D19,D30,D31 ; D19= t[3]'=t[0]-t[3] |
|
1790 VADD.S16 D22,D22,D24 ; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16 |
|
1791 VADD.S16 D23,D23,D27 ; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16 |
|
1792 VSUB.S16 D27,D22,D23 ; D27= t[5]''=t[6]'-t[5]' |
|
1793 VADD.S16 D24,D22,D23 ; D24= t[6]''=t[6]'+t[5]' |
|
1794 ; Stage 4 |
|
1795 VSUB.S16 Q11,Q8, Q13 ; D22= y[7]=t[0]'-t[7]' |
|
1796 ; D23= y[5]=t[2]'-t[5]'' |
|
1797 VSUB.S16 Q10,Q9, Q12 ; D20= y[6]=t[1]'-t[6]' |
|
1798 ; D21= y[4]=t[3]'-t[4]'' |
|
1799 VADD.S16 Q8, Q8, Q13 ; D16= y[0]=t[0]'+t[7]' |
|
1800 ; D17= y[2]=t[2]'+t[5]'' |
|
1801 VADD.S16 Q9, Q9, Q12 ; D18= y[1]=t[1]'-t[6]' |
|
1802 ; D19= y[3]=t[3]'-t[4]'' |
|
1803 ; 8x4 transpose |
|
1804 VTRN.16 Q10,Q11 ; Q10= c5c4a5a4 c7c6a7a6 |
|
1805 ; Q11= d5d4b5b4 d7d6b7b6 |
|
1806 VTRN.16 Q8, Q9 ; Q8 = c3c2a3a2 c1c0a1a0 |
|
1807 ; Q9 = d3d2b3b2 d1d0b1b0 |
|
1808 VSWP D20,D21 ; Q10= c7c6a7a6 c5c4a5a4 |
|
1809 VSWP D22,D23 ; Q11= d7d6b7b6 d5d4b5b4 |
|
1810 VUZP.32 Q9, Q11 ; Q9 = b7b6b5b4 b3b2b1b0 |
|
1811 ; Q11= d7d6d5d4 d3d2d1d0 |
|
1812 VMULL.S16 Q15,D18,D0[1] |
|
1813 VMULL.S16 Q13,D22,D1[1] |
|
1814 VUZP.32 Q8, Q10 ; Q8 = a7a6a5a4 a3a2a1a0 |
|
1815 ; Q10= c7c6c5c4 c3c2c1c0 |
|
1816 ; Column transforms |
|
1817 ; Stages 1, 2, & 3 |
|
1818 VMULL.S16 Q14,D19,D0[1] ; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16) |
|
1819 VMULL.S16 Q12,D23,D1[1] ; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16) |
|
1820 VMULL.S16 Q3, D22,D0[3] |
|
1821 VMULL.S16 Q2, D23,D0[3] ; Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16) |
|
1822 VSHRN.S32 D30,Q15,#16 |
|
1823 VSHRN.S32 D31,Q14,#16 ; Q15= (OC_C1S7*x[1]>>16)-x[1] |
|
1824 VSHRN.S32 D26,Q13,#16 |
|
1825 VSHRN.S32 D27,Q12,#16 ; Q13= (OC_C5S3*x[3]>>16)-x[3] |
|
1826 VSHRN.S32 D28,Q3, #16 |
|
1827 VSHRN.S32 D29,Q2, #16 ; Q14= (OC_C3S5*x[3]>>16)-x[3] |
|
1828 VADD.S16 Q15,Q15,Q9 ; Q15= t[7] |
|
1829 VADD.S16 Q13,Q13,Q11 ; Q13= -t[5] |
|
1830 VADD.S16 Q14,Q14,Q11 ; Q14= t[6] |
|
1831 VMULL.S16 Q12,D18,D1[3] |
|
1832 VMULL.S16 Q2, D19,D1[3] ; Q2:Q12= OC_C7S1*x[1] |
|
1833 VMULL.S16 Q1, D16,D1[0] |
|
1834 VMULL.S16 Q11,D17,D1[0] ; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16) |
|
1835 VMULL.S16 Q3, D20,D0[2] |
|
1836 VMULL.S16 Q9, D21,D0[2] ; Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16) |
|
1837 VSHRN.S32 D24,Q12,#16 |
|
1838 VSHRN.S32 D25,Q2, #16 ; Q12= t[4] |
|
1839 VMULL.S16 Q2, D20,D1[2] |
|
1840 VSHRN.S32 D2, Q1, #16 |
|
1841 VSHRN.S32 D3, Q11,#16 ; Q1 = (OC_C4S4*x[0]>>16)-x[0] |
|
1842 VMULL.S16 Q11,D21,D1[2] ; Q2:Q11= OC_C6S2*x[2] |
|
1843 VSHRN.S32 D6, Q3, #16 |
|
1844 VSHRN.S32 D7, Q9, #16 ; Q3 = (OC_C2S6*x[2]>>16)-x[2] |
|
1845 VSUB.S16 Q9, Q15,Q14 ; Q9 = t[7]-t[6] |
|
1846 VADD.S16 Q15,Q15,Q14 ; Q15= t[7]'=t[7]+t[6] |
|
1847 VSHRN.S32 D4, Q2, #16 |
|
1848 VSHRN.S32 D5, Q11,#16 ; Q2 = t[2] |
|
1849 VADD.S16 Q1, Q1, Q8 ; Q1 = t[0] |
|
1850 VADD.S16 Q8, Q12,Q13 ; Q8 = t[4]-t[5] |
|
1851 VADD.S16 Q3, Q3, Q10 ; Q3 = t[3] |
|
1852 VMULL.S16 Q10,D16,D1[0] |
|
1853 VMULL.S16 Q11,D17,D1[0] ; Q11:Q10= OC_C4S4*(t[4]-t[5]) |
|
1854 ; -(t[4]-t[5]<<16) |
|
1855 VSUB.S16 Q12,Q12,Q13 ; Q12= t[4]'=t[4]+t[5] |
|
1856 VMULL.S16 Q14,D18,D1[0] |
|
1857 VMULL.S16 Q13,D19,D1[0] ; Q13:Q14= OC_C4S4*(t[6]-t[7]) |
|
1858 ; -(t[6]-t[7]<<16) |
|
1859 VSHRN.S32 D20,Q10,#16 |
|
1860 VSHRN.S32 D21,Q11,#16 ; Q10= (OC_C4S4*(t[4]-t[5])>>16) |
|
1861 ; -(t[4]-t[5]) |
|
1862 VADD.S16 Q11,Q1, Q3 ; Q11= t[0]'=t[0]+t[3] |
|
1863 VSUB.S16 Q3, Q1, Q3 ; Q3 = t[3]'=t[0]-t[3] |
|
1864 VSHRN.S32 D28,Q14,#16 |
|
1865 VSHRN.S32 D29,Q13,#16 ; Q14= (OC_C4S4*(t[7]-t[6])>>16) |
|
1866 ; -(t[7]-t[6]) |
|
1867 VADD.S16 Q10,Q10,Q8 ; Q10=t[5]' |
|
1868 VADD.S16 Q14,Q14,Q9 ; Q14=t[6]' |
|
1869 VSUB.S16 Q13,Q14,Q10 ; Q13=t[5]''=t[6]'-t[5]' |
|
1870 VADD.S16 Q14,Q14,Q10 ; Q14=t[6]''=t[6]'+t[5]' |
|
1871 VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2] |
|
1872 VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2] |
|
1873 ; Stage 4 |
|
1874 CMP r0, r1 |
|
1875 VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]' |
|
1876 VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]'' |
|
1877 VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]' |
|
1878 VSUB.S16 Q14,Q10,Q14 ; Q14 = y[6]=t[1]'-t[6]'' |
|
1879 VADD.S16 Q10,Q2, Q13 ; Q10 = y[2]=t[2]'+t[5]'' |
|
1880 VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]' |
|
1881 VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]' |
|
1882 VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]'' |
|
1883 BEQ oc_idct8x8_10_neon_noclear |
|
1884 VMOV.I8 D2, #0 |
|
1885 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 |
|
1886 VST1.64 {D2}, [r1@64], r12 |
|
1887 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 |
|
1888 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 |
|
1889 VST1.64 {D2}, [r1@64], r12 |
|
1890 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 |
|
1891 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 |
|
1892 VST1.64 {D2}, [r1@64], r12 |
|
1893 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 |
|
1894 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 |
|
1895 VST1.64 {D2}, [r1@64] |
|
1896 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 |
|
1897 VSTMIA r0, {D16-D31} |
|
1898 MOV PC, r14 |
|
1899 |
|
1900 oc_idct8x8_10_neon_noclear |
|
1901 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 |
|
1902 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 |
|
1903 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 |
|
1904 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 |
|
1905 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 |
|
1906 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 |
|
1907 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 |
|
1908 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 |
|
1909 VSTMIA r0, {D16-D31} |
|
1910 MOV PC, r14 |
|
1911 ENDP |
|
1912 ] |
|
1913 |
|
1914 END |