media/libtheora/lib/arm/armidct.s

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:50ad640c3bee
1 ;********************************************************************
2 ;* *
3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 ;* *
8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 ;* *
11 ;********************************************************************
12 ; Original implementation:
13 ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
14 ; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $
15 ;********************************************************************
16
17 AREA |.text|, CODE, READONLY
18
19 ; Explicitly specifying alignment here because some versions of
20 ; gas don't align code correctly. See
21 ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
22 ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
23 ALIGN
24
25 GET armopts.s
26
27 EXPORT oc_idct8x8_1_arm
28 EXPORT oc_idct8x8_arm
29
30 oc_idct8x8_1_arm PROC
31 ; r0 = ogg_int16_t *_y
32 ; r1 = ogg_uint16_t _dc
33 ORR r1, r1, r1, LSL #16
34 MOV r2, r1
35 MOV r3, r1
36 MOV r12,r1
37 STMIA r0!,{r1,r2,r3,r12}
38 STMIA r0!,{r1,r2,r3,r12}
39 STMIA r0!,{r1,r2,r3,r12}
40 STMIA r0!,{r1,r2,r3,r12}
41 STMIA r0!,{r1,r2,r3,r12}
42 STMIA r0!,{r1,r2,r3,r12}
43 STMIA r0!,{r1,r2,r3,r12}
44 STMIA r0!,{r1,r2,r3,r12}
45 MOV PC, r14
46 ENDP
47
48 oc_idct8x8_arm PROC
49 ; r0 = ogg_int16_t *_y
50 ; r1 = ogg_int16_t *_x
51 ; r2 = int _last_zzi
52 CMP r2, #3
53 BLE oc_idct8x8_3_arm
54 CMP r2, #6
55 BLE oc_idct8x8_6_arm
56 CMP r2, #10
57 BLE oc_idct8x8_10_arm
58 oc_idct8x8_slow_arm
59 STMFD r13!,{r4-r11,r14}
60 SUB r13,r13,#64*2
61 ; Row transforms
62 STR r0, [r13,#-4]!
63 ADD r0, r13, #4 ; Write to temp storage.
64 BL idct8core_arm
65 BL idct8core_arm
66 BL idct8core_arm
67 BL idct8core_arm
68 BL idct8core_arm
69 BL idct8core_arm
70 BL idct8core_arm
71 BL idct8core_arm
72 LDR r0, [r13], #4 ; Write to the final destination.
73 ; Clear input data for next block (decoder only).
74 SUB r2, r1, #8*16
75 CMP r0, r2
76 MOV r1, r13 ; And read from temp storage.
77 BEQ oc_idct8x8_slow_arm_cols
78 MOV r4, #0
79 MOV r5, #0
80 MOV r6, #0
81 MOV r7, #0
82 STMIA r2!,{r4,r5,r6,r7}
83 STMIA r2!,{r4,r5,r6,r7}
84 STMIA r2!,{r4,r5,r6,r7}
85 STMIA r2!,{r4,r5,r6,r7}
86 STMIA r2!,{r4,r5,r6,r7}
87 STMIA r2!,{r4,r5,r6,r7}
88 STMIA r2!,{r4,r5,r6,r7}
89 STMIA r2!,{r4,r5,r6,r7}
90 oc_idct8x8_slow_arm_cols
91 ; Column transforms
92 BL idct8core_down_arm
93 BL idct8core_down_arm
94 BL idct8core_down_arm
95 BL idct8core_down_arm
96 BL idct8core_down_arm
97 BL idct8core_down_arm
98 BL idct8core_down_arm
99 BL idct8core_down_arm
100 ADD r13,r13,#64*2
101 LDMFD r13!,{r4-r11,PC}
102 ENDP
103
104 oc_idct8x8_10_arm PROC
105 STMFD r13!,{r4-r11,r14}
106 SUB r13,r13,#64*2
107 ; Row transforms
108 MOV r2, r0
109 MOV r0, r13 ; Write to temp storage.
110 BL idct4core_arm
111 BL idct3core_arm
112 BL idct2core_arm
113 BL idct1core_arm
114 ; Clear input data for next block (decoder only).
115 SUB r0, r1, #4*16
116 CMP r0, r2
117 MOV r1, r13 ; Read from temp storage.
118 BEQ oc_idct8x8_10_arm_cols
119 MOV r4, #0
120 STR r4, [r0]
121 STR r4, [r0,#4]
122 STR r4, [r0,#16]
123 STR r4, [r0,#20]
124 STR r4, [r0,#32]
125 STR r4, [r0,#48]
126 MOV r0, r2 ; Write to the final destination
127 oc_idct8x8_10_arm_cols
128 ; Column transforms
129 BL idct4core_down_arm
130 BL idct4core_down_arm
131 BL idct4core_down_arm
132 BL idct4core_down_arm
133 BL idct4core_down_arm
134 BL idct4core_down_arm
135 BL idct4core_down_arm
136 BL idct4core_down_arm
137 ADD r13,r13,#64*2
138 LDMFD r13!,{r4-r11,PC}
139 ENDP
140
141 oc_idct8x8_6_arm PROC
142 STMFD r13!,{r4-r7,r9-r11,r14}
143 SUB r13,r13,#64*2
144 ; Row transforms
145 MOV r2, r0
146 MOV r0, r13 ; Write to temp storage.
147 BL idct3core_arm
148 BL idct2core_arm
149 BL idct1core_arm
150 ; Clear input data for next block (decoder only).
151 SUB r0, r1, #3*16
152 CMP r0, r2
153 MOV r1, r13 ; Read from temp storage.
154 BEQ oc_idct8x8_6_arm_cols
155 MOV r4, #0
156 STR r4, [r0]
157 STR r4, [r0,#4]
158 STR r4, [r0,#16]
159 STR r4, [r0,#32]
160 MOV r0, r2 ; Write to the final destination
161 oc_idct8x8_6_arm_cols
162 ; Column transforms
163 BL idct3core_down_arm
164 BL idct3core_down_arm
165 BL idct3core_down_arm
166 BL idct3core_down_arm
167 BL idct3core_down_arm
168 BL idct3core_down_arm
169 BL idct3core_down_arm
170 BL idct3core_down_arm
171 ADD r13,r13,#64*2
172 LDMFD r13!,{r4-r7,r9-r11,PC}
173 ENDP
174
175 oc_idct8x8_3_arm PROC
176 STMFD r13!,{r4-r7,r9-r11,r14}
177 SUB r13,r13,#64*2
178 ; Row transforms
179 MOV r2, r0
180 MOV r0, r13 ; Write to temp storage.
181 BL idct2core_arm
182 BL idct1core_arm
183 ; Clear input data for next block (decoder only).
184 SUB r0, r1, #2*16
185 CMP r0, r2
186 MOV r1, r13 ; Read from temp storage.
187 MOVNE r4, #0
188 STRNE r4, [r0]
189 STRNE r4, [r0,#16]
190 MOVNE r0, r2 ; Write to the final destination
191 ; Column transforms
192 BL idct2core_down_arm
193 BL idct2core_down_arm
194 BL idct2core_down_arm
195 BL idct2core_down_arm
196 BL idct2core_down_arm
197 BL idct2core_down_arm
198 BL idct2core_down_arm
199 BL idct2core_down_arm
200 ADD r13,r13,#64*2
201 LDMFD r13!,{r4-r7,r9-r11,PC}
202 ENDP
203
204 idct1core_arm PROC
205 ; r0 = ogg_int16_t *_y (destination)
206 ; r1 = const ogg_int16_t *_x (source)
207 LDRSH r3, [r1], #16
208 MOV r12,#0x05
209 ORR r12,r12,#0xB500
210 MUL r3, r12, r3
211 ; Stall ?
212 MOV r3, r3, ASR #16
213 STRH r3, [r0], #2
214 STRH r3, [r0, #14]
215 STRH r3, [r0, #30]
216 STRH r3, [r0, #46]
217 STRH r3, [r0, #62]
218 STRH r3, [r0, #78]
219 STRH r3, [r0, #94]
220 STRH r3, [r0, #110]
221 MOV PC,R14
222 ENDP
223
224 idct2core_arm PROC
225 ; r0 = ogg_int16_t *_y (destination)
226 ; r1 = const ogg_int16_t *_x (source)
227 LDRSH r9, [r1], #16 ; r9 = x[0]
228 LDR r12,OC_C4S4
229 LDRSH r11,[r1, #-14] ; r11= x[1]
230 LDR r3, OC_C7S1
231 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
232 LDR r10,OC_C1S7
233 MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
234 MOV r9, r9, ASR #16 ; r9 = t[0]
235 MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
236 MOV r3, r3, ASR #16 ; r3 = t[4]
237 MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
238 MOV r11,r11,ASR #16 ; r11= t[7]
239 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
240 MOV r10,r10,ASR #16 ; r10= t[5]
241 ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]
242 ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
243 SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
244 ADD r3, r3, r9 ; r3 = t[0]+t[4]
245 ADD r11,r11,r9 ; r11= t[0]+t[7]
246 STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
247 STRH r12,[r0, #14] ; y[1] = t[0]+t[6]
248 STRH r10,[r0, #30] ; y[2] = t[0]+t[5]
249 STRH r3, [r0, #46] ; y[3] = t[0]+t[4]
250 RSB r3, r3, r9, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
251 RSB r10,r10,r9, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
252 RSB r12,r12,r9, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
253 RSB r11,r11,r9, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
254 STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
255 STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
256 STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
257 STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
258 MOV PC,r14
259 ENDP
260
261 idct2core_down_arm PROC
262 ; r0 = ogg_int16_t *_y (destination)
263 ; r1 = const ogg_int16_t *_x (source)
264 LDRSH r9, [r1], #16 ; r9 = x[0]
265 LDR r12,OC_C4S4
266 LDRSH r11,[r1, #-14] ; r11= x[1]
267 LDR r3, OC_C7S1
268 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
269 LDR r10,OC_C1S7
270 MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
271 MOV r9, r9, ASR #16 ; r9 = t[0]
272 MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
273 ADD r9, r9, #8 ; r9 = t[0]+8
274 MOV r3, r3, ASR #16 ; r3 = t[4]
275 MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
276 MOV r11,r11,ASR #16 ; r11= t[7]
277 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
278 MOV r10,r10,ASR #16 ; r10= t[5]
279 ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]+8
280 ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
281 SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
282 ADD r3, r3, r9 ; r3 = t[0]+t[4]+8
283 ADD r11,r11,r9 ; r11= t[0]+t[7]+8
284 ; TODO: This is wrong.
285 ; The C code truncates to 16 bits by storing to RAM and doing the
286 ; shifts later; we've got an extra 4 bits here.
287 MOV r4, r11,ASR #4
288 MOV r5, r12,ASR #4
289 MOV r6, r10,ASR #4
290 MOV r7, r3, ASR #4
291 RSB r3, r3, r9, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
292 RSB r10,r10,r9, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
293 RSB r12,r12,r9, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
294 RSB r11,r11,r9, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
295 MOV r3, r3, ASR #4
296 MOV r10,r10,ASR #4
297 MOV r12,r12,ASR #4
298 MOV r11,r11,ASR #4
299 STRH r4, [r0], #2 ; y[0] = t[0]+t[7]
300 STRH r5, [r0, #14] ; y[1] = t[0]+t[6]
301 STRH r6, [r0, #30] ; y[2] = t[0]+t[5]
302 STRH r7, [r0, #46] ; y[3] = t[0]+t[4]
303 STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
304 STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
305 STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
306 STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
307 MOV PC,r14
308 ENDP
309
310 idct3core_arm PROC
311 LDRSH r9, [r1], #16 ; r9 = x[0]
312 LDR r12,OC_C4S4 ; r12= OC_C4S4
313 LDRSH r3, [r1, #-12] ; r3 = x[2]
314 LDR r10,OC_C6S2 ; r10= OC_C6S2
315 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
316 LDR r4, OC_C2S6 ; r4 = OC_C2S6
317 MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
318 LDRSH r11,[r1, #-14] ; r11= x[1]
319 MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
320 LDR r4, OC_C7S1 ; r4 = OC_C7S1
321 LDR r5, OC_C1S7 ; r5 = OC_C1S7
322 MOV r9, r9, ASR #16 ; r9 = t[0]
323 MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
324 ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]
325 MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
326 MOV r4, r4, ASR #16 ; r4 = t[4]
327 MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
328 MOV r11,r11,ASR #16 ; r11= t[7]
329 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
330 ADD r10,r9, r10,ASR #16 ; r10= t[1] = t[0]+t[2]
331 RSB r6, r10,r9, LSL #1 ; r6 = t[2] = t[0]-t[2]
332 ; r3 = t2[0] = t[0]+t[3]
333 RSB r9, r3, r9, LSL #1 ; r9 = t2[3] = t[0]-t[3]
334 MOV r12,r12,ASR #16 ; r12= t[6]
335 ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
336 RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
337 ADD r11,r3, r11 ; r11= t2[0]+t[7]
338 ADD r5, r10,r5 ; r5 = t[1]+t2[6]
339 ADD r12,r6, r12 ; r12= t[2]+t2[5]
340 ADD r4, r9, r4 ; r4 = t2[3]+t[4]
341 STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
342 STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
343 STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
344 STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
345 RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7]
346 RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6]
347 RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5]
348 RSB r4, r4, r9, LSL #1 ; r4 = t2[3] - t[4]
349 STRH r4, [r0, #62] ; y[4] = t2[3]-t[4]
350 STRH r12,[r0, #78] ; y[5] = t[2]-t2[5]
351 STRH r5, [r0, #94] ; y[6] = t[1]-t2[6]
352 STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
353 MOV PC,R14
354 ENDP
355
356 idct3core_down_arm PROC
357 LDRSH r9, [r1], #16 ; r9 = x[0]
358 LDR r12,OC_C4S4 ; r12= OC_C4S4
359 LDRSH r3, [r1, #-12] ; r3 = x[2]
360 LDR r10,OC_C6S2 ; r10= OC_C6S2
361 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
362 LDR r4, OC_C2S6 ; r4 = OC_C2S6
363 MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
364 LDRSH r11,[r1, #-14] ; r11= x[1]
365 MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
366 LDR r4, OC_C7S1 ; r4 = OC_C7S1
367 LDR r5, OC_C1S7 ; r5 = OC_C1S7
368 MOV r9, r9, ASR #16 ; r9 = t[0]
369 MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
370 ADD r9, r9, #8 ; r9 = t[0]+8
371 MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
372 ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]+8
373 MOV r4, r4, ASR #16 ; r4 = t[4]
374 MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
375 MOV r11,r11,ASR #16 ; r11= t[7]
376 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
377 ADD r10,r9, r10,ASR #16 ; r10= t[1]+8 = t[0]+t[2]+8
378 RSB r6, r10,r9, LSL #1 ; r6 = t[2]+8 = t[0]-t[2]+8
379 ; r3 = t2[0]+8 = t[0]+t[3]+8
380 RSB r9, r3, r9, LSL #1 ; r9 = t2[3]+8 = t[0]-t[3]+8
381 MOV r12,r12,ASR #16 ; r12= t[6]
382 ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
383 RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
384 ADD r11,r3, r11 ; r11= t2[0]+t[7] +8
385 ADD r5, r10,r5 ; r5 = t[1] +t2[6]+8
386 ADD r12,r6, r12 ; r12= t[2] +t2[5]+8
387 ADD r4, r9, r4 ; r4 = t2[3]+t[4] +8
388 RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7] + 8
389 RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6] + 8
390 RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5] + 8
391 RSB r9, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + 8
392 ; TODO: This is wrong.
393 ; The C code truncates to 16 bits by storing to RAM and doing the
394 ; shifts later; we've got an extra 4 bits here.
395 MOV r11,r11,ASR #4
396 MOV r5, r5, ASR #4
397 MOV r12,r12,ASR #4
398 MOV r4, r4, ASR #4
399 MOV r9, r9, ASR #4
400 MOV r6, r6, ASR #4
401 MOV r10,r10,ASR #4
402 MOV r3, r3, ASR #4
403 STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
404 STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
405 STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
406 STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
407 STRH r9, [r0, #62] ; y[4] = t2[3]-t[4]
408 STRH r6, [r0, #78] ; y[5] = t[2]-t2[5]
409 STRH r10,[r0, #94] ; y[6] = t[1]-t2[6]
410 STRH r3, [r0, #110] ; y[7] = t2[0]-t[7]
411 MOV PC,R14
412 ENDP
413
414 idct4core_arm PROC
415 ; r0 = ogg_int16_t *_y (destination)
416 ; r1 = const ogg_int16_t *_x (source)
417 LDRSH r9, [r1], #16 ; r9 = x[0]
418 LDR r10,OC_C4S4 ; r10= OC_C4S4
419 LDRSH r12,[r1, #-12] ; r12= x[2]
420 LDR r4, OC_C6S2 ; r4 = OC_C6S2
421 MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
422 LDR r5, OC_C2S6 ; r5 = OC_C2S6
423 MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
424 LDRSH r3, [r1, #-14] ; r3 = x[1]
425 MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
426 LDR r6, OC_C7S1 ; r6 = OC_C7S1
427 LDR r12,OC_C1S7 ; r12= OC_C1S7
428 LDRSH r11,[r1, #-10] ; r11= x[3]
429 MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
430 LDR r7, OC_C5S3 ; r7 = OC_C5S3
431 MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
432 LDR r8, OC_C3S5 ; r8 = OC_C3S5
433 MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
434 MOV r9, r9, ASR #16 ; r9 = t[0]
435 MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
436 MOV r6, r6, ASR #16 ; r6 = t[4]
437 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
438 ; before multiplying, not after (this is not equivalent)
439 SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
440 RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
441 MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
442 MOV r3, r3, ASR #16 ; r3 = t[7]
443 ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
444 RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
445 MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
446 ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2]
447 RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2]
448 ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3]
449 RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3]
450 MOV r3, r3, ASR #16 ; r3 = t2[6]
451 ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
452 RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
453 ADD r11,r5, r11 ; r11= t[0]+t2[7]
454 ADD r6, r4, r6 ; r6 = t[1]+t3[6]
455 ADD r3, r10,r3 ; r3 = t[2]+t3[5]
456 ADD r7, r9, r7 ; r7 = t[3]+t2[4]
457 STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
458 STRH r6, [r0, #14] ; y[1] = t[1]+t2[6]
459 STRH r3, [r0, #30] ; y[2] = t[2]+t2[5]
460 STRH r7, [r0, #46] ; y[3] = t2[3]+t[4]
461 RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7]
462 RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6]
463 RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5]
464 RSB r7, r7, r9, LSL #1 ; r7 = t[3]-t2[4]
465 STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
466 STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
467 STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
468 STRH r11, [r0, #110] ; y[7] = t2[0]-t[7]
469 MOV PC,r14
470 ENDP
471
472 idct4core_down_arm PROC
473 ; r0 = ogg_int16_t *_y (destination)
474 ; r1 = const ogg_int16_t *_x (source)
475 LDRSH r9, [r1], #16 ; r9 = x[0]
476 LDR r10,OC_C4S4 ; r10= OC_C4S4
477 LDRSH r12,[r1, #-12] ; r12= x[2]
478 LDR r4, OC_C6S2 ; r4 = OC_C6S2
479 MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
480 LDR r5, OC_C2S6 ; r5 = OC_C2S6
481 MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
482 LDRSH r3, [r1, #-14] ; r3 = x[1]
483 MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
484 LDR r6, OC_C7S1 ; r6 = OC_C7S1
485 LDR r12,OC_C1S7 ; r12= OC_C1S7
486 LDRSH r11,[r1, #-10] ; r11= x[3]
487 MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
488 LDR r7, OC_C5S3 ; r7 = OC_C5S3
489 MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
490 LDR r8, OC_C3S5 ; r8 = OC_C3S5
491 MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
492 MOV r9, r9, ASR #16 ; r9 = t[0]
493 MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
494 MOV r6, r6, ASR #16 ; r6 = t[4]
495 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
496 ; before multiplying, not after (this is not equivalent)
497 SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
498 RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
499 MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
500 MOV r3, r3, ASR #16 ; r3 = t[7]
501 ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
502 RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
503 ADD r9, r9, #8 ; r9 = t[0]+8
504 MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
505 ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + 8
506 RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + 8
507 ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + 8
508 RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + 8
509 MOV r3, r3, ASR #16 ; r3 = t2[6]
510 ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
511 RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
512 ADD r5, r5, r11 ; r5 = t[0]+t2[7]+8
513 ADD r4, r4, r6 ; r4 = t[1]+t3[6]+8
514 ADD r10,r10,r3 ; r10= t[2]+t3[5]+8
515 ADD r9, r9, r7 ; r9 = t[3]+t2[4]+8
516 SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]+8
517 SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]+8
518 SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]+8
519 SUB r7, r9, r7, LSL #1 ; r7 = t[3]-t2[4]+8
520 ; TODO: This is wrong.
521 ; The C code truncates to 16 bits by storing to RAM and doing the
522 ; shifts later; we've got an extra 4 bits here.
523 MOV r11,r11,ASR #4
524 MOV r6, r6, ASR #4
525 MOV r3, r3, ASR #4
526 MOV r7, r7, ASR #4
527 MOV r9, r9, ASR #4
528 MOV r10,r10,ASR #4
529 MOV r4, r4, ASR #4
530 MOV r5, r5, ASR #4
531 STRH r5,[r0], #2 ; y[0] = t[0]+t[7]
532 STRH r4, [r0, #14] ; y[1] = t[1]+t2[6]
533 STRH r10,[r0, #30] ; y[2] = t[2]+t2[5]
534 STRH r9, [r0, #46] ; y[3] = t2[3]+t[4]
535 STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
536 STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
537 STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
538 STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
539 MOV PC,r14
540 ENDP
541
542 idct8core_arm PROC
543 ; r0 = ogg_int16_t *_y (destination)
544 ; r1 = const ogg_int16_t *_x (source)
545 LDRSH r2, [r1],#16 ; r2 = x[0]
546 STMFD r13!,{r1,r14}
547 LDRSH r6, [r1, #-8] ; r6 = x[4]
548 LDR r12,OC_C4S4 ; r12= C4S4
549 LDRSH r4, [r1, #-12] ; r4 = x[2]
550 ADD r2, r2, r6 ; r2 = x[0] + x[4]
551 SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
552 ; For spec compliance, these sums must be truncated to 16-bit precision
553 ; _before_ the multiply (not after).
554 ; Sadly, ARMv4 provides no simple way to do that.
555 MOV r2, r2, LSL #16
556 MOV r6, r6, LSL #16
557 MOV r2, r2, ASR #16
558 MOV r6, r6, ASR #16
559 MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
560 LDRSH r8, [r1, #-4] ; r8 = x[6]
561 LDR r7, OC_C6S2 ; r7 = OC_C6S2
562 MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
563 LDR r14,OC_C2S6 ; r14= OC_C2S6
564 MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
565 LDR r5, OC_C7S1 ; r5 = OC_C7S1
566 MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
567 MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
568 MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
569 MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
570 MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
571 LDR r7, OC_C1S7 ; r7 = OC_C1S7
572 SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
573 LDRSH r14,[r1, #-14] ; r14= x[1]
574 ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
575 LDRSH r8, [r1, #-2] ; r8 = x[7]
576 MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
577 LDRSH r10,[r1, #-6] ; r10= x[5]
578 MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
579 MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
580 MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
581 MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
582 MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
583 LDRSH r1, [r1, #-10] ; r1 = x[3]
584 LDR r5, OC_C3S5 ; r5 = OC_C3S5
585 LDR r11,OC_C5S3 ; r11= OC_C5S3
586 ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
587 MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
588 SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
589 MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
590 MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
591 MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
592 MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
593 MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
594 SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
595 ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
596 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
597 ; r10=t[6] r12=C4S4 r14=t[5]
598 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
599 ; before multiplying, not after (this is not equivalent)
600 ; Stage 2
601 ; 4-5 butterfly
602 ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
603 SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
604 MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
605 ; 7-6 butterfly
606 ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
607 SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
608 MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
609 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
610 ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
611 ; Stage 3
612 ; 0-3 butterfly
613 ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3]
614 SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3]
615 ; 1-2 butterfly
616 ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2]
617 SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2]
618 ; 6-5 butterfly
619 MOV r14,r14,ASR #16 ; r14= t2[5]
620 ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
621 SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
622 ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
623 ; r10=t3[6] r14=t3[5]
624 ; Stage 4
625 ADD r2, r2, r8 ; r2 = t[0] + t[7]
626 ADD r6, r6, r10 ; r6 = t[1] + t[6]
627 ADD r3, r3, r14 ; r3 = t[2] + t[5]
628 ADD r4, r4, r9 ; r4 = t[3] + t[4]
629 SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7]
630 SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6]
631 SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5]
632 SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4]
633 STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
634 STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
635 STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
636 STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
637 STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
638 STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
639 STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
640 STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
641 LDMFD r13!,{r1,PC}
642 ENDP
643
644 idct8core_down_arm PROC
645 ; r0 = ogg_int16_t *_y (destination)
646 ; r1 = const ogg_int16_t *_x (source)
647 LDRSH r2, [r1],#16 ; r2 = x[0]
648 STMFD r13!,{r1,r14}
649 LDRSH r6, [r1, #-8] ; r6 = x[4]
650 LDR r12,OC_C4S4 ; r12= C4S4
651 LDRSH r4, [r1, #-12] ; r4 = x[2]
652 ADD r2, r2, r6 ; r2 = x[0] + x[4]
653 SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
654 ; For spec compliance, these sums must be truncated to 16-bit precision
655 ; _before_ the multiply (not after).
656 ; Sadly, ARMv4 provides no simple way to do that.
657 MOV r2, r2, LSL #16
658 MOV r6, r6, LSL #16
659 MOV r2, r2, ASR #16
660 MOV r6, r6, ASR #16
661 MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
662 LDRSH r8, [r1, #-4] ; r8 = x[6]
663 LDR r7, OC_C6S2 ; r7 = OC_C6S2
664 MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
665 LDR r14,OC_C2S6 ; r14= OC_C2S6
666 MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
667 LDR r5, OC_C7S1 ; r5 = OC_C7S1
668 MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
669 MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
670 MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
671 MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
672 MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
673 LDR r7, OC_C1S7 ; r7 = OC_C1S7
674 SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
675 LDRSH r14,[r1, #-14] ; r14= x[1]
676 ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
677 LDRSH r8, [r1, #-2] ; r8 = x[7]
678 MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
679 LDRSH r10,[r1, #-6] ; r10= x[5]
680 MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
681 MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
682 MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
683 MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
684 MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
685 LDRSH r1, [r1, #-10] ; r1 = x[3]
686 LDR r5, OC_C3S5 ; r5 = OC_C3S5
687 LDR r11,OC_C5S3 ; r11= OC_C5S3
688 ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
689 MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
690 SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
691 MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
692 MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
693 MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
694 MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
695 MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
696 SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
697 ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
698 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
699 ; r10=t[6] r12=C4S4 r14=t[5]
700 ; Stage 2
701 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
702 ; before multiplying, not after (this is not equivalent)
703 ; 4-5 butterfly
704 ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
705 SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
706 MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
707 ; 7-6 butterfly
708 ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
709 SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
710 MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
711 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
712 ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
713 ; Stage 3
714 ADD r2, r2, #8<<16 ; r2 = t[0]+8<<16
715 ADD r6, r6, #8<<16 ; r6 = t[1]+8<<16
716 ; 0-3 butterfly
717 ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + 8
718 SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + 8
719 ; 1-2 butterfly
720 ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + 8
721 SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + 8
722 ; 6-5 butterfly
723 MOV r14,r14,ASR #16 ; r14= t2[5]
724 ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
725 SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
726 ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
727 ; r10=t3[6] r14=t3[5]
728 ; Stage 4
729 ADD r2, r2, r8 ; r2 = t[0] + t[7] + 8
730 ADD r6, r6, r10 ; r6 = t[1] + t[6] + 8
731 ADD r3, r3, r14 ; r3 = t[2] + t[5] + 8
732 ADD r4, r4, r9 ; r4 = t[3] + t[4] + 8
733 SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + 8
734 SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + 8
735 SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + 8
736 SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + 8
737 ; TODO: This is wrong.
738 ; The C code truncates to 16 bits by storing to RAM and doing the
739 ; shifts later; we've got an extra 4 bits here.
740 MOV r2, r2, ASR #4
741 MOV r6, r6, ASR #4
742 MOV r3, r3, ASR #4
743 MOV r4, r4, ASR #4
744 MOV r8, r8, ASR #4
745 MOV r10,r10,ASR #4
746 MOV r14,r14,ASR #4
747 MOV r9, r9, ASR #4
748 STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
749 STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
750 STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
751 STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
752 STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
753 STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
754 STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
755 STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
756 LDMFD r13!,{r1,PC}
757 ENDP
758
759 [ OC_ARM_ASM_MEDIA
760 EXPORT oc_idct8x8_1_v6
761 EXPORT oc_idct8x8_v6
762
763 oc_idct8x8_1_v6 PROC
764 ; r0 = ogg_int16_t *_y
765 ; r1 = ogg_uint16_t _dc
766 ORR r2, r1, r1, LSL #16
767 ORR r3, r1, r1, LSL #16
768 STRD r2, [r0], #8
769 STRD r2, [r0], #8
770 STRD r2, [r0], #8
771 STRD r2, [r0], #8
772 STRD r2, [r0], #8
773 STRD r2, [r0], #8
774 STRD r2, [r0], #8
775 STRD r2, [r0], #8
776 STRD r2, [r0], #8
777 STRD r2, [r0], #8
778 STRD r2, [r0], #8
779 STRD r2, [r0], #8
780 STRD r2, [r0], #8
781 STRD r2, [r0], #8
782 STRD r2, [r0], #8
783 STRD r2, [r0], #8
784 MOV PC, r14
785 ENDP
786
787 oc_idct8x8_v6 PROC
788 ; r0 = ogg_int16_t *_y
789 ; r1 = ogg_int16_t *_x
790 ; r2 = int _last_zzi
791 CMP r2, #3
792 BLE oc_idct8x8_3_v6
793 ;CMP r2, #6
794 ;BLE oc_idct8x8_6_v6
795 CMP r2, #10
796 BLE oc_idct8x8_10_v6
797 oc_idct8x8_slow_v6
798 STMFD r13!,{r4-r11,r14}
799 SUB r13,r13,#64*2
800 ; Row transforms
801 STR r0, [r13,#-4]!
802 ADD r0, r13, #4 ; Write to temp storage.
803 BL idct8_8core_v6
804 BL idct8_8core_v6
805 BL idct8_8core_v6
806 BL idct8_8core_v6
807 LDR r0, [r13], #4 ; Write to the final destination.
808 ; Clear input data for next block (decoder only).
809 SUB r2, r1, #8*16
810 CMP r0, r2
811 MOV r1, r13 ; And read from temp storage.
812 BEQ oc_idct8x8_slow_v6_cols
813 MOV r4, #0
814 MOV r5, #0
815 STRD r4, [r2], #8
816 STRD r4, [r2], #8
817 STRD r4, [r2], #8
818 STRD r4, [r2], #8
819 STRD r4, [r2], #8
820 STRD r4, [r2], #8
821 STRD r4, [r2], #8
822 STRD r4, [r2], #8
823 STRD r4, [r2], #8
824 STRD r4, [r2], #8
825 STRD r4, [r2], #8
826 STRD r4, [r2], #8
827 STRD r4, [r2], #8
828 STRD r4, [r2], #8
829 STRD r4, [r2], #8
830 STRD r4, [r2], #8
831 oc_idct8x8_slow_v6_cols
832 ; Column transforms
833 BL idct8_8core_down_v6
834 BL idct8_8core_down_v6
835 BL idct8_8core_down_v6
836 BL idct8_8core_down_v6
837 ADD r13,r13,#64*2
838 LDMFD r13!,{r4-r11,PC}
839 ENDP
840
841 oc_idct8x8_10_v6 PROC
842 STMFD r13!,{r4-r11,r14}
843 SUB r13,r13,#64*2+4
844 ; Row transforms
845 MOV r2, r13
846 STR r0, [r13,#-4]!
847 AND r0, r2, #4 ; Align the stack.
848 ADD r0, r0, r2 ; Write to temp storage.
849 BL idct4_3core_v6
850 BL idct2_1core_v6
851 LDR r0, [r13], #4 ; Write to the final destination.
852 ; Clear input data for next block (decoder only).
853 SUB r2, r1, #4*16
854 CMP r0, r2
855 AND r1, r13,#4 ; Align the stack.
856 BEQ oc_idct8x8_10_v6_cols
857 MOV r4, #0
858 MOV r5, #0
859 STRD r4, [r2]
860 STRD r4, [r2,#16]
861 STR r4, [r2,#32]
862 STR r4, [r2,#48]
863 oc_idct8x8_10_v6_cols
864 ; Column transforms
865 ADD r1, r1, r13 ; And read from temp storage.
866 BL idct4_4core_down_v6
867 BL idct4_4core_down_v6
868 BL idct4_4core_down_v6
869 BL idct4_4core_down_v6
870 ADD r13,r13,#64*2+4
871 LDMFD r13!,{r4-r11,PC}
872 ENDP
873
874 oc_idct8x8_3_v6 PROC
875 STMFD r13!,{r4-r8,r14}
876 SUB r13,r13,#64*2
877 ; Row transforms
878 MOV r8, r0
879 MOV r0, r13 ; Write to temp storage.
880 BL idct2_1core_v6
881 ; Clear input data for next block (decoder only).
882 SUB r0, r1, #2*16
883 CMP r0, r8
884 MOV r1, r13 ; Read from temp storage.
885 MOVNE r4, #0
886 STRNE r4, [r0]
887 STRNE r4, [r0,#16]
888 MOVNE r0, r8 ; Write to the final destination.
889 ; Column transforms
890 BL idct2_2core_down_v6
891 BL idct2_2core_down_v6
892 BL idct2_2core_down_v6
893 BL idct2_2core_down_v6
894 ADD r13,r13,#64*2
895 LDMFD r13!,{r4-r8,PC}
896 ENDP
897
898 idct2_1core_v6 PROC
899 ; r0 = ogg_int16_t *_y (destination)
900 ; r1 = const ogg_int16_t *_x (source)
901 ; Stage 1:
902 LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]>
903 LDR r3, OC_C4S4
904 LDRSH r6, [r1], #16 ; r6 = x[1,0]
905 SMULWB r12,r3, r2 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16
906 LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7
907 SMULWB r6, r3, r6 ; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
908 SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
909 SMULWT r7, r5, r2 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
910 ; Stage 2:
911 SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
912 PKHBT r12,r12,r6, LSL #16 ; r12= <t[1,0]|t[0,0]>
913 SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
914 PKHBT r7, r7, r3 ; r7 = <0|t[0,7]>
915 ; Stage 3:
916 PKHBT r5, r6, r5, LSL #16 ; r5 = <t[0,5]|t[0,6]>
917 PKHBT r4, r4, r3 ; r4 = <0|t[0,4]>
918 SASX r5, r5, r5 ; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
919 ; Stage 4:
920 PKHTB r6, r3, r5, ASR #16 ; r6 = <0|t[0,6]>
921 PKHBT r5, r5, r3 ; r5 = <0|t[0,5]>
922 SADD16 r3, r12,r7 ; r3 = t[0]+t[7]
923 STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]
924 SADD16 r3, r12,r6 ; r3 = t[0]+t[6]
925 STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]
926 SADD16 r3, r12,r5 ; r3 = t[0]+t[5]
927 STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]
928 SADD16 r3, r12,r4 ; r3 = t[0]+t[4]
929 STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]
930 SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]
931 STR r4, [r0, #60] ; y[4<<3] = t[0]-t[4]
932 SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]
933 STR r5, [r0, #76] ; y[5<<3] = t[0]-t[5]
934 SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]
935 STR r6, [r0, #92] ; y[6<<3] = t[0]-t[6]
936 SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]
937 STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
938 MOV PC,r14
939 ENDP
940 ]
941
942 ALIGN 8
943 OC_C7S1
944 DCD 12785 ; 31F1
945 OC_C1S7
946 DCD 64277 ; FB15
947 OC_C6S2
948 DCD 25080 ; 61F8
949 OC_C2S6
950 DCD 60547 ; EC83
951 OC_C5S3
952 DCD 36410 ; 8E3A
953 OC_C3S5
954 DCD 54491 ; D4DB
955 OC_C4S4
956 DCD 46341 ; B505
957
958 [ OC_ARM_ASM_MEDIA
959 idct2_2core_down_v6 PROC
960 ; r0 = ogg_int16_t *_y (destination)
961 ; r1 = const ogg_int16_t *_x (source)
962 ; Stage 1:
963 LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]>
964 LDR r3, OC_C4S4
965 MOV r7 ,#8 ; r7 = 8
966 LDR r6, [r1], #16 ; r6 = <x[1,1]|x[1,0]>
967 SMLAWB r12,r3, r2, r7 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
968 LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7
969 SMLAWB r7, r3, r6, r7 ; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
970 SMULWT r5, r5, r2 ; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
971 PKHBT r12,r12,r7, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
972 SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
973 ; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
974 PKHBT r7, r5, r5, LSL #16 ; r7 = <t[0,7]|t[0,7]>
975 ; Stage 2:
976 SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
977 PKHBT r4, r4, r4, LSL #16 ; r4 = <t[0,4]|t[0,4]>
978 SMULWT r2, r3, r7 ; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
979 SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
980 PKHBT r6, r6, r2, LSL #16 ; r6 = <t[1,6]|t[0,6]>
981 SMULWT r2, r3, r4 ; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
982 PKHBT r2, r5, r2, LSL #16 ; r2 = <t[1,5]|t[0,5]>
983 ; Stage 3:
984 SSUB16 r5, r6, r2 ; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
985 SADD16 r6, r6, r2 ; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
986 ; Stage 4:
987 SADD16 r2, r12,r7 ; r2 = t[0]+t[7]+8
988 MOV r3, r2, ASR #4
989 MOV r2, r2, LSL #16
990 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[7]+8>>4
991 STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4
992 SADD16 r2, r12,r6 ; r2 = t[0]+t[6]+8
993 MOV r3, r2, ASR #4
994 MOV r2, r2, LSL #16
995 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[6]+8>>4
996 STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]+8>>4
997 SADD16 r2, r12,r5 ; r2 = t[0]+t[5]+8
998 MOV r3, r2, ASR #4
999 MOV r2, r2, LSL #16
1000 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[5]+8>>4
1001 STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]+8>>4
1002 SADD16 r2, r12,r4 ; r2 = t[0]+t[4]+8
1003 MOV r3, r2, ASR #4
1004 MOV r2, r2, LSL #16
1005 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[4]+8>>4
1006 STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]+8>>4
1007 SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]+8
1008 MOV r3, r4, ASR #4
1009 MOV r4, r4, LSL #16
1010 PKHTB r3, r3, r4, ASR #20 ; r3 = t[0]-t[4]+8>>4
1011 STR r3, [r0, #60] ; y[4<<3] = t[0]-t[4]+8>>4
1012 SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]+8
1013 MOV r3, r5, ASR #4
1014 MOV r5, r5, LSL #16
1015 PKHTB r3, r3, r5, ASR #20 ; r3 = t[0]-t[5]+8>>4
1016 STR r3, [r0, #76] ; y[5<<3] = t[0]-t[5]+8>>4
1017 SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]+8
1018 MOV r3, r6, ASR #4
1019 MOV r6, r6, LSL #16
1020 PKHTB r3, r3, r6, ASR #20 ; r3 = t[0]-t[6]+8>>4
1021 STR r3, [r0, #92] ; y[6<<3] = t[0]-t[6]+8>>4
1022 SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]+8
1023 MOV r3, r7, ASR #4
1024 MOV r7, r7, LSL #16
1025 PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4
1026 STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
1027 MOV PC,r14
1028 ENDP
1029
1030 ; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
1031 ; pay for increased branch mis-prediction to get here, but in practice it
1032 ; doesn't seem to slow anything down to take it out, and it's less code this
1033 ; way.
1034 [ 0
1035 oc_idct8x8_6_v6 PROC
1036 STMFD r13!,{r4-r8,r10,r11,r14}
1037 SUB r13,r13,#64*2+4
1038 ; Row transforms
1039 MOV r8, r0
1040 AND r0, r13,#4 ; Align the stack.
1041 ADD r0, r0, r13 ; Write to temp storage.
1042 BL idct3_2core_v6
1043 BL idct1core_v6
1044 ; Clear input data for next block (decoder only).
1045 SUB r0, r1, #3*16
1046 CMP r0, r8
1047 AND r1, r13,#4 ; Align the stack.
1048 BEQ oc_idct8x8_6_v6_cols
1049 MOV r4, #0
1050 MOV r5, #0
1051 STRD r4, [r0]
1052 STR r4, [r0,#16]
1053 STR r4, [r0,#32]
1054 MOV r0, r8 ; Write to the final destination.
1055 oc_idct8x8_6_v6_cols
1056 ; Column transforms
1057 ADD r1, r1, r13 ; And read from temp storage.
1058 BL idct3_3core_down_v6
1059 BL idct3_3core_down_v6
1060 BL idct3_3core_down_v6
1061 BL idct3_3core_down_v6
1062 ADD r13,r13,#64*2+4
1063 LDMFD r13!,{r4-r8,r10,r11,PC}
1064 ENDP
1065
1066 idct1core_v6 PROC
1067 ; r0 = ogg_int16_t *_y (destination)
1068 ; r1 = const ogg_int16_t *_x (source)
1069 LDRSH r3, [r1], #16
1070 MOV r12,#0x05
1071 ORR r12,r12,#0xB500
1072 MUL r3, r12, r3
1073 ; Stall ?
1074 MOV r3, r3, ASR #16
1075 ; Don't need to actually store the odd lines; they won't be read.
1076 STRH r3, [r0], #2
1077 STRH r3, [r0, #30]
1078 STRH r3, [r0, #62]
1079 STRH r3, [r0, #94]
1080 MOV PC,R14
1081 ENDP
1082
1083 idct3_2core_v6 PROC
1084 ; r0 = ogg_int16_t *_y (destination)
1085 ; r1 = const ogg_int16_t *_x (source)
1086 ; Stage 1:
1087 LDRD r4, [r1], #16 ; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
1088 LDRD r10,OC_C6S2_3_v6 ; r10= OC_C6S2; r11= OC_C2S6
1089 ; Stall
1090 SMULWB r3, r11,r5 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
1091 LDR r11,OC_C4S4
1092 SMULWB r2, r10,r5 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
1093 LDR r5, [r1], #16 ; r5 = <x[1,1]|x[1,0]>
1094 SMULWB r12,r11,r4 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
1095 LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
1096 SMULWB r10,r11,r5 ; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
1097 PKHBT r12,r12,r10,LSL #16 ; r12= <t[1,0]|t[0,0]>
1098 SMULWT r10,r7, r5 ; r10= t[1,7]=OC_C1S7*x[1,1]>>16
1099 PKHBT r2, r2, r11 ; r2 = <0|t[0,2]>
1100 SMULWT r7, r7, r4 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
1101 PKHBT r3, r3, r11 ; r3 = <0|t[0,3]>
1102 SMULWT r5, r6, r5 ; r10= t[1,4]=OC_C7S1*x[1,1]>>16
1103 PKHBT r7, r7, r10,LSL #16 ; r7 = <t[1,7]|t[0,7]>
1104 SMULWT r4, r6, r4 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
1105 ; Stage 2:
1106 SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
1107 PKHBT r4, r4, r5, LSL #16 ; r4 = <t[1,4]|t[0,4]>
1108 SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16
1109 SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
1110 PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]>
1111 SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
1112 ; Stage 3:
1113 B idct4_3core_stage3_v6
1114 ENDP
1115
1116 ; Another copy so the LDRD offsets are less than +/- 255.
1117 ALIGN 8
1118 OC_C7S1_3_v6
1119 DCD 12785 ; 31F1
1120 OC_C1S7_3_v6
1121 DCD 64277 ; FB15
1122 OC_C6S2_3_v6
1123 DCD 25080 ; 61F8
1124 OC_C2S6_3_v6
1125 DCD 60547 ; EC83
1126
1127 idct3_3core_down_v6 PROC
1128 ; r0 = ogg_int16_t *_y (destination)
1129 ; r1 = const ogg_int16_t *_x (source)
1130 ; Stage 1:
1131 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
1132 LDRD r6, OC_C6S2_3_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
1133 LDR r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>
1134 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
1135 MOV r7,#8
1136 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
1137 LDR r11,OC_C4S4
1138 SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
1139 ; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
1140 PKHBT r3, r3, r3, LSL #16 ; r3 = <t[0,3]|t[0,3]>
1141 SMLAWB r5, r11,r4, r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
1142 PKHBT r2, r2, r2, LSL #16 ; r2 = <t[0,2]|t[0,2]>
1143 LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
1144 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
1145 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
1146 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
1147 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
1148 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
1149 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
1150 ; Stage 2:
1151 SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
1152 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
1153 SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16
1154 SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
1155 PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]>
1156 SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
1157 ; Stage 3:
1158 B idct4_4core_down_stage3_v6
1159 ENDP
1160 ]
1161
1162 idct4_3core_v6 PROC
1163 ; r0 = ogg_int16_t *_y (destination)
1164 ; r1 = const ogg_int16_t *_x (source)
1165 ; Stage 1:
1166 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
1167 LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5
1168 LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
1169 SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
1170 SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
1171 PKHBT r9, r9, r2 ; r9 = <0|t[0,6]>
1172 LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
1173 PKHBT r8, r8, r2 ; r9 = <0|-t[0,5]>
1174 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
1175 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
1176 LDR r11,OC_C4S4
1177 SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16
1178 SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
1179 PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]>
1180 SMULWB r12,r11,r10 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16
1181 PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]>
1182 SMULWB r5, r11,r4 ; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
1183 LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
1184 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]|t[0,0]>
1185 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
1186 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
1187 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
1188 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
1189 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
1190 ; Stage 2:
1191 SSUB16 r6, r7, r9 ; r6 = t[7]-t[6]
1192 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
1193 SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6]
1194 SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16
1195 SADD16 r5, r4, r8 ; r5 = t[4]-t[5]
1196 SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16
1197 SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5]
1198 SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16
1199 PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]>
1200 SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16
1201 ; Stage 3:
1202 idct4_3core_stage3_v6
1203 SADD16 r11,r12,r2 ; r11= t[1]=t[0]+t[2]
1204 PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
1205 SSUB16 r2, r12,r2 ; r2 = t[2]=t[0]-t[2]
1206 idct4_3core_stage3_5_v6
1207 SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5]
1208 SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5]
1209 SADD16 r10,r12,r3 ; r10= t[0]'=t[0]+t[3]
1210 SSUB16 r3, r12,r3 ; r3 = t[3]=t[0]-t[3]
1211 ; Stage 4:
1212 SADD16 r12,r10,r7 ; r12= t[0]+t[7]
1213 STR r12,[r0], #4 ; y[0<<3] = t[0]+t[7]
1214 SADD16 r12,r11,r6 ; r12= t[1]+t[6]
1215 STR r12,[r0, #12] ; y[1<<3] = t[1]+t[6]
1216 SADD16 r12,r2, r5 ; r12= t[2]+t[5]
1217 STR r12,[r0, #28] ; y[2<<3] = t[2]+t[5]
1218 SADD16 r12,r3, r4 ; r12= t[3]+t[4]
1219 STR r12,[r0, #44] ; y[3<<3] = t[3]+t[4]
1220 SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]
1221 STR r4, [r0, #60] ; y[4<<3] = t[3]-t[4]
1222 SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]
1223 STR r5, [r0, #76] ; y[5<<3] = t[2]-t[5]
1224 SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]
1225 STR r6, [r0, #92] ; y[6<<3] = t[1]-t[6]
1226 SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]
1227 STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
1228 MOV PC,r14
1229 ENDP
1230
1231 ; Another copy so the LDRD offsets are less than +/- 255.
1232 ALIGN 8
1233 OC_C7S1_4_v6
1234 DCD 12785 ; 31F1
1235 OC_C1S7_4_v6
1236 DCD 64277 ; FB15
1237 OC_C6S2_4_v6
1238 DCD 25080 ; 61F8
1239 OC_C2S6_4_v6
1240 DCD 60547 ; EC83
1241 OC_C5S3_4_v6
1242 DCD 36410 ; 8E3A
1243 OC_C3S5_4_v6
1244 DCD 54491 ; D4DB
1245
1246 idct4_4core_down_v6 PROC
1247 ; r0 = ogg_int16_t *_y (destination)
1248 ; r1 = const ogg_int16_t *_x (source)
1249 ; Stage 1:
1250 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
1251 LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5
1252 LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
1253 SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
1254 LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
1255 SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
1256 ; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
1257 PKHBT r9, r9, r9, LSL #16 ; r9 = <t[0,6]|t[0,6]>
1258 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
1259 PKHBT r8, r8, r8, LSL #16 ; r8 = <-t[0,5]|-t[0,5]>
1260 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
1261 LDR r11,OC_C4S4
1262 SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16
1263 MOV r7,#8
1264 SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
1265 PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]>
1266 SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
1267 PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]>
1268 SMLAWB r5, r11,r4 ,r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
1269 LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
1270 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
1271 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
1272 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
1273 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
1274 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
1275 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
1276 ; Stage 2:
1277 SSUB16 r6, r7, r9 ; r6 = t[7]-t[6]
1278 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
1279 SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6]
1280 SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16
1281 SADD16 r5, r4, r8 ; r5 = t[4]-t[5]
1282 SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16
1283 SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5]
1284 SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16
1285 PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]>
1286 SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16
1287 ; Stage 3:
1288 idct4_4core_down_stage3_v6
1289 SADD16 r11,r12,r2 ; r11= t[1]+8=t[0]+t[2]+8
1290 PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
1291 SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8
1292 B idct8_8core_down_stage3_5_v6
1293 ENDP
1294
1295 idct8_8core_v6 PROC
1296 STMFD r13!,{r0,r14}
1297 ; Stage 1:
1298 ;5-6 rotation by 3pi/16
1299 LDRD r10,OC_C5S3_4_v6 ; r10= OC_C5S3, r11= OC_C3S5
1300 LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]>
1301 LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]>
1302 SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16
1303 LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]>
1304 SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16
1305 LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]>
1306 SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16
1307 SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16
1308 SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
1309 PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5>
1310 SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
1311 PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]>
1312 SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16
1313 PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]>
1314 SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16
1315 ;2-3 rotation by 6pi/16
1316 LDRD r10,OC_C6S2_4_v6 ; r10= OC_C6S2, r11= OC_C2S6
1317 PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3>
1318 LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]>
1319 SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16
1320 SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]>
1321 SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16
1322 LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]>
1323 SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16
1324 SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16
1325 PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9>
1326 SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
1327 SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
1328 SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16
1329 PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]>
1330 SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16
1331 ;4-7 rotation by 7pi/16
1332 LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7
1333 PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12>
1334 LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]>
1335 PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]>
1336 SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]>
1337 SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16
1338 LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]>
1339 SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16
1340 SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16
1341 SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16
1342 SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
1343 PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8>
1344 SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
1345 PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]>
1346 SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16
1347 PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]>
1348 SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16
1349 ;0-1 butterfly
1350 LDR r11,OC_C4S4
1351 PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10>
1352 SADD16 r7, r0, r4 ; r7 = x[0]+x[4]
1353 SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]>
1354 SSUB16 r4, r0, r4 ; r4 = x[0]-x[4]
1355 SMULWB r8, r11,r7 ; r8 = t[0,0]=OC_C4S4*r7B>>16
1356 SMULWT r12,r11,r7 ; r12= t[1,0]=OC_C4S4*r7T>>16
1357 SMULWB r7, r11,r4 ; r7 = t[0,1]=OC_C4S4*r4B>>16
1358 PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]|t[0,0]>
1359 SMULWT r8, r11,r4 ; r8 = t[1,1]=OC_C4S4*r4T>>16
1360 ; Stage 2:
1361 SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5]
1362 PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]|t[0,0]>
1363 SSUB16 r5, r10,r5 ; r5 = t[4]-t[5]
1364 SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16
1365 SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6]
1366 SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16
1367 SSUB16 r6, r9, r6 ; r6 = t[7]-t[6]
1368 SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16
1369 PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]>
1370 SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16
1371 ; Stage 3:
1372 SADD16 r11,r8, r2 ; r11= t[1]'=t[1]+t[2]
1373 PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]>
1374 SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2]
1375 LDMFD r13!,{r0,r14}
1376 B idct4_3core_stage3_5_v6
1377 ENDP
1378
1379 ; Another copy so the LDRD offsets are less than +/- 255.
1380 ALIGN 8
1381 OC_C7S1_8_v6
1382 DCD 12785 ; 31F1
1383 OC_C1S7_8_v6
1384 DCD 64277 ; FB15
1385 OC_C6S2_8_v6
1386 DCD 25080 ; 61F8
1387 OC_C2S6_8_v6
1388 DCD 60547 ; EC83
1389 OC_C5S3_8_v6
1390 DCD 36410 ; 8E3A
1391 OC_C3S5_8_v6
1392 DCD 54491 ; D4DB
1393
1394 idct8_8core_down_v6 PROC
1395 STMFD r13!,{r0,r14}
1396 ; Stage 1:
1397 ;5-6 rotation by 3pi/16
1398 LDRD r10,OC_C5S3_8_v6 ; r10= OC_C5S3, r11= OC_C3S5
1399 LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]>
1400 LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]>
1401 SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16
1402 LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]>
1403 SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16
1404 LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]>
1405 SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16
1406 SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16
1407 SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
1408 PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5>
1409 SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
1410 PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]>
1411 SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16
1412 PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]>
1413 SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16
1414 ;2-3 rotation by 6pi/16
1415 LDRD r10,OC_C6S2_8_v6 ; r10= OC_C6S2, r11= OC_C2S6
1416 PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3>
1417 LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]>
1418 SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16
1419 SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]>
1420 SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16
1421 LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]>
1422 SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16
1423 SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16
1424 PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9>
1425 SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
1426 SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
1427 SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16
1428 PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]>
1429 SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16
1430 ;4-7 rotation by 7pi/16
1431 LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7
1432 PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12>
1433 LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]>
1434 PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]>
1435 SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]>
1436 SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16
1437 LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]>
1438 SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16
1439 SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16
1440 SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16
1441 SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
1442 PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8>
1443 SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
1444 PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]>
1445 SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16
1446 PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]>
1447 SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16
1448 ;0-1 butterfly
1449 LDR r11,OC_C4S4
1450 MOV r14,#8
1451 PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10>
1452 SADD16 r7, r0, r4 ; r7 = x[0]+x[4]
1453 SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]>
1454 SMLAWB r8, r11,r7, r14 ; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
1455 SSUB16 r4, r0, r4 ; r4 = x[0]-x[4]
1456 SMLAWT r12,r11,r7, r14 ; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
1457 SMLAWB r7, r11,r4, r14 ; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
1458 PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
1459 SMLAWT r8, r11,r4, r14 ; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
1460 ; Stage 2:
1461 SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5]
1462 PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]+8|t[0,0]+8>
1463 SSUB16 r5, r10,r5 ; r5 = t[4]-t[5]
1464 SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16
1465 SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6]
1466 SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16
1467 SSUB16 r6, r9, r6 ; r6 = t[7]-t[6]
1468 SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16
1469 PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]>
1470 SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16
1471 ; Stage 3:
1472 SADD16 r11,r8, r2 ; r11= t[1]'+8=t[1]+t[2]+8
1473 PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]>
1474 SSUB16 r2, r8, r2 ; r2 = t[2]+8=t[1]-t[2]+8
1475 LDMFD r13!,{r0,r14}
1476 idct8_8core_down_stage3_5_v6
1477 SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5]
1478 SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5]
1479 SADD16 r10,r12,r3 ; r10= t[0]'+8=t[0]+t[3]+8
1480 SSUB16 r3, r12,r3 ; r3 = t[3]+8=t[0]-t[3]+8
1481 ; Stage 4:
1482 SADD16 r12,r10,r7 ; r12= t[0]+t[7]+8
1483 SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]+8
1484 MOV r10,r12,ASR #4
1485 MOV r12,r12,LSL #16
1486 PKHTB r10,r10,r12,ASR #20 ; r10= t[0]+t[7]+8>>4
1487 STR r10,[r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4
1488 SADD16 r12,r11,r6 ; r12= t[1]+t[6]+8
1489 SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]+8
1490 MOV r10,r12,ASR #4
1491 MOV r12,r12,LSL #16
1492 PKHTB r10,r10,r12,ASR #20 ; r10= t[1]+t[6]+8>>4
1493 STR r10,[r0, #12] ; y[1<<3] = t[1]+t[6]+8>>4
1494 SADD16 r12,r2, r5 ; r12= t[2]+t[5]+8
1495 SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]+8
1496 MOV r10,r12,ASR #4
1497 MOV r12,r12,LSL #16
1498 PKHTB r10,r10,r12,ASR #20 ; r10= t[2]+t[5]+8>>4
1499 STR r10,[r0, #28] ; y[2<<3] = t[2]+t[5]+8>>4
1500 SADD16 r12,r3, r4 ; r12= t[3]+t[4]+8
1501 SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]+8
1502 MOV r10,r12,ASR #4
1503 MOV r12,r12,LSL #16
1504 PKHTB r10,r10,r12,ASR #20 ; r10= t[3]+t[4]+8>>4
1505 STR r10,[r0, #44] ; y[3<<3] = t[3]+t[4]+8>>4
1506 MOV r10,r4, ASR #4
1507 MOV r4, r4, LSL #16
1508 PKHTB r10,r10,r4, ASR #20 ; r10= t[3]-t[4]+8>>4
1509 STR r10,[r0, #60] ; y[4<<3] = t[3]-t[4]+8>>4
1510 MOV r10,r5, ASR #4
1511 MOV r5, r5, LSL #16
1512 PKHTB r10,r10,r5, ASR #20 ; r10= t[2]-t[5]+8>>4
1513 STR r10,[r0, #76] ; y[5<<3] = t[2]-t[5]+8>>4
1514 MOV r10,r6, ASR #4
1515 MOV r6, r6, LSL #16
1516 PKHTB r10,r10,r6, ASR #20 ; r10= t[1]-t[6]+8>>4
1517 STR r10,[r0, #92] ; y[6<<3] = t[1]-t[6]+8>>4
1518 MOV r10,r7, ASR #4
1519 MOV r7, r7, LSL #16
1520 PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4
1521 STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
1522 MOV PC,r14
1523 ENDP
1524 ]
1525
1526 [ OC_ARM_ASM_NEON
1527 EXPORT oc_idct8x8_1_neon
1528 EXPORT oc_idct8x8_neon
1529
1530 ALIGN 16
1531 OC_IDCT_CONSTS_NEON
1532 DCW 8
1533 DCW 64277 ; FB15 (C1S7)
1534 DCW 60547 ; EC83 (C2S6)
1535 DCW 54491 ; D4DB (C3S5)
1536 DCW 46341 ; B505 (C4S4)
1537 DCW 36410 ; 471D (C5S3)
1538 DCW 25080 ; 30FC (C6S2)
1539 DCW 12785 ; 31F1 (C7S1)
1540
1541 oc_idct8x8_1_neon PROC
1542 ; r0 = ogg_int16_t *_y
1543 ; r1 = ogg_uint16_t _dc
1544 VDUP.S16 Q0, r1
1545 VMOV Q1, Q0
1546 VST1.64 {D0, D1, D2, D3}, [r0@128]!
1547 VST1.64 {D0, D1, D2, D3}, [r0@128]!
1548 VST1.64 {D0, D1, D2, D3}, [r0@128]!
1549 VST1.64 {D0, D1, D2, D3}, [r0@128]
1550 MOV PC, r14
1551 ENDP
1552
1553 oc_idct8x8_neon PROC
1554 ; r0 = ogg_int16_t *_y
1555 ; r1 = ogg_int16_t *_x
1556 ; r2 = int _last_zzi
1557 CMP r2, #10
1558 BLE oc_idct8x8_10_neon
1559 oc_idct8x8_slow_neon
1560 VPUSH {D8-D15}
1561 MOV r2, r1
1562 ADR r3, OC_IDCT_CONSTS_NEON
1563 ; Row transforms (input is pre-transposed)
1564 VLD1.64 {D16,D17,D18,D19}, [r2@128]!
1565 VLD1.64 {D20,D21,D22,D23}, [r2@128]!
1566 VLD1.64 {D24,D25,D26,D27}, [r2@128]!
1567 VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4]
1568 VLD1.64 {D28,D29,D30,D31}, [r2@128]
1569 VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4]
1570 VLD1.64 {D0,D1}, [r3@128]
1571 MOV r12, r14
1572 BL oc_idct8x8_stage123_neon
1573 ; Stage 4
1574 VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]'
1575 VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]'
1576 VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]''
1577 VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]''
1578 VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]''
1579 VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
1580 VTRN.16 Q14,Q15
1581 VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
1582 VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
1583 ; 8x8 Transpose
1584 VTRN.16 Q8, Q9
1585 VTRN.16 Q10,Q11
1586 VTRN.16 Q12,Q13
1587 VTRN.32 Q8, Q10
1588 VTRN.32 Q9, Q11
1589 VTRN.32 Q12,Q14
1590 VTRN.32 Q13,Q15
1591 VSWP D17,D24
1592 VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4]
1593 VSWP D19,D26
1594 VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4]
1595 VSWP D21,D28
1596 VSWP D23,D30
1597 ; Column transforms
1598 BL oc_idct8x8_stage123_neon
1599 CMP r0,r1
1600 ; We have to put the return address back in the LR, or the branch
1601 ; predictor will not recognize the function return and mis-predict the
1602 ; entire call stack.
1603 MOV r14, r12
1604 ; Stage 4
1605 VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]'
1606 VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]'
1607 VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]''
1608 VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]''
1609 VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]''
1610 VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
1611 VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
1612 VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
1613 BEQ oc_idct8x8_slow_neon_noclear
1614 VMOV.I8 Q2,#0
1615 VPOP {D8-D15}
1616 VMOV.I8 Q3,#0
1617 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
1618 VST1.64 {D4, D5, D6, D7}, [r1@128]!
1619 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
1620 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
1621 VST1.64 {D4, D5, D6, D7}, [r1@128]!
1622 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
1623 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
1624 VST1.64 {D4, D5, D6, D7}, [r1@128]!
1625 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
1626 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
1627 VST1.64 {D4, D5, D6, D7}, [r1@128]
1628 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
1629 VSTMIA r0, {D16-D31}
1630 MOV PC, r14
1631
1632 oc_idct8x8_slow_neon_noclear
1633 VPOP {D8-D15}
1634 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
1635 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
1636 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
1637 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
1638 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
1639 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
1640 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
1641 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
1642 VSTMIA r0, {D16-D31}
1643 MOV PC, r14
1644 ENDP
1645
1646 oc_idct8x8_stage123_neon PROC
1647 ; Stages 1 & 2
1648 VMULL.S16 Q4, D18,D1[3]
1649 VMULL.S16 Q5, D19,D1[3]
1650 VMULL.S16 Q7, D30,D1[3]
1651 VMULL.S16 Q6, D31,D1[3]
1652 VMULL.S16 Q2, D30,D0[1]
1653 VMULL.S16 Q3, D31,D0[1]
1654 VSHRN.S32 D8, Q4, #16
1655 VSHRN.S32 D9, Q5, #16 ; Q4 = (OC_C7S1*x[1]>>16)
1656 VSHRN.S32 D14,Q7, #16
1657 VSHRN.S32 D15,Q6, #16 ; Q7 = (OC_C7S1*x[7]>>16)
1658 VSHRN.S32 D4, Q2, #16
1659 VSHRN.S32 D5, Q3, #16 ; Q2 = (OC_C1S7*x[7]>>16)-x[7]
1660 VSUB.S16 Q4, Q4, Q15
1661 VADD.S16 Q7, Q7, Q9
1662 VSUB.S16 Q4, Q4, Q2 ; Q4 = t[4]
1663 VMULL.S16 Q2, D18,D0[1]
1664 VMULL.S16 Q9, D19,D0[1]
1665 VMULL.S16 Q5, D26,D0[3]
1666 VMULL.S16 Q3, D27,D0[3]
1667 VMULL.S16 Q6, D22,D0[3]
1668 VMULL.S16 Q12,D23,D0[3]
1669 VSHRN.S32 D4, Q2, #16
1670 VSHRN.S32 D5, Q9, #16 ; Q2 = (OC_C1S7*x[1]>>16)-x[1]
1671 VSHRN.S32 D10,Q5, #16
1672 VSHRN.S32 D11,Q3, #16 ; Q5 = (OC_C3S5*x[5]>>16)-x[5]
1673 VSHRN.S32 D12,Q6, #16
1674 VSHRN.S32 D13,Q12,#16 ; Q6 = (OC_C3S5*x[3]>>16)-x[3]
1675 VADD.S16 Q7, Q7, Q2 ; Q7 = t[7]
1676 VSUB.S16 Q5, Q5, Q11
1677 VADD.S16 Q6, Q6, Q11
1678 VADD.S16 Q5, Q5, Q13
1679 VADD.S16 Q6, Q6, Q13
1680 VMULL.S16 Q9, D22,D1[1]
1681 VMULL.S16 Q11,D23,D1[1]
1682 VMULL.S16 Q15,D26,D1[1]
1683 VMULL.S16 Q13,D27,D1[1]
1684 VMULL.S16 Q2, D20,D1[2]
1685 VMULL.S16 Q12,D21,D1[2]
1686 VSHRN.S32 D18,Q9, #16
1687 VSHRN.S32 D19,Q11,#16 ; Q9 = (OC_C5S3*x[3]>>16)-x[3]
1688 VSHRN.S32 D30,Q15,#16
1689 VSHRN.S32 D31,Q13,#16 ; Q15= (OC_C5S3*x[5]>>16)-x[5]
1690 VSHRN.S32 D4, Q2, #16
1691 VSHRN.S32 D5, Q12,#16 ; Q2 = (OC_C6S2*x[2]>>16)
1692 VSUB.S16 Q5, Q5, Q9 ; Q5 = t[5]
1693 VADD.S16 Q6, Q6, Q15 ; Q6 = t[6]
1694 VSUB.S16 Q2, Q2, Q14
1695 VMULL.S16 Q3, D28,D1[2]
1696 VMULL.S16 Q11,D29,D1[2]
1697 VMULL.S16 Q12,D28,D0[2]
1698 VMULL.S16 Q9, D29,D0[2]
1699 VMULL.S16 Q13,D20,D0[2]
1700 VMULL.S16 Q15,D21,D0[2]
1701 VSHRN.S32 D6, Q3, #16
1702 VSHRN.S32 D7, Q11,#16 ; Q3 = (OC_C6S2*x[6]>>16)
1703 VSHRN.S32 D24,Q12,#16
1704 VSHRN.S32 D25,Q9, #16 ; Q12= (OC_C2S6*x[6]>>16)-x[6]
1705 VSHRN.S32 D26,Q13,#16
1706 VSHRN.S32 D27,Q15,#16 ; Q13= (OC_C2S6*x[2]>>16)-x[2]
1707 VSUB.S16 Q9, Q4, Q5 ; Q9 = t[4]-t[5]
1708 VSUB.S16 Q11,Q7, Q6 ; Q11= t[7]-t[6]
1709 VADD.S16 Q3, Q3, Q10
1710 VADD.S16 Q4, Q4, Q5 ; Q4 = t[4]'=t[4]+t[5]
1711 VADD.S16 Q7, Q7, Q6 ; Q7 = t[7]'=t[7]+t[6]
1712 VSUB.S16 Q2, Q2, Q12 ; Q2 = t[2]
1713 VADD.S16 Q3, Q3, Q13 ; Q3 = t[3]
1714 VMULL.S16 Q12,D16,D1[0]
1715 VMULL.S16 Q13,D17,D1[0]
1716 VMULL.S16 Q14,D2, D1[0]
1717 VMULL.S16 Q15,D3, D1[0]
1718 VMULL.S16 Q5, D18,D1[0]
1719 VMULL.S16 Q6, D22,D1[0]
1720 VSHRN.S32 D24,Q12,#16
1721 VSHRN.S32 D25,Q13,#16
1722 VSHRN.S32 D28,Q14,#16
1723 VSHRN.S32 D29,Q15,#16
1724 VMULL.S16 Q13,D19,D1[0]
1725 VMULL.S16 Q15,D23,D1[0]
1726 VADD.S16 Q8, Q8, Q12 ; Q8 = t[0]
1727 VADD.S16 Q1, Q1, Q14 ; Q1 = t[1]
1728 VSHRN.S32 D10,Q5, #16
1729 VSHRN.S32 D12,Q6, #16
1730 VSHRN.S32 D11,Q13,#16
1731 VSHRN.S32 D13,Q15,#16
1732 VADD.S16 Q5, Q5, Q9 ; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
1733 VADD.S16 Q6, Q6, Q11 ; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
1734 ; Stage 3
1735 VSUB.S16 Q11,Q8, Q3 ; Q11 = t[3]''=t[0]-t[3]
1736 VADD.S16 Q8, Q8, Q3 ; Q8 = t[0]''=t[0]+t[3]
1737 VADD.S16 Q9, Q1, Q2 ; Q9 = t[1]''=t[1]+t[2]
1738 VADD.S16 Q3, Q6, Q5 ; Q3 = t[6]''=t[6]'+t[5]'
1739 VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2]
1740 VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]'
1741 MOV PC, r14
1742 ENDP
1743
1744 oc_idct8x8_10_neon PROC
1745 ADR r3, OC_IDCT_CONSTS_NEON
1746 VLD1.64 {D0,D1}, [r3@128]
1747 MOV r2, r1
1748 ; Row transforms (input is pre-transposed)
1749 ; Stage 1
1750 VLD1.64 {D16,D17,D18,D19},[r2@128]!
1751 MOV r12, #16
1752 VMULL.S16 Q15,D16,D1[0] ; Q15= OC_C4S4*x[0]-(x[0]<<16)
1753 VLD1.64 {D17}, [r2@64], r12
1754 VMULL.S16 Q2, D18,D0[1] ; Q2 = OC_C1S7*x[1]-(x[1]<<16)
1755 VLD1.64 {D19}, [r2@64]
1756 VMULL.S16 Q14,D17,D0[2] ; Q14= OC_C2S6*x[2]-(x[2]<<16)
1757 VMULL.S16 Q3, D19,D0[3] ; Q3 = OC_C3S5*x[3]-(x[3]<<16)
1758 VMULL.S16 Q13,D19,D1[1] ; Q13= OC_C5S3*x[3]-(x[3]<<16)
1759 VMULL.S16 Q12,D18,D1[3] ; Q12= OC_C7S1*x[1]
1760 VMULL.S16 Q1, D17,D1[2] ; Q1 = OC_C6S2*x[2]
1761 VSHRN.S32 D30,Q15,#16 ; D30= t[0]-x[0]
1762 VSHRN.S32 D4, Q2, #16 ; D4 = t[7]-x[1]
1763 VSHRN.S32 D31,Q14,#16 ; D31= t[3]-x[2]
1764 VSHRN.S32 D6, Q3, #16 ; D6 = t[6]-x[3]
1765 VSHRN.S32 D7, Q13,#16 ; D7 = -t[5]-x[3]
1766 VSHRN.S32 D5, Q12,#16 ; D5 = t[4]
1767 VSHRN.S32 D2, Q1, #16 ; D2 = t[2]
1768 VADD.S16 D4, D4, D18 ; D4 = t[7]
1769 VADD.S16 D6, D6, D19 ; D6 = t[6]
1770 VADD.S16 D7, D7, D19 ; D7 = -t[5]
1771 VADD.S16 Q15,Q15,Q8 ; D30= t[0]
1772 ; D31= t[3]
1773 ; Stages 2 & 3
1774 VSUB.S16 Q12,Q2, Q3 ; D24= t[7]-t[6]
1775 ; D25= t[4]'=t[4]+t[5]
1776 VADD.S16 Q13,Q2, Q3 ; D26= t[7]'=t[7]+t[6]
1777 ; D27= t[4]-t[5]
1778 VMULL.S16 Q11,D24,D1[0] ; Q11= OC_C4S4*(t[7]-t[6])
1779 ; -(t[7]-t[6]<<16)
1780 VMULL.S16 Q14,D27,D1[0] ; Q14= OC_C4S4*(t[4]-t[5])
1781 ; -(t[4]-t[5]<<16)
1782 VADD.S16 D16,D30,D31 ; D16= t[0]'=t[0]+t[3]
1783 VSUB.S16 D17,D30,D2 ; D17= t[2]'=t[0]-t[2]
1784 VADD.S16 D18,D30,D2 ; D18= t[1]'=t[0]+t[2]
1785 VSHRN.S32 D22,Q11,#16 ; D22= (OC_C4S4*(t[7]-t[6])>>16)
1786 ; -(t[7]-t[6])
1787 VSHRN.S32 D23,Q14,#16 ; D23= (OC_C4S4*(t[4]-t[5])>>16)
1788 ; -(t[4]-t[5])
1789 VSUB.S16 D19,D30,D31 ; D19= t[3]'=t[0]-t[3]
1790 VADD.S16 D22,D22,D24 ; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
1791 VADD.S16 D23,D23,D27 ; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
1792 VSUB.S16 D27,D22,D23 ; D27= t[5]''=t[6]'-t[5]'
1793 VADD.S16 D24,D22,D23 ; D24= t[6]''=t[6]'+t[5]'
1794 ; Stage 4
1795 VSUB.S16 Q11,Q8, Q13 ; D22= y[7]=t[0]'-t[7]'
1796 ; D23= y[5]=t[2]'-t[5]''
1797 VSUB.S16 Q10,Q9, Q12 ; D20= y[6]=t[1]'-t[6]'
1798 ; D21= y[4]=t[3]'-t[4]''
1799 VADD.S16 Q8, Q8, Q13 ; D16= y[0]=t[0]'+t[7]'
1800 ; D17= y[2]=t[2]'+t[5]''
1801 VADD.S16 Q9, Q9, Q12 ; D18= y[1]=t[1]'-t[6]'
1802 ; D19= y[3]=t[3]'-t[4]''
1803 ; 8x4 transpose
1804 VTRN.16 Q10,Q11 ; Q10= c5c4a5a4 c7c6a7a6
1805 ; Q11= d5d4b5b4 d7d6b7b6
1806 VTRN.16 Q8, Q9 ; Q8 = c3c2a3a2 c1c0a1a0
1807 ; Q9 = d3d2b3b2 d1d0b1b0
1808 VSWP D20,D21 ; Q10= c7c6a7a6 c5c4a5a4
1809 VSWP D22,D23 ; Q11= d7d6b7b6 d5d4b5b4
1810 VUZP.32 Q9, Q11 ; Q9 = b7b6b5b4 b3b2b1b0
1811 ; Q11= d7d6d5d4 d3d2d1d0
1812 VMULL.S16 Q15,D18,D0[1]
1813 VMULL.S16 Q13,D22,D1[1]
1814 VUZP.32 Q8, Q10 ; Q8 = a7a6a5a4 a3a2a1a0
1815 ; Q10= c7c6c5c4 c3c2c1c0
1816 ; Column transforms
1817 ; Stages 1, 2, & 3
1818 VMULL.S16 Q14,D19,D0[1] ; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
1819 VMULL.S16 Q12,D23,D1[1] ; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
1820 VMULL.S16 Q3, D22,D0[3]
1821 VMULL.S16 Q2, D23,D0[3] ; Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
1822 VSHRN.S32 D30,Q15,#16
1823 VSHRN.S32 D31,Q14,#16 ; Q15= (OC_C1S7*x[1]>>16)-x[1]
1824 VSHRN.S32 D26,Q13,#16
1825 VSHRN.S32 D27,Q12,#16 ; Q13= (OC_C5S3*x[3]>>16)-x[3]
1826 VSHRN.S32 D28,Q3, #16
1827 VSHRN.S32 D29,Q2, #16 ; Q14= (OC_C3S5*x[3]>>16)-x[3]
1828 VADD.S16 Q15,Q15,Q9 ; Q15= t[7]
1829 VADD.S16 Q13,Q13,Q11 ; Q13= -t[5]
1830 VADD.S16 Q14,Q14,Q11 ; Q14= t[6]
1831 VMULL.S16 Q12,D18,D1[3]
1832 VMULL.S16 Q2, D19,D1[3] ; Q2:Q12= OC_C7S1*x[1]
1833 VMULL.S16 Q1, D16,D1[0]
1834 VMULL.S16 Q11,D17,D1[0] ; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
1835 VMULL.S16 Q3, D20,D0[2]
1836 VMULL.S16 Q9, D21,D0[2] ; Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
1837 VSHRN.S32 D24,Q12,#16
1838 VSHRN.S32 D25,Q2, #16 ; Q12= t[4]
1839 VMULL.S16 Q2, D20,D1[2]
1840 VSHRN.S32 D2, Q1, #16
1841 VSHRN.S32 D3, Q11,#16 ; Q1 = (OC_C4S4*x[0]>>16)-x[0]
1842 VMULL.S16 Q11,D21,D1[2] ; Q2:Q11= OC_C6S2*x[2]
1843 VSHRN.S32 D6, Q3, #16
1844 VSHRN.S32 D7, Q9, #16 ; Q3 = (OC_C2S6*x[2]>>16)-x[2]
1845 VSUB.S16 Q9, Q15,Q14 ; Q9 = t[7]-t[6]
1846 VADD.S16 Q15,Q15,Q14 ; Q15= t[7]'=t[7]+t[6]
1847 VSHRN.S32 D4, Q2, #16
1848 VSHRN.S32 D5, Q11,#16 ; Q2 = t[2]
1849 VADD.S16 Q1, Q1, Q8 ; Q1 = t[0]
1850 VADD.S16 Q8, Q12,Q13 ; Q8 = t[4]-t[5]
1851 VADD.S16 Q3, Q3, Q10 ; Q3 = t[3]
1852 VMULL.S16 Q10,D16,D1[0]
1853 VMULL.S16 Q11,D17,D1[0] ; Q11:Q10= OC_C4S4*(t[4]-t[5])
1854 ; -(t[4]-t[5]<<16)
1855 VSUB.S16 Q12,Q12,Q13 ; Q12= t[4]'=t[4]+t[5]
1856 VMULL.S16 Q14,D18,D1[0]
1857 VMULL.S16 Q13,D19,D1[0] ; Q13:Q14= OC_C4S4*(t[6]-t[7])
1858 ; -(t[6]-t[7]<<16)
1859 VSHRN.S32 D20,Q10,#16
1860 VSHRN.S32 D21,Q11,#16 ; Q10= (OC_C4S4*(t[4]-t[5])>>16)
1861 ; -(t[4]-t[5])
1862 VADD.S16 Q11,Q1, Q3 ; Q11= t[0]'=t[0]+t[3]
1863 VSUB.S16 Q3, Q1, Q3 ; Q3 = t[3]'=t[0]-t[3]
1864 VSHRN.S32 D28,Q14,#16
1865 VSHRN.S32 D29,Q13,#16 ; Q14= (OC_C4S4*(t[7]-t[6])>>16)
1866 ; -(t[7]-t[6])
1867 VADD.S16 Q10,Q10,Q8 ; Q10=t[5]'
1868 VADD.S16 Q14,Q14,Q9 ; Q14=t[6]'
1869 VSUB.S16 Q13,Q14,Q10 ; Q13=t[5]''=t[6]'-t[5]'
1870 VADD.S16 Q14,Q14,Q10 ; Q14=t[6]''=t[6]'+t[5]'
1871 VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2]
1872 VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2]
1873 ; Stage 4
1874 CMP r0, r1
1875 VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]'
1876 VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]''
1877 VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]'
1878 VSUB.S16 Q14,Q10,Q14 ; Q14 = y[6]=t[1]'-t[6]''
1879 VADD.S16 Q10,Q2, Q13 ; Q10 = y[2]=t[2]'+t[5]''
1880 VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]'
1881 VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]'
1882 VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]''
1883 BEQ oc_idct8x8_10_neon_noclear
1884 VMOV.I8 D2, #0
1885 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
1886 VST1.64 {D2}, [r1@64], r12
1887 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
1888 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
1889 VST1.64 {D2}, [r1@64], r12
1890 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
1891 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
1892 VST1.64 {D2}, [r1@64], r12
1893 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
1894 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
1895 VST1.64 {D2}, [r1@64]
1896 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
1897 VSTMIA r0, {D16-D31}
1898 MOV PC, r14
1899
1900 oc_idct8x8_10_neon_noclear
1901 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
1902 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
1903 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
1904 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
1905 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
1906 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
1907 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
1908 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
1909 VSTMIA r0, {D16-D31}
1910 MOV PC, r14
1911 ENDP
1912 ]
1913
1914 END

mercurial