|
1 ;******************************************************************** |
|
2 ;* * |
|
3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 ;* * |
|
8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * |
|
9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 ;* * |
|
11 ;******************************************************************** |
|
12 ; Original implementation: |
|
13 ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd |
|
14 ; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $ |
|
15 ;******************************************************************** |
|
16 |
|
17 AREA |.text|, CODE, READONLY |
|
18 |
|
19 ; Explicitly specifying alignment here because some versions of |
|
20 ; gas don't align code correctly. See |
|
21 ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html |
|
22 ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 |
|
23 ALIGN |
|
24 |
|
25 GET armopts.s |
|
26 |
|
27 ; Vanilla ARM v4 versions |
|
28 EXPORT oc_frag_copy_list_arm |
|
29 EXPORT oc_frag_recon_intra_arm |
|
30 EXPORT oc_frag_recon_inter_arm |
|
31 EXPORT oc_frag_recon_inter2_arm |
|
32 |
|
33 oc_frag_copy_list_arm PROC |
|
34 ; r0 = _dst_frame |
|
35 ; r1 = _src_frame |
|
36 ; r2 = _ystride |
|
37 ; r3 = _fragis |
|
38 ; <> = _nfragis |
|
39 ; <> = _frag_buf_offs |
|
40 LDR r12,[r13] ; r12 = _nfragis |
|
41 STMFD r13!,{r4-r6,r11,r14} |
|
42 SUBS r12, r12, #1 |
|
43 LDR r4,[r3],#4 ; r4 = _fragis[fragii] |
|
44 LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs |
|
45 BLT ofcl_arm_end |
|
46 SUB r2, r2, #4 |
|
47 ofcl_arm_lp |
|
48 LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]] |
|
49 SUBS r12, r12, #1 |
|
50 ; Stall (on XScale) |
|
51 ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off |
|
52 LDR r6, [r4], #4 |
|
53 ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off |
|
54 LDR r5, [r4], r2 |
|
55 STR r6, [r11],#4 |
|
56 LDR r6, [r4], #4 |
|
57 STR r5, [r11],r2 |
|
58 LDR r5, [r4], r2 |
|
59 STR r6, [r11],#4 |
|
60 LDR r6, [r4], #4 |
|
61 STR r5, [r11],r2 |
|
62 LDR r5, [r4], r2 |
|
63 STR r6, [r11],#4 |
|
64 LDR r6, [r4], #4 |
|
65 STR r5, [r11],r2 |
|
66 LDR r5, [r4], r2 |
|
67 STR r6, [r11],#4 |
|
68 LDR r6, [r4], #4 |
|
69 STR r5, [r11],r2 |
|
70 LDR r5, [r4], r2 |
|
71 STR r6, [r11],#4 |
|
72 LDR r6, [r4], #4 |
|
73 STR r5, [r11],r2 |
|
74 LDR r5, [r4], r2 |
|
75 STR r6, [r11],#4 |
|
76 LDR r6, [r4], #4 |
|
77 STR r5, [r11],r2 |
|
78 LDR r5, [r4], r2 |
|
79 STR r6, [r11],#4 |
|
80 LDR r6, [r4], #4 |
|
81 STR r5, [r11],r2 |
|
82 LDR r5, [r4] |
|
83 LDRGE r4,[r3],#4 ; r4 = _fragis[fragii] |
|
84 STR r6, [r11],#4 |
|
85 STR r5, [r11] |
|
86 BGE ofcl_arm_lp |
|
87 ofcl_arm_end |
|
88 LDMFD r13!,{r4-r6,r11,PC} |
|
89 oc_frag_recon_intra_arm |
|
90 ; r0 = unsigned char *_dst |
|
91 ; r1 = int _ystride |
|
92 ; r2 = const ogg_int16_t _residue[64] |
|
93 STMFD r13!,{r4,r5,r14} |
|
94 MOV r14,#8 |
|
95 MOV r5, #255 |
|
96 SUB r1, r1, #7 |
|
97 ofrintra_lp_arm |
|
98 LDRSH r3, [r2], #2 |
|
99 LDRSH r4, [r2], #2 |
|
100 LDRSH r12,[r2], #2 |
|
101 ADDS r3, r3, #128 |
|
102 CMPGT r5, r3 |
|
103 EORLT r3, r5, r3, ASR #32 |
|
104 STRB r3, [r0], #1 |
|
105 ADDS r4, r4, #128 |
|
106 CMPGT r5, r4 |
|
107 EORLT r4, r5, r4, ASR #32 |
|
108 LDRSH r3, [r2], #2 |
|
109 STRB r4, [r0], #1 |
|
110 ADDS r12,r12,#128 |
|
111 CMPGT r5, r12 |
|
112 EORLT r12,r5, r12,ASR #32 |
|
113 LDRSH r4, [r2], #2 |
|
114 STRB r12,[r0], #1 |
|
115 ADDS r3, r3, #128 |
|
116 CMPGT r5, r3 |
|
117 EORLT r3, r5, r3, ASR #32 |
|
118 LDRSH r12,[r2], #2 |
|
119 STRB r3, [r0], #1 |
|
120 ADDS r4, r4, #128 |
|
121 CMPGT r5, r4 |
|
122 EORLT r4, r5, r4, ASR #32 |
|
123 LDRSH r3, [r2], #2 |
|
124 STRB r4, [r0], #1 |
|
125 ADDS r12,r12,#128 |
|
126 CMPGT r5, r12 |
|
127 EORLT r12,r5, r12,ASR #32 |
|
128 LDRSH r4, [r2], #2 |
|
129 STRB r12,[r0], #1 |
|
130 ADDS r3, r3, #128 |
|
131 CMPGT r5, r3 |
|
132 EORLT r3, r5, r3, ASR #32 |
|
133 STRB r3, [r0], #1 |
|
134 ADDS r4, r4, #128 |
|
135 CMPGT r5, r4 |
|
136 EORLT r4, r5, r4, ASR #32 |
|
137 STRB r4, [r0], r1 |
|
138 SUBS r14,r14,#1 |
|
139 BGT ofrintra_lp_arm |
|
140 LDMFD r13!,{r4,r5,PC} |
|
141 ENDP |
|
142 |
|
143 oc_frag_recon_inter_arm PROC |
|
144 ; r0 = unsigned char *dst |
|
145 ; r1 = const unsigned char *src |
|
146 ; r2 = int ystride |
|
147 ; r3 = const ogg_int16_t residue[64] |
|
148 STMFD r13!,{r5,r9-r11,r14} |
|
149 MOV r9, #8 |
|
150 MOV r5, #255 |
|
151 SUB r2, r2, #7 |
|
152 ofrinter_lp_arm |
|
153 LDRSH r12,[r3], #2 |
|
154 LDRB r14,[r1], #1 |
|
155 LDRSH r11,[r3], #2 |
|
156 LDRB r10,[r1], #1 |
|
157 ADDS r12,r12,r14 |
|
158 CMPGT r5, r12 |
|
159 EORLT r12,r5, r12,ASR #32 |
|
160 STRB r12,[r0], #1 |
|
161 ADDS r11,r11,r10 |
|
162 CMPGT r5, r11 |
|
163 LDRSH r12,[r3], #2 |
|
164 LDRB r14,[r1], #1 |
|
165 EORLT r11,r5, r11,ASR #32 |
|
166 STRB r11,[r0], #1 |
|
167 ADDS r12,r12,r14 |
|
168 CMPGT r5, r12 |
|
169 LDRSH r11,[r3], #2 |
|
170 LDRB r10,[r1], #1 |
|
171 EORLT r12,r5, r12,ASR #32 |
|
172 STRB r12,[r0], #1 |
|
173 ADDS r11,r11,r10 |
|
174 CMPGT r5, r11 |
|
175 LDRSH r12,[r3], #2 |
|
176 LDRB r14,[r1], #1 |
|
177 EORLT r11,r5, r11,ASR #32 |
|
178 STRB r11,[r0], #1 |
|
179 ADDS r12,r12,r14 |
|
180 CMPGT r5, r12 |
|
181 LDRSH r11,[r3], #2 |
|
182 LDRB r10,[r1], #1 |
|
183 EORLT r12,r5, r12,ASR #32 |
|
184 STRB r12,[r0], #1 |
|
185 ADDS r11,r11,r10 |
|
186 CMPGT r5, r11 |
|
187 LDRSH r12,[r3], #2 |
|
188 LDRB r14,[r1], #1 |
|
189 EORLT r11,r5, r11,ASR #32 |
|
190 STRB r11,[r0], #1 |
|
191 ADDS r12,r12,r14 |
|
192 CMPGT r5, r12 |
|
193 LDRSH r11,[r3], #2 |
|
194 LDRB r10,[r1], r2 |
|
195 EORLT r12,r5, r12,ASR #32 |
|
196 STRB r12,[r0], #1 |
|
197 ADDS r11,r11,r10 |
|
198 CMPGT r5, r11 |
|
199 EORLT r11,r5, r11,ASR #32 |
|
200 STRB r11,[r0], r2 |
|
201 SUBS r9, r9, #1 |
|
202 BGT ofrinter_lp_arm |
|
203 LDMFD r13!,{r5,r9-r11,PC} |
|
204 ENDP |
|
205 |
|
206 oc_frag_recon_inter2_arm PROC |
|
207 ; r0 = unsigned char *dst |
|
208 ; r1 = const unsigned char *src1 |
|
209 ; r2 = const unsigned char *src2 |
|
210 ; r3 = int ystride |
|
211 LDR r12,[r13] |
|
212 ; r12= const ogg_int16_t residue[64] |
|
213 STMFD r13!,{r4-r8,r14} |
|
214 MOV r14,#8 |
|
215 MOV r8, #255 |
|
216 SUB r3, r3, #7 |
|
217 ofrinter2_lp_arm |
|
218 LDRB r5, [r1], #1 |
|
219 LDRB r6, [r2], #1 |
|
220 LDRSH r4, [r12],#2 |
|
221 LDRB r7, [r1], #1 |
|
222 ADD r5, r5, r6 |
|
223 ADDS r5, r4, r5, LSR #1 |
|
224 CMPGT r8, r5 |
|
225 LDRB r6, [r2], #1 |
|
226 LDRSH r4, [r12],#2 |
|
227 EORLT r5, r8, r5, ASR #32 |
|
228 STRB r5, [r0], #1 |
|
229 ADD r7, r7, r6 |
|
230 ADDS r7, r4, r7, LSR #1 |
|
231 CMPGT r8, r7 |
|
232 LDRB r5, [r1], #1 |
|
233 LDRB r6, [r2], #1 |
|
234 LDRSH r4, [r12],#2 |
|
235 EORLT r7, r8, r7, ASR #32 |
|
236 STRB r7, [r0], #1 |
|
237 ADD r5, r5, r6 |
|
238 ADDS r5, r4, r5, LSR #1 |
|
239 CMPGT r8, r5 |
|
240 LDRB r7, [r1], #1 |
|
241 LDRB r6, [r2], #1 |
|
242 LDRSH r4, [r12],#2 |
|
243 EORLT r5, r8, r5, ASR #32 |
|
244 STRB r5, [r0], #1 |
|
245 ADD r7, r7, r6 |
|
246 ADDS r7, r4, r7, LSR #1 |
|
247 CMPGT r8, r7 |
|
248 LDRB r5, [r1], #1 |
|
249 LDRB r6, [r2], #1 |
|
250 LDRSH r4, [r12],#2 |
|
251 EORLT r7, r8, r7, ASR #32 |
|
252 STRB r7, [r0], #1 |
|
253 ADD r5, r5, r6 |
|
254 ADDS r5, r4, r5, LSR #1 |
|
255 CMPGT r8, r5 |
|
256 LDRB r7, [r1], #1 |
|
257 LDRB r6, [r2], #1 |
|
258 LDRSH r4, [r12],#2 |
|
259 EORLT r5, r8, r5, ASR #32 |
|
260 STRB r5, [r0], #1 |
|
261 ADD r7, r7, r6 |
|
262 ADDS r7, r4, r7, LSR #1 |
|
263 CMPGT r8, r7 |
|
264 LDRB r5, [r1], #1 |
|
265 LDRB r6, [r2], #1 |
|
266 LDRSH r4, [r12],#2 |
|
267 EORLT r7, r8, r7, ASR #32 |
|
268 STRB r7, [r0], #1 |
|
269 ADD r5, r5, r6 |
|
270 ADDS r5, r4, r5, LSR #1 |
|
271 CMPGT r8, r5 |
|
272 LDRB r7, [r1], r3 |
|
273 LDRB r6, [r2], r3 |
|
274 LDRSH r4, [r12],#2 |
|
275 EORLT r5, r8, r5, ASR #32 |
|
276 STRB r5, [r0], #1 |
|
277 ADD r7, r7, r6 |
|
278 ADDS r7, r4, r7, LSR #1 |
|
279 CMPGT r8, r7 |
|
280 EORLT r7, r8, r7, ASR #32 |
|
281 STRB r7, [r0], r3 |
|
282 SUBS r14,r14,#1 |
|
283 BGT ofrinter2_lp_arm |
|
284 LDMFD r13!,{r4-r8,PC} |
|
285 ENDP |
|
286 |
|
287 [ OC_ARM_ASM_EDSP |
|
288 EXPORT oc_frag_copy_list_edsp |
|
289 |
|
290 oc_frag_copy_list_edsp PROC |
|
291 ; r0 = _dst_frame |
|
292 ; r1 = _src_frame |
|
293 ; r2 = _ystride |
|
294 ; r3 = _fragis |
|
295 ; <> = _nfragis |
|
296 ; <> = _frag_buf_offs |
|
297 LDR r12,[r13] ; r12 = _nfragis |
|
298 STMFD r13!,{r4-r11,r14} |
|
299 SUBS r12, r12, #1 |
|
300 LDRGE r5, [r3],#4 ; r5 = _fragis[fragii] |
|
301 LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs |
|
302 BLT ofcl_edsp_end |
|
303 ofcl_edsp_lp |
|
304 MOV r4, r1 |
|
305 LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]] |
|
306 SUBS r12, r12, #1 |
|
307 ; Stall (on XScale) |
|
308 LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off |
|
309 LDRD r8, [r4, r2]! |
|
310 ; Stall |
|
311 STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off |
|
312 STRD r8, [r5, r2]! |
|
313 ; Stall |
|
314 LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive |
|
315 LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse |
|
316 LDRD r10,[r4, r2]! ; than us only doing 2, and having to do |
|
317 ; another pair of LDRD/STRD later on. |
|
318 ; Stall |
|
319 STRD r6, [r5, r2]! |
|
320 STRD r8, [r5, r2]! |
|
321 STRD r10,[r5, r2]! |
|
322 LDRD r6, [r4, r2]! |
|
323 LDRD r8, [r4, r2]! |
|
324 LDRD r10,[r4, r2]! |
|
325 STRD r6, [r5, r2]! |
|
326 STRD r8, [r5, r2]! |
|
327 STRD r10,[r5, r2]! |
|
328 LDRGE r5, [r3],#4 ; r5 = _fragis[fragii] |
|
329 BGE ofcl_edsp_lp |
|
330 ofcl_edsp_end |
|
331 LDMFD r13!,{r4-r11,PC} |
|
332 ENDP |
|
333 ] |
|
334 |
|
335 [ OC_ARM_ASM_MEDIA |
|
336 EXPORT oc_frag_recon_intra_v6 |
|
337 EXPORT oc_frag_recon_inter_v6 |
|
338 EXPORT oc_frag_recon_inter2_v6 |
|
339 |
|
340 oc_frag_recon_intra_v6 PROC |
|
341 ; r0 = unsigned char *_dst |
|
342 ; r1 = int _ystride |
|
343 ; r2 = const ogg_int16_t _residue[64] |
|
344 STMFD r13!,{r4-r6,r14} |
|
345 MOV r14,#8 |
|
346 MOV r12,r2 |
|
347 LDR r6, =0x00800080 |
|
348 ofrintra_v6_lp |
|
349 LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222 |
|
350 LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666 |
|
351 SUBS r14,r14,#1 |
|
352 QADD16 r2, r2, r6 |
|
353 QADD16 r3, r3, r6 |
|
354 QADD16 r4, r4, r6 |
|
355 QADD16 r5, r5, r6 |
|
356 USAT16 r2, #8, r2 ; r2 = __11__00 |
|
357 USAT16 r3, #8, r3 ; r3 = __33__22 |
|
358 USAT16 r4, #8, r4 ; r4 = __55__44 |
|
359 USAT16 r5, #8, r5 ; r5 = __77__66 |
|
360 ORR r2, r2, r2, LSR #8 ; r2 = __111100 |
|
361 ORR r3, r3, r3, LSR #8 ; r3 = __333322 |
|
362 ORR r4, r4, r4, LSR #8 ; r4 = __555544 |
|
363 ORR r5, r5, r5, LSR #8 ; r5 = __777766 |
|
364 PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100 |
|
365 PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544 |
|
366 STRD r2, [r0], r1 |
|
367 BGT ofrintra_v6_lp |
|
368 LDMFD r13!,{r4-r6,PC} |
|
369 ENDP |
|
370 |
|
371 oc_frag_recon_inter_v6 PROC |
|
372 ; r0 = unsigned char *_dst |
|
373 ; r1 = const unsigned char *_src |
|
374 ; r2 = int _ystride |
|
375 ; r3 = const ogg_int16_t _residue[64] |
|
376 STMFD r13!,{r4-r7,r14} |
|
377 MOV r14,#8 |
|
378 ofrinter_v6_lp |
|
379 LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222 |
|
380 SUBS r14,r14,#1 |
|
381 [ OC_ARM_CAN_UNALIGN_LDRD |
|
382 LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544 |
|
383 | |
|
384 LDR r5, [r1, #4] |
|
385 LDR r4, [r1], r2 |
|
386 ] |
|
387 PKHBT r12,r6, r7, LSL #16 ; r12= 22220000 |
|
388 PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111 |
|
389 UXTB16 r6,r4 ; r6 = __22__00 |
|
390 UXTB16 r4,r4, ROR #8 ; r4 = __33__11 |
|
391 QADD16 r12,r12,r6 ; r12= xx22xx00 |
|
392 QADD16 r4, r7, r4 ; r4 = xx33xx11 |
|
393 LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666 |
|
394 USAT16 r4, #8, r4 ; r4 = __33__11 |
|
395 USAT16 r12,#8,r12 ; r12= __22__00 |
|
396 ORR r4, r12,r4, LSL #8 ; r4 = 33221100 |
|
397 PKHBT r12,r6, r7, LSL #16 ; r12= 66664444 |
|
398 PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555 |
|
399 UXTB16 r6,r5 ; r6 = __66__44 |
|
400 UXTB16 r5,r5, ROR #8 ; r5 = __77__55 |
|
401 QADD16 r12,r12,r6 ; r12= xx66xx44 |
|
402 QADD16 r5, r7, r5 ; r5 = xx77xx55 |
|
403 USAT16 r12,#8, r12 ; r12= __66__44 |
|
404 USAT16 r5, #8, r5 ; r4 = __77__55 |
|
405 ORR r5, r12,r5, LSL #8 ; r5 = 33221100 |
|
406 STRD r4, [r0], r2 |
|
407 BGT ofrinter_v6_lp |
|
408 LDMFD r13!,{r4-r7,PC} |
|
409 ENDP |
|
410 |
|
411 oc_frag_recon_inter2_v6 PROC |
|
412 ; r0 = unsigned char *_dst |
|
413 ; r1 = const unsigned char *_src1 |
|
414 ; r2 = const unsigned char *_src2 |
|
415 ; r3 = int _ystride |
|
416 LDR r12,[r13] |
|
417 ; r12= const ogg_int16_t _residue[64] |
|
418 STMFD r13!,{r4-r9,r14} |
|
419 MOV r14,#8 |
|
420 ofrinter2_v6_lp |
|
421 LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666 |
|
422 SUBS r14,r14,#1 |
|
423 LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544 |
|
424 LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544 |
|
425 PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444 |
|
426 PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555 |
|
427 UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1 |
|
428 UXTB16 r5, r4 ; r5 = __66__44 |
|
429 UXTB16 r4, r4, ROR #8 ; r4 = __77__55 |
|
430 QADD16 r8, r8, r5 ; r8 = xx66xx44 |
|
431 QADD16 r9, r9, r4 ; r9 = xx77xx55 |
|
432 LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000 |
|
433 USAT16 r8, #8, r8 ; r8 = __66__44 |
|
434 LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100 |
|
435 USAT16 r9, #8, r9 ; r9 = __77__55 |
|
436 LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100 |
|
437 ORR r9, r8, r9, LSL #8 ; r9 = 77665544 |
|
438 PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000 |
|
439 UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1 |
|
440 PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111 |
|
441 UXTB16 r5, r4 ; r5 = __22__00 |
|
442 UXTB16 r4, r4, ROR #8 ; r4 = __33__11 |
|
443 QADD16 r8, r8, r5 ; r8 = xx22xx00 |
|
444 QADD16 r7, r7, r4 ; r7 = xx33xx11 |
|
445 USAT16 r8, #8, r8 ; r8 = __22__00 |
|
446 USAT16 r7, #8, r7 ; r7 = __33__11 |
|
447 ORR r8, r8, r7, LSL #8 ; r8 = 33221100 |
|
448 STRD r8, [r0], r3 |
|
449 BGT ofrinter2_v6_lp |
|
450 LDMFD r13!,{r4-r9,PC} |
|
451 ENDP |
|
452 ] |
|
453 |
|
454 [ OC_ARM_ASM_NEON |
|
455 EXPORT oc_frag_copy_list_neon |
|
456 EXPORT oc_frag_recon_intra_neon |
|
457 EXPORT oc_frag_recon_inter_neon |
|
458 EXPORT oc_frag_recon_inter2_neon |
|
459 |
|
460 oc_frag_copy_list_neon PROC |
|
461 ; r0 = _dst_frame |
|
462 ; r1 = _src_frame |
|
463 ; r2 = _ystride |
|
464 ; r3 = _fragis |
|
465 ; <> = _nfragis |
|
466 ; <> = _frag_buf_offs |
|
467 LDR r12,[r13] ; r12 = _nfragis |
|
468 STMFD r13!,{r4-r7,r14} |
|
469 CMP r12, #1 |
|
470 LDRGE r6, [r3] ; r6 = _fragis[fragii] |
|
471 LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs |
|
472 BLT ofcl_neon_end |
|
473 ; Stall (2 on Xscale) |
|
474 LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]] |
|
475 ; Stall (on XScale) |
|
476 MOV r7, r6 ; Guarantee PLD points somewhere valid. |
|
477 ofcl_neon_lp |
|
478 ADD r4, r1, r6 |
|
479 VLD1.64 {D0}, [r4@64], r2 |
|
480 ADD r5, r0, r6 |
|
481 VLD1.64 {D1}, [r4@64], r2 |
|
482 SUBS r12, r12, #1 |
|
483 VLD1.64 {D2}, [r4@64], r2 |
|
484 LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii] |
|
485 VLD1.64 {D3}, [r4@64], r2 |
|
486 LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]] |
|
487 VLD1.64 {D4}, [r4@64], r2 |
|
488 ADDGT r7, r1, r6 |
|
489 VLD1.64 {D5}, [r4@64], r2 |
|
490 PLD [r7] |
|
491 VLD1.64 {D6}, [r4@64], r2 |
|
492 PLD [r7, r2] |
|
493 VLD1.64 {D7}, [r4@64] |
|
494 PLD [r7, r2, LSL #1] |
|
495 VST1.64 {D0}, [r5@64], r2 |
|
496 ADDGT r7, r7, r2, LSL #2 |
|
497 VST1.64 {D1}, [r5@64], r2 |
|
498 PLD [r7, -r2] |
|
499 VST1.64 {D2}, [r5@64], r2 |
|
500 PLD [r7] |
|
501 VST1.64 {D3}, [r5@64], r2 |
|
502 PLD [r7, r2] |
|
503 VST1.64 {D4}, [r5@64], r2 |
|
504 PLD [r7, r2, LSL #1] |
|
505 VST1.64 {D5}, [r5@64], r2 |
|
506 ADDGT r7, r7, r2, LSL #2 |
|
507 VST1.64 {D6}, [r5@64], r2 |
|
508 PLD [r7, -r2] |
|
509 VST1.64 {D7}, [r5@64] |
|
510 BGT ofcl_neon_lp |
|
511 ofcl_neon_end |
|
512 LDMFD r13!,{r4-r7,PC} |
|
513 ENDP |
|
514 |
|
515 oc_frag_recon_intra_neon PROC |
|
516 ; r0 = unsigned char *_dst |
|
517 ; r1 = int _ystride |
|
518 ; r2 = const ogg_int16_t _residue[64] |
|
519 MOV r3, #128 |
|
520 VDUP.S16 Q0, r3 |
|
521 VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles |
|
522 VQADD.S16 Q8, Q8, Q0 |
|
523 VQADD.S16 Q9, Q9, Q0 |
|
524 VQADD.S16 Q10,Q10,Q0 |
|
525 VQADD.S16 Q11,Q11,Q0 |
|
526 VQADD.S16 Q12,Q12,Q0 |
|
527 VQADD.S16 Q13,Q13,Q0 |
|
528 VQADD.S16 Q14,Q14,Q0 |
|
529 VQADD.S16 Q15,Q15,Q0 |
|
530 VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle |
|
531 VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle |
|
532 VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle |
|
533 VST1.64 {D16},[r0@64], r1 |
|
534 VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle |
|
535 VST1.64 {D17},[r0@64], r1 |
|
536 VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle |
|
537 VST1.64 {D18},[r0@64], r1 |
|
538 VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle |
|
539 VST1.64 {D19},[r0@64], r1 |
|
540 VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle |
|
541 VST1.64 {D20},[r0@64], r1 |
|
542 VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle |
|
543 VST1.64 {D21},[r0@64], r1 |
|
544 VST1.64 {D22},[r0@64], r1 |
|
545 VST1.64 {D23},[r0@64], r1 |
|
546 MOV PC,R14 |
|
547 ENDP |
|
548 |
|
549 oc_frag_recon_inter_neon PROC |
|
550 ; r0 = unsigned char *_dst |
|
551 ; r1 = const unsigned char *_src |
|
552 ; r2 = int _ystride |
|
553 ; r3 = const ogg_int16_t _residue[64] |
|
554 VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles |
|
555 VLD1.64 {D0}, [r1], r2 |
|
556 VLD1.64 {D2}, [r1], r2 |
|
557 VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00 |
|
558 VLD1.64 {D4}, [r1], r2 |
|
559 VMOVL.U8 Q1, D2 ; etc |
|
560 VLD1.64 {D6}, [r1], r2 |
|
561 VMOVL.U8 Q2, D4 |
|
562 VMOVL.U8 Q3, D6 |
|
563 VQADD.S16 Q8, Q8, Q0 |
|
564 VLD1.64 {D0}, [r1], r2 |
|
565 VQADD.S16 Q9, Q9, Q1 |
|
566 VLD1.64 {D2}, [r1], r2 |
|
567 VQADD.S16 Q10,Q10,Q2 |
|
568 VLD1.64 {D4}, [r1], r2 |
|
569 VQADD.S16 Q11,Q11,Q3 |
|
570 VLD1.64 {D6}, [r1], r2 |
|
571 VMOVL.U8 Q0, D0 |
|
572 VMOVL.U8 Q1, D2 |
|
573 VMOVL.U8 Q2, D4 |
|
574 VMOVL.U8 Q3, D6 |
|
575 VQADD.S16 Q12,Q12,Q0 |
|
576 VQADD.S16 Q13,Q13,Q1 |
|
577 VQADD.S16 Q14,Q14,Q2 |
|
578 VQADD.S16 Q15,Q15,Q3 |
|
579 VQMOVUN.S16 D16,Q8 |
|
580 VQMOVUN.S16 D17,Q9 |
|
581 VQMOVUN.S16 D18,Q10 |
|
582 VST1.64 {D16},[r0@64], r2 |
|
583 VQMOVUN.S16 D19,Q11 |
|
584 VST1.64 {D17},[r0@64], r2 |
|
585 VQMOVUN.S16 D20,Q12 |
|
586 VST1.64 {D18},[r0@64], r2 |
|
587 VQMOVUN.S16 D21,Q13 |
|
588 VST1.64 {D19},[r0@64], r2 |
|
589 VQMOVUN.S16 D22,Q14 |
|
590 VST1.64 {D20},[r0@64], r2 |
|
591 VQMOVUN.S16 D23,Q15 |
|
592 VST1.64 {D21},[r0@64], r2 |
|
593 VST1.64 {D22},[r0@64], r2 |
|
594 VST1.64 {D23},[r0@64], r2 |
|
595 MOV PC,R14 |
|
596 ENDP |
|
597 |
|
598 oc_frag_recon_inter2_neon PROC |
|
599 ; r0 = unsigned char *_dst |
|
600 ; r1 = const unsigned char *_src1 |
|
601 ; r2 = const unsigned char *_src2 |
|
602 ; r3 = int _ystride |
|
603 LDR r12,[r13] |
|
604 ; r12= const ogg_int16_t _residue[64] |
|
605 VLDMIA r12,{D16-D31} |
|
606 VLD1.64 {D0}, [r1], r3 |
|
607 VLD1.64 {D4}, [r2], r3 |
|
608 VLD1.64 {D1}, [r1], r3 |
|
609 VLD1.64 {D5}, [r2], r3 |
|
610 VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100 |
|
611 VLD1.64 {D2}, [r1], r3 |
|
612 VLD1.64 {D6}, [r2], r3 |
|
613 VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00 |
|
614 VLD1.64 {D3}, [r1], r3 |
|
615 VMOVL.U8 Q2, D5 ; etc |
|
616 VLD1.64 {D7}, [r2], r3 |
|
617 VHADD.U8 Q3, Q1, Q3 |
|
618 VQADD.S16 Q8, Q8, Q0 |
|
619 VQADD.S16 Q9, Q9, Q2 |
|
620 VLD1.64 {D0}, [r1], r3 |
|
621 VMOVL.U8 Q1, D6 |
|
622 VLD1.64 {D4}, [r2], r3 |
|
623 VMOVL.U8 Q3, D7 |
|
624 VLD1.64 {D1}, [r1], r3 |
|
625 VQADD.S16 Q10,Q10,Q1 |
|
626 VLD1.64 {D5}, [r2], r3 |
|
627 VQADD.S16 Q11,Q11,Q3 |
|
628 VLD1.64 {D2}, [r1], r3 |
|
629 VHADD.U8 Q2, Q0, Q2 |
|
630 VLD1.64 {D6}, [r2], r3 |
|
631 VLD1.64 {D3}, [r1], r3 |
|
632 VMOVL.U8 Q0, D4 |
|
633 VLD1.64 {D7}, [r2], r3 |
|
634 VMOVL.U8 Q2, D5 |
|
635 VHADD.U8 Q3, Q1, Q3 |
|
636 VQADD.S16 Q12,Q12,Q0 |
|
637 VQADD.S16 Q13,Q13,Q2 |
|
638 VMOVL.U8 Q1, D6 |
|
639 VMOVL.U8 Q3, D7 |
|
640 VQADD.S16 Q14,Q14,Q1 |
|
641 VQADD.S16 Q15,Q15,Q3 |
|
642 VQMOVUN.S16 D16,Q8 |
|
643 VQMOVUN.S16 D17,Q9 |
|
644 VQMOVUN.S16 D18,Q10 |
|
645 VST1.64 {D16},[r0@64], r3 |
|
646 VQMOVUN.S16 D19,Q11 |
|
647 VST1.64 {D17},[r0@64], r3 |
|
648 VQMOVUN.S16 D20,Q12 |
|
649 VST1.64 {D18},[r0@64], r3 |
|
650 VQMOVUN.S16 D21,Q13 |
|
651 VST1.64 {D19},[r0@64], r3 |
|
652 VQMOVUN.S16 D22,Q14 |
|
653 VST1.64 {D20},[r0@64], r3 |
|
654 VQMOVUN.S16 D23,Q15 |
|
655 VST1.64 {D21},[r0@64], r3 |
|
656 VST1.64 {D22},[r0@64], r3 |
|
657 VST1.64 {D23},[r0@64], r3 |
|
658 MOV PC,R14 |
|
659 ENDP |
|
660 ] |
|
661 |
|
662 END |