media/libtheora/lib/arm/armfrag.s

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:292a1342bafc
1 ;********************************************************************
2 ;* *
3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 ;* *
8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 ;* *
11 ;********************************************************************
12 ; Original implementation:
13 ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
14 ; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $
15 ;********************************************************************
16
17 AREA |.text|, CODE, READONLY
18
19 ; Explicitly specifying alignment here because some versions of
20 ; gas don't align code correctly. See
21 ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
22 ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
23 ALIGN
24
25 GET armopts.s
26
27 ; Vanilla ARM v4 versions
28 EXPORT oc_frag_copy_list_arm
29 EXPORT oc_frag_recon_intra_arm
30 EXPORT oc_frag_recon_inter_arm
31 EXPORT oc_frag_recon_inter2_arm
32
33 oc_frag_copy_list_arm PROC
34 ; r0 = _dst_frame
35 ; r1 = _src_frame
36 ; r2 = _ystride
37 ; r3 = _fragis
38 ; <> = _nfragis
39 ; <> = _frag_buf_offs
40 LDR r12,[r13] ; r12 = _nfragis
41 STMFD r13!,{r4-r6,r11,r14}
42 SUBS r12, r12, #1
43 LDR r4,[r3],#4 ; r4 = _fragis[fragii]
44 LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
45 BLT ofcl_arm_end
46 SUB r2, r2, #4
47 ofcl_arm_lp
48 LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]]
49 SUBS r12, r12, #1
50 ; Stall (on XScale)
51 ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off
52 LDR r6, [r4], #4
53 ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off
54 LDR r5, [r4], r2
55 STR r6, [r11],#4
56 LDR r6, [r4], #4
57 STR r5, [r11],r2
58 LDR r5, [r4], r2
59 STR r6, [r11],#4
60 LDR r6, [r4], #4
61 STR r5, [r11],r2
62 LDR r5, [r4], r2
63 STR r6, [r11],#4
64 LDR r6, [r4], #4
65 STR r5, [r11],r2
66 LDR r5, [r4], r2
67 STR r6, [r11],#4
68 LDR r6, [r4], #4
69 STR r5, [r11],r2
70 LDR r5, [r4], r2
71 STR r6, [r11],#4
72 LDR r6, [r4], #4
73 STR r5, [r11],r2
74 LDR r5, [r4], r2
75 STR r6, [r11],#4
76 LDR r6, [r4], #4
77 STR r5, [r11],r2
78 LDR r5, [r4], r2
79 STR r6, [r11],#4
80 LDR r6, [r4], #4
81 STR r5, [r11],r2
82 LDR r5, [r4]
83 LDRGE r4,[r3],#4 ; r4 = _fragis[fragii]
84 STR r6, [r11],#4
85 STR r5, [r11]
86 BGE ofcl_arm_lp
87 ofcl_arm_end
88 LDMFD r13!,{r4-r6,r11,PC}
89 oc_frag_recon_intra_arm
90 ; r0 = unsigned char *_dst
91 ; r1 = int _ystride
92 ; r2 = const ogg_int16_t _residue[64]
93 STMFD r13!,{r4,r5,r14}
94 MOV r14,#8
95 MOV r5, #255
96 SUB r1, r1, #7
97 ofrintra_lp_arm
98 LDRSH r3, [r2], #2
99 LDRSH r4, [r2], #2
100 LDRSH r12,[r2], #2
101 ADDS r3, r3, #128
102 CMPGT r5, r3
103 EORLT r3, r5, r3, ASR #32
104 STRB r3, [r0], #1
105 ADDS r4, r4, #128
106 CMPGT r5, r4
107 EORLT r4, r5, r4, ASR #32
108 LDRSH r3, [r2], #2
109 STRB r4, [r0], #1
110 ADDS r12,r12,#128
111 CMPGT r5, r12
112 EORLT r12,r5, r12,ASR #32
113 LDRSH r4, [r2], #2
114 STRB r12,[r0], #1
115 ADDS r3, r3, #128
116 CMPGT r5, r3
117 EORLT r3, r5, r3, ASR #32
118 LDRSH r12,[r2], #2
119 STRB r3, [r0], #1
120 ADDS r4, r4, #128
121 CMPGT r5, r4
122 EORLT r4, r5, r4, ASR #32
123 LDRSH r3, [r2], #2
124 STRB r4, [r0], #1
125 ADDS r12,r12,#128
126 CMPGT r5, r12
127 EORLT r12,r5, r12,ASR #32
128 LDRSH r4, [r2], #2
129 STRB r12,[r0], #1
130 ADDS r3, r3, #128
131 CMPGT r5, r3
132 EORLT r3, r5, r3, ASR #32
133 STRB r3, [r0], #1
134 ADDS r4, r4, #128
135 CMPGT r5, r4
136 EORLT r4, r5, r4, ASR #32
137 STRB r4, [r0], r1
138 SUBS r14,r14,#1
139 BGT ofrintra_lp_arm
140 LDMFD r13!,{r4,r5,PC}
141 ENDP
142
143 oc_frag_recon_inter_arm PROC
144 ; r0 = unsigned char *dst
145 ; r1 = const unsigned char *src
146 ; r2 = int ystride
147 ; r3 = const ogg_int16_t residue[64]
148 STMFD r13!,{r5,r9-r11,r14}
149 MOV r9, #8
150 MOV r5, #255
151 SUB r2, r2, #7
152 ofrinter_lp_arm
153 LDRSH r12,[r3], #2
154 LDRB r14,[r1], #1
155 LDRSH r11,[r3], #2
156 LDRB r10,[r1], #1
157 ADDS r12,r12,r14
158 CMPGT r5, r12
159 EORLT r12,r5, r12,ASR #32
160 STRB r12,[r0], #1
161 ADDS r11,r11,r10
162 CMPGT r5, r11
163 LDRSH r12,[r3], #2
164 LDRB r14,[r1], #1
165 EORLT r11,r5, r11,ASR #32
166 STRB r11,[r0], #1
167 ADDS r12,r12,r14
168 CMPGT r5, r12
169 LDRSH r11,[r3], #2
170 LDRB r10,[r1], #1
171 EORLT r12,r5, r12,ASR #32
172 STRB r12,[r0], #1
173 ADDS r11,r11,r10
174 CMPGT r5, r11
175 LDRSH r12,[r3], #2
176 LDRB r14,[r1], #1
177 EORLT r11,r5, r11,ASR #32
178 STRB r11,[r0], #1
179 ADDS r12,r12,r14
180 CMPGT r5, r12
181 LDRSH r11,[r3], #2
182 LDRB r10,[r1], #1
183 EORLT r12,r5, r12,ASR #32
184 STRB r12,[r0], #1
185 ADDS r11,r11,r10
186 CMPGT r5, r11
187 LDRSH r12,[r3], #2
188 LDRB r14,[r1], #1
189 EORLT r11,r5, r11,ASR #32
190 STRB r11,[r0], #1
191 ADDS r12,r12,r14
192 CMPGT r5, r12
193 LDRSH r11,[r3], #2
194 LDRB r10,[r1], r2
195 EORLT r12,r5, r12,ASR #32
196 STRB r12,[r0], #1
197 ADDS r11,r11,r10
198 CMPGT r5, r11
199 EORLT r11,r5, r11,ASR #32
200 STRB r11,[r0], r2
201 SUBS r9, r9, #1
202 BGT ofrinter_lp_arm
203 LDMFD r13!,{r5,r9-r11,PC}
204 ENDP
205
206 oc_frag_recon_inter2_arm PROC
207 ; r0 = unsigned char *dst
208 ; r1 = const unsigned char *src1
209 ; r2 = const unsigned char *src2
210 ; r3 = int ystride
211 LDR r12,[r13]
212 ; r12= const ogg_int16_t residue[64]
213 STMFD r13!,{r4-r8,r14}
214 MOV r14,#8
215 MOV r8, #255
216 SUB r3, r3, #7
217 ofrinter2_lp_arm
218 LDRB r5, [r1], #1
219 LDRB r6, [r2], #1
220 LDRSH r4, [r12],#2
221 LDRB r7, [r1], #1
222 ADD r5, r5, r6
223 ADDS r5, r4, r5, LSR #1
224 CMPGT r8, r5
225 LDRB r6, [r2], #1
226 LDRSH r4, [r12],#2
227 EORLT r5, r8, r5, ASR #32
228 STRB r5, [r0], #1
229 ADD r7, r7, r6
230 ADDS r7, r4, r7, LSR #1
231 CMPGT r8, r7
232 LDRB r5, [r1], #1
233 LDRB r6, [r2], #1
234 LDRSH r4, [r12],#2
235 EORLT r7, r8, r7, ASR #32
236 STRB r7, [r0], #1
237 ADD r5, r5, r6
238 ADDS r5, r4, r5, LSR #1
239 CMPGT r8, r5
240 LDRB r7, [r1], #1
241 LDRB r6, [r2], #1
242 LDRSH r4, [r12],#2
243 EORLT r5, r8, r5, ASR #32
244 STRB r5, [r0], #1
245 ADD r7, r7, r6
246 ADDS r7, r4, r7, LSR #1
247 CMPGT r8, r7
248 LDRB r5, [r1], #1
249 LDRB r6, [r2], #1
250 LDRSH r4, [r12],#2
251 EORLT r7, r8, r7, ASR #32
252 STRB r7, [r0], #1
253 ADD r5, r5, r6
254 ADDS r5, r4, r5, LSR #1
255 CMPGT r8, r5
256 LDRB r7, [r1], #1
257 LDRB r6, [r2], #1
258 LDRSH r4, [r12],#2
259 EORLT r5, r8, r5, ASR #32
260 STRB r5, [r0], #1
261 ADD r7, r7, r6
262 ADDS r7, r4, r7, LSR #1
263 CMPGT r8, r7
264 LDRB r5, [r1], #1
265 LDRB r6, [r2], #1
266 LDRSH r4, [r12],#2
267 EORLT r7, r8, r7, ASR #32
268 STRB r7, [r0], #1
269 ADD r5, r5, r6
270 ADDS r5, r4, r5, LSR #1
271 CMPGT r8, r5
272 LDRB r7, [r1], r3
273 LDRB r6, [r2], r3
274 LDRSH r4, [r12],#2
275 EORLT r5, r8, r5, ASR #32
276 STRB r5, [r0], #1
277 ADD r7, r7, r6
278 ADDS r7, r4, r7, LSR #1
279 CMPGT r8, r7
280 EORLT r7, r8, r7, ASR #32
281 STRB r7, [r0], r3
282 SUBS r14,r14,#1
283 BGT ofrinter2_lp_arm
284 LDMFD r13!,{r4-r8,PC}
285 ENDP
286
287 [ OC_ARM_ASM_EDSP
288 EXPORT oc_frag_copy_list_edsp
289
290 oc_frag_copy_list_edsp PROC
291 ; r0 = _dst_frame
292 ; r1 = _src_frame
293 ; r2 = _ystride
294 ; r3 = _fragis
295 ; <> = _nfragis
296 ; <> = _frag_buf_offs
297 LDR r12,[r13] ; r12 = _nfragis
298 STMFD r13!,{r4-r11,r14}
299 SUBS r12, r12, #1
300 LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
301 LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs
302 BLT ofcl_edsp_end
303 ofcl_edsp_lp
304 MOV r4, r1
305 LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]]
306 SUBS r12, r12, #1
307 ; Stall (on XScale)
308 LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off
309 LDRD r8, [r4, r2]!
310 ; Stall
311 STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off
312 STRD r8, [r5, r2]!
313 ; Stall
314 LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive
315 LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse
316 LDRD r10,[r4, r2]! ; than us only doing 2, and having to do
317 ; another pair of LDRD/STRD later on.
318 ; Stall
319 STRD r6, [r5, r2]!
320 STRD r8, [r5, r2]!
321 STRD r10,[r5, r2]!
322 LDRD r6, [r4, r2]!
323 LDRD r8, [r4, r2]!
324 LDRD r10,[r4, r2]!
325 STRD r6, [r5, r2]!
326 STRD r8, [r5, r2]!
327 STRD r10,[r5, r2]!
328 LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
329 BGE ofcl_edsp_lp
330 ofcl_edsp_end
331 LDMFD r13!,{r4-r11,PC}
332 ENDP
333 ]
334
335 [ OC_ARM_ASM_MEDIA
336 EXPORT oc_frag_recon_intra_v6
337 EXPORT oc_frag_recon_inter_v6
338 EXPORT oc_frag_recon_inter2_v6
339
340 oc_frag_recon_intra_v6 PROC
341 ; r0 = unsigned char *_dst
342 ; r1 = int _ystride
343 ; r2 = const ogg_int16_t _residue[64]
344 STMFD r13!,{r4-r6,r14}
345 MOV r14,#8
346 MOV r12,r2
347 LDR r6, =0x00800080
348 ofrintra_v6_lp
349 LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222
350 LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666
351 SUBS r14,r14,#1
352 QADD16 r2, r2, r6
353 QADD16 r3, r3, r6
354 QADD16 r4, r4, r6
355 QADD16 r5, r5, r6
356 USAT16 r2, #8, r2 ; r2 = __11__00
357 USAT16 r3, #8, r3 ; r3 = __33__22
358 USAT16 r4, #8, r4 ; r4 = __55__44
359 USAT16 r5, #8, r5 ; r5 = __77__66
360 ORR r2, r2, r2, LSR #8 ; r2 = __111100
361 ORR r3, r3, r3, LSR #8 ; r3 = __333322
362 ORR r4, r4, r4, LSR #8 ; r4 = __555544
363 ORR r5, r5, r5, LSR #8 ; r5 = __777766
364 PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100
365 PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544
366 STRD r2, [r0], r1
367 BGT ofrintra_v6_lp
368 LDMFD r13!,{r4-r6,PC}
369 ENDP
370
371 oc_frag_recon_inter_v6 PROC
372 ; r0 = unsigned char *_dst
373 ; r1 = const unsigned char *_src
374 ; r2 = int _ystride
375 ; r3 = const ogg_int16_t _residue[64]
376 STMFD r13!,{r4-r7,r14}
377 MOV r14,#8
378 ofrinter_v6_lp
379 LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222
380 SUBS r14,r14,#1
381 [ OC_ARM_CAN_UNALIGN_LDRD
382 LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544
383 |
384 LDR r5, [r1, #4]
385 LDR r4, [r1], r2
386 ]
387 PKHBT r12,r6, r7, LSL #16 ; r12= 22220000
388 PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
389 UXTB16 r6,r4 ; r6 = __22__00
390 UXTB16 r4,r4, ROR #8 ; r4 = __33__11
391 QADD16 r12,r12,r6 ; r12= xx22xx00
392 QADD16 r4, r7, r4 ; r4 = xx33xx11
393 LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666
394 USAT16 r4, #8, r4 ; r4 = __33__11
395 USAT16 r12,#8,r12 ; r12= __22__00
396 ORR r4, r12,r4, LSL #8 ; r4 = 33221100
397 PKHBT r12,r6, r7, LSL #16 ; r12= 66664444
398 PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555
399 UXTB16 r6,r5 ; r6 = __66__44
400 UXTB16 r5,r5, ROR #8 ; r5 = __77__55
401 QADD16 r12,r12,r6 ; r12= xx66xx44
402 QADD16 r5, r7, r5 ; r5 = xx77xx55
403 USAT16 r12,#8, r12 ; r12= __66__44
404 USAT16 r5, #8, r5 ; r4 = __77__55
405 ORR r5, r12,r5, LSL #8 ; r5 = 33221100
406 STRD r4, [r0], r2
407 BGT ofrinter_v6_lp
408 LDMFD r13!,{r4-r7,PC}
409 ENDP
410
411 oc_frag_recon_inter2_v6 PROC
412 ; r0 = unsigned char *_dst
413 ; r1 = const unsigned char *_src1
414 ; r2 = const unsigned char *_src2
415 ; r3 = int _ystride
416 LDR r12,[r13]
417 ; r12= const ogg_int16_t _residue[64]
418 STMFD r13!,{r4-r9,r14}
419 MOV r14,#8
420 ofrinter2_v6_lp
421 LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666
422 SUBS r14,r14,#1
423 LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544
424 LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544
425 PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444
426 PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555
427 UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
428 UXTB16 r5, r4 ; r5 = __66__44
429 UXTB16 r4, r4, ROR #8 ; r4 = __77__55
430 QADD16 r8, r8, r5 ; r8 = xx66xx44
431 QADD16 r9, r9, r4 ; r9 = xx77xx55
432 LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000
433 USAT16 r8, #8, r8 ; r8 = __66__44
434 LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100
435 USAT16 r9, #8, r9 ; r9 = __77__55
436 LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100
437 ORR r9, r8, r9, LSL #8 ; r9 = 77665544
438 PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000
439 UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
440 PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
441 UXTB16 r5, r4 ; r5 = __22__00
442 UXTB16 r4, r4, ROR #8 ; r4 = __33__11
443 QADD16 r8, r8, r5 ; r8 = xx22xx00
444 QADD16 r7, r7, r4 ; r7 = xx33xx11
445 USAT16 r8, #8, r8 ; r8 = __22__00
446 USAT16 r7, #8, r7 ; r7 = __33__11
447 ORR r8, r8, r7, LSL #8 ; r8 = 33221100
448 STRD r8, [r0], r3
449 BGT ofrinter2_v6_lp
450 LDMFD r13!,{r4-r9,PC}
451 ENDP
452 ]
453
454 [ OC_ARM_ASM_NEON
455 EXPORT oc_frag_copy_list_neon
456 EXPORT oc_frag_recon_intra_neon
457 EXPORT oc_frag_recon_inter_neon
458 EXPORT oc_frag_recon_inter2_neon
459
460 oc_frag_copy_list_neon PROC
461 ; r0 = _dst_frame
462 ; r1 = _src_frame
463 ; r2 = _ystride
464 ; r3 = _fragis
465 ; <> = _nfragis
466 ; <> = _frag_buf_offs
467 LDR r12,[r13] ; r12 = _nfragis
468 STMFD r13!,{r4-r7,r14}
469 CMP r12, #1
470 LDRGE r6, [r3] ; r6 = _fragis[fragii]
471 LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
472 BLT ofcl_neon_end
473 ; Stall (2 on Xscale)
474 LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
475 ; Stall (on XScale)
476 MOV r7, r6 ; Guarantee PLD points somewhere valid.
477 ofcl_neon_lp
478 ADD r4, r1, r6
479 VLD1.64 {D0}, [r4@64], r2
480 ADD r5, r0, r6
481 VLD1.64 {D1}, [r4@64], r2
482 SUBS r12, r12, #1
483 VLD1.64 {D2}, [r4@64], r2
484 LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii]
485 VLD1.64 {D3}, [r4@64], r2
486 LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
487 VLD1.64 {D4}, [r4@64], r2
488 ADDGT r7, r1, r6
489 VLD1.64 {D5}, [r4@64], r2
490 PLD [r7]
491 VLD1.64 {D6}, [r4@64], r2
492 PLD [r7, r2]
493 VLD1.64 {D7}, [r4@64]
494 PLD [r7, r2, LSL #1]
495 VST1.64 {D0}, [r5@64], r2
496 ADDGT r7, r7, r2, LSL #2
497 VST1.64 {D1}, [r5@64], r2
498 PLD [r7, -r2]
499 VST1.64 {D2}, [r5@64], r2
500 PLD [r7]
501 VST1.64 {D3}, [r5@64], r2
502 PLD [r7, r2]
503 VST1.64 {D4}, [r5@64], r2
504 PLD [r7, r2, LSL #1]
505 VST1.64 {D5}, [r5@64], r2
506 ADDGT r7, r7, r2, LSL #2
507 VST1.64 {D6}, [r5@64], r2
508 PLD [r7, -r2]
509 VST1.64 {D7}, [r5@64]
510 BGT ofcl_neon_lp
511 ofcl_neon_end
512 LDMFD r13!,{r4-r7,PC}
513 ENDP
514
515 oc_frag_recon_intra_neon PROC
516 ; r0 = unsigned char *_dst
517 ; r1 = int _ystride
518 ; r2 = const ogg_int16_t _residue[64]
519 MOV r3, #128
520 VDUP.S16 Q0, r3
521 VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
522 VQADD.S16 Q8, Q8, Q0
523 VQADD.S16 Q9, Q9, Q0
524 VQADD.S16 Q10,Q10,Q0
525 VQADD.S16 Q11,Q11,Q0
526 VQADD.S16 Q12,Q12,Q0
527 VQADD.S16 Q13,Q13,Q0
528 VQADD.S16 Q14,Q14,Q0
529 VQADD.S16 Q15,Q15,Q0
530 VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle
531 VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle
532 VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle
533 VST1.64 {D16},[r0@64], r1
534 VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle
535 VST1.64 {D17},[r0@64], r1
536 VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle
537 VST1.64 {D18},[r0@64], r1
538 VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle
539 VST1.64 {D19},[r0@64], r1
540 VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle
541 VST1.64 {D20},[r0@64], r1
542 VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle
543 VST1.64 {D21},[r0@64], r1
544 VST1.64 {D22},[r0@64], r1
545 VST1.64 {D23},[r0@64], r1
546 MOV PC,R14
547 ENDP
548
549 oc_frag_recon_inter_neon PROC
550 ; r0 = unsigned char *_dst
551 ; r1 = const unsigned char *_src
552 ; r2 = int _ystride
553 ; r3 = const ogg_int16_t _residue[64]
554 VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
555 VLD1.64 {D0}, [r1], r2
556 VLD1.64 {D2}, [r1], r2
557 VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00
558 VLD1.64 {D4}, [r1], r2
559 VMOVL.U8 Q1, D2 ; etc
560 VLD1.64 {D6}, [r1], r2
561 VMOVL.U8 Q2, D4
562 VMOVL.U8 Q3, D6
563 VQADD.S16 Q8, Q8, Q0
564 VLD1.64 {D0}, [r1], r2
565 VQADD.S16 Q9, Q9, Q1
566 VLD1.64 {D2}, [r1], r2
567 VQADD.S16 Q10,Q10,Q2
568 VLD1.64 {D4}, [r1], r2
569 VQADD.S16 Q11,Q11,Q3
570 VLD1.64 {D6}, [r1], r2
571 VMOVL.U8 Q0, D0
572 VMOVL.U8 Q1, D2
573 VMOVL.U8 Q2, D4
574 VMOVL.U8 Q3, D6
575 VQADD.S16 Q12,Q12,Q0
576 VQADD.S16 Q13,Q13,Q1
577 VQADD.S16 Q14,Q14,Q2
578 VQADD.S16 Q15,Q15,Q3
579 VQMOVUN.S16 D16,Q8
580 VQMOVUN.S16 D17,Q9
581 VQMOVUN.S16 D18,Q10
582 VST1.64 {D16},[r0@64], r2
583 VQMOVUN.S16 D19,Q11
584 VST1.64 {D17},[r0@64], r2
585 VQMOVUN.S16 D20,Q12
586 VST1.64 {D18},[r0@64], r2
587 VQMOVUN.S16 D21,Q13
588 VST1.64 {D19},[r0@64], r2
589 VQMOVUN.S16 D22,Q14
590 VST1.64 {D20},[r0@64], r2
591 VQMOVUN.S16 D23,Q15
592 VST1.64 {D21},[r0@64], r2
593 VST1.64 {D22},[r0@64], r2
594 VST1.64 {D23},[r0@64], r2
595 MOV PC,R14
596 ENDP
597
598 oc_frag_recon_inter2_neon PROC
599 ; r0 = unsigned char *_dst
600 ; r1 = const unsigned char *_src1
601 ; r2 = const unsigned char *_src2
602 ; r3 = int _ystride
603 LDR r12,[r13]
604 ; r12= const ogg_int16_t _residue[64]
605 VLDMIA r12,{D16-D31}
606 VLD1.64 {D0}, [r1], r3
607 VLD1.64 {D4}, [r2], r3
608 VLD1.64 {D1}, [r1], r3
609 VLD1.64 {D5}, [r2], r3
610 VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100
611 VLD1.64 {D2}, [r1], r3
612 VLD1.64 {D6}, [r2], r3
613 VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00
614 VLD1.64 {D3}, [r1], r3
615 VMOVL.U8 Q2, D5 ; etc
616 VLD1.64 {D7}, [r2], r3
617 VHADD.U8 Q3, Q1, Q3
618 VQADD.S16 Q8, Q8, Q0
619 VQADD.S16 Q9, Q9, Q2
620 VLD1.64 {D0}, [r1], r3
621 VMOVL.U8 Q1, D6
622 VLD1.64 {D4}, [r2], r3
623 VMOVL.U8 Q3, D7
624 VLD1.64 {D1}, [r1], r3
625 VQADD.S16 Q10,Q10,Q1
626 VLD1.64 {D5}, [r2], r3
627 VQADD.S16 Q11,Q11,Q3
628 VLD1.64 {D2}, [r1], r3
629 VHADD.U8 Q2, Q0, Q2
630 VLD1.64 {D6}, [r2], r3
631 VLD1.64 {D3}, [r1], r3
632 VMOVL.U8 Q0, D4
633 VLD1.64 {D7}, [r2], r3
634 VMOVL.U8 Q2, D5
635 VHADD.U8 Q3, Q1, Q3
636 VQADD.S16 Q12,Q12,Q0
637 VQADD.S16 Q13,Q13,Q2
638 VMOVL.U8 Q1, D6
639 VMOVL.U8 Q3, D7
640 VQADD.S16 Q14,Q14,Q1
641 VQADD.S16 Q15,Q15,Q3
642 VQMOVUN.S16 D16,Q8
643 VQMOVUN.S16 D17,Q9
644 VQMOVUN.S16 D18,Q10
645 VST1.64 {D16},[r0@64], r3
646 VQMOVUN.S16 D19,Q11
647 VST1.64 {D17},[r0@64], r3
648 VQMOVUN.S16 D20,Q12
649 VST1.64 {D18},[r0@64], r3
650 VQMOVUN.S16 D21,Q13
651 VST1.64 {D19},[r0@64], r3
652 VQMOVUN.S16 D22,Q14
653 VST1.64 {D20},[r0@64], r3
654 VQMOVUN.S16 D23,Q15
655 VST1.64 {D21},[r0@64], r3
656 VST1.64 {D22},[r0@64], r3
657 VST1.64 {D23},[r0@64], r3
658 MOV PC,R14
659 ENDP
660 ]
661
662 END

mercurial