1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libtheora/lib/arm/armfrag.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,662 @@ 1.4 +;******************************************************************** 1.5 +;* * 1.6 +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * 1.7 +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 1.8 +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 1.9 +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 1.10 +;* * 1.11 +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * 1.12 +;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * 1.13 +;* * 1.14 +;******************************************************************** 1.15 +; Original implementation: 1.16 +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd 1.17 +; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $ 1.18 +;******************************************************************** 1.19 + 1.20 + AREA |.text|, CODE, READONLY 1.21 + 1.22 + ; Explicitly specifying alignment here because some versions of 1.23 + ; gas don't align code correctly. See 1.24 + ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html 1.25 + ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 1.26 + ALIGN 1.27 + 1.28 + GET armopts.s 1.29 + 1.30 +; Vanilla ARM v4 versions 1.31 + EXPORT oc_frag_copy_list_arm 1.32 + EXPORT oc_frag_recon_intra_arm 1.33 + EXPORT oc_frag_recon_inter_arm 1.34 + EXPORT oc_frag_recon_inter2_arm 1.35 + 1.36 +oc_frag_copy_list_arm PROC 1.37 + ; r0 = _dst_frame 1.38 + ; r1 = _src_frame 1.39 + ; r2 = _ystride 1.40 + ; r3 = _fragis 1.41 + ; <> = _nfragis 1.42 + ; <> = _frag_buf_offs 1.43 + LDR r12,[r13] ; r12 = _nfragis 1.44 + STMFD r13!,{r4-r6,r11,r14} 1.45 + SUBS r12, r12, #1 1.46 + LDR r4,[r3],#4 ; r4 = _fragis[fragii] 1.47 + LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs 1.48 + BLT ofcl_arm_end 1.49 + SUB r2, r2, #4 1.50 +ofcl_arm_lp 1.51 + LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]] 1.52 + SUBS r12, r12, #1 1.53 + ; Stall (on XScale) 1.54 + ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off 1.55 + LDR r6, [r4], #4 1.56 + ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off 1.57 + LDR r5, [r4], r2 1.58 + STR r6, [r11],#4 1.59 + LDR r6, [r4], #4 1.60 + STR r5, [r11],r2 1.61 + LDR r5, [r4], r2 1.62 + STR r6, [r11],#4 1.63 + LDR r6, [r4], #4 1.64 + STR r5, [r11],r2 1.65 + LDR r5, [r4], r2 1.66 + STR r6, [r11],#4 1.67 + LDR r6, [r4], #4 1.68 + STR r5, [r11],r2 1.69 + LDR r5, [r4], r2 1.70 + STR r6, [r11],#4 1.71 + LDR r6, [r4], #4 1.72 + STR r5, [r11],r2 1.73 + LDR r5, [r4], r2 1.74 + STR r6, [r11],#4 1.75 + LDR r6, [r4], #4 1.76 + STR r5, [r11],r2 1.77 + LDR r5, [r4], r2 1.78 + STR r6, [r11],#4 1.79 + LDR r6, [r4], #4 1.80 + STR r5, [r11],r2 1.81 + LDR r5, [r4], r2 1.82 + STR r6, [r11],#4 1.83 + LDR r6, [r4], #4 1.84 + STR r5, [r11],r2 1.85 + LDR r5, [r4] 1.86 + LDRGE r4,[r3],#4 ; r4 = _fragis[fragii] 1.87 + STR r6, [r11],#4 1.88 + STR r5, [r11] 1.89 + BGE ofcl_arm_lp 1.90 +ofcl_arm_end 1.91 + LDMFD r13!,{r4-r6,r11,PC} 1.92 +oc_frag_recon_intra_arm 1.93 + ; r0 = unsigned char *_dst 1.94 + ; r1 = int _ystride 1.95 + ; r2 = const ogg_int16_t _residue[64] 1.96 + STMFD r13!,{r4,r5,r14} 1.97 + MOV r14,#8 1.98 + MOV r5, #255 1.99 + SUB r1, r1, #7 1.100 +ofrintra_lp_arm 1.101 + LDRSH r3, [r2], #2 1.102 + LDRSH r4, [r2], #2 1.103 + LDRSH r12,[r2], #2 1.104 + ADDS r3, r3, #128 1.105 + CMPGT r5, r3 1.106 + EORLT r3, r5, r3, ASR #32 1.107 + STRB r3, [r0], #1 1.108 + ADDS r4, r4, #128 1.109 + CMPGT r5, r4 1.110 + EORLT r4, r5, r4, ASR #32 1.111 + LDRSH r3, [r2], #2 1.112 + STRB r4, [r0], #1 1.113 + ADDS r12,r12,#128 1.114 + CMPGT r5, r12 1.115 + EORLT r12,r5, r12,ASR #32 1.116 + LDRSH r4, [r2], #2 1.117 + STRB r12,[r0], #1 1.118 + ADDS r3, r3, #128 1.119 + CMPGT r5, r3 1.120 + EORLT r3, r5, r3, ASR #32 1.121 + LDRSH r12,[r2], #2 1.122 + STRB r3, [r0], #1 1.123 + ADDS r4, r4, #128 1.124 + CMPGT r5, r4 1.125 + EORLT r4, r5, r4, ASR #32 1.126 + LDRSH r3, [r2], #2 1.127 + STRB r4, [r0], #1 1.128 + ADDS r12,r12,#128 1.129 + CMPGT r5, r12 1.130 + EORLT r12,r5, r12,ASR #32 1.131 + LDRSH r4, [r2], #2 1.132 + STRB r12,[r0], #1 1.133 + ADDS r3, r3, #128 1.134 + CMPGT r5, r3 1.135 + EORLT r3, r5, r3, ASR #32 1.136 + STRB r3, [r0], #1 1.137 + ADDS r4, r4, #128 1.138 + CMPGT r5, r4 1.139 + EORLT r4, r5, r4, ASR #32 1.140 + STRB r4, [r0], r1 1.141 + SUBS r14,r14,#1 1.142 + BGT ofrintra_lp_arm 1.143 + LDMFD r13!,{r4,r5,PC} 1.144 + ENDP 1.145 + 1.146 +oc_frag_recon_inter_arm PROC 1.147 + ; r0 = unsigned char *dst 1.148 + ; r1 = const unsigned char *src 1.149 + ; r2 = int ystride 1.150 + ; r3 = const ogg_int16_t residue[64] 1.151 + STMFD r13!,{r5,r9-r11,r14} 1.152 + MOV r9, #8 1.153 + MOV r5, #255 1.154 + SUB r2, r2, #7 1.155 +ofrinter_lp_arm 1.156 + LDRSH r12,[r3], #2 1.157 + LDRB r14,[r1], #1 1.158 + LDRSH r11,[r3], #2 1.159 + LDRB r10,[r1], #1 1.160 + ADDS r12,r12,r14 1.161 + CMPGT r5, r12 1.162 + EORLT r12,r5, r12,ASR #32 1.163 + STRB r12,[r0], #1 1.164 + ADDS r11,r11,r10 1.165 + CMPGT r5, r11 1.166 + LDRSH r12,[r3], #2 1.167 + LDRB r14,[r1], #1 1.168 + EORLT r11,r5, r11,ASR #32 1.169 + STRB r11,[r0], #1 1.170 + ADDS r12,r12,r14 1.171 + CMPGT r5, r12 1.172 + LDRSH r11,[r3], #2 1.173 + LDRB r10,[r1], #1 1.174 + EORLT r12,r5, r12,ASR #32 1.175 + STRB r12,[r0], #1 1.176 + ADDS r11,r11,r10 1.177 + CMPGT r5, r11 1.178 + LDRSH r12,[r3], #2 1.179 + LDRB r14,[r1], #1 1.180 + EORLT r11,r5, r11,ASR #32 1.181 + STRB r11,[r0], #1 1.182 + ADDS r12,r12,r14 1.183 + CMPGT r5, r12 1.184 + LDRSH r11,[r3], #2 1.185 + LDRB r10,[r1], #1 1.186 + EORLT r12,r5, r12,ASR #32 1.187 + STRB r12,[r0], #1 1.188 + ADDS r11,r11,r10 1.189 + CMPGT r5, r11 1.190 + LDRSH r12,[r3], #2 1.191 + LDRB r14,[r1], #1 1.192 + EORLT r11,r5, r11,ASR #32 1.193 + STRB r11,[r0], #1 1.194 + ADDS r12,r12,r14 1.195 + CMPGT r5, r12 1.196 + LDRSH r11,[r3], #2 1.197 + LDRB r10,[r1], r2 1.198 + EORLT r12,r5, r12,ASR #32 1.199 + STRB r12,[r0], #1 1.200 + ADDS r11,r11,r10 1.201 + CMPGT r5, r11 1.202 + EORLT r11,r5, r11,ASR #32 1.203 + STRB r11,[r0], r2 1.204 + SUBS r9, r9, #1 1.205 + BGT ofrinter_lp_arm 1.206 + LDMFD r13!,{r5,r9-r11,PC} 1.207 + ENDP 1.208 + 1.209 +oc_frag_recon_inter2_arm PROC 1.210 + ; r0 = unsigned char *dst 1.211 + ; r1 = const unsigned char *src1 1.212 + ; r2 = const unsigned char *src2 1.213 + ; r3 = int ystride 1.214 + LDR r12,[r13] 1.215 + ; r12= const ogg_int16_t residue[64] 1.216 + STMFD r13!,{r4-r8,r14} 1.217 + MOV r14,#8 1.218 + MOV r8, #255 1.219 + SUB r3, r3, #7 1.220 +ofrinter2_lp_arm 1.221 + LDRB r5, [r1], #1 1.222 + LDRB r6, [r2], #1 1.223 + LDRSH r4, [r12],#2 1.224 + LDRB r7, [r1], #1 1.225 + ADD r5, r5, r6 1.226 + ADDS r5, r4, r5, LSR #1 1.227 + CMPGT r8, r5 1.228 + LDRB r6, [r2], #1 1.229 + LDRSH r4, [r12],#2 1.230 + EORLT r5, r8, r5, ASR #32 1.231 + STRB r5, [r0], #1 1.232 + ADD r7, r7, r6 1.233 + ADDS r7, r4, r7, LSR #1 1.234 + CMPGT r8, r7 1.235 + LDRB r5, [r1], #1 1.236 + LDRB r6, [r2], #1 1.237 + LDRSH r4, [r12],#2 1.238 + EORLT r7, r8, r7, ASR #32 1.239 + STRB r7, [r0], #1 1.240 + ADD r5, r5, r6 1.241 + ADDS r5, r4, r5, LSR #1 1.242 + CMPGT r8, r5 1.243 + LDRB r7, [r1], #1 1.244 + LDRB r6, [r2], #1 1.245 + LDRSH r4, [r12],#2 1.246 + EORLT r5, r8, r5, ASR #32 1.247 + STRB r5, [r0], #1 1.248 + ADD r7, r7, r6 1.249 + ADDS r7, r4, r7, LSR #1 1.250 + CMPGT r8, r7 1.251 + LDRB r5, [r1], #1 1.252 + LDRB r6, [r2], #1 1.253 + LDRSH r4, [r12],#2 1.254 + EORLT r7, r8, r7, ASR #32 1.255 + STRB r7, [r0], #1 1.256 + ADD r5, r5, r6 1.257 + ADDS r5, r4, r5, LSR #1 1.258 + CMPGT r8, r5 1.259 + LDRB r7, [r1], #1 1.260 + LDRB r6, [r2], #1 1.261 + LDRSH r4, [r12],#2 1.262 + EORLT r5, r8, r5, ASR #32 1.263 + STRB r5, [r0], #1 1.264 + ADD r7, r7, r6 1.265 + ADDS r7, r4, r7, LSR #1 1.266 + CMPGT r8, r7 1.267 + LDRB r5, [r1], #1 1.268 + LDRB r6, [r2], #1 1.269 + LDRSH r4, [r12],#2 1.270 + EORLT r7, r8, r7, ASR #32 1.271 + STRB r7, [r0], #1 1.272 + ADD r5, r5, r6 1.273 + ADDS r5, r4, r5, LSR #1 1.274 + CMPGT r8, r5 1.275 + LDRB r7, [r1], r3 1.276 + LDRB r6, [r2], r3 1.277 + LDRSH r4, [r12],#2 1.278 + EORLT r5, r8, r5, ASR #32 1.279 + STRB r5, [r0], #1 1.280 + ADD r7, r7, r6 1.281 + ADDS r7, r4, r7, LSR #1 1.282 + CMPGT r8, r7 1.283 + EORLT r7, r8, r7, ASR #32 1.284 + STRB r7, [r0], r3 1.285 + SUBS r14,r14,#1 1.286 + BGT ofrinter2_lp_arm 1.287 + LDMFD r13!,{r4-r8,PC} 1.288 + ENDP 1.289 + 1.290 + [ OC_ARM_ASM_EDSP 1.291 + EXPORT oc_frag_copy_list_edsp 1.292 + 1.293 +oc_frag_copy_list_edsp PROC 1.294 + ; r0 = _dst_frame 1.295 + ; r1 = _src_frame 1.296 + ; r2 = _ystride 1.297 + ; r3 = _fragis 1.298 + ; <> = _nfragis 1.299 + ; <> = _frag_buf_offs 1.300 + LDR r12,[r13] ; r12 = _nfragis 1.301 + STMFD r13!,{r4-r11,r14} 1.302 + SUBS r12, r12, #1 1.303 + LDRGE r5, [r3],#4 ; r5 = _fragis[fragii] 1.304 + LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs 1.305 + BLT ofcl_edsp_end 1.306 +ofcl_edsp_lp 1.307 + MOV r4, r1 1.308 + LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]] 1.309 + SUBS r12, r12, #1 1.310 + ; Stall (on XScale) 1.311 + LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off 1.312 + LDRD r8, [r4, r2]! 1.313 + ; Stall 1.314 + STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off 1.315 + STRD r8, [r5, r2]! 1.316 + ; Stall 1.317 + LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive 1.318 + LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse 1.319 + LDRD r10,[r4, r2]! ; than us only doing 2, and having to do 1.320 + ; another pair of LDRD/STRD later on. 1.321 + ; Stall 1.322 + STRD r6, [r5, r2]! 1.323 + STRD r8, [r5, r2]! 1.324 + STRD r10,[r5, r2]! 1.325 + LDRD r6, [r4, r2]! 1.326 + LDRD r8, [r4, r2]! 1.327 + LDRD r10,[r4, r2]! 1.328 + STRD r6, [r5, r2]! 1.329 + STRD r8, [r5, r2]! 1.330 + STRD r10,[r5, r2]! 1.331 + LDRGE r5, [r3],#4 ; r5 = _fragis[fragii] 1.332 + BGE ofcl_edsp_lp 1.333 +ofcl_edsp_end 1.334 + LDMFD r13!,{r4-r11,PC} 1.335 + ENDP 1.336 + ] 1.337 + 1.338 + [ OC_ARM_ASM_MEDIA 1.339 + EXPORT oc_frag_recon_intra_v6 1.340 + EXPORT oc_frag_recon_inter_v6 1.341 + EXPORT oc_frag_recon_inter2_v6 1.342 + 1.343 +oc_frag_recon_intra_v6 PROC 1.344 + ; r0 = unsigned char *_dst 1.345 + ; r1 = int _ystride 1.346 + ; r2 = const ogg_int16_t _residue[64] 1.347 + STMFD r13!,{r4-r6,r14} 1.348 + MOV r14,#8 1.349 + MOV r12,r2 1.350 + LDR r6, =0x00800080 1.351 +ofrintra_v6_lp 1.352 + LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222 1.353 + LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666 1.354 + SUBS r14,r14,#1 1.355 + QADD16 r2, r2, r6 1.356 + QADD16 r3, r3, r6 1.357 + QADD16 r4, r4, r6 1.358 + QADD16 r5, r5, r6 1.359 + USAT16 r2, #8, r2 ; r2 = __11__00 1.360 + USAT16 r3, #8, r3 ; r3 = __33__22 1.361 + USAT16 r4, #8, r4 ; r4 = __55__44 1.362 + USAT16 r5, #8, r5 ; r5 = __77__66 1.363 + ORR r2, r2, r2, LSR #8 ; r2 = __111100 1.364 + ORR r3, r3, r3, LSR #8 ; r3 = __333322 1.365 + ORR r4, r4, r4, LSR #8 ; r4 = __555544 1.366 + ORR r5, r5, r5, LSR #8 ; r5 = __777766 1.367 + PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100 1.368 + PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544 1.369 + STRD r2, [r0], r1 1.370 + BGT ofrintra_v6_lp 1.371 + LDMFD r13!,{r4-r6,PC} 1.372 + ENDP 1.373 + 1.374 +oc_frag_recon_inter_v6 PROC 1.375 + ; r0 = unsigned char *_dst 1.376 + ; r1 = const unsigned char *_src 1.377 + ; r2 = int _ystride 1.378 + ; r3 = const ogg_int16_t _residue[64] 1.379 + STMFD r13!,{r4-r7,r14} 1.380 + MOV r14,#8 1.381 +ofrinter_v6_lp 1.382 + LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222 1.383 + SUBS r14,r14,#1 1.384 + [ OC_ARM_CAN_UNALIGN_LDRD 1.385 + LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544 1.386 + | 1.387 + LDR r5, [r1, #4] 1.388 + LDR r4, [r1], r2 1.389 + ] 1.390 + PKHBT r12,r6, r7, LSL #16 ; r12= 22220000 1.391 + PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111 1.392 + UXTB16 r6,r4 ; r6 = __22__00 1.393 + UXTB16 r4,r4, ROR #8 ; r4 = __33__11 1.394 + QADD16 r12,r12,r6 ; r12= xx22xx00 1.395 + QADD16 r4, r7, r4 ; r4 = xx33xx11 1.396 + LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666 1.397 + USAT16 r4, #8, r4 ; r4 = __33__11 1.398 + USAT16 r12,#8,r12 ; r12= __22__00 1.399 + ORR r4, r12,r4, LSL #8 ; r4 = 33221100 1.400 + PKHBT r12,r6, r7, LSL #16 ; r12= 66664444 1.401 + PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555 1.402 + UXTB16 r6,r5 ; r6 = __66__44 1.403 + UXTB16 r5,r5, ROR #8 ; r5 = __77__55 1.404 + QADD16 r12,r12,r6 ; r12= xx66xx44 1.405 + QADD16 r5, r7, r5 ; r5 = xx77xx55 1.406 + USAT16 r12,#8, r12 ; r12= __66__44 1.407 + USAT16 r5, #8, r5 ; r4 = __77__55 1.408 + ORR r5, r12,r5, LSL #8 ; r5 = 33221100 1.409 + STRD r4, [r0], r2 1.410 + BGT ofrinter_v6_lp 1.411 + LDMFD r13!,{r4-r7,PC} 1.412 + ENDP 1.413 + 1.414 +oc_frag_recon_inter2_v6 PROC 1.415 + ; r0 = unsigned char *_dst 1.416 + ; r1 = const unsigned char *_src1 1.417 + ; r2 = const unsigned char *_src2 1.418 + ; r3 = int _ystride 1.419 + LDR r12,[r13] 1.420 + ; r12= const ogg_int16_t _residue[64] 1.421 + STMFD r13!,{r4-r9,r14} 1.422 + MOV r14,#8 1.423 +ofrinter2_v6_lp 1.424 + LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666 1.425 + SUBS r14,r14,#1 1.426 + LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544 1.427 + LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544 1.428 + PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444 1.429 + PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555 1.430 + UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1 1.431 + UXTB16 r5, r4 ; r5 = __66__44 1.432 + UXTB16 r4, r4, ROR #8 ; r4 = __77__55 1.433 + QADD16 r8, r8, r5 ; r8 = xx66xx44 1.434 + QADD16 r9, r9, r4 ; r9 = xx77xx55 1.435 + LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000 1.436 + USAT16 r8, #8, r8 ; r8 = __66__44 1.437 + LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100 1.438 + USAT16 r9, #8, r9 ; r9 = __77__55 1.439 + LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100 1.440 + ORR r9, r8, r9, LSL #8 ; r9 = 77665544 1.441 + PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000 1.442 + UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1 1.443 + PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111 1.444 + UXTB16 r5, r4 ; r5 = __22__00 1.445 + UXTB16 r4, r4, ROR #8 ; r4 = __33__11 1.446 + QADD16 r8, r8, r5 ; r8 = xx22xx00 1.447 + QADD16 r7, r7, r4 ; r7 = xx33xx11 1.448 + USAT16 r8, #8, r8 ; r8 = __22__00 1.449 + USAT16 r7, #8, r7 ; r7 = __33__11 1.450 + ORR r8, r8, r7, LSL #8 ; r8 = 33221100 1.451 + STRD r8, [r0], r3 1.452 + BGT ofrinter2_v6_lp 1.453 + LDMFD r13!,{r4-r9,PC} 1.454 + ENDP 1.455 + ] 1.456 + 1.457 + [ OC_ARM_ASM_NEON 1.458 + EXPORT oc_frag_copy_list_neon 1.459 + EXPORT oc_frag_recon_intra_neon 1.460 + EXPORT oc_frag_recon_inter_neon 1.461 + EXPORT oc_frag_recon_inter2_neon 1.462 + 1.463 +oc_frag_copy_list_neon PROC 1.464 + ; r0 = _dst_frame 1.465 + ; r1 = _src_frame 1.466 + ; r2 = _ystride 1.467 + ; r3 = _fragis 1.468 + ; <> = _nfragis 1.469 + ; <> = _frag_buf_offs 1.470 + LDR r12,[r13] ; r12 = _nfragis 1.471 + STMFD r13!,{r4-r7,r14} 1.472 + CMP r12, #1 1.473 + LDRGE r6, [r3] ; r6 = _fragis[fragii] 1.474 + LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs 1.475 + BLT ofcl_neon_end 1.476 + ; Stall (2 on Xscale) 1.477 + LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]] 1.478 + ; Stall (on XScale) 1.479 + MOV r7, r6 ; Guarantee PLD points somewhere valid. 1.480 +ofcl_neon_lp 1.481 + ADD r4, r1, r6 1.482 + VLD1.64 {D0}, [r4@64], r2 1.483 + ADD r5, r0, r6 1.484 + VLD1.64 {D1}, [r4@64], r2 1.485 + SUBS r12, r12, #1 1.486 + VLD1.64 {D2}, [r4@64], r2 1.487 + LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii] 1.488 + VLD1.64 {D3}, [r4@64], r2 1.489 + LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]] 1.490 + VLD1.64 {D4}, [r4@64], r2 1.491 + ADDGT r7, r1, r6 1.492 + VLD1.64 {D5}, [r4@64], r2 1.493 + PLD [r7] 1.494 + VLD1.64 {D6}, [r4@64], r2 1.495 + PLD [r7, r2] 1.496 + VLD1.64 {D7}, [r4@64] 1.497 + PLD [r7, r2, LSL #1] 1.498 + VST1.64 {D0}, [r5@64], r2 1.499 + ADDGT r7, r7, r2, LSL #2 1.500 + VST1.64 {D1}, [r5@64], r2 1.501 + PLD [r7, -r2] 1.502 + VST1.64 {D2}, [r5@64], r2 1.503 + PLD [r7] 1.504 + VST1.64 {D3}, [r5@64], r2 1.505 + PLD [r7, r2] 1.506 + VST1.64 {D4}, [r5@64], r2 1.507 + PLD [r7, r2, LSL #1] 1.508 + VST1.64 {D5}, [r5@64], r2 1.509 + ADDGT r7, r7, r2, LSL #2 1.510 + VST1.64 {D6}, [r5@64], r2 1.511 + PLD [r7, -r2] 1.512 + VST1.64 {D7}, [r5@64] 1.513 + BGT ofcl_neon_lp 1.514 +ofcl_neon_end 1.515 + LDMFD r13!,{r4-r7,PC} 1.516 + ENDP 1.517 + 1.518 +oc_frag_recon_intra_neon PROC 1.519 + ; r0 = unsigned char *_dst 1.520 + ; r1 = int _ystride 1.521 + ; r2 = const ogg_int16_t _residue[64] 1.522 + MOV r3, #128 1.523 + VDUP.S16 Q0, r3 1.524 + VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles 1.525 + VQADD.S16 Q8, Q8, Q0 1.526 + VQADD.S16 Q9, Q9, Q0 1.527 + VQADD.S16 Q10,Q10,Q0 1.528 + VQADD.S16 Q11,Q11,Q0 1.529 + VQADD.S16 Q12,Q12,Q0 1.530 + VQADD.S16 Q13,Q13,Q0 1.531 + VQADD.S16 Q14,Q14,Q0 1.532 + VQADD.S16 Q15,Q15,Q0 1.533 + VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle 1.534 + VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle 1.535 + VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle 1.536 + VST1.64 {D16},[r0@64], r1 1.537 + VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle 1.538 + VST1.64 {D17},[r0@64], r1 1.539 + VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle 1.540 + VST1.64 {D18},[r0@64], r1 1.541 + VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle 1.542 + VST1.64 {D19},[r0@64], r1 1.543 + VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle 1.544 + VST1.64 {D20},[r0@64], r1 1.545 + VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle 1.546 + VST1.64 {D21},[r0@64], r1 1.547 + VST1.64 {D22},[r0@64], r1 1.548 + VST1.64 {D23},[r0@64], r1 1.549 + MOV PC,R14 1.550 + ENDP 1.551 + 1.552 +oc_frag_recon_inter_neon PROC 1.553 + ; r0 = unsigned char *_dst 1.554 + ; r1 = const unsigned char *_src 1.555 + ; r2 = int _ystride 1.556 + ; r3 = const ogg_int16_t _residue[64] 1.557 + VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles 1.558 + VLD1.64 {D0}, [r1], r2 1.559 + VLD1.64 {D2}, [r1], r2 1.560 + VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00 1.561 + VLD1.64 {D4}, [r1], r2 1.562 + VMOVL.U8 Q1, D2 ; etc 1.563 + VLD1.64 {D6}, [r1], r2 1.564 + VMOVL.U8 Q2, D4 1.565 + VMOVL.U8 Q3, D6 1.566 + VQADD.S16 Q8, Q8, Q0 1.567 + VLD1.64 {D0}, [r1], r2 1.568 + VQADD.S16 Q9, Q9, Q1 1.569 + VLD1.64 {D2}, [r1], r2 1.570 + VQADD.S16 Q10,Q10,Q2 1.571 + VLD1.64 {D4}, [r1], r2 1.572 + VQADD.S16 Q11,Q11,Q3 1.573 + VLD1.64 {D6}, [r1], r2 1.574 + VMOVL.U8 Q0, D0 1.575 + VMOVL.U8 Q1, D2 1.576 + VMOVL.U8 Q2, D4 1.577 + VMOVL.U8 Q3, D6 1.578 + VQADD.S16 Q12,Q12,Q0 1.579 + VQADD.S16 Q13,Q13,Q1 1.580 + VQADD.S16 Q14,Q14,Q2 1.581 + VQADD.S16 Q15,Q15,Q3 1.582 + VQMOVUN.S16 D16,Q8 1.583 + VQMOVUN.S16 D17,Q9 1.584 + VQMOVUN.S16 D18,Q10 1.585 + VST1.64 {D16},[r0@64], r2 1.586 + VQMOVUN.S16 D19,Q11 1.587 + VST1.64 {D17},[r0@64], r2 1.588 + VQMOVUN.S16 D20,Q12 1.589 + VST1.64 {D18},[r0@64], r2 1.590 + VQMOVUN.S16 D21,Q13 1.591 + VST1.64 {D19},[r0@64], r2 1.592 + VQMOVUN.S16 D22,Q14 1.593 + VST1.64 {D20},[r0@64], r2 1.594 + VQMOVUN.S16 D23,Q15 1.595 + VST1.64 {D21},[r0@64], r2 1.596 + VST1.64 {D22},[r0@64], r2 1.597 + VST1.64 {D23},[r0@64], r2 1.598 + MOV PC,R14 1.599 + ENDP 1.600 + 1.601 +oc_frag_recon_inter2_neon PROC 1.602 + ; r0 = unsigned char *_dst 1.603 + ; r1 = const unsigned char *_src1 1.604 + ; r2 = const unsigned char *_src2 1.605 + ; r3 = int _ystride 1.606 + LDR r12,[r13] 1.607 + ; r12= const ogg_int16_t _residue[64] 1.608 + VLDMIA r12,{D16-D31} 1.609 + VLD1.64 {D0}, [r1], r3 1.610 + VLD1.64 {D4}, [r2], r3 1.611 + VLD1.64 {D1}, [r1], r3 1.612 + VLD1.64 {D5}, [r2], r3 1.613 + VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100 1.614 + VLD1.64 {D2}, [r1], r3 1.615 + VLD1.64 {D6}, [r2], r3 1.616 + VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00 1.617 + VLD1.64 {D3}, [r1], r3 1.618 + VMOVL.U8 Q2, D5 ; etc 1.619 + VLD1.64 {D7}, [r2], r3 1.620 + VHADD.U8 Q3, Q1, Q3 1.621 + VQADD.S16 Q8, Q8, Q0 1.622 + VQADD.S16 Q9, Q9, Q2 1.623 + VLD1.64 {D0}, [r1], r3 1.624 + VMOVL.U8 Q1, D6 1.625 + VLD1.64 {D4}, [r2], r3 1.626 + VMOVL.U8 Q3, D7 1.627 + VLD1.64 {D1}, [r1], r3 1.628 + VQADD.S16 Q10,Q10,Q1 1.629 + VLD1.64 {D5}, [r2], r3 1.630 + VQADD.S16 Q11,Q11,Q3 1.631 + VLD1.64 {D2}, [r1], r3 1.632 + VHADD.U8 Q2, Q0, Q2 1.633 + VLD1.64 {D6}, [r2], r3 1.634 + VLD1.64 {D3}, [r1], r3 1.635 + VMOVL.U8 Q0, D4 1.636 + VLD1.64 {D7}, [r2], r3 1.637 + VMOVL.U8 Q2, D5 1.638 + VHADD.U8 Q3, Q1, Q3 1.639 + VQADD.S16 Q12,Q12,Q0 1.640 + VQADD.S16 Q13,Q13,Q2 1.641 + VMOVL.U8 Q1, D6 1.642 + VMOVL.U8 Q3, D7 1.643 + VQADD.S16 Q14,Q14,Q1 1.644 + VQADD.S16 Q15,Q15,Q3 1.645 + VQMOVUN.S16 D16,Q8 1.646 + VQMOVUN.S16 D17,Q9 1.647 + VQMOVUN.S16 D18,Q10 1.648 + VST1.64 {D16},[r0@64], r3 1.649 + VQMOVUN.S16 D19,Q11 1.650 + VST1.64 {D17},[r0@64], r3 1.651 + VQMOVUN.S16 D20,Q12 1.652 + VST1.64 {D18},[r0@64], r3 1.653 + VQMOVUN.S16 D21,Q13 1.654 + VST1.64 {D19},[r0@64], r3 1.655 + VQMOVUN.S16 D22,Q14 1.656 + VST1.64 {D20},[r0@64], r3 1.657 + VQMOVUN.S16 D23,Q15 1.658 + VST1.64 {D21},[r0@64], r3 1.659 + VST1.64 {D22},[r0@64], r3 1.660 + VST1.64 {D23},[r0@64], r3 1.661 + MOV PC,R14 1.662 + ENDP 1.663 + ] 1.664 + 1.665 + END