media/libtheora/lib/arm/armfrag.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 ;********************************************************************
michael@0 2 ;* *
michael@0 3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 ;* *
michael@0 8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
michael@0 9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 ;* *
michael@0 11 ;********************************************************************
michael@0 12 ; Original implementation:
michael@0 13 ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
michael@0 14 ; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $
michael@0 15 ;********************************************************************
michael@0 16
michael@0 17 AREA |.text|, CODE, READONLY
michael@0 18
michael@0 19 ; Explicitly specifying alignment here because some versions of
michael@0 20 ; gas don't align code correctly. See
michael@0 21 ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
michael@0 22 ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
michael@0 23 ALIGN
michael@0 24
michael@0 25 GET armopts.s
michael@0 26
michael@0 27 ; Vanilla ARM v4 versions
michael@0 28 EXPORT oc_frag_copy_list_arm
michael@0 29 EXPORT oc_frag_recon_intra_arm
michael@0 30 EXPORT oc_frag_recon_inter_arm
michael@0 31 EXPORT oc_frag_recon_inter2_arm
michael@0 32
michael@0 33 oc_frag_copy_list_arm PROC
michael@0 34 ; r0 = _dst_frame
michael@0 35 ; r1 = _src_frame
michael@0 36 ; r2 = _ystride
michael@0 37 ; r3 = _fragis
michael@0 38 ; <> = _nfragis
michael@0 39 ; <> = _frag_buf_offs
michael@0 40 LDR r12,[r13] ; r12 = _nfragis
michael@0 41 STMFD r13!,{r4-r6,r11,r14}
michael@0 42 SUBS r12, r12, #1
michael@0 43 LDR r4,[r3],#4 ; r4 = _fragis[fragii]
michael@0 44 LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
michael@0 45 BLT ofcl_arm_end
michael@0 46 SUB r2, r2, #4
michael@0 47 ofcl_arm_lp
michael@0 48 LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]]
michael@0 49 SUBS r12, r12, #1
michael@0 50 ; Stall (on XScale)
michael@0 51 ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off
michael@0 52 LDR r6, [r4], #4
michael@0 53 ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off
michael@0 54 LDR r5, [r4], r2
michael@0 55 STR r6, [r11],#4
michael@0 56 LDR r6, [r4], #4
michael@0 57 STR r5, [r11],r2
michael@0 58 LDR r5, [r4], r2
michael@0 59 STR r6, [r11],#4
michael@0 60 LDR r6, [r4], #4
michael@0 61 STR r5, [r11],r2
michael@0 62 LDR r5, [r4], r2
michael@0 63 STR r6, [r11],#4
michael@0 64 LDR r6, [r4], #4
michael@0 65 STR r5, [r11],r2
michael@0 66 LDR r5, [r4], r2
michael@0 67 STR r6, [r11],#4
michael@0 68 LDR r6, [r4], #4
michael@0 69 STR r5, [r11],r2
michael@0 70 LDR r5, [r4], r2
michael@0 71 STR r6, [r11],#4
michael@0 72 LDR r6, [r4], #4
michael@0 73 STR r5, [r11],r2
michael@0 74 LDR r5, [r4], r2
michael@0 75 STR r6, [r11],#4
michael@0 76 LDR r6, [r4], #4
michael@0 77 STR r5, [r11],r2
michael@0 78 LDR r5, [r4], r2
michael@0 79 STR r6, [r11],#4
michael@0 80 LDR r6, [r4], #4
michael@0 81 STR r5, [r11],r2
michael@0 82 LDR r5, [r4]
michael@0 83 LDRGE r4,[r3],#4 ; r4 = _fragis[fragii]
michael@0 84 STR r6, [r11],#4
michael@0 85 STR r5, [r11]
michael@0 86 BGE ofcl_arm_lp
michael@0 87 ofcl_arm_end
michael@0 88 LDMFD r13!,{r4-r6,r11,PC}
michael@0 89 oc_frag_recon_intra_arm
michael@0 90 ; r0 = unsigned char *_dst
michael@0 91 ; r1 = int _ystride
michael@0 92 ; r2 = const ogg_int16_t _residue[64]
michael@0 93 STMFD r13!,{r4,r5,r14}
michael@0 94 MOV r14,#8
michael@0 95 MOV r5, #255
michael@0 96 SUB r1, r1, #7
michael@0 97 ofrintra_lp_arm
michael@0 98 LDRSH r3, [r2], #2
michael@0 99 LDRSH r4, [r2], #2
michael@0 100 LDRSH r12,[r2], #2
michael@0 101 ADDS r3, r3, #128
michael@0 102 CMPGT r5, r3
michael@0 103 EORLT r3, r5, r3, ASR #32
michael@0 104 STRB r3, [r0], #1
michael@0 105 ADDS r4, r4, #128
michael@0 106 CMPGT r5, r4
michael@0 107 EORLT r4, r5, r4, ASR #32
michael@0 108 LDRSH r3, [r2], #2
michael@0 109 STRB r4, [r0], #1
michael@0 110 ADDS r12,r12,#128
michael@0 111 CMPGT r5, r12
michael@0 112 EORLT r12,r5, r12,ASR #32
michael@0 113 LDRSH r4, [r2], #2
michael@0 114 STRB r12,[r0], #1
michael@0 115 ADDS r3, r3, #128
michael@0 116 CMPGT r5, r3
michael@0 117 EORLT r3, r5, r3, ASR #32
michael@0 118 LDRSH r12,[r2], #2
michael@0 119 STRB r3, [r0], #1
michael@0 120 ADDS r4, r4, #128
michael@0 121 CMPGT r5, r4
michael@0 122 EORLT r4, r5, r4, ASR #32
michael@0 123 LDRSH r3, [r2], #2
michael@0 124 STRB r4, [r0], #1
michael@0 125 ADDS r12,r12,#128
michael@0 126 CMPGT r5, r12
michael@0 127 EORLT r12,r5, r12,ASR #32
michael@0 128 LDRSH r4, [r2], #2
michael@0 129 STRB r12,[r0], #1
michael@0 130 ADDS r3, r3, #128
michael@0 131 CMPGT r5, r3
michael@0 132 EORLT r3, r5, r3, ASR #32
michael@0 133 STRB r3, [r0], #1
michael@0 134 ADDS r4, r4, #128
michael@0 135 CMPGT r5, r4
michael@0 136 EORLT r4, r5, r4, ASR #32
michael@0 137 STRB r4, [r0], r1
michael@0 138 SUBS r14,r14,#1
michael@0 139 BGT ofrintra_lp_arm
michael@0 140 LDMFD r13!,{r4,r5,PC}
michael@0 141 ENDP
michael@0 142
michael@0 143 oc_frag_recon_inter_arm PROC
michael@0 144 ; r0 = unsigned char *dst
michael@0 145 ; r1 = const unsigned char *src
michael@0 146 ; r2 = int ystride
michael@0 147 ; r3 = const ogg_int16_t residue[64]
michael@0 148 STMFD r13!,{r5,r9-r11,r14}
michael@0 149 MOV r9, #8
michael@0 150 MOV r5, #255
michael@0 151 SUB r2, r2, #7
michael@0 152 ofrinter_lp_arm
michael@0 153 LDRSH r12,[r3], #2
michael@0 154 LDRB r14,[r1], #1
michael@0 155 LDRSH r11,[r3], #2
michael@0 156 LDRB r10,[r1], #1
michael@0 157 ADDS r12,r12,r14
michael@0 158 CMPGT r5, r12
michael@0 159 EORLT r12,r5, r12,ASR #32
michael@0 160 STRB r12,[r0], #1
michael@0 161 ADDS r11,r11,r10
michael@0 162 CMPGT r5, r11
michael@0 163 LDRSH r12,[r3], #2
michael@0 164 LDRB r14,[r1], #1
michael@0 165 EORLT r11,r5, r11,ASR #32
michael@0 166 STRB r11,[r0], #1
michael@0 167 ADDS r12,r12,r14
michael@0 168 CMPGT r5, r12
michael@0 169 LDRSH r11,[r3], #2
michael@0 170 LDRB r10,[r1], #1
michael@0 171 EORLT r12,r5, r12,ASR #32
michael@0 172 STRB r12,[r0], #1
michael@0 173 ADDS r11,r11,r10
michael@0 174 CMPGT r5, r11
michael@0 175 LDRSH r12,[r3], #2
michael@0 176 LDRB r14,[r1], #1
michael@0 177 EORLT r11,r5, r11,ASR #32
michael@0 178 STRB r11,[r0], #1
michael@0 179 ADDS r12,r12,r14
michael@0 180 CMPGT r5, r12
michael@0 181 LDRSH r11,[r3], #2
michael@0 182 LDRB r10,[r1], #1
michael@0 183 EORLT r12,r5, r12,ASR #32
michael@0 184 STRB r12,[r0], #1
michael@0 185 ADDS r11,r11,r10
michael@0 186 CMPGT r5, r11
michael@0 187 LDRSH r12,[r3], #2
michael@0 188 LDRB r14,[r1], #1
michael@0 189 EORLT r11,r5, r11,ASR #32
michael@0 190 STRB r11,[r0], #1
michael@0 191 ADDS r12,r12,r14
michael@0 192 CMPGT r5, r12
michael@0 193 LDRSH r11,[r3], #2
michael@0 194 LDRB r10,[r1], r2
michael@0 195 EORLT r12,r5, r12,ASR #32
michael@0 196 STRB r12,[r0], #1
michael@0 197 ADDS r11,r11,r10
michael@0 198 CMPGT r5, r11
michael@0 199 EORLT r11,r5, r11,ASR #32
michael@0 200 STRB r11,[r0], r2
michael@0 201 SUBS r9, r9, #1
michael@0 202 BGT ofrinter_lp_arm
michael@0 203 LDMFD r13!,{r5,r9-r11,PC}
michael@0 204 ENDP
michael@0 205
michael@0 206 oc_frag_recon_inter2_arm PROC
michael@0 207 ; r0 = unsigned char *dst
michael@0 208 ; r1 = const unsigned char *src1
michael@0 209 ; r2 = const unsigned char *src2
michael@0 210 ; r3 = int ystride
michael@0 211 LDR r12,[r13]
michael@0 212 ; r12= const ogg_int16_t residue[64]
michael@0 213 STMFD r13!,{r4-r8,r14}
michael@0 214 MOV r14,#8
michael@0 215 MOV r8, #255
michael@0 216 SUB r3, r3, #7
michael@0 217 ofrinter2_lp_arm
michael@0 218 LDRB r5, [r1], #1
michael@0 219 LDRB r6, [r2], #1
michael@0 220 LDRSH r4, [r12],#2
michael@0 221 LDRB r7, [r1], #1
michael@0 222 ADD r5, r5, r6
michael@0 223 ADDS r5, r4, r5, LSR #1
michael@0 224 CMPGT r8, r5
michael@0 225 LDRB r6, [r2], #1
michael@0 226 LDRSH r4, [r12],#2
michael@0 227 EORLT r5, r8, r5, ASR #32
michael@0 228 STRB r5, [r0], #1
michael@0 229 ADD r7, r7, r6
michael@0 230 ADDS r7, r4, r7, LSR #1
michael@0 231 CMPGT r8, r7
michael@0 232 LDRB r5, [r1], #1
michael@0 233 LDRB r6, [r2], #1
michael@0 234 LDRSH r4, [r12],#2
michael@0 235 EORLT r7, r8, r7, ASR #32
michael@0 236 STRB r7, [r0], #1
michael@0 237 ADD r5, r5, r6
michael@0 238 ADDS r5, r4, r5, LSR #1
michael@0 239 CMPGT r8, r5
michael@0 240 LDRB r7, [r1], #1
michael@0 241 LDRB r6, [r2], #1
michael@0 242 LDRSH r4, [r12],#2
michael@0 243 EORLT r5, r8, r5, ASR #32
michael@0 244 STRB r5, [r0], #1
michael@0 245 ADD r7, r7, r6
michael@0 246 ADDS r7, r4, r7, LSR #1
michael@0 247 CMPGT r8, r7
michael@0 248 LDRB r5, [r1], #1
michael@0 249 LDRB r6, [r2], #1
michael@0 250 LDRSH r4, [r12],#2
michael@0 251 EORLT r7, r8, r7, ASR #32
michael@0 252 STRB r7, [r0], #1
michael@0 253 ADD r5, r5, r6
michael@0 254 ADDS r5, r4, r5, LSR #1
michael@0 255 CMPGT r8, r5
michael@0 256 LDRB r7, [r1], #1
michael@0 257 LDRB r6, [r2], #1
michael@0 258 LDRSH r4, [r12],#2
michael@0 259 EORLT r5, r8, r5, ASR #32
michael@0 260 STRB r5, [r0], #1
michael@0 261 ADD r7, r7, r6
michael@0 262 ADDS r7, r4, r7, LSR #1
michael@0 263 CMPGT r8, r7
michael@0 264 LDRB r5, [r1], #1
michael@0 265 LDRB r6, [r2], #1
michael@0 266 LDRSH r4, [r12],#2
michael@0 267 EORLT r7, r8, r7, ASR #32
michael@0 268 STRB r7, [r0], #1
michael@0 269 ADD r5, r5, r6
michael@0 270 ADDS r5, r4, r5, LSR #1
michael@0 271 CMPGT r8, r5
michael@0 272 LDRB r7, [r1], r3
michael@0 273 LDRB r6, [r2], r3
michael@0 274 LDRSH r4, [r12],#2
michael@0 275 EORLT r5, r8, r5, ASR #32
michael@0 276 STRB r5, [r0], #1
michael@0 277 ADD r7, r7, r6
michael@0 278 ADDS r7, r4, r7, LSR #1
michael@0 279 CMPGT r8, r7
michael@0 280 EORLT r7, r8, r7, ASR #32
michael@0 281 STRB r7, [r0], r3
michael@0 282 SUBS r14,r14,#1
michael@0 283 BGT ofrinter2_lp_arm
michael@0 284 LDMFD r13!,{r4-r8,PC}
michael@0 285 ENDP
michael@0 286
michael@0 287 [ OC_ARM_ASM_EDSP
michael@0 288 EXPORT oc_frag_copy_list_edsp
michael@0 289
michael@0 290 oc_frag_copy_list_edsp PROC
michael@0 291 ; r0 = _dst_frame
michael@0 292 ; r1 = _src_frame
michael@0 293 ; r2 = _ystride
michael@0 294 ; r3 = _fragis
michael@0 295 ; <> = _nfragis
michael@0 296 ; <> = _frag_buf_offs
michael@0 297 LDR r12,[r13] ; r12 = _nfragis
michael@0 298 STMFD r13!,{r4-r11,r14}
michael@0 299 SUBS r12, r12, #1
michael@0 300 LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
michael@0 301 LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs
michael@0 302 BLT ofcl_edsp_end
michael@0 303 ofcl_edsp_lp
michael@0 304 MOV r4, r1
michael@0 305 LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]]
michael@0 306 SUBS r12, r12, #1
michael@0 307 ; Stall (on XScale)
michael@0 308 LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off
michael@0 309 LDRD r8, [r4, r2]!
michael@0 310 ; Stall
michael@0 311 STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off
michael@0 312 STRD r8, [r5, r2]!
michael@0 313 ; Stall
michael@0 314 LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive
michael@0 315 LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse
michael@0 316 LDRD r10,[r4, r2]! ; than us only doing 2, and having to do
michael@0 317 ; another pair of LDRD/STRD later on.
michael@0 318 ; Stall
michael@0 319 STRD r6, [r5, r2]!
michael@0 320 STRD r8, [r5, r2]!
michael@0 321 STRD r10,[r5, r2]!
michael@0 322 LDRD r6, [r4, r2]!
michael@0 323 LDRD r8, [r4, r2]!
michael@0 324 LDRD r10,[r4, r2]!
michael@0 325 STRD r6, [r5, r2]!
michael@0 326 STRD r8, [r5, r2]!
michael@0 327 STRD r10,[r5, r2]!
michael@0 328 LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
michael@0 329 BGE ofcl_edsp_lp
michael@0 330 ofcl_edsp_end
michael@0 331 LDMFD r13!,{r4-r11,PC}
michael@0 332 ENDP
michael@0 333 ]
michael@0 334
michael@0 335 [ OC_ARM_ASM_MEDIA
michael@0 336 EXPORT oc_frag_recon_intra_v6
michael@0 337 EXPORT oc_frag_recon_inter_v6
michael@0 338 EXPORT oc_frag_recon_inter2_v6
michael@0 339
michael@0 340 oc_frag_recon_intra_v6 PROC
michael@0 341 ; r0 = unsigned char *_dst
michael@0 342 ; r1 = int _ystride
michael@0 343 ; r2 = const ogg_int16_t _residue[64]
michael@0 344 STMFD r13!,{r4-r6,r14}
michael@0 345 MOV r14,#8
michael@0 346 MOV r12,r2
michael@0 347 LDR r6, =0x00800080
michael@0 348 ofrintra_v6_lp
michael@0 349 LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222
michael@0 350 LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666
michael@0 351 SUBS r14,r14,#1
michael@0 352 QADD16 r2, r2, r6
michael@0 353 QADD16 r3, r3, r6
michael@0 354 QADD16 r4, r4, r6
michael@0 355 QADD16 r5, r5, r6
michael@0 356 USAT16 r2, #8, r2 ; r2 = __11__00
michael@0 357 USAT16 r3, #8, r3 ; r3 = __33__22
michael@0 358 USAT16 r4, #8, r4 ; r4 = __55__44
michael@0 359 USAT16 r5, #8, r5 ; r5 = __77__66
michael@0 360 ORR r2, r2, r2, LSR #8 ; r2 = __111100
michael@0 361 ORR r3, r3, r3, LSR #8 ; r3 = __333322
michael@0 362 ORR r4, r4, r4, LSR #8 ; r4 = __555544
michael@0 363 ORR r5, r5, r5, LSR #8 ; r5 = __777766
michael@0 364 PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100
michael@0 365 PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544
michael@0 366 STRD r2, [r0], r1
michael@0 367 BGT ofrintra_v6_lp
michael@0 368 LDMFD r13!,{r4-r6,PC}
michael@0 369 ENDP
michael@0 370
michael@0 371 oc_frag_recon_inter_v6 PROC
michael@0 372 ; r0 = unsigned char *_dst
michael@0 373 ; r1 = const unsigned char *_src
michael@0 374 ; r2 = int _ystride
michael@0 375 ; r3 = const ogg_int16_t _residue[64]
michael@0 376 STMFD r13!,{r4-r7,r14}
michael@0 377 MOV r14,#8
michael@0 378 ofrinter_v6_lp
michael@0 379 LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222
michael@0 380 SUBS r14,r14,#1
michael@0 381 [ OC_ARM_CAN_UNALIGN_LDRD
michael@0 382 LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544
michael@0 383 |
michael@0 384 LDR r5, [r1, #4]
michael@0 385 LDR r4, [r1], r2
michael@0 386 ]
michael@0 387 PKHBT r12,r6, r7, LSL #16 ; r12= 22220000
michael@0 388 PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
michael@0 389 UXTB16 r6,r4 ; r6 = __22__00
michael@0 390 UXTB16 r4,r4, ROR #8 ; r4 = __33__11
michael@0 391 QADD16 r12,r12,r6 ; r12= xx22xx00
michael@0 392 QADD16 r4, r7, r4 ; r4 = xx33xx11
michael@0 393 LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666
michael@0 394 USAT16 r4, #8, r4 ; r4 = __33__11
michael@0 395 USAT16 r12,#8,r12 ; r12= __22__00
michael@0 396 ORR r4, r12,r4, LSL #8 ; r4 = 33221100
michael@0 397 PKHBT r12,r6, r7, LSL #16 ; r12= 66664444
michael@0 398 PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555
michael@0 399 UXTB16 r6,r5 ; r6 = __66__44
michael@0 400 UXTB16 r5,r5, ROR #8 ; r5 = __77__55
michael@0 401 QADD16 r12,r12,r6 ; r12= xx66xx44
michael@0 402 QADD16 r5, r7, r5 ; r5 = xx77xx55
michael@0 403 USAT16 r12,#8, r12 ; r12= __66__44
michael@0 404 USAT16 r5, #8, r5 ; r4 = __77__55
michael@0 405 ORR r5, r12,r5, LSL #8 ; r5 = 33221100
michael@0 406 STRD r4, [r0], r2
michael@0 407 BGT ofrinter_v6_lp
michael@0 408 LDMFD r13!,{r4-r7,PC}
michael@0 409 ENDP
michael@0 410
michael@0 411 oc_frag_recon_inter2_v6 PROC
michael@0 412 ; r0 = unsigned char *_dst
michael@0 413 ; r1 = const unsigned char *_src1
michael@0 414 ; r2 = const unsigned char *_src2
michael@0 415 ; r3 = int _ystride
michael@0 416 LDR r12,[r13]
michael@0 417 ; r12= const ogg_int16_t _residue[64]
michael@0 418 STMFD r13!,{r4-r9,r14}
michael@0 419 MOV r14,#8
michael@0 420 ofrinter2_v6_lp
michael@0 421 LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666
michael@0 422 SUBS r14,r14,#1
michael@0 423 LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544
michael@0 424 LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544
michael@0 425 PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444
michael@0 426 PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555
michael@0 427 UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
michael@0 428 UXTB16 r5, r4 ; r5 = __66__44
michael@0 429 UXTB16 r4, r4, ROR #8 ; r4 = __77__55
michael@0 430 QADD16 r8, r8, r5 ; r8 = xx66xx44
michael@0 431 QADD16 r9, r9, r4 ; r9 = xx77xx55
michael@0 432 LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000
michael@0 433 USAT16 r8, #8, r8 ; r8 = __66__44
michael@0 434 LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100
michael@0 435 USAT16 r9, #8, r9 ; r9 = __77__55
michael@0 436 LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100
michael@0 437 ORR r9, r8, r9, LSL #8 ; r9 = 77665544
michael@0 438 PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000
michael@0 439 UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
michael@0 440 PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
michael@0 441 UXTB16 r5, r4 ; r5 = __22__00
michael@0 442 UXTB16 r4, r4, ROR #8 ; r4 = __33__11
michael@0 443 QADD16 r8, r8, r5 ; r8 = xx22xx00
michael@0 444 QADD16 r7, r7, r4 ; r7 = xx33xx11
michael@0 445 USAT16 r8, #8, r8 ; r8 = __22__00
michael@0 446 USAT16 r7, #8, r7 ; r7 = __33__11
michael@0 447 ORR r8, r8, r7, LSL #8 ; r8 = 33221100
michael@0 448 STRD r8, [r0], r3
michael@0 449 BGT ofrinter2_v6_lp
michael@0 450 LDMFD r13!,{r4-r9,PC}
michael@0 451 ENDP
michael@0 452 ]
michael@0 453
michael@0 454 [ OC_ARM_ASM_NEON
michael@0 455 EXPORT oc_frag_copy_list_neon
michael@0 456 EXPORT oc_frag_recon_intra_neon
michael@0 457 EXPORT oc_frag_recon_inter_neon
michael@0 458 EXPORT oc_frag_recon_inter2_neon
michael@0 459
michael@0 460 oc_frag_copy_list_neon PROC
michael@0 461 ; r0 = _dst_frame
michael@0 462 ; r1 = _src_frame
michael@0 463 ; r2 = _ystride
michael@0 464 ; r3 = _fragis
michael@0 465 ; <> = _nfragis
michael@0 466 ; <> = _frag_buf_offs
michael@0 467 LDR r12,[r13] ; r12 = _nfragis
michael@0 468 STMFD r13!,{r4-r7,r14}
michael@0 469 CMP r12, #1
michael@0 470 LDRGE r6, [r3] ; r6 = _fragis[fragii]
michael@0 471 LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
michael@0 472 BLT ofcl_neon_end
michael@0 473 ; Stall (2 on Xscale)
michael@0 474 LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
michael@0 475 ; Stall (on XScale)
michael@0 476 MOV r7, r6 ; Guarantee PLD points somewhere valid.
michael@0 477 ofcl_neon_lp
michael@0 478 ADD r4, r1, r6
michael@0 479 VLD1.64 {D0}, [r4@64], r2
michael@0 480 ADD r5, r0, r6
michael@0 481 VLD1.64 {D1}, [r4@64], r2
michael@0 482 SUBS r12, r12, #1
michael@0 483 VLD1.64 {D2}, [r4@64], r2
michael@0 484 LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii]
michael@0 485 VLD1.64 {D3}, [r4@64], r2
michael@0 486 LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
michael@0 487 VLD1.64 {D4}, [r4@64], r2
michael@0 488 ADDGT r7, r1, r6
michael@0 489 VLD1.64 {D5}, [r4@64], r2
michael@0 490 PLD [r7]
michael@0 491 VLD1.64 {D6}, [r4@64], r2
michael@0 492 PLD [r7, r2]
michael@0 493 VLD1.64 {D7}, [r4@64]
michael@0 494 PLD [r7, r2, LSL #1]
michael@0 495 VST1.64 {D0}, [r5@64], r2
michael@0 496 ADDGT r7, r7, r2, LSL #2
michael@0 497 VST1.64 {D1}, [r5@64], r2
michael@0 498 PLD [r7, -r2]
michael@0 499 VST1.64 {D2}, [r5@64], r2
michael@0 500 PLD [r7]
michael@0 501 VST1.64 {D3}, [r5@64], r2
michael@0 502 PLD [r7, r2]
michael@0 503 VST1.64 {D4}, [r5@64], r2
michael@0 504 PLD [r7, r2, LSL #1]
michael@0 505 VST1.64 {D5}, [r5@64], r2
michael@0 506 ADDGT r7, r7, r2, LSL #2
michael@0 507 VST1.64 {D6}, [r5@64], r2
michael@0 508 PLD [r7, -r2]
michael@0 509 VST1.64 {D7}, [r5@64]
michael@0 510 BGT ofcl_neon_lp
michael@0 511 ofcl_neon_end
michael@0 512 LDMFD r13!,{r4-r7,PC}
michael@0 513 ENDP
michael@0 514
michael@0 515 oc_frag_recon_intra_neon PROC
michael@0 516 ; r0 = unsigned char *_dst
michael@0 517 ; r1 = int _ystride
michael@0 518 ; r2 = const ogg_int16_t _residue[64]
michael@0 519 MOV r3, #128
michael@0 520 VDUP.S16 Q0, r3
michael@0 521 VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
michael@0 522 VQADD.S16 Q8, Q8, Q0
michael@0 523 VQADD.S16 Q9, Q9, Q0
michael@0 524 VQADD.S16 Q10,Q10,Q0
michael@0 525 VQADD.S16 Q11,Q11,Q0
michael@0 526 VQADD.S16 Q12,Q12,Q0
michael@0 527 VQADD.S16 Q13,Q13,Q0
michael@0 528 VQADD.S16 Q14,Q14,Q0
michael@0 529 VQADD.S16 Q15,Q15,Q0
michael@0 530 VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle
michael@0 531 VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle
michael@0 532 VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle
michael@0 533 VST1.64 {D16},[r0@64], r1
michael@0 534 VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle
michael@0 535 VST1.64 {D17},[r0@64], r1
michael@0 536 VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle
michael@0 537 VST1.64 {D18},[r0@64], r1
michael@0 538 VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle
michael@0 539 VST1.64 {D19},[r0@64], r1
michael@0 540 VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle
michael@0 541 VST1.64 {D20},[r0@64], r1
michael@0 542 VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle
michael@0 543 VST1.64 {D21},[r0@64], r1
michael@0 544 VST1.64 {D22},[r0@64], r1
michael@0 545 VST1.64 {D23},[r0@64], r1
michael@0 546 MOV PC,R14
michael@0 547 ENDP
michael@0 548
michael@0 549 oc_frag_recon_inter_neon PROC
michael@0 550 ; r0 = unsigned char *_dst
michael@0 551 ; r1 = const unsigned char *_src
michael@0 552 ; r2 = int _ystride
michael@0 553 ; r3 = const ogg_int16_t _residue[64]
michael@0 554 VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
michael@0 555 VLD1.64 {D0}, [r1], r2
michael@0 556 VLD1.64 {D2}, [r1], r2
michael@0 557 VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00
michael@0 558 VLD1.64 {D4}, [r1], r2
michael@0 559 VMOVL.U8 Q1, D2 ; etc
michael@0 560 VLD1.64 {D6}, [r1], r2
michael@0 561 VMOVL.U8 Q2, D4
michael@0 562 VMOVL.U8 Q3, D6
michael@0 563 VQADD.S16 Q8, Q8, Q0
michael@0 564 VLD1.64 {D0}, [r1], r2
michael@0 565 VQADD.S16 Q9, Q9, Q1
michael@0 566 VLD1.64 {D2}, [r1], r2
michael@0 567 VQADD.S16 Q10,Q10,Q2
michael@0 568 VLD1.64 {D4}, [r1], r2
michael@0 569 VQADD.S16 Q11,Q11,Q3
michael@0 570 VLD1.64 {D6}, [r1], r2
michael@0 571 VMOVL.U8 Q0, D0
michael@0 572 VMOVL.U8 Q1, D2
michael@0 573 VMOVL.U8 Q2, D4
michael@0 574 VMOVL.U8 Q3, D6
michael@0 575 VQADD.S16 Q12,Q12,Q0
michael@0 576 VQADD.S16 Q13,Q13,Q1
michael@0 577 VQADD.S16 Q14,Q14,Q2
michael@0 578 VQADD.S16 Q15,Q15,Q3
michael@0 579 VQMOVUN.S16 D16,Q8
michael@0 580 VQMOVUN.S16 D17,Q9
michael@0 581 VQMOVUN.S16 D18,Q10
michael@0 582 VST1.64 {D16},[r0@64], r2
michael@0 583 VQMOVUN.S16 D19,Q11
michael@0 584 VST1.64 {D17},[r0@64], r2
michael@0 585 VQMOVUN.S16 D20,Q12
michael@0 586 VST1.64 {D18},[r0@64], r2
michael@0 587 VQMOVUN.S16 D21,Q13
michael@0 588 VST1.64 {D19},[r0@64], r2
michael@0 589 VQMOVUN.S16 D22,Q14
michael@0 590 VST1.64 {D20},[r0@64], r2
michael@0 591 VQMOVUN.S16 D23,Q15
michael@0 592 VST1.64 {D21},[r0@64], r2
michael@0 593 VST1.64 {D22},[r0@64], r2
michael@0 594 VST1.64 {D23},[r0@64], r2
michael@0 595 MOV PC,R14
michael@0 596 ENDP
michael@0 597
michael@0 598 oc_frag_recon_inter2_neon PROC
michael@0 599 ; r0 = unsigned char *_dst
michael@0 600 ; r1 = const unsigned char *_src1
michael@0 601 ; r2 = const unsigned char *_src2
michael@0 602 ; r3 = int _ystride
michael@0 603 LDR r12,[r13]
michael@0 604 ; r12= const ogg_int16_t _residue[64]
michael@0 605 VLDMIA r12,{D16-D31}
michael@0 606 VLD1.64 {D0}, [r1], r3
michael@0 607 VLD1.64 {D4}, [r2], r3
michael@0 608 VLD1.64 {D1}, [r1], r3
michael@0 609 VLD1.64 {D5}, [r2], r3
michael@0 610 VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100
michael@0 611 VLD1.64 {D2}, [r1], r3
michael@0 612 VLD1.64 {D6}, [r2], r3
michael@0 613 VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00
michael@0 614 VLD1.64 {D3}, [r1], r3
michael@0 615 VMOVL.U8 Q2, D5 ; etc
michael@0 616 VLD1.64 {D7}, [r2], r3
michael@0 617 VHADD.U8 Q3, Q1, Q3
michael@0 618 VQADD.S16 Q8, Q8, Q0
michael@0 619 VQADD.S16 Q9, Q9, Q2
michael@0 620 VLD1.64 {D0}, [r1], r3
michael@0 621 VMOVL.U8 Q1, D6
michael@0 622 VLD1.64 {D4}, [r2], r3
michael@0 623 VMOVL.U8 Q3, D7
michael@0 624 VLD1.64 {D1}, [r1], r3
michael@0 625 VQADD.S16 Q10,Q10,Q1
michael@0 626 VLD1.64 {D5}, [r2], r3
michael@0 627 VQADD.S16 Q11,Q11,Q3
michael@0 628 VLD1.64 {D2}, [r1], r3
michael@0 629 VHADD.U8 Q2, Q0, Q2
michael@0 630 VLD1.64 {D6}, [r2], r3
michael@0 631 VLD1.64 {D3}, [r1], r3
michael@0 632 VMOVL.U8 Q0, D4
michael@0 633 VLD1.64 {D7}, [r2], r3
michael@0 634 VMOVL.U8 Q2, D5
michael@0 635 VHADD.U8 Q3, Q1, Q3
michael@0 636 VQADD.S16 Q12,Q12,Q0
michael@0 637 VQADD.S16 Q13,Q13,Q2
michael@0 638 VMOVL.U8 Q1, D6
michael@0 639 VMOVL.U8 Q3, D7
michael@0 640 VQADD.S16 Q14,Q14,Q1
michael@0 641 VQADD.S16 Q15,Q15,Q3
michael@0 642 VQMOVUN.S16 D16,Q8
michael@0 643 VQMOVUN.S16 D17,Q9
michael@0 644 VQMOVUN.S16 D18,Q10
michael@0 645 VST1.64 {D16},[r0@64], r3
michael@0 646 VQMOVUN.S16 D19,Q11
michael@0 647 VST1.64 {D17},[r0@64], r3
michael@0 648 VQMOVUN.S16 D20,Q12
michael@0 649 VST1.64 {D18},[r0@64], r3
michael@0 650 VQMOVUN.S16 D21,Q13
michael@0 651 VST1.64 {D19},[r0@64], r3
michael@0 652 VQMOVUN.S16 D22,Q14
michael@0 653 VST1.64 {D20},[r0@64], r3
michael@0 654 VQMOVUN.S16 D23,Q15
michael@0 655 VST1.64 {D21},[r0@64], r3
michael@0 656 VST1.64 {D22},[r0@64], r3
michael@0 657 VST1.64 {D23},[r0@64], r3
michael@0 658 MOV PC,R14
michael@0 659 ENDP
michael@0 660 ]
michael@0 661
michael@0 662 END

mercurial