media/libtheora/lib/arm/armloop.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 ;********************************************************************
michael@0 2 ;* *
michael@0 3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 ;* *
michael@0 8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
michael@0 9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 ;* *
michael@0 11 ;********************************************************************
michael@0 12 ; Original implementation:
michael@0 13 ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
michael@0 14 ; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
michael@0 15 ;********************************************************************
michael@0 16
michael@0 17 AREA |.text|, CODE, READONLY
michael@0 18
michael@0 19 ; Explicitly specifying alignment here because some versions of
michael@0 20 ; gas don't align code correctly. See
michael@0 21 ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
michael@0 22 ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
michael@0 23 ALIGN
michael@0 24
michael@0 25 GET armopts.s
michael@0 26
michael@0 27 EXPORT oc_loop_filter_frag_rows_arm
michael@0 28
michael@0 29 ; Which bit this is depends on the order of packing within a bitfield.
michael@0 30 ; Hopefully that doesn't change among any of the relevant compilers.
michael@0 31 OC_FRAG_CODED_FLAG * 1
michael@0 32
michael@0 33 ; Vanilla ARM v4 version
michael@0 34 loop_filter_h_arm PROC
michael@0 35 ; r0 = unsigned char *_pix
michael@0 36 ; r1 = int _ystride
michael@0 37 ; r2 = int *_bv
michael@0 38 ; preserves r0-r3
michael@0 39 STMFD r13!,{r3-r6,r14}
michael@0 40 MOV r14,#8
michael@0 41 MOV r6, #255
michael@0 42 lfh_arm_lp
michael@0 43 LDRB r3, [r0, #-2] ; r3 = _pix[0]
michael@0 44 LDRB r12,[r0, #1] ; r12= _pix[3]
michael@0 45 LDRB r4, [r0, #-1] ; r4 = _pix[1]
michael@0 46 LDRB r5, [r0] ; r5 = _pix[2]
michael@0 47 SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
michael@0 48 ADD r3, r3, #4
michael@0 49 SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
michael@0 50 ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
michael@0 51 ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
michael@0 52 MOV r12,r12,ASR #3
michael@0 53 LDRSB r12,[r2, r12]
michael@0 54 ; Stall (2 on Xscale)
michael@0 55 ADDS r4, r4, r12
michael@0 56 CMPGT r6, r4
michael@0 57 EORLT r4, r6, r4, ASR #32
michael@0 58 SUBS r5, r5, r12
michael@0 59 CMPGT r6, r5
michael@0 60 EORLT r5, r6, r5, ASR #32
michael@0 61 STRB r4, [r0, #-1]
michael@0 62 STRB r5, [r0], r1
michael@0 63 SUBS r14,r14,#1
michael@0 64 BGT lfh_arm_lp
michael@0 65 SUB r0, r0, r1, LSL #3
michael@0 66 LDMFD r13!,{r3-r6,PC}
michael@0 67 ENDP
michael@0 68
michael@0 69 loop_filter_v_arm PROC
michael@0 70 ; r0 = unsigned char *_pix
michael@0 71 ; r1 = int _ystride
michael@0 72 ; r2 = int *_bv
michael@0 73 ; preserves r0-r3
michael@0 74 STMFD r13!,{r3-r6,r14}
michael@0 75 MOV r14,#8
michael@0 76 MOV r6, #255
michael@0 77 lfv_arm_lp
michael@0 78 LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0]
michael@0 79 LDRB r12,[r0, r1] ; r12= _pix[3]
michael@0 80 LDRB r4, [r0, -r1] ; r4 = _pix[1]
michael@0 81 LDRB r5, [r0] ; r5 = _pix[2]
michael@0 82 SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
michael@0 83 ADD r3, r3, #4
michael@0 84 SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
michael@0 85 ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
michael@0 86 ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
michael@0 87 MOV r12,r12,ASR #3
michael@0 88 LDRSB r12,[r2, r12]
michael@0 89 ; Stall (2 on Xscale)
michael@0 90 ADDS r4, r4, r12
michael@0 91 CMPGT r6, r4
michael@0 92 EORLT r4, r6, r4, ASR #32
michael@0 93 SUBS r5, r5, r12
michael@0 94 CMPGT r6, r5
michael@0 95 EORLT r5, r6, r5, ASR #32
michael@0 96 STRB r4, [r0, -r1]
michael@0 97 STRB r5, [r0], #1
michael@0 98 SUBS r14,r14,#1
michael@0 99 BGT lfv_arm_lp
michael@0 100 SUB r0, r0, #8
michael@0 101 LDMFD r13!,{r3-r6,PC}
michael@0 102 ENDP
michael@0 103
michael@0 104 oc_loop_filter_frag_rows_arm PROC
michael@0 105 ; r0 = _ref_frame_data
michael@0 106 ; r1 = _ystride
michael@0 107 ; r2 = _bv
michael@0 108 ; r3 = _frags
michael@0 109 ; r4 = _fragi0
michael@0 110 ; r5 = _fragi0_end
michael@0 111 ; r6 = _fragi_top
michael@0 112 ; r7 = _fragi_bot
michael@0 113 ; r8 = _frag_buf_offs
michael@0 114 ; r9 = _nhfrags
michael@0 115 MOV r12,r13
michael@0 116 STMFD r13!,{r0,r4-r11,r14}
michael@0 117 LDMFD r12,{r4-r9}
michael@0 118 ADD r2, r2, #127 ; _bv += 127
michael@0 119 CMP r4, r5 ; if(_fragi0>=_fragi0_end)
michael@0 120 BGE oslffri_arm_end ; bail
michael@0 121 SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
michael@0 122 BLE oslffri_arm_end ; bail
michael@0 123 ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
michael@0 124 ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
michael@0 125 SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
michael@0 126 oslffri_arm_lp1
michael@0 127 MOV r10,r4 ; r10= fragi = _fragi0
michael@0 128 ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
michael@0 129 oslffri_arm_lp2
michael@0 130 LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
michael@0 131 LDR r0, [r13] ; r0 = _ref_frame_data
michael@0 132 LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
michael@0 133 TST r14,#OC_FRAG_CODED_FLAG
michael@0 134 BEQ oslffri_arm_uncoded
michael@0 135 CMP r10,r4 ; if (fragi>_fragi0)
michael@0 136 ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
michael@0 137 BLGT loop_filter_h_arm
michael@0 138 CMP r4, r6 ; if (_fragi0>_fragi_top)
michael@0 139 BLGT loop_filter_v_arm
michael@0 140 CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
michael@0 141 LDRLT r12,[r3] ; r12 = _frags[fragi+1]
michael@0 142 ADD r0, r0, #8
michael@0 143 ADD r10,r10,#1 ; r10 = fragi+1;
michael@0 144 ANDLT r12,r12,#OC_FRAG_CODED_FLAG
michael@0 145 CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
michael@0 146 BLLT loop_filter_h_arm
michael@0 147 CMP r10,r7 ; if (fragi<_fragi_bot)
michael@0 148 LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
michael@0 149 SUB r0, r0, #8
michael@0 150 ADD r0, r0, r1, LSL #3
michael@0 151 ANDLT r12,r12,#OC_FRAG_CODED_FLAG
michael@0 152 CMPLT r12,#OC_FRAG_CODED_FLAG
michael@0 153 BLLT loop_filter_v_arm
michael@0 154 CMP r10,r11 ; while(fragi<=fragi_end-1)
michael@0 155 BLE oslffri_arm_lp2
michael@0 156 MOV r4, r10 ; r4 = fragi0 += _nhfrags
michael@0 157 CMP r4, r5
michael@0 158 BLT oslffri_arm_lp1
michael@0 159 oslffri_arm_end
michael@0 160 LDMFD r13!,{r0,r4-r11,PC}
michael@0 161 oslffri_arm_uncoded
michael@0 162 ADD r10,r10,#1
michael@0 163 CMP r10,r11
michael@0 164 BLE oslffri_arm_lp2
michael@0 165 MOV r4, r10 ; r4 = _fragi0 += _nhfrags
michael@0 166 CMP r4, r5
michael@0 167 BLT oslffri_arm_lp1
michael@0 168 LDMFD r13!,{r0,r4-r11,PC}
michael@0 169 ENDP
michael@0 170
michael@0 171 [ OC_ARM_ASM_MEDIA
michael@0 172 EXPORT oc_loop_filter_init_v6
michael@0 173 EXPORT oc_loop_filter_frag_rows_v6
michael@0 174
michael@0 175 oc_loop_filter_init_v6 PROC
michael@0 176 ; r0 = _bv
michael@0 177 ; r1 = _flimit (=L from the spec)
michael@0 178 MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L>
michael@0 179 AND r1, r1, #255 ; r1 = ll=r1&0xFF
michael@0 180 ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll>
michael@0 181 PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll>
michael@0 182 STR r1, [r0]
michael@0 183 MOV PC,r14
michael@0 184 ENDP
michael@0 185
michael@0 186 ; We could use the same strategy as the v filter below, but that would require
michael@0 187 ; 40 instructions to load the data and transpose it into columns and another
michael@0 188 ; 32 to write out the results at the end, plus the 52 instructions to do the
michael@0 189 ; filtering itself.
michael@0 190 ; This is slightly less, and less code, even assuming we could have shared the
michael@0 191 ; 52 instructions in the middle with the other function.
michael@0 192 ; It executes slightly fewer instructions than the ARMv6 approach David Conrad
michael@0 193 ; proposed for FFmpeg, but not by much:
michael@0 194 ; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
michael@0 195 ; His is a lot less code, though, because it only does two rows at once instead
michael@0 196 ; of four.
michael@0 197 loop_filter_h_v6 PROC
michael@0 198 ; r0 = unsigned char *_pix
michael@0 199 ; r1 = int _ystride
michael@0 200 ; r2 = int _ll
michael@0 201 ; preserves r0-r3
michael@0 202 STMFD r13!,{r4-r11,r14}
michael@0 203 LDR r12,=0x10003
michael@0 204 BL loop_filter_h_core_v6
michael@0 205 ADD r0, r0, r1, LSL #2
michael@0 206 BL loop_filter_h_core_v6
michael@0 207 SUB r0, r0, r1, LSL #2
michael@0 208 LDMFD r13!,{r4-r11,PC}
michael@0 209 ENDP
michael@0 210
michael@0 211 loop_filter_h_core_v6 PROC
michael@0 212 ; r0 = unsigned char *_pix
michael@0 213 ; r1 = int _ystride
michael@0 214 ; r2 = int _ll
michael@0 215 ; r12= 0x10003
michael@0 216 ; Preserves r0-r3, r12; Clobbers r4-r11.
michael@0 217 LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0>
michael@0 218 ; Single issue
michael@0 219 LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0>
michael@0 220 UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2>
michael@0 221 UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1>
michael@0 222 UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2>
michael@0 223 UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1>
michael@0 224 PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1>
michael@0 225 PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2>
michael@0 226 SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2>
michael@0 227 SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3>
michael@0 228 SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2>
michael@0 229 SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4>
michael@0 230 LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0>
michael@0 231 MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
michael@0 232 LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0>
michael@0 233 PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p>
michael@0 234 UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2>
michael@0 235 UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p>
michael@0 236 UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1>
michael@0 237 UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2>
michael@0 238 PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2>
michael@0 239 SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2>
michael@0 240 UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1>
michael@0 241 SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3>
michael@0 242 SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2>
michael@0 243 SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4>
michael@0 244 ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2>
michael@0 245 MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
michael@0 246 PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1>
michael@0 247 PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r>
michael@0 248 ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1>
michael@0 249 UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r>
michael@0 250 MOV r10,#0
michael@0 251 ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p>
michael@0 252 ; Single issue
michael@0 253 ; There's no min, max or abs instruction.
michael@0 254 ; SSUB8 and SEL will work for abs, and we can do all the rest with
michael@0 255 ; unsigned saturated adds, which means the GE flags are still all
michael@0 256 ; set when we're done computing lflim(abs(R_i),L).
michael@0 257 ; This allows us to both add and subtract, and split the results by
michael@0 258 ; the original sign of R_i.
michael@0 259 SSUB8 r7, r10,r6
michael@0 260 ; Single issue
michael@0 261 SEL r7, r7, r6 ; r7 = abs(R_i)
michael@0 262 ; Single issue
michael@0 263 UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0)
michael@0 264 ; Single issue
michael@0 265 UQADD8 r7, r7, r4
michael@0 266 ; Single issue
michael@0 267 UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
michael@0 268 ; Single issue
michael@0 269 UQSUB8 r4, r8, r7
michael@0 270 UQADD8 r5, r9, r7
michael@0 271 UQADD8 r8, r8, r7
michael@0 272 UQSUB8 r9, r9, r7
michael@0 273 SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L)
michael@0 274 SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L)
michael@0 275 MOV r5, r9, LSR #24 ; r5 = s2
michael@0 276 STRB r5, [r0,#2]!
michael@0 277 MOV r4, r8, LSR #24 ; r4 = s1
michael@0 278 STRB r4, [r0,#-1]
michael@0 279 MOV r5, r9, LSR #8 ; r5 = r2
michael@0 280 STRB r5, [r0,-r1]!
michael@0 281 MOV r4, r8, LSR #8 ; r4 = r1
michael@0 282 STRB r4, [r0,#-1]
michael@0 283 MOV r5, r9, LSR #16 ; r5 = q2
michael@0 284 STRB r5, [r0,-r1]!
michael@0 285 MOV r4, r8, LSR #16 ; r4 = q1
michael@0 286 STRB r4, [r0,#-1]
michael@0 287 ; Single issue
michael@0 288 STRB r9, [r0,-r1]!
michael@0 289 ; Single issue
michael@0 290 STRB r8, [r0,#-1]
michael@0 291 MOV PC,r14
michael@0 292 ENDP
michael@0 293
michael@0 294 ; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
michael@0 295 ; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
michael@0 296 ; This works just as well, with the following procedure for computing the
michael@0 297 ; filter value, f:
michael@0 298 ; u = ~UHADD8(p1,~p2);
michael@0 299 ; v = UHADD8(~p1,p2);
michael@0 300 ; m = v-u;
michael@0 301 ; a = m^UHADD8(m^p0,m^~p3);
michael@0 302 ; f = UHADD8(UHADD8(a,u1),v1);
michael@0 303 ; where f = 127+R, with R in [-127,128] defined as in the spec.
michael@0 304 ; This is exactly the same amount of arithmetic as the version that uses PAVGB
michael@0 305 ; as the basic operator.
michael@0 306 ; It executes about 2/3 the number of instructions of David Conrad's approach,
michael@0 307 ; but requires more code, because it does all eight columns at once, instead
michael@0 308 ; of four at a time.
michael@0 309 loop_filter_v_v6 PROC
michael@0 310 ; r0 = unsigned char *_pix
michael@0 311 ; r1 = int _ystride
michael@0 312 ; r2 = int _ll
michael@0 313 ; preserves r0-r11
michael@0 314 STMFD r13!,{r4-r11,r14}
michael@0 315 LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1>
michael@0 316 LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0>
michael@0 317 LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2>
michael@0 318 MVN r14,r6 ; r14= ~p1
michael@0 319 LDRD r10,[r0, r1] ; r11,r10= <p7|p3>
michael@0 320 ; Filter the first four columns.
michael@0 321 MVN r12,r8 ; r12= ~p2
michael@0 322 UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1
michael@0 323 UHADD8 r12,r12,r6 ; r12= p1+~p2>>1
michael@0 324 MVN r10, r10 ; r10=~p3
michael@0 325 MVN r12,r12 ; r12= u1=~p1+p2+1>>1
michael@0 326 SSUB8 r14,r14,r12 ; r14= m1=v1-u1
michael@0 327 ; Single issue
michael@0 328 EOR r4, r4, r14 ; r4 = m1^p0
michael@0 329 EOR r10,r10,r14 ; r10= m1^~p3
michael@0 330 UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1
michael@0 331 ; Single issue
michael@0 332 EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
michael@0 333 SADD8 r14,r14,r12 ; r14= v1=m1+u1
michael@0 334 UHADD8 r4, r4, r12 ; r4 = a1+u1>>1
michael@0 335 MVN r12,r9 ; r12= ~p6
michael@0 336 UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1
michael@0 337 ; Filter the second four columns.
michael@0 338 MVN r14,r7 ; r14= ~p5
michael@0 339 UHADD8 r12,r12,r7 ; r12= p5+~p6>>1
michael@0 340 UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1
michael@0 341 MVN r12,r12 ; r12= u2=~p5+p6+1>>1
michael@0 342 MVN r11,r11 ; r11=~p7
michael@0 343 SSUB8 r10,r14,r12 ; r10= m2=v2-u2
michael@0 344 ; Single issue
michael@0 345 EOR r5, r5, r10 ; r5 = m2^p4
michael@0 346 EOR r11,r11,r10 ; r11= m2^~p7
michael@0 347 UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1
michael@0 348 ; Single issue
michael@0 349 EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
michael@0 350 ; Single issue
michael@0 351 UHADD8 r5, r5, r12 ; r5 = a2+u2>>1
michael@0 352 LDR r12,=0x7F7F7F7F ; r12 = {127}x4
michael@0 353 UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1
michael@0 354 ; Now split f[i] by sign.
michael@0 355 ; There's no min or max instruction.
michael@0 356 ; We could use SSUB8 and SEL, but this is just as many instructions and
michael@0 357 ; dual issues more (for v7 without NEON).
michael@0 358 UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0
michael@0 359 UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0
michael@0 360 UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0)
michael@0 361 UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
michael@0 362 UQADD8 r10,r10,r11
michael@0 363 UQADD8 r4, r4, r14
michael@0 364 UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
michael@0 365 UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
michael@0 366 UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0
michael@0 367 UQADD8 r6, r6, r10
michael@0 368 UQSUB8 r8, r8, r10
michael@0 369 UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0
michael@0 370 UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L)
michael@0 371 UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L)
michael@0 372 UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0)
michael@0 373 UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
michael@0 374 UQADD8 r11,r11,r10
michael@0 375 UQADD8 r5, r5, r14
michael@0 376 UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
michael@0 377 UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
michael@0 378 UQADD8 r7, r7, r11
michael@0 379 UQSUB8 r9, r9, r11
michael@0 380 UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L)
michael@0 381 STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6]
michael@0 382 UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L)
michael@0 383 STRD r8, [r0] ; [p6:p2] = [r9: r8]
michael@0 384 LDMFD r13!,{r4-r11,PC}
michael@0 385 ENDP
michael@0 386
michael@0 387 oc_loop_filter_frag_rows_v6 PROC
michael@0 388 ; r0 = _ref_frame_data
michael@0 389 ; r1 = _ystride
michael@0 390 ; r2 = _bv
michael@0 391 ; r3 = _frags
michael@0 392 ; r4 = _fragi0
michael@0 393 ; r5 = _fragi0_end
michael@0 394 ; r6 = _fragi_top
michael@0 395 ; r7 = _fragi_bot
michael@0 396 ; r8 = _frag_buf_offs
michael@0 397 ; r9 = _nhfrags
michael@0 398 MOV r12,r13
michael@0 399 STMFD r13!,{r0,r4-r11,r14}
michael@0 400 LDMFD r12,{r4-r9}
michael@0 401 LDR r2, [r2] ; ll = *(int *)_bv
michael@0 402 CMP r4, r5 ; if(_fragi0>=_fragi0_end)
michael@0 403 BGE oslffri_v6_end ; bail
michael@0 404 SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
michael@0 405 BLE oslffri_v6_end ; bail
michael@0 406 ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
michael@0 407 ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
michael@0 408 SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
michael@0 409 oslffri_v6_lp1
michael@0 410 MOV r10,r4 ; r10= fragi = _fragi0
michael@0 411 ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
michael@0 412 oslffri_v6_lp2
michael@0 413 LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
michael@0 414 LDR r0, [r13] ; r0 = _ref_frame_data
michael@0 415 LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
michael@0 416 TST r14,#OC_FRAG_CODED_FLAG
michael@0 417 BEQ oslffri_v6_uncoded
michael@0 418 CMP r10,r4 ; if (fragi>_fragi0)
michael@0 419 ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
michael@0 420 BLGT loop_filter_h_v6
michael@0 421 CMP r4, r6 ; if (fragi0>_fragi_top)
michael@0 422 BLGT loop_filter_v_v6
michael@0 423 CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
michael@0 424 LDRLT r12,[r3] ; r12 = _frags[fragi+1]
michael@0 425 ADD r0, r0, #8
michael@0 426 ADD r10,r10,#1 ; r10 = fragi+1;
michael@0 427 ANDLT r12,r12,#OC_FRAG_CODED_FLAG
michael@0 428 CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
michael@0 429 BLLT loop_filter_h_v6
michael@0 430 CMP r10,r7 ; if (fragi<_fragi_bot)
michael@0 431 LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
michael@0 432 SUB r0, r0, #8
michael@0 433 ADD r0, r0, r1, LSL #3
michael@0 434 ANDLT r12,r12,#OC_FRAG_CODED_FLAG
michael@0 435 CMPLT r12,#OC_FRAG_CODED_FLAG
michael@0 436 BLLT loop_filter_v_v6
michael@0 437 CMP r10,r11 ; while(fragi<=fragi_end-1)
michael@0 438 BLE oslffri_v6_lp2
michael@0 439 MOV r4, r10 ; r4 = fragi0 += nhfrags
michael@0 440 CMP r4, r5
michael@0 441 BLT oslffri_v6_lp1
michael@0 442 oslffri_v6_end
michael@0 443 LDMFD r13!,{r0,r4-r11,PC}
michael@0 444 oslffri_v6_uncoded
michael@0 445 ADD r10,r10,#1
michael@0 446 CMP r10,r11
michael@0 447 BLE oslffri_v6_lp2
michael@0 448 MOV r4, r10 ; r4 = fragi0 += nhfrags
michael@0 449 CMP r4, r5
michael@0 450 BLT oslffri_v6_lp1
michael@0 451 LDMFD r13!,{r0,r4-r11,PC}
michael@0 452 ENDP
michael@0 453 ]
michael@0 454
michael@0 455 [ OC_ARM_ASM_NEON
michael@0 456 EXPORT oc_loop_filter_init_neon
michael@0 457 EXPORT oc_loop_filter_frag_rows_neon
michael@0 458
michael@0 459 oc_loop_filter_init_neon PROC
michael@0 460 ; r0 = _bv
michael@0 461 ; r1 = _flimit (=L from the spec)
michael@0 462 MOV r1, r1, LSL #1 ; r1 = 2*L
michael@0 463 VDUP.S16 Q15, r1 ; Q15= 2L in U16s
michael@0 464 VST1.64 {D30,D31}, [r0@128]
michael@0 465 MOV PC,r14
michael@0 466 ENDP
michael@0 467
michael@0 468 loop_filter_h_neon PROC
michael@0 469 ; r0 = unsigned char *_pix
michael@0 470 ; r1 = int _ystride
michael@0 471 ; r2 = int *_bv
michael@0 472 ; preserves r0-r3
michael@0 473 ; We assume Q15= 2*L in U16s
michael@0 474 ; My best guesses at cycle counts (and latency)--vvv
michael@0 475 SUB r12,r0, #2
michael@0 476 ; Doing a 2-element structure load saves doing two VTRN's below, at the
michael@0 477 ; cost of using two more slower single-lane loads vs. the faster
michael@0 478 ; all-lane loads.
michael@0 479 ; It's less code this way, though, and benches a hair faster, but it
michael@0 480 ; leaves D2 and D4 swapped.
michael@0 481 VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1
michael@0 482 ; D2 = ____________3322
michael@0 483 VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1
michael@0 484 ; D6 = ____________7766
michael@0 485 VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1
michael@0 486 ; D2 = ________BBAA3322
michael@0 487 VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1
michael@0 488 ; D6 = ________FFEE7766
michael@0 489 VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1
michael@0 490 ; D2 = ____JJIIBBAA3322
michael@0 491 VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1
michael@0 492 ; D6 = ____NNMMFFEE7766
michael@0 493 VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1
michael@0 494 ; D2 = RRQQJJIIBBAA3322
michael@0 495 VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1
michael@0 496 ; D6 = VVUUNNMMFFEE7766
michael@0 497 VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1
michael@0 498 VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1
michael@0 499 VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
michael@0 500 VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3
michael@0 501 ADD r12,r0, #8
michael@0 502 VADD.S16 Q0, Q0, Q8 ; 1,3
michael@0 503 PLD [r12]
michael@0 504 VADD.S16 Q0, Q0, Q8 ; 1,3
michael@0 505 PLD [r12,r1]
michael@0 506 VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
michael@0 507 PLD [r12,r1, LSL #1]
michael@0 508 VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
michael@0 509 ADD r12,r12,r1, LSL #2
michael@0 510 ; We want to do
michael@0 511 ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
michael@0 512 ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
michael@0 513 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
michael@0 514 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
michael@0 515 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
michael@0 516 ; So we've reduced the left and right hand terms to be the same, except
michael@0 517 ; for a negation.
michael@0 518 ; Stall x3
michael@0 519 VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
michael@0 520 PLD [r12,-r1]
michael@0 521 VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
michael@0 522 PLD [r12]
michael@0 523 VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
michael@0 524 PLD [r12,r1]
michael@0 525 VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
michael@0 526 PLD [r12,r1,LSL #1]
michael@0 527 VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
michael@0 528 ADD r12,r12,r1, LSL #2
michael@0 529 ; Now we need to correct for the sign of f.
michael@0 530 ; For negative elements of Q0, we want to subtract the appropriate
michael@0 531 ; element of Q9. For positive elements we want to add them. No NEON
michael@0 532 ; instruction exists to do this, so we need to negate the negative
michael@0 533 ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
michael@0 534 VADD.S16 Q9, Q9, Q0 ; 1,3
michael@0 535 PLD [r12,-r1]
michael@0 536 VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
michael@0 537 ; Bah. No VRSBW.U8
michael@0 538 ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
michael@0 539 VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
michael@0 540 VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
michael@0 541 VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1
michael@0 542 VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1
michael@0 543 SUB r12,r0, #1
michael@0 544 VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1
michael@0 545 VST1.16 {D4[0]}, [r12], r1
michael@0 546 VST1.16 {D2[0]}, [r12], r1
michael@0 547 VST1.16 {D4[1]}, [r12], r1
michael@0 548 VST1.16 {D2[1]}, [r12], r1
michael@0 549 VST1.16 {D4[2]}, [r12], r1
michael@0 550 VST1.16 {D2[2]}, [r12], r1
michael@0 551 VST1.16 {D4[3]}, [r12], r1
michael@0 552 VST1.16 {D2[3]}, [r12], r1
michael@0 553 MOV PC,r14
michael@0 554 ENDP
michael@0 555
michael@0 556 loop_filter_v_neon PROC
michael@0 557 ; r0 = unsigned char *_pix
michael@0 558 ; r1 = int _ystride
michael@0 559 ; r2 = int *_bv
michael@0 560 ; preserves r0-r3
michael@0 561 ; We assume Q15= 2*L in U16s
michael@0 562 ; My best guesses at cycle counts (and latency)--vvv
michael@0 563 SUB r12,r0, r1, LSL #1
michael@0 564 VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1
michael@0 565 VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1
michael@0 566 VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1
michael@0 567 VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1
michael@0 568 VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3
michael@0 569 VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
michael@0 570 ADD r12, #8
michael@0 571 VADD.S16 Q0, Q0, Q8 ; 1,3
michael@0 572 PLD [r12]
michael@0 573 VADD.S16 Q0, Q0, Q8 ; 1,3
michael@0 574 PLD [r12,r1]
michael@0 575 VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
michael@0 576 SUB r12, r0, r1
michael@0 577 VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
michael@0 578 ; We want to do
michael@0 579 ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
michael@0 580 ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
michael@0 581 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
michael@0 582 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
michael@0 583 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
michael@0 584 ; So we've reduced the left and right hand terms to be the same, except
michael@0 585 ; for a negation.
michael@0 586 ; Stall x3
michael@0 587 VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
michael@0 588 VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
michael@0 589 ; Stall x2
michael@0 590 VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
michael@0 591 VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
michael@0 592 ; Stall x2
michael@0 593 VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
michael@0 594 ; Now we need to correct for the sign of f.
michael@0 595 ; For negative elements of Q0, we want to subtract the appropriate
michael@0 596 ; element of Q9. For positive elements we want to add them. No NEON
michael@0 597 ; instruction exists to do this, so we need to negate the negative
michael@0 598 ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
michael@0 599 ; Stall x3
michael@0 600 VADD.S16 Q9, Q9, Q0 ; 1,3
michael@0 601 ; Stall x2
michael@0 602 VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
michael@0 603 ; Bah. No VRSBW.U8
michael@0 604 ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
michael@0 605 VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
michael@0 606 VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
michael@0 607 VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1
michael@0 608 VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1
michael@0 609 VST1.64 {D2}, [r12@64], r1
michael@0 610 VST1.64 {D4}, [r12@64], r1
michael@0 611 MOV PC,r14
michael@0 612 ENDP
michael@0 613
michael@0 614 oc_loop_filter_frag_rows_neon PROC
michael@0 615 ; r0 = _ref_frame_data
michael@0 616 ; r1 = _ystride
michael@0 617 ; r2 = _bv
michael@0 618 ; r3 = _frags
michael@0 619 ; r4 = _fragi0
michael@0 620 ; r5 = _fragi0_end
michael@0 621 ; r6 = _fragi_top
michael@0 622 ; r7 = _fragi_bot
michael@0 623 ; r8 = _frag_buf_offs
michael@0 624 ; r9 = _nhfrags
michael@0 625 MOV r12,r13
michael@0 626 STMFD r13!,{r0,r4-r11,r14}
michael@0 627 LDMFD r12,{r4-r9}
michael@0 628 CMP r4, r5 ; if(_fragi0>=_fragi0_end)
michael@0 629 BGE oslffri_neon_end; bail
michael@0 630 SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
michael@0 631 BLE oslffri_neon_end ; bail
michael@0 632 VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s
michael@0 633 ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
michael@0 634 ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
michael@0 635 SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
michael@0 636 oslffri_neon_lp1
michael@0 637 MOV r10,r4 ; r10= fragi = _fragi0
michael@0 638 ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
michael@0 639 oslffri_neon_lp2
michael@0 640 LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
michael@0 641 LDR r0, [r13] ; r0 = _ref_frame_data
michael@0 642 LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
michael@0 643 TST r14,#OC_FRAG_CODED_FLAG
michael@0 644 BEQ oslffri_neon_uncoded
michael@0 645 CMP r10,r4 ; if (fragi>_fragi0)
michael@0 646 ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
michael@0 647 BLGT loop_filter_h_neon
michael@0 648 CMP r4, r6 ; if (_fragi0>_fragi_top)
michael@0 649 BLGT loop_filter_v_neon
michael@0 650 CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
michael@0 651 LDRLT r12,[r3] ; r12 = _frags[fragi+1]
michael@0 652 ADD r0, r0, #8
michael@0 653 ADD r10,r10,#1 ; r10 = fragi+1;
michael@0 654 ANDLT r12,r12,#OC_FRAG_CODED_FLAG
michael@0 655 CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
michael@0 656 BLLT loop_filter_h_neon
michael@0 657 CMP r10,r7 ; if (fragi<_fragi_bot)
michael@0 658 LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
michael@0 659 SUB r0, r0, #8
michael@0 660 ADD r0, r0, r1, LSL #3
michael@0 661 ANDLT r12,r12,#OC_FRAG_CODED_FLAG
michael@0 662 CMPLT r12,#OC_FRAG_CODED_FLAG
michael@0 663 BLLT loop_filter_v_neon
michael@0 664 CMP r10,r11 ; while(fragi<=fragi_end-1)
michael@0 665 BLE oslffri_neon_lp2
michael@0 666 MOV r4, r10 ; r4 = _fragi0 += _nhfrags
michael@0 667 CMP r4, r5
michael@0 668 BLT oslffri_neon_lp1
michael@0 669 oslffri_neon_end
michael@0 670 LDMFD r13!,{r0,r4-r11,PC}
michael@0 671 oslffri_neon_uncoded
michael@0 672 ADD r10,r10,#1
michael@0 673 CMP r10,r11
michael@0 674 BLE oslffri_neon_lp2
michael@0 675 MOV r4, r10 ; r4 = _fragi0 += _nhfrags
michael@0 676 CMP r4, r5
michael@0 677 BLT oslffri_neon_lp1
michael@0 678 LDMFD r13!,{r0,r4-r11,PC}
michael@0 679 ENDP
michael@0 680 ]
michael@0 681
michael@0 682 END

mercurial