1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libtheora/lib/arm/armloop.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,682 @@ 1.4 +;******************************************************************** 1.5 +;* * 1.6 +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * 1.7 +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 1.8 +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 1.9 +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 1.10 +;* * 1.11 +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * 1.12 +;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * 1.13 +;* * 1.14 +;******************************************************************** 1.15 +; Original implementation: 1.16 +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd 1.17 +; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $ 1.18 +;******************************************************************** 1.19 + 1.20 + AREA |.text|, CODE, READONLY 1.21 + 1.22 + ; Explicitly specifying alignment here because some versions of 1.23 + ; gas don't align code correctly. See 1.24 + ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html 1.25 + ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 1.26 + ALIGN 1.27 + 1.28 + GET armopts.s 1.29 + 1.30 + EXPORT oc_loop_filter_frag_rows_arm 1.31 + 1.32 +; Which bit this is depends on the order of packing within a bitfield. 1.33 +; Hopefully that doesn't change among any of the relevant compilers. 1.34 +OC_FRAG_CODED_FLAG * 1 1.35 + 1.36 + ; Vanilla ARM v4 version 1.37 +loop_filter_h_arm PROC 1.38 + ; r0 = unsigned char *_pix 1.39 + ; r1 = int _ystride 1.40 + ; r2 = int *_bv 1.41 + ; preserves r0-r3 1.42 + STMFD r13!,{r3-r6,r14} 1.43 + MOV r14,#8 1.44 + MOV r6, #255 1.45 +lfh_arm_lp 1.46 + LDRB r3, [r0, #-2] ; r3 = _pix[0] 1.47 + LDRB r12,[r0, #1] ; r12= _pix[3] 1.48 + LDRB r4, [r0, #-1] ; r4 = _pix[1] 1.49 + LDRB r5, [r0] ; r5 = _pix[2] 1.50 + SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 1.51 + ADD r3, r3, #4 1.52 + SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] 1.53 + ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) 1.54 + ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 1.55 + MOV r12,r12,ASR #3 1.56 + LDRSB r12,[r2, r12] 1.57 + ; Stall (2 on Xscale) 1.58 + ADDS r4, r4, r12 1.59 + CMPGT r6, r4 1.60 + EORLT r4, r6, r4, ASR #32 1.61 + SUBS r5, r5, r12 1.62 + CMPGT r6, r5 1.63 + EORLT r5, r6, r5, ASR #32 1.64 + STRB r4, [r0, #-1] 1.65 + STRB r5, [r0], r1 1.66 + SUBS r14,r14,#1 1.67 + BGT lfh_arm_lp 1.68 + SUB r0, r0, r1, LSL #3 1.69 + LDMFD r13!,{r3-r6,PC} 1.70 + ENDP 1.71 + 1.72 +loop_filter_v_arm PROC 1.73 + ; r0 = unsigned char *_pix 1.74 + ; r1 = int _ystride 1.75 + ; r2 = int *_bv 1.76 + ; preserves r0-r3 1.77 + STMFD r13!,{r3-r6,r14} 1.78 + MOV r14,#8 1.79 + MOV r6, #255 1.80 +lfv_arm_lp 1.81 + LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0] 1.82 + LDRB r12,[r0, r1] ; r12= _pix[3] 1.83 + LDRB r4, [r0, -r1] ; r4 = _pix[1] 1.84 + LDRB r5, [r0] ; r5 = _pix[2] 1.85 + SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 1.86 + ADD r3, r3, #4 1.87 + SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] 1.88 + ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) 1.89 + ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 1.90 + MOV r12,r12,ASR #3 1.91 + LDRSB r12,[r2, r12] 1.92 + ; Stall (2 on Xscale) 1.93 + ADDS r4, r4, r12 1.94 + CMPGT r6, r4 1.95 + EORLT r4, r6, r4, ASR #32 1.96 + SUBS r5, r5, r12 1.97 + CMPGT r6, r5 1.98 + EORLT r5, r6, r5, ASR #32 1.99 + STRB r4, [r0, -r1] 1.100 + STRB r5, [r0], #1 1.101 + SUBS r14,r14,#1 1.102 + BGT lfv_arm_lp 1.103 + SUB r0, r0, #8 1.104 + LDMFD r13!,{r3-r6,PC} 1.105 + ENDP 1.106 + 1.107 +oc_loop_filter_frag_rows_arm PROC 1.108 + ; r0 = _ref_frame_data 1.109 + ; r1 = _ystride 1.110 + ; r2 = _bv 1.111 + ; r3 = _frags 1.112 + ; r4 = _fragi0 1.113 + ; r5 = _fragi0_end 1.114 + ; r6 = _fragi_top 1.115 + ; r7 = _fragi_bot 1.116 + ; r8 = _frag_buf_offs 1.117 + ; r9 = _nhfrags 1.118 + MOV r12,r13 1.119 + STMFD r13!,{r0,r4-r11,r14} 1.120 + LDMFD r12,{r4-r9} 1.121 + ADD r2, r2, #127 ; _bv += 127 1.122 + CMP r4, r5 ; if(_fragi0>=_fragi0_end) 1.123 + BGE oslffri_arm_end ; bail 1.124 + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) 1.125 + BLE oslffri_arm_end ; bail 1.126 + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] 1.127 + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] 1.128 + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; 1.129 +oslffri_arm_lp1 1.130 + MOV r10,r4 ; r10= fragi = _fragi0 1.131 + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 1.132 +oslffri_arm_lp2 1.133 + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ 1.134 + LDR r0, [r13] ; r0 = _ref_frame_data 1.135 + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ 1.136 + TST r14,#OC_FRAG_CODED_FLAG 1.137 + BEQ oslffri_arm_uncoded 1.138 + CMP r10,r4 ; if (fragi>_fragi0) 1.139 + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] 1.140 + BLGT loop_filter_h_arm 1.141 + CMP r4, r6 ; if (_fragi0>_fragi_top) 1.142 + BLGT loop_filter_v_arm 1.143 + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) 1.144 + LDRLT r12,[r3] ; r12 = _frags[fragi+1] 1.145 + ADD r0, r0, #8 1.146 + ADD r10,r10,#1 ; r10 = fragi+1; 1.147 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG 1.148 + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 1.149 + BLLT loop_filter_h_arm 1.150 + CMP r10,r7 ; if (fragi<_fragi_bot) 1.151 + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] 1.152 + SUB r0, r0, #8 1.153 + ADD r0, r0, r1, LSL #3 1.154 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG 1.155 + CMPLT r12,#OC_FRAG_CODED_FLAG 1.156 + BLLT loop_filter_v_arm 1.157 + CMP r10,r11 ; while(fragi<=fragi_end-1) 1.158 + BLE oslffri_arm_lp2 1.159 + MOV r4, r10 ; r4 = fragi0 += _nhfrags 1.160 + CMP r4, r5 1.161 + BLT oslffri_arm_lp1 1.162 +oslffri_arm_end 1.163 + LDMFD r13!,{r0,r4-r11,PC} 1.164 +oslffri_arm_uncoded 1.165 + ADD r10,r10,#1 1.166 + CMP r10,r11 1.167 + BLE oslffri_arm_lp2 1.168 + MOV r4, r10 ; r4 = _fragi0 += _nhfrags 1.169 + CMP r4, r5 1.170 + BLT oslffri_arm_lp1 1.171 + LDMFD r13!,{r0,r4-r11,PC} 1.172 + ENDP 1.173 + 1.174 + [ OC_ARM_ASM_MEDIA 1.175 + EXPORT oc_loop_filter_init_v6 1.176 + EXPORT oc_loop_filter_frag_rows_v6 1.177 + 1.178 +oc_loop_filter_init_v6 PROC 1.179 + ; r0 = _bv 1.180 + ; r1 = _flimit (=L from the spec) 1.181 + MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L> 1.182 + AND r1, r1, #255 ; r1 = ll=r1&0xFF 1.183 + ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll> 1.184 + PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll> 1.185 + STR r1, [r0] 1.186 + MOV PC,r14 1.187 + ENDP 1.188 + 1.189 +; We could use the same strategy as the v filter below, but that would require 1.190 +; 40 instructions to load the data and transpose it into columns and another 1.191 +; 32 to write out the results at the end, plus the 52 instructions to do the 1.192 +; filtering itself. 1.193 +; This is slightly less, and less code, even assuming we could have shared the 1.194 +; 52 instructions in the middle with the other function. 1.195 +; It executes slightly fewer instructions than the ARMv6 approach David Conrad 1.196 +; proposed for FFmpeg, but not by much: 1.197 +; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html 1.198 +; His is a lot less code, though, because it only does two rows at once instead 1.199 +; of four. 1.200 +loop_filter_h_v6 PROC 1.201 + ; r0 = unsigned char *_pix 1.202 + ; r1 = int _ystride 1.203 + ; r2 = int _ll 1.204 + ; preserves r0-r3 1.205 + STMFD r13!,{r4-r11,r14} 1.206 + LDR r12,=0x10003 1.207 + BL loop_filter_h_core_v6 1.208 + ADD r0, r0, r1, LSL #2 1.209 + BL loop_filter_h_core_v6 1.210 + SUB r0, r0, r1, LSL #2 1.211 + LDMFD r13!,{r4-r11,PC} 1.212 + ENDP 1.213 + 1.214 +loop_filter_h_core_v6 PROC 1.215 + ; r0 = unsigned char *_pix 1.216 + ; r1 = int _ystride 1.217 + ; r2 = int _ll 1.218 + ; r12= 0x10003 1.219 + ; Preserves r0-r3, r12; Clobbers r4-r11. 1.220 + LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0> 1.221 + ; Single issue 1.222 + LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0> 1.223 + UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2> 1.224 + UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1> 1.225 + UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2> 1.226 + UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1> 1.227 + PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1> 1.228 + PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2> 1.229 + SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2> 1.230 + SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3> 1.231 + SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2> 1.232 + SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4> 1.233 + LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0> 1.234 + MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3> 1.235 + LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0> 1.236 + PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p> 1.237 + UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2> 1.238 + UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p> 1.239 + UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1> 1.240 + UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2> 1.241 + PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2> 1.242 + SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2> 1.243 + UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1> 1.244 + SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3> 1.245 + SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2> 1.246 + SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4> 1.247 + ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2> 1.248 + MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3> 1.249 + PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1> 1.250 + PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r> 1.251 + ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1> 1.252 + UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r> 1.253 + MOV r10,#0 1.254 + ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p> 1.255 + ; Single issue 1.256 + ; There's no min, max or abs instruction. 1.257 + ; SSUB8 and SEL will work for abs, and we can do all the rest with 1.258 + ; unsigned saturated adds, which means the GE flags are still all 1.259 + ; set when we're done computing lflim(abs(R_i),L). 1.260 + ; This allows us to both add and subtract, and split the results by 1.261 + ; the original sign of R_i. 1.262 + SSUB8 r7, r10,r6 1.263 + ; Single issue 1.264 + SEL r7, r7, r6 ; r7 = abs(R_i) 1.265 + ; Single issue 1.266 + UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0) 1.267 + ; Single issue 1.268 + UQADD8 r7, r7, r4 1.269 + ; Single issue 1.270 + UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0)) 1.271 + ; Single issue 1.272 + UQSUB8 r4, r8, r7 1.273 + UQADD8 r5, r9, r7 1.274 + UQADD8 r8, r8, r7 1.275 + UQSUB8 r9, r9, r7 1.276 + SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L) 1.277 + SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L) 1.278 + MOV r5, r9, LSR #24 ; r5 = s2 1.279 + STRB r5, [r0,#2]! 1.280 + MOV r4, r8, LSR #24 ; r4 = s1 1.281 + STRB r4, [r0,#-1] 1.282 + MOV r5, r9, LSR #8 ; r5 = r2 1.283 + STRB r5, [r0,-r1]! 1.284 + MOV r4, r8, LSR #8 ; r4 = r1 1.285 + STRB r4, [r0,#-1] 1.286 + MOV r5, r9, LSR #16 ; r5 = q2 1.287 + STRB r5, [r0,-r1]! 1.288 + MOV r4, r8, LSR #16 ; r4 = q1 1.289 + STRB r4, [r0,#-1] 1.290 + ; Single issue 1.291 + STRB r9, [r0,-r1]! 1.292 + ; Single issue 1.293 + STRB r8, [r0,#-1] 1.294 + MOV PC,r14 1.295 + ENDP 1.296 + 1.297 +; This uses the same strategy as the MMXEXT version for x86, except that UHADD8 1.298 +; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB. 1.299 +; This works just as well, with the following procedure for computing the 1.300 +; filter value, f: 1.301 +; u = ~UHADD8(p1,~p2); 1.302 +; v = UHADD8(~p1,p2); 1.303 +; m = v-u; 1.304 +; a = m^UHADD8(m^p0,m^~p3); 1.305 +; f = UHADD8(UHADD8(a,u1),v1); 1.306 +; where f = 127+R, with R in [-127,128] defined as in the spec. 1.307 +; This is exactly the same amount of arithmetic as the version that uses PAVGB 1.308 +; as the basic operator. 1.309 +; It executes about 2/3 the number of instructions of David Conrad's approach, 1.310 +; but requires more code, because it does all eight columns at once, instead 1.311 +; of four at a time. 1.312 +loop_filter_v_v6 PROC 1.313 + ; r0 = unsigned char *_pix 1.314 + ; r1 = int _ystride 1.315 + ; r2 = int _ll 1.316 + ; preserves r0-r11 1.317 + STMFD r13!,{r4-r11,r14} 1.318 + LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1> 1.319 + LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0> 1.320 + LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2> 1.321 + MVN r14,r6 ; r14= ~p1 1.322 + LDRD r10,[r0, r1] ; r11,r10= <p7|p3> 1.323 + ; Filter the first four columns. 1.324 + MVN r12,r8 ; r12= ~p2 1.325 + UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1 1.326 + UHADD8 r12,r12,r6 ; r12= p1+~p2>>1 1.327 + MVN r10, r10 ; r10=~p3 1.328 + MVN r12,r12 ; r12= u1=~p1+p2+1>>1 1.329 + SSUB8 r14,r14,r12 ; r14= m1=v1-u1 1.330 + ; Single issue 1.331 + EOR r4, r4, r14 ; r4 = m1^p0 1.332 + EOR r10,r10,r14 ; r10= m1^~p3 1.333 + UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1 1.334 + ; Single issue 1.335 + EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1) 1.336 + SADD8 r14,r14,r12 ; r14= v1=m1+u1 1.337 + UHADD8 r4, r4, r12 ; r4 = a1+u1>>1 1.338 + MVN r12,r9 ; r12= ~p6 1.339 + UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1 1.340 + ; Filter the second four columns. 1.341 + MVN r14,r7 ; r14= ~p5 1.342 + UHADD8 r12,r12,r7 ; r12= p5+~p6>>1 1.343 + UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1 1.344 + MVN r12,r12 ; r12= u2=~p5+p6+1>>1 1.345 + MVN r11,r11 ; r11=~p7 1.346 + SSUB8 r10,r14,r12 ; r10= m2=v2-u2 1.347 + ; Single issue 1.348 + EOR r5, r5, r10 ; r5 = m2^p4 1.349 + EOR r11,r11,r10 ; r11= m2^~p7 1.350 + UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1 1.351 + ; Single issue 1.352 + EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1) 1.353 + ; Single issue 1.354 + UHADD8 r5, r5, r12 ; r5 = a2+u2>>1 1.355 + LDR r12,=0x7F7F7F7F ; r12 = {127}x4 1.356 + UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1 1.357 + ; Now split f[i] by sign. 1.358 + ; There's no min or max instruction. 1.359 + ; We could use SSUB8 and SEL, but this is just as many instructions and 1.360 + ; dual issues more (for v7 without NEON). 1.361 + UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0 1.362 + UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0 1.363 + UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0) 1.364 + UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0) 1.365 + UQADD8 r10,r10,r11 1.366 + UQADD8 r4, r4, r14 1.367 + UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) 1.368 + UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) 1.369 + UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0 1.370 + UQADD8 r6, r6, r10 1.371 + UQSUB8 r8, r8, r10 1.372 + UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0 1.373 + UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L) 1.374 + UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L) 1.375 + UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0) 1.376 + UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0) 1.377 + UQADD8 r11,r11,r10 1.378 + UQADD8 r5, r5, r14 1.379 + UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) 1.380 + UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) 1.381 + UQADD8 r7, r7, r11 1.382 + UQSUB8 r9, r9, r11 1.383 + UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L) 1.384 + STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6] 1.385 + UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L) 1.386 + STRD r8, [r0] ; [p6:p2] = [r9: r8] 1.387 + LDMFD r13!,{r4-r11,PC} 1.388 + ENDP 1.389 + 1.390 +oc_loop_filter_frag_rows_v6 PROC 1.391 + ; r0 = _ref_frame_data 1.392 + ; r1 = _ystride 1.393 + ; r2 = _bv 1.394 + ; r3 = _frags 1.395 + ; r4 = _fragi0 1.396 + ; r5 = _fragi0_end 1.397 + ; r6 = _fragi_top 1.398 + ; r7 = _fragi_bot 1.399 + ; r8 = _frag_buf_offs 1.400 + ; r9 = _nhfrags 1.401 + MOV r12,r13 1.402 + STMFD r13!,{r0,r4-r11,r14} 1.403 + LDMFD r12,{r4-r9} 1.404 + LDR r2, [r2] ; ll = *(int *)_bv 1.405 + CMP r4, r5 ; if(_fragi0>=_fragi0_end) 1.406 + BGE oslffri_v6_end ; bail 1.407 + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) 1.408 + BLE oslffri_v6_end ; bail 1.409 + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] 1.410 + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] 1.411 + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; 1.412 +oslffri_v6_lp1 1.413 + MOV r10,r4 ; r10= fragi = _fragi0 1.414 + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 1.415 +oslffri_v6_lp2 1.416 + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ 1.417 + LDR r0, [r13] ; r0 = _ref_frame_data 1.418 + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ 1.419 + TST r14,#OC_FRAG_CODED_FLAG 1.420 + BEQ oslffri_v6_uncoded 1.421 + CMP r10,r4 ; if (fragi>_fragi0) 1.422 + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] 1.423 + BLGT loop_filter_h_v6 1.424 + CMP r4, r6 ; if (fragi0>_fragi_top) 1.425 + BLGT loop_filter_v_v6 1.426 + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) 1.427 + LDRLT r12,[r3] ; r12 = _frags[fragi+1] 1.428 + ADD r0, r0, #8 1.429 + ADD r10,r10,#1 ; r10 = fragi+1; 1.430 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG 1.431 + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 1.432 + BLLT loop_filter_h_v6 1.433 + CMP r10,r7 ; if (fragi<_fragi_bot) 1.434 + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] 1.435 + SUB r0, r0, #8 1.436 + ADD r0, r0, r1, LSL #3 1.437 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG 1.438 + CMPLT r12,#OC_FRAG_CODED_FLAG 1.439 + BLLT loop_filter_v_v6 1.440 + CMP r10,r11 ; while(fragi<=fragi_end-1) 1.441 + BLE oslffri_v6_lp2 1.442 + MOV r4, r10 ; r4 = fragi0 += nhfrags 1.443 + CMP r4, r5 1.444 + BLT oslffri_v6_lp1 1.445 +oslffri_v6_end 1.446 + LDMFD r13!,{r0,r4-r11,PC} 1.447 +oslffri_v6_uncoded 1.448 + ADD r10,r10,#1 1.449 + CMP r10,r11 1.450 + BLE oslffri_v6_lp2 1.451 + MOV r4, r10 ; r4 = fragi0 += nhfrags 1.452 + CMP r4, r5 1.453 + BLT oslffri_v6_lp1 1.454 + LDMFD r13!,{r0,r4-r11,PC} 1.455 + ENDP 1.456 + ] 1.457 + 1.458 + [ OC_ARM_ASM_NEON 1.459 + EXPORT oc_loop_filter_init_neon 1.460 + EXPORT oc_loop_filter_frag_rows_neon 1.461 + 1.462 +oc_loop_filter_init_neon PROC 1.463 + ; r0 = _bv 1.464 + ; r1 = _flimit (=L from the spec) 1.465 + MOV r1, r1, LSL #1 ; r1 = 2*L 1.466 + VDUP.S16 Q15, r1 ; Q15= 2L in U16s 1.467 + VST1.64 {D30,D31}, [r0@128] 1.468 + MOV PC,r14 1.469 + ENDP 1.470 + 1.471 +loop_filter_h_neon PROC 1.472 + ; r0 = unsigned char *_pix 1.473 + ; r1 = int _ystride 1.474 + ; r2 = int *_bv 1.475 + ; preserves r0-r3 1.476 + ; We assume Q15= 2*L in U16s 1.477 + ; My best guesses at cycle counts (and latency)--vvv 1.478 + SUB r12,r0, #2 1.479 + ; Doing a 2-element structure load saves doing two VTRN's below, at the 1.480 + ; cost of using two more slower single-lane loads vs. the faster 1.481 + ; all-lane loads. 1.482 + ; It's less code this way, though, and benches a hair faster, but it 1.483 + ; leaves D2 and D4 swapped. 1.484 + VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1 1.485 + ; D2 = ____________3322 1.486 + VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1 1.487 + ; D6 = ____________7766 1.488 + VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1 1.489 + ; D2 = ________BBAA3322 1.490 + VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1 1.491 + ; D6 = ________FFEE7766 1.492 + VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1 1.493 + ; D2 = ____JJIIBBAA3322 1.494 + VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1 1.495 + ; D6 = ____NNMMFFEE7766 1.496 + VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1 1.497 + ; D2 = RRQQJJIIBBAA3322 1.498 + VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1 1.499 + ; D6 = VVUUNNMMFFEE7766 1.500 + VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1 1.501 + VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1 1.502 + VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 1.503 + VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3 1.504 + ADD r12,r0, #8 1.505 + VADD.S16 Q0, Q0, Q8 ; 1,3 1.506 + PLD [r12] 1.507 + VADD.S16 Q0, Q0, Q8 ; 1,3 1.508 + PLD [r12,r1] 1.509 + VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 1.510 + PLD [r12,r1, LSL #1] 1.511 + VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 1.512 + ADD r12,r12,r1, LSL #2 1.513 + ; We want to do 1.514 + ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) 1.515 + ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) 1.516 + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) 1.517 + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) 1.518 + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) 1.519 + ; So we've reduced the left and right hand terms to be the same, except 1.520 + ; for a negation. 1.521 + ; Stall x3 1.522 + VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 1.523 + PLD [r12,-r1] 1.524 + VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 1.525 + PLD [r12] 1.526 + VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 1.527 + PLD [r12,r1] 1.528 + VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 1.529 + PLD [r12,r1,LSL #1] 1.530 + VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 1.531 + ADD r12,r12,r1, LSL #2 1.532 + ; Now we need to correct for the sign of f. 1.533 + ; For negative elements of Q0, we want to subtract the appropriate 1.534 + ; element of Q9. For positive elements we want to add them. No NEON 1.535 + ; instruction exists to do this, so we need to negate the negative 1.536 + ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b 1.537 + VADD.S16 Q9, Q9, Q0 ; 1,3 1.538 + PLD [r12,-r1] 1.539 + VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 1.540 + ; Bah. No VRSBW.U8 1.541 + ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) 1.542 + VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 1.543 + VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 1.544 + VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1 1.545 + VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1 1.546 + SUB r12,r0, #1 1.547 + VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1 1.548 + VST1.16 {D4[0]}, [r12], r1 1.549 + VST1.16 {D2[0]}, [r12], r1 1.550 + VST1.16 {D4[1]}, [r12], r1 1.551 + VST1.16 {D2[1]}, [r12], r1 1.552 + VST1.16 {D4[2]}, [r12], r1 1.553 + VST1.16 {D2[2]}, [r12], r1 1.554 + VST1.16 {D4[3]}, [r12], r1 1.555 + VST1.16 {D2[3]}, [r12], r1 1.556 + MOV PC,r14 1.557 + ENDP 1.558 + 1.559 +loop_filter_v_neon PROC 1.560 + ; r0 = unsigned char *_pix 1.561 + ; r1 = int _ystride 1.562 + ; r2 = int *_bv 1.563 + ; preserves r0-r3 1.564 + ; We assume Q15= 2*L in U16s 1.565 + ; My best guesses at cycle counts (and latency)--vvv 1.566 + SUB r12,r0, r1, LSL #1 1.567 + VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1 1.568 + VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1 1.569 + VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1 1.570 + VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1 1.571 + VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3 1.572 + VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 1.573 + ADD r12, #8 1.574 + VADD.S16 Q0, Q0, Q8 ; 1,3 1.575 + PLD [r12] 1.576 + VADD.S16 Q0, Q0, Q8 ; 1,3 1.577 + PLD [r12,r1] 1.578 + VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 1.579 + SUB r12, r0, r1 1.580 + VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 1.581 + ; We want to do 1.582 + ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) 1.583 + ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) 1.584 + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) 1.585 + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) 1.586 + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) 1.587 + ; So we've reduced the left and right hand terms to be the same, except 1.588 + ; for a negation. 1.589 + ; Stall x3 1.590 + VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 1.591 + VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 1.592 + ; Stall x2 1.593 + VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 1.594 + VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 1.595 + ; Stall x2 1.596 + VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 1.597 + ; Now we need to correct for the sign of f. 1.598 + ; For negative elements of Q0, we want to subtract the appropriate 1.599 + ; element of Q9. For positive elements we want to add them. No NEON 1.600 + ; instruction exists to do this, so we need to negate the negative 1.601 + ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b 1.602 + ; Stall x3 1.603 + VADD.S16 Q9, Q9, Q0 ; 1,3 1.604 + ; Stall x2 1.605 + VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 1.606 + ; Bah. No VRSBW.U8 1.607 + ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) 1.608 + VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 1.609 + VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 1.610 + VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1 1.611 + VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1 1.612 + VST1.64 {D2}, [r12@64], r1 1.613 + VST1.64 {D4}, [r12@64], r1 1.614 + MOV PC,r14 1.615 + ENDP 1.616 + 1.617 +oc_loop_filter_frag_rows_neon PROC 1.618 + ; r0 = _ref_frame_data 1.619 + ; r1 = _ystride 1.620 + ; r2 = _bv 1.621 + ; r3 = _frags 1.622 + ; r4 = _fragi0 1.623 + ; r5 = _fragi0_end 1.624 + ; r6 = _fragi_top 1.625 + ; r7 = _fragi_bot 1.626 + ; r8 = _frag_buf_offs 1.627 + ; r9 = _nhfrags 1.628 + MOV r12,r13 1.629 + STMFD r13!,{r0,r4-r11,r14} 1.630 + LDMFD r12,{r4-r9} 1.631 + CMP r4, r5 ; if(_fragi0>=_fragi0_end) 1.632 + BGE oslffri_neon_end; bail 1.633 + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) 1.634 + BLE oslffri_neon_end ; bail 1.635 + VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s 1.636 + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] 1.637 + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] 1.638 + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; 1.639 +oslffri_neon_lp1 1.640 + MOV r10,r4 ; r10= fragi = _fragi0 1.641 + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 1.642 +oslffri_neon_lp2 1.643 + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ 1.644 + LDR r0, [r13] ; r0 = _ref_frame_data 1.645 + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ 1.646 + TST r14,#OC_FRAG_CODED_FLAG 1.647 + BEQ oslffri_neon_uncoded 1.648 + CMP r10,r4 ; if (fragi>_fragi0) 1.649 + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] 1.650 + BLGT loop_filter_h_neon 1.651 + CMP r4, r6 ; if (_fragi0>_fragi_top) 1.652 + BLGT loop_filter_v_neon 1.653 + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) 1.654 + LDRLT r12,[r3] ; r12 = _frags[fragi+1] 1.655 + ADD r0, r0, #8 1.656 + ADD r10,r10,#1 ; r10 = fragi+1; 1.657 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG 1.658 + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 1.659 + BLLT loop_filter_h_neon 1.660 + CMP r10,r7 ; if (fragi<_fragi_bot) 1.661 + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] 1.662 + SUB r0, r0, #8 1.663 + ADD r0, r0, r1, LSL #3 1.664 + ANDLT r12,r12,#OC_FRAG_CODED_FLAG 1.665 + CMPLT r12,#OC_FRAG_CODED_FLAG 1.666 + BLLT loop_filter_v_neon 1.667 + CMP r10,r11 ; while(fragi<=fragi_end-1) 1.668 + BLE oslffri_neon_lp2 1.669 + MOV r4, r10 ; r4 = _fragi0 += _nhfrags 1.670 + CMP r4, r5 1.671 + BLT oslffri_neon_lp1 1.672 +oslffri_neon_end 1.673 + LDMFD r13!,{r0,r4-r11,PC} 1.674 +oslffri_neon_uncoded 1.675 + ADD r10,r10,#1 1.676 + CMP r10,r11 1.677 + BLE oslffri_neon_lp2 1.678 + MOV r4, r10 ; r4 = _fragi0 += _nhfrags 1.679 + CMP r4, r5 1.680 + BLT oslffri_neon_lp1 1.681 + LDMFD r13!,{r0,r4-r11,PC} 1.682 + ENDP 1.683 + ] 1.684 + 1.685 + END