Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | ;******************************************************************** |
michael@0 | 2 | ;* * |
michael@0 | 3 | ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
michael@0 | 4 | ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
michael@0 | 5 | ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
michael@0 | 6 | ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
michael@0 | 7 | ;* * |
michael@0 | 8 | ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * |
michael@0 | 9 | ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
michael@0 | 10 | ;* * |
michael@0 | 11 | ;******************************************************************** |
michael@0 | 12 | ; Original implementation: |
michael@0 | 13 | ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd |
michael@0 | 14 | ; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $ |
michael@0 | 15 | ;******************************************************************** |
michael@0 | 16 | |
michael@0 | 17 | AREA |.text|, CODE, READONLY |
michael@0 | 18 | |
michael@0 | 19 | ; Explicitly specifying alignment here because some versions of |
michael@0 | 20 | ; gas don't align code correctly. See |
michael@0 | 21 | ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html |
michael@0 | 22 | ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 |
michael@0 | 23 | ALIGN |
michael@0 | 24 | |
michael@0 | 25 | GET armopts.s |
michael@0 | 26 | |
michael@0 | 27 | EXPORT oc_loop_filter_frag_rows_arm |
michael@0 | 28 | |
michael@0 | 29 | ; Which bit this is depends on the order of packing within a bitfield. |
michael@0 | 30 | ; Hopefully that doesn't change among any of the relevant compilers. |
michael@0 | 31 | OC_FRAG_CODED_FLAG * 1 |
michael@0 | 32 | |
michael@0 | 33 | ; Vanilla ARM v4 version |
michael@0 | 34 | loop_filter_h_arm PROC |
michael@0 | 35 | ; r0 = unsigned char *_pix |
michael@0 | 36 | ; r1 = int _ystride |
michael@0 | 37 | ; r2 = int *_bv |
michael@0 | 38 | ; preserves r0-r3 |
michael@0 | 39 | STMFD r13!,{r3-r6,r14} |
michael@0 | 40 | MOV r14,#8 |
michael@0 | 41 | MOV r6, #255 |
michael@0 | 42 | lfh_arm_lp |
michael@0 | 43 | LDRB r3, [r0, #-2] ; r3 = _pix[0] |
michael@0 | 44 | LDRB r12,[r0, #1] ; r12= _pix[3] |
michael@0 | 45 | LDRB r4, [r0, #-1] ; r4 = _pix[1] |
michael@0 | 46 | LDRB r5, [r0] ; r5 = _pix[2] |
michael@0 | 47 | SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 |
michael@0 | 48 | ADD r3, r3, #4 |
michael@0 | 49 | SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] |
michael@0 | 50 | ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) |
michael@0 | 51 | ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 |
michael@0 | 52 | MOV r12,r12,ASR #3 |
michael@0 | 53 | LDRSB r12,[r2, r12] |
michael@0 | 54 | ; Stall (2 on Xscale) |
michael@0 | 55 | ADDS r4, r4, r12 |
michael@0 | 56 | CMPGT r6, r4 |
michael@0 | 57 | EORLT r4, r6, r4, ASR #32 |
michael@0 | 58 | SUBS r5, r5, r12 |
michael@0 | 59 | CMPGT r6, r5 |
michael@0 | 60 | EORLT r5, r6, r5, ASR #32 |
michael@0 | 61 | STRB r4, [r0, #-1] |
michael@0 | 62 | STRB r5, [r0], r1 |
michael@0 | 63 | SUBS r14,r14,#1 |
michael@0 | 64 | BGT lfh_arm_lp |
michael@0 | 65 | SUB r0, r0, r1, LSL #3 |
michael@0 | 66 | LDMFD r13!,{r3-r6,PC} |
michael@0 | 67 | ENDP |
michael@0 | 68 | |
michael@0 | 69 | loop_filter_v_arm PROC |
michael@0 | 70 | ; r0 = unsigned char *_pix |
michael@0 | 71 | ; r1 = int _ystride |
michael@0 | 72 | ; r2 = int *_bv |
michael@0 | 73 | ; preserves r0-r3 |
michael@0 | 74 | STMFD r13!,{r3-r6,r14} |
michael@0 | 75 | MOV r14,#8 |
michael@0 | 76 | MOV r6, #255 |
michael@0 | 77 | lfv_arm_lp |
michael@0 | 78 | LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0] |
michael@0 | 79 | LDRB r12,[r0, r1] ; r12= _pix[3] |
michael@0 | 80 | LDRB r4, [r0, -r1] ; r4 = _pix[1] |
michael@0 | 81 | LDRB r5, [r0] ; r5 = _pix[2] |
michael@0 | 82 | SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 |
michael@0 | 83 | ADD r3, r3, #4 |
michael@0 | 84 | SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] |
michael@0 | 85 | ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) |
michael@0 | 86 | ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 |
michael@0 | 87 | MOV r12,r12,ASR #3 |
michael@0 | 88 | LDRSB r12,[r2, r12] |
michael@0 | 89 | ; Stall (2 on Xscale) |
michael@0 | 90 | ADDS r4, r4, r12 |
michael@0 | 91 | CMPGT r6, r4 |
michael@0 | 92 | EORLT r4, r6, r4, ASR #32 |
michael@0 | 93 | SUBS r5, r5, r12 |
michael@0 | 94 | CMPGT r6, r5 |
michael@0 | 95 | EORLT r5, r6, r5, ASR #32 |
michael@0 | 96 | STRB r4, [r0, -r1] |
michael@0 | 97 | STRB r5, [r0], #1 |
michael@0 | 98 | SUBS r14,r14,#1 |
michael@0 | 99 | BGT lfv_arm_lp |
michael@0 | 100 | SUB r0, r0, #8 |
michael@0 | 101 | LDMFD r13!,{r3-r6,PC} |
michael@0 | 102 | ENDP |
michael@0 | 103 | |
michael@0 | 104 | oc_loop_filter_frag_rows_arm PROC |
michael@0 | 105 | ; r0 = _ref_frame_data |
michael@0 | 106 | ; r1 = _ystride |
michael@0 | 107 | ; r2 = _bv |
michael@0 | 108 | ; r3 = _frags |
michael@0 | 109 | ; r4 = _fragi0 |
michael@0 | 110 | ; r5 = _fragi0_end |
michael@0 | 111 | ; r6 = _fragi_top |
michael@0 | 112 | ; r7 = _fragi_bot |
michael@0 | 113 | ; r8 = _frag_buf_offs |
michael@0 | 114 | ; r9 = _nhfrags |
michael@0 | 115 | MOV r12,r13 |
michael@0 | 116 | STMFD r13!,{r0,r4-r11,r14} |
michael@0 | 117 | LDMFD r12,{r4-r9} |
michael@0 | 118 | ADD r2, r2, #127 ; _bv += 127 |
michael@0 | 119 | CMP r4, r5 ; if(_fragi0>=_fragi0_end) |
michael@0 | 120 | BGE oslffri_arm_end ; bail |
michael@0 | 121 | SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) |
michael@0 | 122 | BLE oslffri_arm_end ; bail |
michael@0 | 123 | ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] |
michael@0 | 124 | ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] |
michael@0 | 125 | SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; |
michael@0 | 126 | oslffri_arm_lp1 |
michael@0 | 127 | MOV r10,r4 ; r10= fragi = _fragi0 |
michael@0 | 128 | ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 |
michael@0 | 129 | oslffri_arm_lp2 |
michael@0 | 130 | LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ |
michael@0 | 131 | LDR r0, [r13] ; r0 = _ref_frame_data |
michael@0 | 132 | LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ |
michael@0 | 133 | TST r14,#OC_FRAG_CODED_FLAG |
michael@0 | 134 | BEQ oslffri_arm_uncoded |
michael@0 | 135 | CMP r10,r4 ; if (fragi>_fragi0) |
michael@0 | 136 | ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] |
michael@0 | 137 | BLGT loop_filter_h_arm |
michael@0 | 138 | CMP r4, r6 ; if (_fragi0>_fragi_top) |
michael@0 | 139 | BLGT loop_filter_v_arm |
michael@0 | 140 | CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) |
michael@0 | 141 | LDRLT r12,[r3] ; r12 = _frags[fragi+1] |
michael@0 | 142 | ADD r0, r0, #8 |
michael@0 | 143 | ADD r10,r10,#1 ; r10 = fragi+1; |
michael@0 | 144 | ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
michael@0 | 145 | CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 |
michael@0 | 146 | BLLT loop_filter_h_arm |
michael@0 | 147 | CMP r10,r7 ; if (fragi<_fragi_bot) |
michael@0 | 148 | LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] |
michael@0 | 149 | SUB r0, r0, #8 |
michael@0 | 150 | ADD r0, r0, r1, LSL #3 |
michael@0 | 151 | ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
michael@0 | 152 | CMPLT r12,#OC_FRAG_CODED_FLAG |
michael@0 | 153 | BLLT loop_filter_v_arm |
michael@0 | 154 | CMP r10,r11 ; while(fragi<=fragi_end-1) |
michael@0 | 155 | BLE oslffri_arm_lp2 |
michael@0 | 156 | MOV r4, r10 ; r4 = fragi0 += _nhfrags |
michael@0 | 157 | CMP r4, r5 |
michael@0 | 158 | BLT oslffri_arm_lp1 |
michael@0 | 159 | oslffri_arm_end |
michael@0 | 160 | LDMFD r13!,{r0,r4-r11,PC} |
michael@0 | 161 | oslffri_arm_uncoded |
michael@0 | 162 | ADD r10,r10,#1 |
michael@0 | 163 | CMP r10,r11 |
michael@0 | 164 | BLE oslffri_arm_lp2 |
michael@0 | 165 | MOV r4, r10 ; r4 = _fragi0 += _nhfrags |
michael@0 | 166 | CMP r4, r5 |
michael@0 | 167 | BLT oslffri_arm_lp1 |
michael@0 | 168 | LDMFD r13!,{r0,r4-r11,PC} |
michael@0 | 169 | ENDP |
michael@0 | 170 | |
michael@0 | 171 | [ OC_ARM_ASM_MEDIA |
michael@0 | 172 | EXPORT oc_loop_filter_init_v6 |
michael@0 | 173 | EXPORT oc_loop_filter_frag_rows_v6 |
michael@0 | 174 | |
michael@0 | 175 | oc_loop_filter_init_v6 PROC |
michael@0 | 176 | ; r0 = _bv |
michael@0 | 177 | ; r1 = _flimit (=L from the spec) |
michael@0 | 178 | MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L> |
michael@0 | 179 | AND r1, r1, #255 ; r1 = ll=r1&0xFF |
michael@0 | 180 | ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll> |
michael@0 | 181 | PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll> |
michael@0 | 182 | STR r1, [r0] |
michael@0 | 183 | MOV PC,r14 |
michael@0 | 184 | ENDP |
michael@0 | 185 | |
michael@0 | 186 | ; We could use the same strategy as the v filter below, but that would require |
michael@0 | 187 | ; 40 instructions to load the data and transpose it into columns and another |
michael@0 | 188 | ; 32 to write out the results at the end, plus the 52 instructions to do the |
michael@0 | 189 | ; filtering itself. |
michael@0 | 190 | ; This is slightly less, and less code, even assuming we could have shared the |
michael@0 | 191 | ; 52 instructions in the middle with the other function. |
michael@0 | 192 | ; It executes slightly fewer instructions than the ARMv6 approach David Conrad |
michael@0 | 193 | ; proposed for FFmpeg, but not by much: |
michael@0 | 194 | ; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html |
michael@0 | 195 | ; His is a lot less code, though, because it only does two rows at once instead |
michael@0 | 196 | ; of four. |
michael@0 | 197 | loop_filter_h_v6 PROC |
michael@0 | 198 | ; r0 = unsigned char *_pix |
michael@0 | 199 | ; r1 = int _ystride |
michael@0 | 200 | ; r2 = int _ll |
michael@0 | 201 | ; preserves r0-r3 |
michael@0 | 202 | STMFD r13!,{r4-r11,r14} |
michael@0 | 203 | LDR r12,=0x10003 |
michael@0 | 204 | BL loop_filter_h_core_v6 |
michael@0 | 205 | ADD r0, r0, r1, LSL #2 |
michael@0 | 206 | BL loop_filter_h_core_v6 |
michael@0 | 207 | SUB r0, r0, r1, LSL #2 |
michael@0 | 208 | LDMFD r13!,{r4-r11,PC} |
michael@0 | 209 | ENDP |
michael@0 | 210 | |
michael@0 | 211 | loop_filter_h_core_v6 PROC |
michael@0 | 212 | ; r0 = unsigned char *_pix |
michael@0 | 213 | ; r1 = int _ystride |
michael@0 | 214 | ; r2 = int _ll |
michael@0 | 215 | ; r12= 0x10003 |
michael@0 | 216 | ; Preserves r0-r3, r12; Clobbers r4-r11. |
michael@0 | 217 | LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0> |
michael@0 | 218 | ; Single issue |
michael@0 | 219 | LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0> |
michael@0 | 220 | UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2> |
michael@0 | 221 | UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1> |
michael@0 | 222 | UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2> |
michael@0 | 223 | UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1> |
michael@0 | 224 | PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1> |
michael@0 | 225 | PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2> |
michael@0 | 226 | SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2> |
michael@0 | 227 | SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3> |
michael@0 | 228 | SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2> |
michael@0 | 229 | SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4> |
michael@0 | 230 | LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0> |
michael@0 | 231 | MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3> |
michael@0 | 232 | LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0> |
michael@0 | 233 | PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p> |
michael@0 | 234 | UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2> |
michael@0 | 235 | UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p> |
michael@0 | 236 | UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1> |
michael@0 | 237 | UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2> |
michael@0 | 238 | PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2> |
michael@0 | 239 | SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2> |
michael@0 | 240 | UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1> |
michael@0 | 241 | SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3> |
michael@0 | 242 | SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2> |
michael@0 | 243 | SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4> |
michael@0 | 244 | ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2> |
michael@0 | 245 | MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3> |
michael@0 | 246 | PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1> |
michael@0 | 247 | PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r> |
michael@0 | 248 | ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1> |
michael@0 | 249 | UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r> |
michael@0 | 250 | MOV r10,#0 |
michael@0 | 251 | ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p> |
michael@0 | 252 | ; Single issue |
michael@0 | 253 | ; There's no min, max or abs instruction. |
michael@0 | 254 | ; SSUB8 and SEL will work for abs, and we can do all the rest with |
michael@0 | 255 | ; unsigned saturated adds, which means the GE flags are still all |
michael@0 | 256 | ; set when we're done computing lflim(abs(R_i),L). |
michael@0 | 257 | ; This allows us to both add and subtract, and split the results by |
michael@0 | 258 | ; the original sign of R_i. |
michael@0 | 259 | SSUB8 r7, r10,r6 |
michael@0 | 260 | ; Single issue |
michael@0 | 261 | SEL r7, r7, r6 ; r7 = abs(R_i) |
michael@0 | 262 | ; Single issue |
michael@0 | 263 | UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0) |
michael@0 | 264 | ; Single issue |
michael@0 | 265 | UQADD8 r7, r7, r4 |
michael@0 | 266 | ; Single issue |
michael@0 | 267 | UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0)) |
michael@0 | 268 | ; Single issue |
michael@0 | 269 | UQSUB8 r4, r8, r7 |
michael@0 | 270 | UQADD8 r5, r9, r7 |
michael@0 | 271 | UQADD8 r8, r8, r7 |
michael@0 | 272 | UQSUB8 r9, r9, r7 |
michael@0 | 273 | SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L) |
michael@0 | 274 | SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L) |
michael@0 | 275 | MOV r5, r9, LSR #24 ; r5 = s2 |
michael@0 | 276 | STRB r5, [r0,#2]! |
michael@0 | 277 | MOV r4, r8, LSR #24 ; r4 = s1 |
michael@0 | 278 | STRB r4, [r0,#-1] |
michael@0 | 279 | MOV r5, r9, LSR #8 ; r5 = r2 |
michael@0 | 280 | STRB r5, [r0,-r1]! |
michael@0 | 281 | MOV r4, r8, LSR #8 ; r4 = r1 |
michael@0 | 282 | STRB r4, [r0,#-1] |
michael@0 | 283 | MOV r5, r9, LSR #16 ; r5 = q2 |
michael@0 | 284 | STRB r5, [r0,-r1]! |
michael@0 | 285 | MOV r4, r8, LSR #16 ; r4 = q1 |
michael@0 | 286 | STRB r4, [r0,#-1] |
michael@0 | 287 | ; Single issue |
michael@0 | 288 | STRB r9, [r0,-r1]! |
michael@0 | 289 | ; Single issue |
michael@0 | 290 | STRB r8, [r0,#-1] |
michael@0 | 291 | MOV PC,r14 |
michael@0 | 292 | ENDP |
michael@0 | 293 | |
michael@0 | 294 | ; This uses the same strategy as the MMXEXT version for x86, except that UHADD8 |
michael@0 | 295 | ; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB. |
michael@0 | 296 | ; This works just as well, with the following procedure for computing the |
michael@0 | 297 | ; filter value, f: |
michael@0 | 298 | ; u = ~UHADD8(p1,~p2); |
michael@0 | 299 | ; v = UHADD8(~p1,p2); |
michael@0 | 300 | ; m = v-u; |
michael@0 | 301 | ; a = m^UHADD8(m^p0,m^~p3); |
michael@0 | 302 | ; f = UHADD8(UHADD8(a,u1),v1); |
michael@0 | 303 | ; where f = 127+R, with R in [-127,128] defined as in the spec. |
michael@0 | 304 | ; This is exactly the same amount of arithmetic as the version that uses PAVGB |
michael@0 | 305 | ; as the basic operator. |
michael@0 | 306 | ; It executes about 2/3 the number of instructions of David Conrad's approach, |
michael@0 | 307 | ; but requires more code, because it does all eight columns at once, instead |
michael@0 | 308 | ; of four at a time. |
michael@0 | 309 | loop_filter_v_v6 PROC |
michael@0 | 310 | ; r0 = unsigned char *_pix |
michael@0 | 311 | ; r1 = int _ystride |
michael@0 | 312 | ; r2 = int _ll |
michael@0 | 313 | ; preserves r0-r11 |
michael@0 | 314 | STMFD r13!,{r4-r11,r14} |
michael@0 | 315 | LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1> |
michael@0 | 316 | LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0> |
michael@0 | 317 | LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2> |
michael@0 | 318 | MVN r14,r6 ; r14= ~p1 |
michael@0 | 319 | LDRD r10,[r0, r1] ; r11,r10= <p7|p3> |
michael@0 | 320 | ; Filter the first four columns. |
michael@0 | 321 | MVN r12,r8 ; r12= ~p2 |
michael@0 | 322 | UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1 |
michael@0 | 323 | UHADD8 r12,r12,r6 ; r12= p1+~p2>>1 |
michael@0 | 324 | MVN r10, r10 ; r10=~p3 |
michael@0 | 325 | MVN r12,r12 ; r12= u1=~p1+p2+1>>1 |
michael@0 | 326 | SSUB8 r14,r14,r12 ; r14= m1=v1-u1 |
michael@0 | 327 | ; Single issue |
michael@0 | 328 | EOR r4, r4, r14 ; r4 = m1^p0 |
michael@0 | 329 | EOR r10,r10,r14 ; r10= m1^~p3 |
michael@0 | 330 | UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1 |
michael@0 | 331 | ; Single issue |
michael@0 | 332 | EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1) |
michael@0 | 333 | SADD8 r14,r14,r12 ; r14= v1=m1+u1 |
michael@0 | 334 | UHADD8 r4, r4, r12 ; r4 = a1+u1>>1 |
michael@0 | 335 | MVN r12,r9 ; r12= ~p6 |
michael@0 | 336 | UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1 |
michael@0 | 337 | ; Filter the second four columns. |
michael@0 | 338 | MVN r14,r7 ; r14= ~p5 |
michael@0 | 339 | UHADD8 r12,r12,r7 ; r12= p5+~p6>>1 |
michael@0 | 340 | UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1 |
michael@0 | 341 | MVN r12,r12 ; r12= u2=~p5+p6+1>>1 |
michael@0 | 342 | MVN r11,r11 ; r11=~p7 |
michael@0 | 343 | SSUB8 r10,r14,r12 ; r10= m2=v2-u2 |
michael@0 | 344 | ; Single issue |
michael@0 | 345 | EOR r5, r5, r10 ; r5 = m2^p4 |
michael@0 | 346 | EOR r11,r11,r10 ; r11= m2^~p7 |
michael@0 | 347 | UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1 |
michael@0 | 348 | ; Single issue |
michael@0 | 349 | EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1) |
michael@0 | 350 | ; Single issue |
michael@0 | 351 | UHADD8 r5, r5, r12 ; r5 = a2+u2>>1 |
michael@0 | 352 | LDR r12,=0x7F7F7F7F ; r12 = {127}x4 |
michael@0 | 353 | UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1 |
michael@0 | 354 | ; Now split f[i] by sign. |
michael@0 | 355 | ; There's no min or max instruction. |
michael@0 | 356 | ; We could use SSUB8 and SEL, but this is just as many instructions and |
michael@0 | 357 | ; dual issues more (for v7 without NEON). |
michael@0 | 358 | UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0 |
michael@0 | 359 | UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0 |
michael@0 | 360 | UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0) |
michael@0 | 361 | UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0) |
michael@0 | 362 | UQADD8 r10,r10,r11 |
michael@0 | 363 | UQADD8 r4, r4, r14 |
michael@0 | 364 | UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) |
michael@0 | 365 | UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) |
michael@0 | 366 | UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0 |
michael@0 | 367 | UQADD8 r6, r6, r10 |
michael@0 | 368 | UQSUB8 r8, r8, r10 |
michael@0 | 369 | UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0 |
michael@0 | 370 | UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L) |
michael@0 | 371 | UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L) |
michael@0 | 372 | UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0) |
michael@0 | 373 | UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0) |
michael@0 | 374 | UQADD8 r11,r11,r10 |
michael@0 | 375 | UQADD8 r5, r5, r14 |
michael@0 | 376 | UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) |
michael@0 | 377 | UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) |
michael@0 | 378 | UQADD8 r7, r7, r11 |
michael@0 | 379 | UQSUB8 r9, r9, r11 |
michael@0 | 380 | UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L) |
michael@0 | 381 | STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6] |
michael@0 | 382 | UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L) |
michael@0 | 383 | STRD r8, [r0] ; [p6:p2] = [r9: r8] |
michael@0 | 384 | LDMFD r13!,{r4-r11,PC} |
michael@0 | 385 | ENDP |
michael@0 | 386 | |
michael@0 | 387 | oc_loop_filter_frag_rows_v6 PROC |
michael@0 | 388 | ; r0 = _ref_frame_data |
michael@0 | 389 | ; r1 = _ystride |
michael@0 | 390 | ; r2 = _bv |
michael@0 | 391 | ; r3 = _frags |
michael@0 | 392 | ; r4 = _fragi0 |
michael@0 | 393 | ; r5 = _fragi0_end |
michael@0 | 394 | ; r6 = _fragi_top |
michael@0 | 395 | ; r7 = _fragi_bot |
michael@0 | 396 | ; r8 = _frag_buf_offs |
michael@0 | 397 | ; r9 = _nhfrags |
michael@0 | 398 | MOV r12,r13 |
michael@0 | 399 | STMFD r13!,{r0,r4-r11,r14} |
michael@0 | 400 | LDMFD r12,{r4-r9} |
michael@0 | 401 | LDR r2, [r2] ; ll = *(int *)_bv |
michael@0 | 402 | CMP r4, r5 ; if(_fragi0>=_fragi0_end) |
michael@0 | 403 | BGE oslffri_v6_end ; bail |
michael@0 | 404 | SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) |
michael@0 | 405 | BLE oslffri_v6_end ; bail |
michael@0 | 406 | ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] |
michael@0 | 407 | ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] |
michael@0 | 408 | SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; |
michael@0 | 409 | oslffri_v6_lp1 |
michael@0 | 410 | MOV r10,r4 ; r10= fragi = _fragi0 |
michael@0 | 411 | ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 |
michael@0 | 412 | oslffri_v6_lp2 |
michael@0 | 413 | LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ |
michael@0 | 414 | LDR r0, [r13] ; r0 = _ref_frame_data |
michael@0 | 415 | LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ |
michael@0 | 416 | TST r14,#OC_FRAG_CODED_FLAG |
michael@0 | 417 | BEQ oslffri_v6_uncoded |
michael@0 | 418 | CMP r10,r4 ; if (fragi>_fragi0) |
michael@0 | 419 | ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] |
michael@0 | 420 | BLGT loop_filter_h_v6 |
michael@0 | 421 | CMP r4, r6 ; if (fragi0>_fragi_top) |
michael@0 | 422 | BLGT loop_filter_v_v6 |
michael@0 | 423 | CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) |
michael@0 | 424 | LDRLT r12,[r3] ; r12 = _frags[fragi+1] |
michael@0 | 425 | ADD r0, r0, #8 |
michael@0 | 426 | ADD r10,r10,#1 ; r10 = fragi+1; |
michael@0 | 427 | ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
michael@0 | 428 | CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 |
michael@0 | 429 | BLLT loop_filter_h_v6 |
michael@0 | 430 | CMP r10,r7 ; if (fragi<_fragi_bot) |
michael@0 | 431 | LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] |
michael@0 | 432 | SUB r0, r0, #8 |
michael@0 | 433 | ADD r0, r0, r1, LSL #3 |
michael@0 | 434 | ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
michael@0 | 435 | CMPLT r12,#OC_FRAG_CODED_FLAG |
michael@0 | 436 | BLLT loop_filter_v_v6 |
michael@0 | 437 | CMP r10,r11 ; while(fragi<=fragi_end-1) |
michael@0 | 438 | BLE oslffri_v6_lp2 |
michael@0 | 439 | MOV r4, r10 ; r4 = fragi0 += nhfrags |
michael@0 | 440 | CMP r4, r5 |
michael@0 | 441 | BLT oslffri_v6_lp1 |
michael@0 | 442 | oslffri_v6_end |
michael@0 | 443 | LDMFD r13!,{r0,r4-r11,PC} |
michael@0 | 444 | oslffri_v6_uncoded |
michael@0 | 445 | ADD r10,r10,#1 |
michael@0 | 446 | CMP r10,r11 |
michael@0 | 447 | BLE oslffri_v6_lp2 |
michael@0 | 448 | MOV r4, r10 ; r4 = fragi0 += nhfrags |
michael@0 | 449 | CMP r4, r5 |
michael@0 | 450 | BLT oslffri_v6_lp1 |
michael@0 | 451 | LDMFD r13!,{r0,r4-r11,PC} |
michael@0 | 452 | ENDP |
michael@0 | 453 | ] |
michael@0 | 454 | |
michael@0 | 455 | [ OC_ARM_ASM_NEON |
michael@0 | 456 | EXPORT oc_loop_filter_init_neon |
michael@0 | 457 | EXPORT oc_loop_filter_frag_rows_neon |
michael@0 | 458 | |
michael@0 | 459 | oc_loop_filter_init_neon PROC |
michael@0 | 460 | ; r0 = _bv |
michael@0 | 461 | ; r1 = _flimit (=L from the spec) |
michael@0 | 462 | MOV r1, r1, LSL #1 ; r1 = 2*L |
michael@0 | 463 | VDUP.S16 Q15, r1 ; Q15= 2L in U16s |
michael@0 | 464 | VST1.64 {D30,D31}, [r0@128] |
michael@0 | 465 | MOV PC,r14 |
michael@0 | 466 | ENDP |
michael@0 | 467 | |
michael@0 | 468 | loop_filter_h_neon PROC |
michael@0 | 469 | ; r0 = unsigned char *_pix |
michael@0 | 470 | ; r1 = int _ystride |
michael@0 | 471 | ; r2 = int *_bv |
michael@0 | 472 | ; preserves r0-r3 |
michael@0 | 473 | ; We assume Q15= 2*L in U16s |
michael@0 | 474 | ; My best guesses at cycle counts (and latency)--vvv |
michael@0 | 475 | SUB r12,r0, #2 |
michael@0 | 476 | ; Doing a 2-element structure load saves doing two VTRN's below, at the |
michael@0 | 477 | ; cost of using two more slower single-lane loads vs. the faster |
michael@0 | 478 | ; all-lane loads. |
michael@0 | 479 | ; It's less code this way, though, and benches a hair faster, but it |
michael@0 | 480 | ; leaves D2 and D4 swapped. |
michael@0 | 481 | VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1 |
michael@0 | 482 | ; D2 = ____________3322 |
michael@0 | 483 | VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1 |
michael@0 | 484 | ; D6 = ____________7766 |
michael@0 | 485 | VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1 |
michael@0 | 486 | ; D2 = ________BBAA3322 |
michael@0 | 487 | VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1 |
michael@0 | 488 | ; D6 = ________FFEE7766 |
michael@0 | 489 | VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1 |
michael@0 | 490 | ; D2 = ____JJIIBBAA3322 |
michael@0 | 491 | VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1 |
michael@0 | 492 | ; D6 = ____NNMMFFEE7766 |
michael@0 | 493 | VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1 |
michael@0 | 494 | ; D2 = RRQQJJIIBBAA3322 |
michael@0 | 495 | VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1 |
michael@0 | 496 | ; D6 = VVUUNNMMFFEE7766 |
michael@0 | 497 | VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1 |
michael@0 | 498 | VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1 |
michael@0 | 499 | VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 |
michael@0 | 500 | VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3 |
michael@0 | 501 | ADD r12,r0, #8 |
michael@0 | 502 | VADD.S16 Q0, Q0, Q8 ; 1,3 |
michael@0 | 503 | PLD [r12] |
michael@0 | 504 | VADD.S16 Q0, Q0, Q8 ; 1,3 |
michael@0 | 505 | PLD [r12,r1] |
michael@0 | 506 | VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 |
michael@0 | 507 | PLD [r12,r1, LSL #1] |
michael@0 | 508 | VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 |
michael@0 | 509 | ADD r12,r12,r1, LSL #2 |
michael@0 | 510 | ; We want to do |
michael@0 | 511 | ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) |
michael@0 | 512 | ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) |
michael@0 | 513 | ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) |
michael@0 | 514 | ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) |
michael@0 | 515 | ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) |
michael@0 | 516 | ; So we've reduced the left and right hand terms to be the same, except |
michael@0 | 517 | ; for a negation. |
michael@0 | 518 | ; Stall x3 |
michael@0 | 519 | VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 |
michael@0 | 520 | PLD [r12,-r1] |
michael@0 | 521 | VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 |
michael@0 | 522 | PLD [r12] |
michael@0 | 523 | VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 |
michael@0 | 524 | PLD [r12,r1] |
michael@0 | 525 | VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 |
michael@0 | 526 | PLD [r12,r1,LSL #1] |
michael@0 | 527 | VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 |
michael@0 | 528 | ADD r12,r12,r1, LSL #2 |
michael@0 | 529 | ; Now we need to correct for the sign of f. |
michael@0 | 530 | ; For negative elements of Q0, we want to subtract the appropriate |
michael@0 | 531 | ; element of Q9. For positive elements we want to add them. No NEON |
michael@0 | 532 | ; instruction exists to do this, so we need to negate the negative |
michael@0 | 533 | ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b |
michael@0 | 534 | VADD.S16 Q9, Q9, Q0 ; 1,3 |
michael@0 | 535 | PLD [r12,-r1] |
michael@0 | 536 | VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 |
michael@0 | 537 | ; Bah. No VRSBW.U8 |
michael@0 | 538 | ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) |
michael@0 | 539 | VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 |
michael@0 | 540 | VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 |
michael@0 | 541 | VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1 |
michael@0 | 542 | VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1 |
michael@0 | 543 | SUB r12,r0, #1 |
michael@0 | 544 | VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1 |
michael@0 | 545 | VST1.16 {D4[0]}, [r12], r1 |
michael@0 | 546 | VST1.16 {D2[0]}, [r12], r1 |
michael@0 | 547 | VST1.16 {D4[1]}, [r12], r1 |
michael@0 | 548 | VST1.16 {D2[1]}, [r12], r1 |
michael@0 | 549 | VST1.16 {D4[2]}, [r12], r1 |
michael@0 | 550 | VST1.16 {D2[2]}, [r12], r1 |
michael@0 | 551 | VST1.16 {D4[3]}, [r12], r1 |
michael@0 | 552 | VST1.16 {D2[3]}, [r12], r1 |
michael@0 | 553 | MOV PC,r14 |
michael@0 | 554 | ENDP |
michael@0 | 555 | |
michael@0 | 556 | loop_filter_v_neon PROC |
michael@0 | 557 | ; r0 = unsigned char *_pix |
michael@0 | 558 | ; r1 = int _ystride |
michael@0 | 559 | ; r2 = int *_bv |
michael@0 | 560 | ; preserves r0-r3 |
michael@0 | 561 | ; We assume Q15= 2*L in U16s |
michael@0 | 562 | ; My best guesses at cycle counts (and latency)--vvv |
michael@0 | 563 | SUB r12,r0, r1, LSL #1 |
michael@0 | 564 | VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1 |
michael@0 | 565 | VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1 |
michael@0 | 566 | VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1 |
michael@0 | 567 | VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1 |
michael@0 | 568 | VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3 |
michael@0 | 569 | VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 |
michael@0 | 570 | ADD r12, #8 |
michael@0 | 571 | VADD.S16 Q0, Q0, Q8 ; 1,3 |
michael@0 | 572 | PLD [r12] |
michael@0 | 573 | VADD.S16 Q0, Q0, Q8 ; 1,3 |
michael@0 | 574 | PLD [r12,r1] |
michael@0 | 575 | VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 |
michael@0 | 576 | SUB r12, r0, r1 |
michael@0 | 577 | VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 |
michael@0 | 578 | ; We want to do |
michael@0 | 579 | ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) |
michael@0 | 580 | ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) |
michael@0 | 581 | ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) |
michael@0 | 582 | ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) |
michael@0 | 583 | ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) |
michael@0 | 584 | ; So we've reduced the left and right hand terms to be the same, except |
michael@0 | 585 | ; for a negation. |
michael@0 | 586 | ; Stall x3 |
michael@0 | 587 | VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 |
michael@0 | 588 | VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 |
michael@0 | 589 | ; Stall x2 |
michael@0 | 590 | VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 |
michael@0 | 591 | VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 |
michael@0 | 592 | ; Stall x2 |
michael@0 | 593 | VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 |
michael@0 | 594 | ; Now we need to correct for the sign of f. |
michael@0 | 595 | ; For negative elements of Q0, we want to subtract the appropriate |
michael@0 | 596 | ; element of Q9. For positive elements we want to add them. No NEON |
michael@0 | 597 | ; instruction exists to do this, so we need to negate the negative |
michael@0 | 598 | ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b |
michael@0 | 599 | ; Stall x3 |
michael@0 | 600 | VADD.S16 Q9, Q9, Q0 ; 1,3 |
michael@0 | 601 | ; Stall x2 |
michael@0 | 602 | VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 |
michael@0 | 603 | ; Bah. No VRSBW.U8 |
michael@0 | 604 | ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) |
michael@0 | 605 | VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 |
michael@0 | 606 | VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 |
michael@0 | 607 | VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1 |
michael@0 | 608 | VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1 |
michael@0 | 609 | VST1.64 {D2}, [r12@64], r1 |
michael@0 | 610 | VST1.64 {D4}, [r12@64], r1 |
michael@0 | 611 | MOV PC,r14 |
michael@0 | 612 | ENDP |
michael@0 | 613 | |
michael@0 | 614 | oc_loop_filter_frag_rows_neon PROC |
michael@0 | 615 | ; r0 = _ref_frame_data |
michael@0 | 616 | ; r1 = _ystride |
michael@0 | 617 | ; r2 = _bv |
michael@0 | 618 | ; r3 = _frags |
michael@0 | 619 | ; r4 = _fragi0 |
michael@0 | 620 | ; r5 = _fragi0_end |
michael@0 | 621 | ; r6 = _fragi_top |
michael@0 | 622 | ; r7 = _fragi_bot |
michael@0 | 623 | ; r8 = _frag_buf_offs |
michael@0 | 624 | ; r9 = _nhfrags |
michael@0 | 625 | MOV r12,r13 |
michael@0 | 626 | STMFD r13!,{r0,r4-r11,r14} |
michael@0 | 627 | LDMFD r12,{r4-r9} |
michael@0 | 628 | CMP r4, r5 ; if(_fragi0>=_fragi0_end) |
michael@0 | 629 | BGE oslffri_neon_end; bail |
michael@0 | 630 | SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) |
michael@0 | 631 | BLE oslffri_neon_end ; bail |
michael@0 | 632 | VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s |
michael@0 | 633 | ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] |
michael@0 | 634 | ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] |
michael@0 | 635 | SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; |
michael@0 | 636 | oslffri_neon_lp1 |
michael@0 | 637 | MOV r10,r4 ; r10= fragi = _fragi0 |
michael@0 | 638 | ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 |
michael@0 | 639 | oslffri_neon_lp2 |
michael@0 | 640 | LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ |
michael@0 | 641 | LDR r0, [r13] ; r0 = _ref_frame_data |
michael@0 | 642 | LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ |
michael@0 | 643 | TST r14,#OC_FRAG_CODED_FLAG |
michael@0 | 644 | BEQ oslffri_neon_uncoded |
michael@0 | 645 | CMP r10,r4 ; if (fragi>_fragi0) |
michael@0 | 646 | ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] |
michael@0 | 647 | BLGT loop_filter_h_neon |
michael@0 | 648 | CMP r4, r6 ; if (_fragi0>_fragi_top) |
michael@0 | 649 | BLGT loop_filter_v_neon |
michael@0 | 650 | CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) |
michael@0 | 651 | LDRLT r12,[r3] ; r12 = _frags[fragi+1] |
michael@0 | 652 | ADD r0, r0, #8 |
michael@0 | 653 | ADD r10,r10,#1 ; r10 = fragi+1; |
michael@0 | 654 | ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
michael@0 | 655 | CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 |
michael@0 | 656 | BLLT loop_filter_h_neon |
michael@0 | 657 | CMP r10,r7 ; if (fragi<_fragi_bot) |
michael@0 | 658 | LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] |
michael@0 | 659 | SUB r0, r0, #8 |
michael@0 | 660 | ADD r0, r0, r1, LSL #3 |
michael@0 | 661 | ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
michael@0 | 662 | CMPLT r12,#OC_FRAG_CODED_FLAG |
michael@0 | 663 | BLLT loop_filter_v_neon |
michael@0 | 664 | CMP r10,r11 ; while(fragi<=fragi_end-1) |
michael@0 | 665 | BLE oslffri_neon_lp2 |
michael@0 | 666 | MOV r4, r10 ; r4 = _fragi0 += _nhfrags |
michael@0 | 667 | CMP r4, r5 |
michael@0 | 668 | BLT oslffri_neon_lp1 |
michael@0 | 669 | oslffri_neon_end |
michael@0 | 670 | LDMFD r13!,{r0,r4-r11,PC} |
michael@0 | 671 | oslffri_neon_uncoded |
michael@0 | 672 | ADD r10,r10,#1 |
michael@0 | 673 | CMP r10,r11 |
michael@0 | 674 | BLE oslffri_neon_lp2 |
michael@0 | 675 | MOV r4, r10 ; r4 = _fragi0 += _nhfrags |
michael@0 | 676 | CMP r4, r5 |
michael@0 | 677 | BLT oslffri_neon_lp1 |
michael@0 | 678 | LDMFD r13!,{r0,r4-r11,PC} |
michael@0 | 679 | ENDP |
michael@0 | 680 | ] |
michael@0 | 681 | |
michael@0 | 682 | END |