michael@0: ;******************************************************************** michael@0: ;* * michael@0: ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * michael@0: ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * michael@0: ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * michael@0: ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * michael@0: ;* * michael@0: ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * michael@0: ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * michael@0: ;* * michael@0: ;******************************************************************** michael@0: ; Original implementation: michael@0: ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd michael@0: ; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $ michael@0: ;******************************************************************** michael@0: michael@0: AREA |.text|, CODE, READONLY michael@0: michael@0: ; Explicitly specifying alignment here because some versions of michael@0: ; gas don't align code correctly. See michael@0: ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html michael@0: ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 michael@0: ALIGN michael@0: michael@0: GET armopts.s michael@0: michael@0: EXPORT oc_loop_filter_frag_rows_arm michael@0: michael@0: ; Which bit this is depends on the order of packing within a bitfield. michael@0: ; Hopefully that doesn't change among any of the relevant compilers. michael@0: OC_FRAG_CODED_FLAG * 1 michael@0: michael@0: ; Vanilla ARM v4 version michael@0: loop_filter_h_arm PROC michael@0: ; r0 = unsigned char *_pix michael@0: ; r1 = int _ystride michael@0: ; r2 = int *_bv michael@0: ; preserves r0-r3 michael@0: STMFD r13!,{r3-r6,r14} michael@0: MOV r14,#8 michael@0: MOV r6, #255 michael@0: lfh_arm_lp michael@0: LDRB r3, [r0, #-2] ; r3 = _pix[0] michael@0: LDRB r12,[r0, #1] ; r12= _pix[3] michael@0: LDRB r4, [r0, #-1] ; r4 = _pix[1] michael@0: LDRB r5, [r0] ; r5 = _pix[2] michael@0: SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 michael@0: ADD r3, r3, #4 michael@0: SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] michael@0: ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) michael@0: ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 michael@0: MOV r12,r12,ASR #3 michael@0: LDRSB r12,[r2, r12] michael@0: ; Stall (2 on Xscale) michael@0: ADDS r4, r4, r12 michael@0: CMPGT r6, r4 michael@0: EORLT r4, r6, r4, ASR #32 michael@0: SUBS r5, r5, r12 michael@0: CMPGT r6, r5 michael@0: EORLT r5, r6, r5, ASR #32 michael@0: STRB r4, [r0, #-1] michael@0: STRB r5, [r0], r1 michael@0: SUBS r14,r14,#1 michael@0: BGT lfh_arm_lp michael@0: SUB r0, r0, r1, LSL #3 michael@0: LDMFD r13!,{r3-r6,PC} michael@0: ENDP michael@0: michael@0: loop_filter_v_arm PROC michael@0: ; r0 = unsigned char *_pix michael@0: ; r1 = int _ystride michael@0: ; r2 = int *_bv michael@0: ; preserves r0-r3 michael@0: STMFD r13!,{r3-r6,r14} michael@0: MOV r14,#8 michael@0: MOV r6, #255 michael@0: lfv_arm_lp michael@0: LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0] michael@0: LDRB r12,[r0, r1] ; r12= _pix[3] michael@0: LDRB r4, [r0, -r1] ; r4 = _pix[1] michael@0: LDRB r5, [r0] ; r5 = _pix[2] michael@0: SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 michael@0: ADD r3, r3, #4 michael@0: SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] michael@0: ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) michael@0: ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 michael@0: MOV r12,r12,ASR #3 michael@0: LDRSB r12,[r2, r12] michael@0: ; Stall (2 on Xscale) michael@0: ADDS r4, r4, r12 michael@0: CMPGT r6, r4 michael@0: EORLT r4, r6, r4, ASR #32 michael@0: SUBS r5, r5, r12 michael@0: CMPGT r6, r5 michael@0: EORLT r5, r6, r5, ASR #32 michael@0: STRB r4, [r0, -r1] michael@0: STRB r5, [r0], #1 michael@0: SUBS r14,r14,#1 michael@0: BGT lfv_arm_lp michael@0: SUB r0, r0, #8 michael@0: LDMFD r13!,{r3-r6,PC} michael@0: ENDP michael@0: michael@0: oc_loop_filter_frag_rows_arm PROC michael@0: ; r0 = _ref_frame_data michael@0: ; r1 = _ystride michael@0: ; r2 = _bv michael@0: ; r3 = _frags michael@0: ; r4 = _fragi0 michael@0: ; r5 = _fragi0_end michael@0: ; r6 = _fragi_top michael@0: ; r7 = _fragi_bot michael@0: ; r8 = _frag_buf_offs michael@0: ; r9 = _nhfrags michael@0: MOV r12,r13 michael@0: STMFD r13!,{r0,r4-r11,r14} michael@0: LDMFD r12,{r4-r9} michael@0: ADD r2, r2, #127 ; _bv += 127 michael@0: CMP r4, r5 ; if(_fragi0>=_fragi0_end) michael@0: BGE oslffri_arm_end ; bail michael@0: SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) michael@0: BLE oslffri_arm_end ; bail michael@0: ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] michael@0: ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] michael@0: SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; michael@0: oslffri_arm_lp1 michael@0: MOV r10,r4 ; r10= fragi = _fragi0 michael@0: ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 michael@0: oslffri_arm_lp2 michael@0: LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ michael@0: LDR r0, [r13] ; r0 = _ref_frame_data michael@0: LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ michael@0: TST r14,#OC_FRAG_CODED_FLAG michael@0: BEQ oslffri_arm_uncoded michael@0: CMP r10,r4 ; if (fragi>_fragi0) michael@0: ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] michael@0: BLGT loop_filter_h_arm michael@0: CMP r4, r6 ; if (_fragi0>_fragi_top) michael@0: BLGT loop_filter_v_arm michael@0: CMP r10,r11 ; if(fragi+1 michael@0: AND r1, r1, #255 ; r1 = ll=r1&0xFF michael@0: ORR r1, r1, r1, LSL #8 ; r1 = michael@0: PKHBT r1, r1, r1, LSL #16 ; r1 = michael@0: STR r1, [r0] michael@0: MOV PC,r14 michael@0: ENDP michael@0: michael@0: ; We could use the same strategy as the v filter below, but that would require michael@0: ; 40 instructions to load the data and transpose it into columns and another michael@0: ; 32 to write out the results at the end, plus the 52 instructions to do the michael@0: ; filtering itself. michael@0: ; This is slightly less, and less code, even assuming we could have shared the michael@0: ; 52 instructions in the middle with the other function. michael@0: ; It executes slightly fewer instructions than the ARMv6 approach David Conrad michael@0: ; proposed for FFmpeg, but not by much: michael@0: ; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html michael@0: ; His is a lot less code, though, because it only does two rows at once instead michael@0: ; of four. michael@0: loop_filter_h_v6 PROC michael@0: ; r0 = unsigned char *_pix michael@0: ; r1 = int _ystride michael@0: ; r2 = int _ll michael@0: ; preserves r0-r3 michael@0: STMFD r13!,{r4-r11,r14} michael@0: LDR r12,=0x10003 michael@0: BL loop_filter_h_core_v6 michael@0: ADD r0, r0, r1, LSL #2 michael@0: BL loop_filter_h_core_v6 michael@0: SUB r0, r0, r1, LSL #2 michael@0: LDMFD r13!,{r4-r11,PC} michael@0: ENDP michael@0: michael@0: loop_filter_h_core_v6 PROC michael@0: ; r0 = unsigned char *_pix michael@0: ; r1 = int _ystride michael@0: ; r2 = int _ll michael@0: ; r12= 0x10003 michael@0: ; Preserves r0-r3, r12; Clobbers r4-r11. michael@0: LDR r4,[r0, #-2]! ; r4 = michael@0: ; Single issue michael@0: LDR r5,[r0, r1]! ; r5 = michael@0: UXTB16 r6, r4, ROR #16 ; r6 = michael@0: UXTB16 r4, r4, ROR #8 ; r4 = michael@0: UXTB16 r7, r5, ROR #16 ; r7 = michael@0: UXTB16 r5, r5, ROR #8 ; r5 = michael@0: PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1> michael@0: PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2> michael@0: SSUB16 r6, r4, r6 ; r6 = michael@0: SMLAD r6, r6, r12,r12 ; r6 = michael@0: SSUB16 r7, r5, r7 ; r7 = michael@0: SMLAD r7, r7, r12,r12 ; r7 = michael@0: LDR r4,[r0, r1]! ; r4 = michael@0: MOV r6, r6, ASR #3 ; r6 = >3> michael@0: LDR r5,[r0, r1]! ; r5 = michael@0: PKHBT r11,r6, r7, LSL #13 ; r11= michael@0: UXTB16 r6, r4, ROR #16 ; r6 = michael@0: UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p> michael@0: UXTB16 r4, r4, ROR #8 ; r4 = michael@0: UXTB16 r7, r5, ROR #16 ; r7 = michael@0: PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2> michael@0: SSUB16 r6, r4, r6 ; r6 = michael@0: UXTB16 r5, r5, ROR #8 ; r5 = michael@0: SMLAD r6, r6, r12,r12 ; r6 = michael@0: SSUB16 r7, r5, r7 ; r7 = michael@0: SMLAD r7, r7, r12,r12 ; r7 = michael@0: ORR r9, r9, r10, LSL #8 ; r9 = michael@0: MOV r6, r6, ASR #3 ; r6 = >3> michael@0: PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1> michael@0: PKHBT r6, r6, r7, LSL #13 ; r6 = michael@0: ORR r8, r8, r10, LSL #8 ; r8 = michael@0: UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r> michael@0: MOV r10,#0 michael@0: ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p> michael@0: ; Single issue michael@0: ; There's no min, max or abs instruction. michael@0: ; SSUB8 and SEL will work for abs, and we can do all the rest with michael@0: ; unsigned saturated adds, which means the GE flags are still all michael@0: ; set when we're done computing lflim(abs(R_i),L). michael@0: ; This allows us to both add and subtract, and split the results by michael@0: ; the original sign of R_i. michael@0: SSUB8 r7, r10,r6 michael@0: ; Single issue michael@0: SEL r7, r7, r6 ; r7 = abs(R_i) michael@0: ; Single issue michael@0: UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0) michael@0: ; Single issue michael@0: UQADD8 r7, r7, r4 michael@0: ; Single issue michael@0: UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0)) michael@0: ; Single issue michael@0: UQSUB8 r4, r8, r7 michael@0: UQADD8 r5, r9, r7 michael@0: UQADD8 r8, r8, r7 michael@0: UQSUB8 r9, r9, r7 michael@0: SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L) michael@0: SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L) michael@0: MOV r5, r9, LSR #24 ; r5 = s2 michael@0: STRB r5, [r0,#2]! michael@0: MOV r4, r8, LSR #24 ; r4 = s1 michael@0: STRB r4, [r0,#-1] michael@0: MOV r5, r9, LSR #8 ; r5 = r2 michael@0: STRB r5, [r0,-r1]! michael@0: MOV r4, r8, LSR #8 ; r4 = r1 michael@0: STRB r4, [r0,#-1] michael@0: MOV r5, r9, LSR #16 ; r5 = q2 michael@0: STRB r5, [r0,-r1]! michael@0: MOV r4, r8, LSR #16 ; r4 = q1 michael@0: STRB r4, [r0,#-1] michael@0: ; Single issue michael@0: STRB r9, [r0,-r1]! michael@0: ; Single issue michael@0: STRB r8, [r0,#-1] michael@0: MOV PC,r14 michael@0: ENDP michael@0: michael@0: ; This uses the same strategy as the MMXEXT version for x86, except that UHADD8 michael@0: ; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB. michael@0: ; This works just as well, with the following procedure for computing the michael@0: ; filter value, f: michael@0: ; u = ~UHADD8(p1,~p2); michael@0: ; v = UHADD8(~p1,p2); michael@0: ; m = v-u; michael@0: ; a = m^UHADD8(m^p0,m^~p3); michael@0: ; f = UHADD8(UHADD8(a,u1),v1); michael@0: ; where f = 127+R, with R in [-127,128] defined as in the spec. michael@0: ; This is exactly the same amount of arithmetic as the version that uses PAVGB michael@0: ; as the basic operator. michael@0: ; It executes about 2/3 the number of instructions of David Conrad's approach, michael@0: ; but requires more code, because it does all eight columns at once, instead michael@0: ; of four at a time. michael@0: loop_filter_v_v6 PROC michael@0: ; r0 = unsigned char *_pix michael@0: ; r1 = int _ystride michael@0: ; r2 = int _ll michael@0: ; preserves r0-r11 michael@0: STMFD r13!,{r4-r11,r14} michael@0: LDRD r6, [r0, -r1]! ; r7, r6 = michael@0: LDRD r4, [r0, -r1] ; r5, r4 = michael@0: LDRD r8, [r0, r1]! ; r9, r8 = michael@0: MVN r14,r6 ; r14= ~p1 michael@0: LDRD r10,[r0, r1] ; r11,r10= michael@0: ; Filter the first four columns. michael@0: MVN r12,r8 ; r12= ~p2 michael@0: UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1 michael@0: UHADD8 r12,r12,r6 ; r12= p1+~p2>>1 michael@0: MVN r10, r10 ; r10=~p3 michael@0: MVN r12,r12 ; r12= u1=~p1+p2+1>>1 michael@0: SSUB8 r14,r14,r12 ; r14= m1=v1-u1 michael@0: ; Single issue michael@0: EOR r4, r4, r14 ; r4 = m1^p0 michael@0: EOR r10,r10,r14 ; r10= m1^~p3 michael@0: UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1 michael@0: ; Single issue michael@0: EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1) michael@0: SADD8 r14,r14,r12 ; r14= v1=m1+u1 michael@0: UHADD8 r4, r4, r12 ; r4 = a1+u1>>1 michael@0: MVN r12,r9 ; r12= ~p6 michael@0: UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1 michael@0: ; Filter the second four columns. michael@0: MVN r14,r7 ; r14= ~p5 michael@0: UHADD8 r12,r12,r7 ; r12= p5+~p6>>1 michael@0: UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1 michael@0: MVN r12,r12 ; r12= u2=~p5+p6+1>>1 michael@0: MVN r11,r11 ; r11=~p7 michael@0: SSUB8 r10,r14,r12 ; r10= m2=v2-u2 michael@0: ; Single issue michael@0: EOR r5, r5, r10 ; r5 = m2^p4 michael@0: EOR r11,r11,r10 ; r11= m2^~p7 michael@0: UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1 michael@0: ; Single issue michael@0: EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1) michael@0: ; Single issue michael@0: UHADD8 r5, r5, r12 ; r5 = a2+u2>>1 michael@0: LDR r12,=0x7F7F7F7F ; r12 = {127}x4 michael@0: UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1 michael@0: ; Now split f[i] by sign. michael@0: ; There's no min or max instruction. michael@0: ; We could use SSUB8 and SEL, but this is just as many instructions and michael@0: ; dual issues more (for v7 without NEON). michael@0: UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0 michael@0: UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0 michael@0: UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0) michael@0: UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0) michael@0: UQADD8 r10,r10,r11 michael@0: UQADD8 r4, r4, r14 michael@0: UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) michael@0: UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) michael@0: UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0 michael@0: UQADD8 r6, r6, r10 michael@0: UQSUB8 r8, r8, r10 michael@0: UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0 michael@0: UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L) michael@0: UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L) michael@0: UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0) michael@0: UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0) michael@0: UQADD8 r11,r11,r10 michael@0: UQADD8 r5, r5, r14 michael@0: UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) michael@0: UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) michael@0: UQADD8 r7, r7, r11 michael@0: UQSUB8 r9, r9, r11 michael@0: UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L) michael@0: STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6] michael@0: UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L) michael@0: STRD r8, [r0] ; [p6:p2] = [r9: r8] michael@0: LDMFD r13!,{r4-r11,PC} michael@0: ENDP michael@0: michael@0: oc_loop_filter_frag_rows_v6 PROC michael@0: ; r0 = _ref_frame_data michael@0: ; r1 = _ystride michael@0: ; r2 = _bv michael@0: ; r3 = _frags michael@0: ; r4 = _fragi0 michael@0: ; r5 = _fragi0_end michael@0: ; r6 = _fragi_top michael@0: ; r7 = _fragi_bot michael@0: ; r8 = _frag_buf_offs michael@0: ; r9 = _nhfrags michael@0: MOV r12,r13 michael@0: STMFD r13!,{r0,r4-r11,r14} michael@0: LDMFD r12,{r4-r9} michael@0: LDR r2, [r2] ; ll = *(int *)_bv michael@0: CMP r4, r5 ; if(_fragi0>=_fragi0_end) michael@0: BGE oslffri_v6_end ; bail michael@0: SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) michael@0: BLE oslffri_v6_end ; bail michael@0: ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] michael@0: ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] michael@0: SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; michael@0: oslffri_v6_lp1 michael@0: MOV r10,r4 ; r10= fragi = _fragi0 michael@0: ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 michael@0: oslffri_v6_lp2 michael@0: LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ michael@0: LDR r0, [r13] ; r0 = _ref_frame_data michael@0: LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ michael@0: TST r14,#OC_FRAG_CODED_FLAG michael@0: BEQ oslffri_v6_uncoded michael@0: CMP r10,r4 ; if (fragi>_fragi0) michael@0: ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] michael@0: BLGT loop_filter_h_v6 michael@0: CMP r4, r6 ; if (fragi0>_fragi_top) michael@0: BLGT loop_filter_v_v6 michael@0: CMP r10,r11 ; if(fragi+1>3 1,4 michael@0: ADD r12,r12,r1, LSL #2 michael@0: ; We want to do michael@0: ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) michael@0: ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) michael@0: ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) michael@0: ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) michael@0: ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) michael@0: ; So we've reduced the left and right hand terms to be the same, except michael@0: ; for a negation. michael@0: ; Stall x3 michael@0: VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 michael@0: PLD [r12,-r1] michael@0: VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 michael@0: PLD [r12] michael@0: VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 michael@0: PLD [r12,r1] michael@0: VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 michael@0: PLD [r12,r1,LSL #1] michael@0: VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 michael@0: ADD r12,r12,r1, LSL #2 michael@0: ; Now we need to correct for the sign of f. michael@0: ; For negative elements of Q0, we want to subtract the appropriate michael@0: ; element of Q9. For positive elements we want to add them. No NEON michael@0: ; instruction exists to do this, so we need to negate the negative michael@0: ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b michael@0: VADD.S16 Q9, Q9, Q0 ; 1,3 michael@0: PLD [r12,-r1] michael@0: VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 michael@0: ; Bah. No VRSBW.U8 michael@0: ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) michael@0: VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 michael@0: VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 michael@0: VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1 michael@0: VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1 michael@0: SUB r12,r0, #1 michael@0: VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1 michael@0: VST1.16 {D4[0]}, [r12], r1 michael@0: VST1.16 {D2[0]}, [r12], r1 michael@0: VST1.16 {D4[1]}, [r12], r1 michael@0: VST1.16 {D2[1]}, [r12], r1 michael@0: VST1.16 {D4[2]}, [r12], r1 michael@0: VST1.16 {D2[2]}, [r12], r1 michael@0: VST1.16 {D4[3]}, [r12], r1 michael@0: VST1.16 {D2[3]}, [r12], r1 michael@0: MOV PC,r14 michael@0: ENDP michael@0: michael@0: loop_filter_v_neon PROC michael@0: ; r0 = unsigned char *_pix michael@0: ; r1 = int _ystride michael@0: ; r2 = int *_bv michael@0: ; preserves r0-r3 michael@0: ; We assume Q15= 2*L in U16s michael@0: ; My best guesses at cycle counts (and latency)--vvv michael@0: SUB r12,r0, r1, LSL #1 michael@0: VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1 michael@0: VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1 michael@0: VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1 michael@0: VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1 michael@0: VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3 michael@0: VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 michael@0: ADD r12, #8 michael@0: VADD.S16 Q0, Q0, Q8 ; 1,3 michael@0: PLD [r12] michael@0: VADD.S16 Q0, Q0, Q8 ; 1,3 michael@0: PLD [r12,r1] michael@0: VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 michael@0: SUB r12, r0, r1 michael@0: VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 michael@0: ; We want to do michael@0: ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) michael@0: ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) michael@0: ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) michael@0: ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) michael@0: ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) michael@0: ; So we've reduced the left and right hand terms to be the same, except michael@0: ; for a negation. michael@0: ; Stall x3 michael@0: VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 michael@0: VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 michael@0: ; Stall x2 michael@0: VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 michael@0: VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 michael@0: ; Stall x2 michael@0: VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 michael@0: ; Now we need to correct for the sign of f. michael@0: ; For negative elements of Q0, we want to subtract the appropriate michael@0: ; element of Q9. For positive elements we want to add them. No NEON michael@0: ; instruction exists to do this, so we need to negate the negative michael@0: ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b michael@0: ; Stall x3 michael@0: VADD.S16 Q9, Q9, Q0 ; 1,3 michael@0: ; Stall x2 michael@0: VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 michael@0: ; Bah. No VRSBW.U8 michael@0: ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) michael@0: VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 michael@0: VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 michael@0: VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1 michael@0: VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1 michael@0: VST1.64 {D2}, [r12@64], r1 michael@0: VST1.64 {D4}, [r12@64], r1 michael@0: MOV PC,r14 michael@0: ENDP michael@0: michael@0: oc_loop_filter_frag_rows_neon PROC michael@0: ; r0 = _ref_frame_data michael@0: ; r1 = _ystride michael@0: ; r2 = _bv michael@0: ; r3 = _frags michael@0: ; r4 = _fragi0 michael@0: ; r5 = _fragi0_end michael@0: ; r6 = _fragi_top michael@0: ; r7 = _fragi_bot michael@0: ; r8 = _frag_buf_offs michael@0: ; r9 = _nhfrags michael@0: MOV r12,r13 michael@0: STMFD r13!,{r0,r4-r11,r14} michael@0: LDMFD r12,{r4-r9} michael@0: CMP r4, r5 ; if(_fragi0>=_fragi0_end) michael@0: BGE oslffri_neon_end; bail michael@0: SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) michael@0: BLE oslffri_neon_end ; bail michael@0: VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s michael@0: ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] michael@0: ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] michael@0: SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; michael@0: oslffri_neon_lp1 michael@0: MOV r10,r4 ; r10= fragi = _fragi0 michael@0: ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 michael@0: oslffri_neon_lp2 michael@0: LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ michael@0: LDR r0, [r13] ; r0 = _ref_frame_data michael@0: LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ michael@0: TST r14,#OC_FRAG_CODED_FLAG michael@0: BEQ oslffri_neon_uncoded michael@0: CMP r10,r4 ; if (fragi>_fragi0) michael@0: ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] michael@0: BLGT loop_filter_h_neon michael@0: CMP r4, r6 ; if (_fragi0>_fragi_top) michael@0: BLGT loop_filter_v_neon michael@0: CMP r10,r11 ; if(fragi+1