|
1 ;******************************************************************** |
|
2 ;* * |
|
3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 ;* * |
|
8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * |
|
9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 ;* * |
|
11 ;******************************************************************** |
|
12 ; Original implementation: |
|
13 ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd |
|
14 ; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $ |
|
15 ;******************************************************************** |
|
16 |
|
17 AREA |.text|, CODE, READONLY |
|
18 |
|
19 ; Explicitly specifying alignment here because some versions of |
|
20 ; gas don't align code correctly. See |
|
21 ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html |
|
22 ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 |
|
23 ALIGN |
|
24 |
|
25 GET armopts.s |
|
26 |
|
27 EXPORT oc_loop_filter_frag_rows_arm |
|
28 |
|
29 ; Which bit this is depends on the order of packing within a bitfield. |
|
30 ; Hopefully that doesn't change among any of the relevant compilers. |
|
31 OC_FRAG_CODED_FLAG * 1 |
|
32 |
|
33 ; Vanilla ARM v4 version |
|
34 loop_filter_h_arm PROC |
|
35 ; r0 = unsigned char *_pix |
|
36 ; r1 = int _ystride |
|
37 ; r2 = int *_bv |
|
38 ; preserves r0-r3 |
|
39 STMFD r13!,{r3-r6,r14} |
|
40 MOV r14,#8 |
|
41 MOV r6, #255 |
|
42 lfh_arm_lp |
|
43 LDRB r3, [r0, #-2] ; r3 = _pix[0] |
|
44 LDRB r12,[r0, #1] ; r12= _pix[3] |
|
45 LDRB r4, [r0, #-1] ; r4 = _pix[1] |
|
46 LDRB r5, [r0] ; r5 = _pix[2] |
|
47 SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 |
|
48 ADD r3, r3, #4 |
|
49 SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] |
|
50 ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) |
|
51 ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 |
|
52 MOV r12,r12,ASR #3 |
|
53 LDRSB r12,[r2, r12] |
|
54 ; Stall (2 on Xscale) |
|
55 ADDS r4, r4, r12 |
|
56 CMPGT r6, r4 |
|
57 EORLT r4, r6, r4, ASR #32 |
|
58 SUBS r5, r5, r12 |
|
59 CMPGT r6, r5 |
|
60 EORLT r5, r6, r5, ASR #32 |
|
61 STRB r4, [r0, #-1] |
|
62 STRB r5, [r0], r1 |
|
63 SUBS r14,r14,#1 |
|
64 BGT lfh_arm_lp |
|
65 SUB r0, r0, r1, LSL #3 |
|
66 LDMFD r13!,{r3-r6,PC} |
|
67 ENDP |
|
68 |
|
69 loop_filter_v_arm PROC |
|
70 ; r0 = unsigned char *_pix |
|
71 ; r1 = int _ystride |
|
72 ; r2 = int *_bv |
|
73 ; preserves r0-r3 |
|
74 STMFD r13!,{r3-r6,r14} |
|
75 MOV r14,#8 |
|
76 MOV r6, #255 |
|
77 lfv_arm_lp |
|
78 LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0] |
|
79 LDRB r12,[r0, r1] ; r12= _pix[3] |
|
80 LDRB r4, [r0, -r1] ; r4 = _pix[1] |
|
81 LDRB r5, [r0] ; r5 = _pix[2] |
|
82 SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 |
|
83 ADD r3, r3, #4 |
|
84 SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] |
|
85 ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) |
|
86 ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 |
|
87 MOV r12,r12,ASR #3 |
|
88 LDRSB r12,[r2, r12] |
|
89 ; Stall (2 on Xscale) |
|
90 ADDS r4, r4, r12 |
|
91 CMPGT r6, r4 |
|
92 EORLT r4, r6, r4, ASR #32 |
|
93 SUBS r5, r5, r12 |
|
94 CMPGT r6, r5 |
|
95 EORLT r5, r6, r5, ASR #32 |
|
96 STRB r4, [r0, -r1] |
|
97 STRB r5, [r0], #1 |
|
98 SUBS r14,r14,#1 |
|
99 BGT lfv_arm_lp |
|
100 SUB r0, r0, #8 |
|
101 LDMFD r13!,{r3-r6,PC} |
|
102 ENDP |
|
103 |
|
104 oc_loop_filter_frag_rows_arm PROC |
|
105 ; r0 = _ref_frame_data |
|
106 ; r1 = _ystride |
|
107 ; r2 = _bv |
|
108 ; r3 = _frags |
|
109 ; r4 = _fragi0 |
|
110 ; r5 = _fragi0_end |
|
111 ; r6 = _fragi_top |
|
112 ; r7 = _fragi_bot |
|
113 ; r8 = _frag_buf_offs |
|
114 ; r9 = _nhfrags |
|
115 MOV r12,r13 |
|
116 STMFD r13!,{r0,r4-r11,r14} |
|
117 LDMFD r12,{r4-r9} |
|
118 ADD r2, r2, #127 ; _bv += 127 |
|
119 CMP r4, r5 ; if(_fragi0>=_fragi0_end) |
|
120 BGE oslffri_arm_end ; bail |
|
121 SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) |
|
122 BLE oslffri_arm_end ; bail |
|
123 ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] |
|
124 ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] |
|
125 SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; |
|
126 oslffri_arm_lp1 |
|
127 MOV r10,r4 ; r10= fragi = _fragi0 |
|
128 ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 |
|
129 oslffri_arm_lp2 |
|
130 LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ |
|
131 LDR r0, [r13] ; r0 = _ref_frame_data |
|
132 LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ |
|
133 TST r14,#OC_FRAG_CODED_FLAG |
|
134 BEQ oslffri_arm_uncoded |
|
135 CMP r10,r4 ; if (fragi>_fragi0) |
|
136 ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] |
|
137 BLGT loop_filter_h_arm |
|
138 CMP r4, r6 ; if (_fragi0>_fragi_top) |
|
139 BLGT loop_filter_v_arm |
|
140 CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) |
|
141 LDRLT r12,[r3] ; r12 = _frags[fragi+1] |
|
142 ADD r0, r0, #8 |
|
143 ADD r10,r10,#1 ; r10 = fragi+1; |
|
144 ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
|
145 CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 |
|
146 BLLT loop_filter_h_arm |
|
147 CMP r10,r7 ; if (fragi<_fragi_bot) |
|
148 LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] |
|
149 SUB r0, r0, #8 |
|
150 ADD r0, r0, r1, LSL #3 |
|
151 ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
|
152 CMPLT r12,#OC_FRAG_CODED_FLAG |
|
153 BLLT loop_filter_v_arm |
|
154 CMP r10,r11 ; while(fragi<=fragi_end-1) |
|
155 BLE oslffri_arm_lp2 |
|
156 MOV r4, r10 ; r4 = fragi0 += _nhfrags |
|
157 CMP r4, r5 |
|
158 BLT oslffri_arm_lp1 |
|
159 oslffri_arm_end |
|
160 LDMFD r13!,{r0,r4-r11,PC} |
|
161 oslffri_arm_uncoded |
|
162 ADD r10,r10,#1 |
|
163 CMP r10,r11 |
|
164 BLE oslffri_arm_lp2 |
|
165 MOV r4, r10 ; r4 = _fragi0 += _nhfrags |
|
166 CMP r4, r5 |
|
167 BLT oslffri_arm_lp1 |
|
168 LDMFD r13!,{r0,r4-r11,PC} |
|
169 ENDP |
|
170 |
|
171 [ OC_ARM_ASM_MEDIA |
|
172 EXPORT oc_loop_filter_init_v6 |
|
173 EXPORT oc_loop_filter_frag_rows_v6 |
|
174 |
|
175 oc_loop_filter_init_v6 PROC |
|
176 ; r0 = _bv |
|
177 ; r1 = _flimit (=L from the spec) |
|
178 MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L> |
|
179 AND r1, r1, #255 ; r1 = ll=r1&0xFF |
|
180 ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll> |
|
181 PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll> |
|
182 STR r1, [r0] |
|
183 MOV PC,r14 |
|
184 ENDP |
|
185 |
|
186 ; We could use the same strategy as the v filter below, but that would require |
|
187 ; 40 instructions to load the data and transpose it into columns and another |
|
188 ; 32 to write out the results at the end, plus the 52 instructions to do the |
|
189 ; filtering itself. |
|
190 ; This is slightly less, and less code, even assuming we could have shared the |
|
191 ; 52 instructions in the middle with the other function. |
|
192 ; It executes slightly fewer instructions than the ARMv6 approach David Conrad |
|
193 ; proposed for FFmpeg, but not by much: |
|
194 ; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html |
|
195 ; His is a lot less code, though, because it only does two rows at once instead |
|
196 ; of four. |
|
197 loop_filter_h_v6 PROC |
|
198 ; r0 = unsigned char *_pix |
|
199 ; r1 = int _ystride |
|
200 ; r2 = int _ll |
|
201 ; preserves r0-r3 |
|
202 STMFD r13!,{r4-r11,r14} |
|
203 LDR r12,=0x10003 |
|
204 BL loop_filter_h_core_v6 |
|
205 ADD r0, r0, r1, LSL #2 |
|
206 BL loop_filter_h_core_v6 |
|
207 SUB r0, r0, r1, LSL #2 |
|
208 LDMFD r13!,{r4-r11,PC} |
|
209 ENDP |
|
210 |
|
211 loop_filter_h_core_v6 PROC |
|
212 ; r0 = unsigned char *_pix |
|
213 ; r1 = int _ystride |
|
214 ; r2 = int _ll |
|
215 ; r12= 0x10003 |
|
216 ; Preserves r0-r3, r12; Clobbers r4-r11. |
|
217 LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0> |
|
218 ; Single issue |
|
219 LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0> |
|
220 UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2> |
|
221 UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1> |
|
222 UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2> |
|
223 UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1> |
|
224 PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1> |
|
225 PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2> |
|
226 SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2> |
|
227 SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3> |
|
228 SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2> |
|
229 SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4> |
|
230 LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0> |
|
231 MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3> |
|
232 LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0> |
|
233 PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p> |
|
234 UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2> |
|
235 UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p> |
|
236 UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1> |
|
237 UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2> |
|
238 PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2> |
|
239 SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2> |
|
240 UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1> |
|
241 SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3> |
|
242 SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2> |
|
243 SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4> |
|
244 ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2> |
|
245 MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3> |
|
246 PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1> |
|
247 PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r> |
|
248 ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1> |
|
249 UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r> |
|
250 MOV r10,#0 |
|
251 ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p> |
|
252 ; Single issue |
|
253 ; There's no min, max or abs instruction. |
|
254 ; SSUB8 and SEL will work for abs, and we can do all the rest with |
|
255 ; unsigned saturated adds, which means the GE flags are still all |
|
256 ; set when we're done computing lflim(abs(R_i),L). |
|
257 ; This allows us to both add and subtract, and split the results by |
|
258 ; the original sign of R_i. |
|
259 SSUB8 r7, r10,r6 |
|
260 ; Single issue |
|
261 SEL r7, r7, r6 ; r7 = abs(R_i) |
|
262 ; Single issue |
|
263 UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0) |
|
264 ; Single issue |
|
265 UQADD8 r7, r7, r4 |
|
266 ; Single issue |
|
267 UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0)) |
|
268 ; Single issue |
|
269 UQSUB8 r4, r8, r7 |
|
270 UQADD8 r5, r9, r7 |
|
271 UQADD8 r8, r8, r7 |
|
272 UQSUB8 r9, r9, r7 |
|
273 SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L) |
|
274 SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L) |
|
275 MOV r5, r9, LSR #24 ; r5 = s2 |
|
276 STRB r5, [r0,#2]! |
|
277 MOV r4, r8, LSR #24 ; r4 = s1 |
|
278 STRB r4, [r0,#-1] |
|
279 MOV r5, r9, LSR #8 ; r5 = r2 |
|
280 STRB r5, [r0,-r1]! |
|
281 MOV r4, r8, LSR #8 ; r4 = r1 |
|
282 STRB r4, [r0,#-1] |
|
283 MOV r5, r9, LSR #16 ; r5 = q2 |
|
284 STRB r5, [r0,-r1]! |
|
285 MOV r4, r8, LSR #16 ; r4 = q1 |
|
286 STRB r4, [r0,#-1] |
|
287 ; Single issue |
|
288 STRB r9, [r0,-r1]! |
|
289 ; Single issue |
|
290 STRB r8, [r0,#-1] |
|
291 MOV PC,r14 |
|
292 ENDP |
|
293 |
|
294 ; This uses the same strategy as the MMXEXT version for x86, except that UHADD8 |
|
295 ; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB. |
|
296 ; This works just as well, with the following procedure for computing the |
|
297 ; filter value, f: |
|
298 ; u = ~UHADD8(p1,~p2); |
|
299 ; v = UHADD8(~p1,p2); |
|
300 ; m = v-u; |
|
301 ; a = m^UHADD8(m^p0,m^~p3); |
|
302 ; f = UHADD8(UHADD8(a,u1),v1); |
|
303 ; where f = 127+R, with R in [-127,128] defined as in the spec. |
|
304 ; This is exactly the same amount of arithmetic as the version that uses PAVGB |
|
305 ; as the basic operator. |
|
306 ; It executes about 2/3 the number of instructions of David Conrad's approach, |
|
307 ; but requires more code, because it does all eight columns at once, instead |
|
308 ; of four at a time. |
|
309 loop_filter_v_v6 PROC |
|
310 ; r0 = unsigned char *_pix |
|
311 ; r1 = int _ystride |
|
312 ; r2 = int _ll |
|
313 ; preserves r0-r11 |
|
314 STMFD r13!,{r4-r11,r14} |
|
315 LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1> |
|
316 LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0> |
|
317 LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2> |
|
318 MVN r14,r6 ; r14= ~p1 |
|
319 LDRD r10,[r0, r1] ; r11,r10= <p7|p3> |
|
320 ; Filter the first four columns. |
|
321 MVN r12,r8 ; r12= ~p2 |
|
322 UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1 |
|
323 UHADD8 r12,r12,r6 ; r12= p1+~p2>>1 |
|
324 MVN r10, r10 ; r10=~p3 |
|
325 MVN r12,r12 ; r12= u1=~p1+p2+1>>1 |
|
326 SSUB8 r14,r14,r12 ; r14= m1=v1-u1 |
|
327 ; Single issue |
|
328 EOR r4, r4, r14 ; r4 = m1^p0 |
|
329 EOR r10,r10,r14 ; r10= m1^~p3 |
|
330 UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1 |
|
331 ; Single issue |
|
332 EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1) |
|
333 SADD8 r14,r14,r12 ; r14= v1=m1+u1 |
|
334 UHADD8 r4, r4, r12 ; r4 = a1+u1>>1 |
|
335 MVN r12,r9 ; r12= ~p6 |
|
336 UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1 |
|
337 ; Filter the second four columns. |
|
338 MVN r14,r7 ; r14= ~p5 |
|
339 UHADD8 r12,r12,r7 ; r12= p5+~p6>>1 |
|
340 UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1 |
|
341 MVN r12,r12 ; r12= u2=~p5+p6+1>>1 |
|
342 MVN r11,r11 ; r11=~p7 |
|
343 SSUB8 r10,r14,r12 ; r10= m2=v2-u2 |
|
344 ; Single issue |
|
345 EOR r5, r5, r10 ; r5 = m2^p4 |
|
346 EOR r11,r11,r10 ; r11= m2^~p7 |
|
347 UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1 |
|
348 ; Single issue |
|
349 EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1) |
|
350 ; Single issue |
|
351 UHADD8 r5, r5, r12 ; r5 = a2+u2>>1 |
|
352 LDR r12,=0x7F7F7F7F ; r12 = {127}x4 |
|
353 UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1 |
|
354 ; Now split f[i] by sign. |
|
355 ; There's no min or max instruction. |
|
356 ; We could use SSUB8 and SEL, but this is just as many instructions and |
|
357 ; dual issues more (for v7 without NEON). |
|
358 UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0 |
|
359 UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0 |
|
360 UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0) |
|
361 UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0) |
|
362 UQADD8 r10,r10,r11 |
|
363 UQADD8 r4, r4, r14 |
|
364 UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) |
|
365 UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) |
|
366 UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0 |
|
367 UQADD8 r6, r6, r10 |
|
368 UQSUB8 r8, r8, r10 |
|
369 UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0 |
|
370 UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L) |
|
371 UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L) |
|
372 UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0) |
|
373 UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0) |
|
374 UQADD8 r11,r11,r10 |
|
375 UQADD8 r5, r5, r14 |
|
376 UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) |
|
377 UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) |
|
378 UQADD8 r7, r7, r11 |
|
379 UQSUB8 r9, r9, r11 |
|
380 UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L) |
|
381 STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6] |
|
382 UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L) |
|
383 STRD r8, [r0] ; [p6:p2] = [r9: r8] |
|
384 LDMFD r13!,{r4-r11,PC} |
|
385 ENDP |
|
386 |
|
387 oc_loop_filter_frag_rows_v6 PROC |
|
388 ; r0 = _ref_frame_data |
|
389 ; r1 = _ystride |
|
390 ; r2 = _bv |
|
391 ; r3 = _frags |
|
392 ; r4 = _fragi0 |
|
393 ; r5 = _fragi0_end |
|
394 ; r6 = _fragi_top |
|
395 ; r7 = _fragi_bot |
|
396 ; r8 = _frag_buf_offs |
|
397 ; r9 = _nhfrags |
|
398 MOV r12,r13 |
|
399 STMFD r13!,{r0,r4-r11,r14} |
|
400 LDMFD r12,{r4-r9} |
|
401 LDR r2, [r2] ; ll = *(int *)_bv |
|
402 CMP r4, r5 ; if(_fragi0>=_fragi0_end) |
|
403 BGE oslffri_v6_end ; bail |
|
404 SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) |
|
405 BLE oslffri_v6_end ; bail |
|
406 ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] |
|
407 ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] |
|
408 SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; |
|
409 oslffri_v6_lp1 |
|
410 MOV r10,r4 ; r10= fragi = _fragi0 |
|
411 ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 |
|
412 oslffri_v6_lp2 |
|
413 LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ |
|
414 LDR r0, [r13] ; r0 = _ref_frame_data |
|
415 LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ |
|
416 TST r14,#OC_FRAG_CODED_FLAG |
|
417 BEQ oslffri_v6_uncoded |
|
418 CMP r10,r4 ; if (fragi>_fragi0) |
|
419 ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] |
|
420 BLGT loop_filter_h_v6 |
|
421 CMP r4, r6 ; if (fragi0>_fragi_top) |
|
422 BLGT loop_filter_v_v6 |
|
423 CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) |
|
424 LDRLT r12,[r3] ; r12 = _frags[fragi+1] |
|
425 ADD r0, r0, #8 |
|
426 ADD r10,r10,#1 ; r10 = fragi+1; |
|
427 ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
|
428 CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 |
|
429 BLLT loop_filter_h_v6 |
|
430 CMP r10,r7 ; if (fragi<_fragi_bot) |
|
431 LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] |
|
432 SUB r0, r0, #8 |
|
433 ADD r0, r0, r1, LSL #3 |
|
434 ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
|
435 CMPLT r12,#OC_FRAG_CODED_FLAG |
|
436 BLLT loop_filter_v_v6 |
|
437 CMP r10,r11 ; while(fragi<=fragi_end-1) |
|
438 BLE oslffri_v6_lp2 |
|
439 MOV r4, r10 ; r4 = fragi0 += nhfrags |
|
440 CMP r4, r5 |
|
441 BLT oslffri_v6_lp1 |
|
442 oslffri_v6_end |
|
443 LDMFD r13!,{r0,r4-r11,PC} |
|
444 oslffri_v6_uncoded |
|
445 ADD r10,r10,#1 |
|
446 CMP r10,r11 |
|
447 BLE oslffri_v6_lp2 |
|
448 MOV r4, r10 ; r4 = fragi0 += nhfrags |
|
449 CMP r4, r5 |
|
450 BLT oslffri_v6_lp1 |
|
451 LDMFD r13!,{r0,r4-r11,PC} |
|
452 ENDP |
|
453 ] |
|
454 |
|
455 [ OC_ARM_ASM_NEON |
|
456 EXPORT oc_loop_filter_init_neon |
|
457 EXPORT oc_loop_filter_frag_rows_neon |
|
458 |
|
459 oc_loop_filter_init_neon PROC |
|
460 ; r0 = _bv |
|
461 ; r1 = _flimit (=L from the spec) |
|
462 MOV r1, r1, LSL #1 ; r1 = 2*L |
|
463 VDUP.S16 Q15, r1 ; Q15= 2L in U16s |
|
464 VST1.64 {D30,D31}, [r0@128] |
|
465 MOV PC,r14 |
|
466 ENDP |
|
467 |
|
468 loop_filter_h_neon PROC |
|
469 ; r0 = unsigned char *_pix |
|
470 ; r1 = int _ystride |
|
471 ; r2 = int *_bv |
|
472 ; preserves r0-r3 |
|
473 ; We assume Q15= 2*L in U16s |
|
474 ; My best guesses at cycle counts (and latency)--vvv |
|
475 SUB r12,r0, #2 |
|
476 ; Doing a 2-element structure load saves doing two VTRN's below, at the |
|
477 ; cost of using two more slower single-lane loads vs. the faster |
|
478 ; all-lane loads. |
|
479 ; It's less code this way, though, and benches a hair faster, but it |
|
480 ; leaves D2 and D4 swapped. |
|
481 VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1 |
|
482 ; D2 = ____________3322 |
|
483 VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1 |
|
484 ; D6 = ____________7766 |
|
485 VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1 |
|
486 ; D2 = ________BBAA3322 |
|
487 VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1 |
|
488 ; D6 = ________FFEE7766 |
|
489 VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1 |
|
490 ; D2 = ____JJIIBBAA3322 |
|
491 VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1 |
|
492 ; D6 = ____NNMMFFEE7766 |
|
493 VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1 |
|
494 ; D2 = RRQQJJIIBBAA3322 |
|
495 VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1 |
|
496 ; D6 = VVUUNNMMFFEE7766 |
|
497 VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1 |
|
498 VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1 |
|
499 VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 |
|
500 VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3 |
|
501 ADD r12,r0, #8 |
|
502 VADD.S16 Q0, Q0, Q8 ; 1,3 |
|
503 PLD [r12] |
|
504 VADD.S16 Q0, Q0, Q8 ; 1,3 |
|
505 PLD [r12,r1] |
|
506 VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 |
|
507 PLD [r12,r1, LSL #1] |
|
508 VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 |
|
509 ADD r12,r12,r1, LSL #2 |
|
510 ; We want to do |
|
511 ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) |
|
512 ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) |
|
513 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) |
|
514 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) |
|
515 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) |
|
516 ; So we've reduced the left and right hand terms to be the same, except |
|
517 ; for a negation. |
|
518 ; Stall x3 |
|
519 VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 |
|
520 PLD [r12,-r1] |
|
521 VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 |
|
522 PLD [r12] |
|
523 VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 |
|
524 PLD [r12,r1] |
|
525 VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 |
|
526 PLD [r12,r1,LSL #1] |
|
527 VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 |
|
528 ADD r12,r12,r1, LSL #2 |
|
529 ; Now we need to correct for the sign of f. |
|
530 ; For negative elements of Q0, we want to subtract the appropriate |
|
531 ; element of Q9. For positive elements we want to add them. No NEON |
|
532 ; instruction exists to do this, so we need to negate the negative |
|
533 ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b |
|
534 VADD.S16 Q9, Q9, Q0 ; 1,3 |
|
535 PLD [r12,-r1] |
|
536 VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 |
|
537 ; Bah. No VRSBW.U8 |
|
538 ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) |
|
539 VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 |
|
540 VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 |
|
541 VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1 |
|
542 VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1 |
|
543 SUB r12,r0, #1 |
|
544 VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1 |
|
545 VST1.16 {D4[0]}, [r12], r1 |
|
546 VST1.16 {D2[0]}, [r12], r1 |
|
547 VST1.16 {D4[1]}, [r12], r1 |
|
548 VST1.16 {D2[1]}, [r12], r1 |
|
549 VST1.16 {D4[2]}, [r12], r1 |
|
550 VST1.16 {D2[2]}, [r12], r1 |
|
551 VST1.16 {D4[3]}, [r12], r1 |
|
552 VST1.16 {D2[3]}, [r12], r1 |
|
553 MOV PC,r14 |
|
554 ENDP |
|
555 |
|
556 loop_filter_v_neon PROC |
|
557 ; r0 = unsigned char *_pix |
|
558 ; r1 = int _ystride |
|
559 ; r2 = int *_bv |
|
560 ; preserves r0-r3 |
|
561 ; We assume Q15= 2*L in U16s |
|
562 ; My best guesses at cycle counts (and latency)--vvv |
|
563 SUB r12,r0, r1, LSL #1 |
|
564 VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1 |
|
565 VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1 |
|
566 VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1 |
|
567 VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1 |
|
568 VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3 |
|
569 VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 |
|
570 ADD r12, #8 |
|
571 VADD.S16 Q0, Q0, Q8 ; 1,3 |
|
572 PLD [r12] |
|
573 VADD.S16 Q0, Q0, Q8 ; 1,3 |
|
574 PLD [r12,r1] |
|
575 VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 |
|
576 SUB r12, r0, r1 |
|
577 VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 |
|
578 ; We want to do |
|
579 ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) |
|
580 ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) |
|
581 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) |
|
582 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) |
|
583 ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) |
|
584 ; So we've reduced the left and right hand terms to be the same, except |
|
585 ; for a negation. |
|
586 ; Stall x3 |
|
587 VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 |
|
588 VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 |
|
589 ; Stall x2 |
|
590 VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 |
|
591 VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 |
|
592 ; Stall x2 |
|
593 VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 |
|
594 ; Now we need to correct for the sign of f. |
|
595 ; For negative elements of Q0, we want to subtract the appropriate |
|
596 ; element of Q9. For positive elements we want to add them. No NEON |
|
597 ; instruction exists to do this, so we need to negate the negative |
|
598 ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b |
|
599 ; Stall x3 |
|
600 VADD.S16 Q9, Q9, Q0 ; 1,3 |
|
601 ; Stall x2 |
|
602 VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 |
|
603 ; Bah. No VRSBW.U8 |
|
604 ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) |
|
605 VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 |
|
606 VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 |
|
607 VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1 |
|
608 VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1 |
|
609 VST1.64 {D2}, [r12@64], r1 |
|
610 VST1.64 {D4}, [r12@64], r1 |
|
611 MOV PC,r14 |
|
612 ENDP |
|
613 |
|
614 oc_loop_filter_frag_rows_neon PROC |
|
615 ; r0 = _ref_frame_data |
|
616 ; r1 = _ystride |
|
617 ; r2 = _bv |
|
618 ; r3 = _frags |
|
619 ; r4 = _fragi0 |
|
620 ; r5 = _fragi0_end |
|
621 ; r6 = _fragi_top |
|
622 ; r7 = _fragi_bot |
|
623 ; r8 = _frag_buf_offs |
|
624 ; r9 = _nhfrags |
|
625 MOV r12,r13 |
|
626 STMFD r13!,{r0,r4-r11,r14} |
|
627 LDMFD r12,{r4-r9} |
|
628 CMP r4, r5 ; if(_fragi0>=_fragi0_end) |
|
629 BGE oslffri_neon_end; bail |
|
630 SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) |
|
631 BLE oslffri_neon_end ; bail |
|
632 VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s |
|
633 ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] |
|
634 ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] |
|
635 SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; |
|
636 oslffri_neon_lp1 |
|
637 MOV r10,r4 ; r10= fragi = _fragi0 |
|
638 ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 |
|
639 oslffri_neon_lp2 |
|
640 LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ |
|
641 LDR r0, [r13] ; r0 = _ref_frame_data |
|
642 LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ |
|
643 TST r14,#OC_FRAG_CODED_FLAG |
|
644 BEQ oslffri_neon_uncoded |
|
645 CMP r10,r4 ; if (fragi>_fragi0) |
|
646 ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] |
|
647 BLGT loop_filter_h_neon |
|
648 CMP r4, r6 ; if (_fragi0>_fragi_top) |
|
649 BLGT loop_filter_v_neon |
|
650 CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) |
|
651 LDRLT r12,[r3] ; r12 = _frags[fragi+1] |
|
652 ADD r0, r0, #8 |
|
653 ADD r10,r10,#1 ; r10 = fragi+1; |
|
654 ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
|
655 CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 |
|
656 BLLT loop_filter_h_neon |
|
657 CMP r10,r7 ; if (fragi<_fragi_bot) |
|
658 LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] |
|
659 SUB r0, r0, #8 |
|
660 ADD r0, r0, r1, LSL #3 |
|
661 ANDLT r12,r12,#OC_FRAG_CODED_FLAG |
|
662 CMPLT r12,#OC_FRAG_CODED_FLAG |
|
663 BLLT loop_filter_v_neon |
|
664 CMP r10,r11 ; while(fragi<=fragi_end-1) |
|
665 BLE oslffri_neon_lp2 |
|
666 MOV r4, r10 ; r4 = _fragi0 += _nhfrags |
|
667 CMP r4, r5 |
|
668 BLT oslffri_neon_lp1 |
|
669 oslffri_neon_end |
|
670 LDMFD r13!,{r0,r4-r11,PC} |
|
671 oslffri_neon_uncoded |
|
672 ADD r10,r10,#1 |
|
673 CMP r10,r11 |
|
674 BLE oslffri_neon_lp2 |
|
675 MOV r4, r10 ; r4 = _fragi0 += _nhfrags |
|
676 CMP r4, r5 |
|
677 BLT oslffri_neon_lp1 |
|
678 LDMFD r13!,{r0,r4-r11,PC} |
|
679 ENDP |
|
680 ] |
|
681 |
|
682 END |