|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 .arch armv7-a |
|
6 .fpu neon |
|
7 /* Allow to build on targets not supporting neon, and force the object file |
|
8 * target to avoid bumping the final binary target */ |
|
9 .object_arch armv4t |
|
10 .text |
|
11 .align |
|
12 |
|
13 .balign 64 |
|
14 YCbCr42xToRGB565_DITHER03_CONSTS_NEON: |
|
15 .short -14240 |
|
16 .short -14240+384 |
|
17 .short 8672 |
|
18 .short 8672+192 |
|
19 .short -17696 |
|
20 .short -17696+384 |
|
21 .byte 102 |
|
22 .byte 25 |
|
23 .byte 52 |
|
24 .byte 129 |
|
25 YCbCr42xToRGB565_DITHER12_CONSTS_NEON: |
|
26 .short -14240+128 |
|
27 .short -14240+256 |
|
28 .short 8672+64 |
|
29 .short 8672+128 |
|
30 .short -17696+128 |
|
31 .short -17696+256 |
|
32 .byte 102 |
|
33 .byte 25 |
|
34 .byte 52 |
|
35 .byte 129 |
|
36 YCbCr42xToRGB565_DITHER21_CONSTS_NEON: |
|
37 .short -14240+256 |
|
38 .short -14240+128 |
|
39 .short 8672+128 |
|
40 .short 8672+64 |
|
41 .short -17696+256 |
|
42 .short -17696+128 |
|
43 .byte 102 |
|
44 .byte 25 |
|
45 .byte 52 |
|
46 .byte 129 |
|
47 YCbCr42xToRGB565_DITHER30_CONSTS_NEON: |
|
48 .short -14240+384 |
|
49 .short -14240 |
|
50 .short 8672+192 |
|
51 .short 8672 |
|
52 .short -17696+384 |
|
53 .short -17696 |
|
54 .byte 102 |
|
55 .byte 25 |
|
56 .byte 52 |
|
57 .byte 129 |
|
58 |
|
59 @ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON( |
|
60 @ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); |
|
61 @ |
|
62 @ ctx = { |
|
63 @ uint16_t *rgb_row; /*r0*/ |
|
64 @ const uint8_t *y_row; /*r1*/ |
|
65 @ const uint8_t *u_row; /*r2*/ |
|
66 @ const uint8_t *v_row; /*r3*/ |
|
67 @ int y_yweight; /*r4*/ |
|
68 @ int y_pitch; /*r5*/ |
|
69 @ int width; /*r6*/ |
|
70 @ int source_x0_q16; /*r7*/ |
|
71 @ int source_dx_q16; /*r8*/ |
|
72 @ int source_uv_xoffs_q16; /*r9*/ |
|
73 @ }; |
|
74 .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON |
|
75 .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function |
|
76 .balign 64 |
|
77 .fnstart |
|
78 ScaleYCbCr42xToRGB565_BilinearY_Row_NEON: |
|
79 STMFD r13!,{r4-r9,r14} @ 8 words. |
|
80 ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON |
|
81 VPUSH {Q4-Q7} @ 16 words. |
|
82 ADD r14,r14,r1, LSL #4 @ Select the dither table to use |
|
83 LDMIA r0, {r0-r9} |
|
84 @ Set up image index registers. |
|
85 ADD r12,r8, r8 |
|
86 VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16 |
|
87 VDUP.32 D17,r12 |
|
88 ADD r12,r12,r12 |
|
89 VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16 |
|
90 VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16 |
|
91 ADD r12,r12,r12 |
|
92 VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16 |
|
93 VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16 |
|
94 CMP r8, #0 @ If source_dx_q16 is negative... |
|
95 VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16 |
|
96 ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block |
|
97 VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16 |
|
98 SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use) |
|
99 VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16 |
|
100 VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16 |
|
101 VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16 |
|
102 VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16 |
|
103 VLD1.64 {D30,D31},[r14,:128] @ Load some constants |
|
104 VMOV.I8 D28,#52 |
|
105 VMOV.I8 D29,#129 |
|
106 @ The basic idea here is to do aligned loads of a block of data and then |
|
107 @ index into it using VTBL to extract the data from the source X |
|
108 @ coordinate corresponding to each destination pixel. |
|
109 @ This is significantly less code and significantly fewer cycles than doing |
|
110 @ a series of single-lane loads, but it means that the X step between |
|
111 @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee |
|
112 @ that we could read 8 pixels from a single aligned 32-byte block of data. |
|
113 @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel, |
|
114 @ separated into even pixels and odd pixels to make extracting offsets and |
|
115 @ weights easier. |
|
116 @ We then pull out two bytes from the middle of each coordinate: the top |
|
117 @ byte corresponds to the integer part of the X coordinate, and the bottom |
|
118 @ byte corresponds to the weight to use for bilinear blending. |
|
119 @ These are separated out into different registers with VTRN. |
|
120 @ Then by subtracting the integer X coordinate of the first pixel in the |
|
121 @ data block we loaded, we produce an index register suitable for use by |
|
122 @ VTBL. |
|
123 s42xbily_neon_loop: |
|
124 @ Load the Y' data. |
|
125 MOV r12,r7, ASR #16 |
|
126 VRSHRN.S32 D16,Q0, #8 |
|
127 AND r12,r12,#~15 @ Read 16-byte aligned blocks |
|
128 VDUP.I8 D20,r12 |
|
129 ADD r12,r1, r12 @ r12 = y_row+(source_x&~7) |
|
130 VRSHRN.S32 D17,Q1, #8 |
|
131 PLD [r12,#64] |
|
132 VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row |
|
133 ADD r14,r7, r8, LSL #3 |
|
134 VRSHRN.S32 D18,Q2, #8 |
|
135 MOV r14,r14,ASR #16 |
|
136 VRSHRN.S32 D19,Q3, #8 |
|
137 AND r14,r14,#~15 @ Read 16-byte aligned blocks |
|
138 VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row |
|
139 PLD [r12,#64] |
|
140 VDUP.I8 D21,r14 |
|
141 ADD r14,r1, r14 @ r14 = y_row+(source_x&~7) |
|
142 VMOV.I8 Q13,#1 |
|
143 PLD [r14,#64] |
|
144 VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> |
|
145 @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> |
|
146 VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded. |
|
147 @ First 8 Y' pixels |
|
148 VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x |
|
149 VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x |
|
150 VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x |
|
151 VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1 |
|
152 VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1 |
|
153 @ Next 8 Y' pixels |
|
154 VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row |
|
155 VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row |
|
156 PLD [r14,#64] |
|
157 VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x |
|
158 VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x |
|
159 VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1 |
|
160 VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1 |
|
161 @ Blend Y'. |
|
162 VDUP.I16 Q9, r4 @ Load the y weights. |
|
163 VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a |
|
164 VSUBL.U8 Q5, D25,D21 |
|
165 VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b |
|
166 VSUBL.U8 Q7, D27,D23 |
|
167 VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight |
|
168 VMUL.S16 Q5, Q5, Q9 |
|
169 VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight |
|
170 VMUL.S16 Q7, Q7, Q9 |
|
171 VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits. |
|
172 VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW. |
|
173 VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8 |
|
174 VRSHRN.S16 D9, Q5, #8 |
|
175 VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8 |
|
176 VRSHRN.S16 D13,Q7, #8 |
|
177 VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8) |
|
178 VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8) |
|
179 VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a |
|
180 VSUBL.U8 Q5, D23,D21 |
|
181 VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight |
|
182 VMUL.S16 Q5, Q5, Q13 |
|
183 VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8 |
|
184 ADD r12,r7, r9 |
|
185 VRSHRN.S16 D9, Q5, #8 |
|
186 MOV r12,r12,ASR #17 |
|
187 VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8) |
|
188 @ Start extracting the chroma x coordinates, and load Cb and Cr. |
|
189 AND r12,r12,#~15 @ Read 16-byte aligned blocks |
|
190 VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4 |
|
191 ADD r14,r2, r12 |
|
192 VADD.I32 Q10,Q0, Q9 |
|
193 VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb |
|
194 PLD [r14,#64] |
|
195 VADD.I32 Q11,Q1, Q9 |
|
196 ADD r14,r3, r12 |
|
197 VADD.I32 Q12,Q2, Q9 |
|
198 VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr |
|
199 PLD [r14,#64] |
|
200 VADD.I32 Q13,Q3, Q9 |
|
201 VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0> |
|
202 VRSHRN.S32 D21,Q11,#9 |
|
203 VDUP.I8 Q9, r12 |
|
204 VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1> |
|
205 VRSHRN.S32 D23,Q13,#9 |
|
206 @ We don't actually need the x weights, but we get them for free. |
|
207 @ Free ALU slot |
|
208 VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> |
|
209 @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> |
|
210 VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded. |
|
211 VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x |
|
212 VMOV.I8 D24,#74 |
|
213 VTBL.8 D19,{D8, D9, D10,D11},D23 |
|
214 VMOV.I8 D26,#102 |
|
215 VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x |
|
216 VMOV.I8 D27,#25 |
|
217 VTBL.8 D21,{D12,D13,D14,D15},D23 |
|
218 @ We now have Y' in Q8, Cb in Q9, and Cr in Q10 |
|
219 @ We use VDUP to expand constants, because it's a permute instruction, so |
|
220 @ it can dual issue on the A8. |
|
221 SUBS r6, r6, #16 @ width -= 16 |
|
222 VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74 |
|
223 VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G |
|
224 VMULL.U8 Q5, D17,D24 |
|
225 VDUP.32 Q7, D30[1] |
|
226 VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G |
|
227 VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R |
|
228 VMLSL.U8 Q7, D19,D27 |
|
229 VDUP.32 Q12,D30[0] |
|
230 VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R |
|
231 VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B |
|
232 VMLAL.U8 Q12,D21,D26 |
|
233 VDUP.32 Q13,D31[0] |
|
234 VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B |
|
235 VMLAL.U8 Q13,D19,D29 |
|
236 VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G |
|
237 VMLSL.U8 Q7, D21,D28 |
|
238 VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R |
|
239 VADD.S16 Q12,Q5, Q12 |
|
240 VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B |
|
241 VQADD.S16 Q13,Q5, Q13 |
|
242 VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G |
|
243 VADD.S16 Q7, Q5, Q7 |
|
244 @ Push each value to the top of its word and saturate it. |
|
245 VQSHLU.S16 Q11,Q11,#2 |
|
246 VQSHLU.S16 Q12,Q12,#2 |
|
247 VQSHLU.S16 Q6, Q6, #2 |
|
248 VQSHLU.S16 Q7, Q7, #2 |
|
249 VQSHLU.S16 Q8, Q8, #2 |
|
250 VQSHLU.S16 Q13,Q13,#2 |
|
251 @ Merge G and B into R. |
|
252 VSRI.U16 Q11,Q6, #5 |
|
253 VSRI.U16 Q12,Q7, #5 |
|
254 VSRI.U16 Q11,Q8, #11 |
|
255 MOV r14,r8, LSL #4 |
|
256 VSRI.U16 Q12,Q13,#11 |
|
257 BLT s42xbily_neon_tail |
|
258 VDUP.I32 Q13,r14 |
|
259 @ Store the result. |
|
260 VST1.16 {D22,D23,D24,D25},[r0]! |
|
261 BEQ s42xbily_neon_done |
|
262 @ Advance the x coordinates. |
|
263 VADD.I32 Q0, Q0, Q13 |
|
264 VADD.I32 Q1, Q1, Q13 |
|
265 ADD r7, r14 |
|
266 VADD.I32 Q2, Q2, Q13 |
|
267 VADD.I32 Q3, Q3, Q13 |
|
268 B s42xbily_neon_loop |
|
269 s42xbily_neon_tail: |
|
270 @ We have between 1 and 15 pixels left to write. |
|
271 @ -r6 == the number of pixels we need to skip writing. |
|
272 @ Adjust r0 to point to the last one we need to write, because we're going |
|
273 @ to write them in reverse order. |
|
274 ADD r0, r0, r6, LSL #1 |
|
275 MOV r14,#-2 |
|
276 ADD r0, r0, #30 |
|
277 @ Skip past the ones we don't need to write. |
|
278 SUB PC, PC, r6, LSL #2 |
|
279 ORR r0, r0, r0 |
|
280 VST1.16 {D25[3]},[r0,:16],r14 |
|
281 VST1.16 {D25[2]},[r0,:16],r14 |
|
282 VST1.16 {D25[1]},[r0,:16],r14 |
|
283 VST1.16 {D25[0]},[r0,:16],r14 |
|
284 VST1.16 {D24[3]},[r0,:16],r14 |
|
285 VST1.16 {D24[2]},[r0,:16],r14 |
|
286 VST1.16 {D24[1]},[r0,:16],r14 |
|
287 VST1.16 {D24[0]},[r0,:16],r14 |
|
288 VST1.16 {D23[3]},[r0,:16],r14 |
|
289 VST1.16 {D23[2]},[r0,:16],r14 |
|
290 VST1.16 {D23[1]},[r0,:16],r14 |
|
291 VST1.16 {D23[0]},[r0,:16],r14 |
|
292 VST1.16 {D22[3]},[r0,:16],r14 |
|
293 VST1.16 {D22[2]},[r0,:16],r14 |
|
294 VST1.16 {D22[1]},[r0,:16],r14 |
|
295 VST1.16 {D22[0]},[r0,:16] |
|
296 s42xbily_neon_done: |
|
297 VPOP {Q4-Q7} @ 16 words. |
|
298 LDMFD r13!,{r4-r9,PC} @ 8 words. |
|
299 .fnend |
|
300 .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON |
|
301 |
|
302 #if defined(__ELF__)&&defined(__linux__) |
|
303 .section .note.GNU-stack,"",%progbits |
|
304 #endif |