1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/ycbcr/yuv_row_arm.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,304 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 + 1.8 + .arch armv7-a 1.9 + .fpu neon 1.10 +/* Allow to build on targets not supporting neon, and force the object file 1.11 + * target to avoid bumping the final binary target */ 1.12 + .object_arch armv4t 1.13 + .text 1.14 + .align 1.15 + 1.16 + .balign 64 1.17 +YCbCr42xToRGB565_DITHER03_CONSTS_NEON: 1.18 + .short -14240 1.19 + .short -14240+384 1.20 + .short 8672 1.21 + .short 8672+192 1.22 + .short -17696 1.23 + .short -17696+384 1.24 + .byte 102 1.25 + .byte 25 1.26 + .byte 52 1.27 + .byte 129 1.28 +YCbCr42xToRGB565_DITHER12_CONSTS_NEON: 1.29 + .short -14240+128 1.30 + .short -14240+256 1.31 + .short 8672+64 1.32 + .short 8672+128 1.33 + .short -17696+128 1.34 + .short -17696+256 1.35 + .byte 102 1.36 + .byte 25 1.37 + .byte 52 1.38 + .byte 129 1.39 +YCbCr42xToRGB565_DITHER21_CONSTS_NEON: 1.40 + .short -14240+256 1.41 + .short -14240+128 1.42 + .short 8672+128 1.43 + .short 8672+64 1.44 + .short -17696+256 1.45 + .short -17696+128 1.46 + .byte 102 1.47 + .byte 25 1.48 + .byte 52 1.49 + .byte 129 1.50 +YCbCr42xToRGB565_DITHER30_CONSTS_NEON: 1.51 + .short -14240+384 1.52 + .short -14240 1.53 + .short 8672+192 1.54 + .short 8672 1.55 + .short -17696+384 1.56 + .short -17696 1.57 + .byte 102 1.58 + .byte 25 1.59 + .byte 52 1.60 + .byte 129 1.61 + 1.62 +@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON( 1.63 +@ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither); 1.64 +@ 1.65 +@ ctx = { 1.66 +@ uint16_t *rgb_row; /*r0*/ 1.67 +@ const uint8_t *y_row; /*r1*/ 1.68 +@ const uint8_t *u_row; /*r2*/ 1.69 +@ const uint8_t *v_row; /*r3*/ 1.70 +@ int y_yweight; /*r4*/ 1.71 +@ int y_pitch; /*r5*/ 1.72 +@ int width; /*r6*/ 1.73 +@ int source_x0_q16; /*r7*/ 1.74 +@ int source_dx_q16; /*r8*/ 1.75 +@ int source_uv_xoffs_q16; /*r9*/ 1.76 +@ }; 1.77 + .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON 1.78 + .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function 1.79 + .balign 64 1.80 + .fnstart 1.81 +ScaleYCbCr42xToRGB565_BilinearY_Row_NEON: 1.82 + STMFD r13!,{r4-r9,r14} @ 8 words. 1.83 + ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON 1.84 + VPUSH {Q4-Q7} @ 16 words. 1.85 + ADD r14,r14,r1, LSL #4 @ Select the dither table to use 1.86 + LDMIA r0, {r0-r9} 1.87 + @ Set up image index registers. 1.88 + ADD r12,r8, r8 1.89 + VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16 1.90 + VDUP.32 D17,r12 1.91 + ADD r12,r12,r12 1.92 + VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16 1.93 + VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16 1.94 + ADD r12,r12,r12 1.95 + VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16 1.96 + VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16 1.97 + CMP r8, #0 @ If source_dx_q16 is negative... 1.98 + VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16 1.99 + ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block 1.100 + VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16 1.101 + SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use) 1.102 + VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16 1.103 + VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16 1.104 + VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16 1.105 + VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16 1.106 + VLD1.64 {D30,D31},[r14,:128] @ Load some constants 1.107 + VMOV.I8 D28,#52 1.108 + VMOV.I8 D29,#129 1.109 + @ The basic idea here is to do aligned loads of a block of data and then 1.110 + @ index into it using VTBL to extract the data from the source X 1.111 + @ coordinate corresponding to each destination pixel. 1.112 + @ This is significantly less code and significantly fewer cycles than doing 1.113 + @ a series of single-lane loads, but it means that the X step between 1.114 + @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee 1.115 + @ that we could read 8 pixels from a single aligned 32-byte block of data. 1.116 + @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel, 1.117 + @ separated into even pixels and odd pixels to make extracting offsets and 1.118 + @ weights easier. 1.119 + @ We then pull out two bytes from the middle of each coordinate: the top 1.120 + @ byte corresponds to the integer part of the X coordinate, and the bottom 1.121 + @ byte corresponds to the weight to use for bilinear blending. 1.122 + @ These are separated out into different registers with VTRN. 1.123 + @ Then by subtracting the integer X coordinate of the first pixel in the 1.124 + @ data block we loaded, we produce an index register suitable for use by 1.125 + @ VTBL. 1.126 +s42xbily_neon_loop: 1.127 + @ Load the Y' data. 1.128 + MOV r12,r7, ASR #16 1.129 + VRSHRN.S32 D16,Q0, #8 1.130 + AND r12,r12,#~15 @ Read 16-byte aligned blocks 1.131 + VDUP.I8 D20,r12 1.132 + ADD r12,r1, r12 @ r12 = y_row+(source_x&~7) 1.133 + VRSHRN.S32 D17,Q1, #8 1.134 + PLD [r12,#64] 1.135 + VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row 1.136 + ADD r14,r7, r8, LSL #3 1.137 + VRSHRN.S32 D18,Q2, #8 1.138 + MOV r14,r14,ASR #16 1.139 + VRSHRN.S32 D19,Q3, #8 1.140 + AND r14,r14,#~15 @ Read 16-byte aligned blocks 1.141 + VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row 1.142 + PLD [r12,#64] 1.143 + VDUP.I8 D21,r14 1.144 + ADD r14,r1, r14 @ r14 = y_row+(source_x&~7) 1.145 + VMOV.I8 Q13,#1 1.146 + PLD [r14,#64] 1.147 + VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> 1.148 + @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> 1.149 + VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded. 1.150 + @ First 8 Y' pixels 1.151 + VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x 1.152 + VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x 1.153 + VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x 1.154 + VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1 1.155 + VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1 1.156 + @ Next 8 Y' pixels 1.157 + VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row 1.158 + VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row 1.159 + PLD [r14,#64] 1.160 + VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x 1.161 + VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x 1.162 + VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1 1.163 + VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1 1.164 + @ Blend Y'. 1.165 + VDUP.I16 Q9, r4 @ Load the y weights. 1.166 + VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a 1.167 + VSUBL.U8 Q5, D25,D21 1.168 + VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b 1.169 + VSUBL.U8 Q7, D27,D23 1.170 + VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight 1.171 + VMUL.S16 Q5, Q5, Q9 1.172 + VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight 1.173 + VMUL.S16 Q7, Q7, Q9 1.174 + VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits. 1.175 + VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW. 1.176 + VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8 1.177 + VRSHRN.S16 D9, Q5, #8 1.178 + VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8 1.179 + VRSHRN.S16 D13,Q7, #8 1.180 + VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8) 1.181 + VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8) 1.182 + VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a 1.183 + VSUBL.U8 Q5, D23,D21 1.184 + VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight 1.185 + VMUL.S16 Q5, Q5, Q13 1.186 + VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8 1.187 + ADD r12,r7, r9 1.188 + VRSHRN.S16 D9, Q5, #8 1.189 + MOV r12,r12,ASR #17 1.190 + VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8) 1.191 + @ Start extracting the chroma x coordinates, and load Cb and Cr. 1.192 + AND r12,r12,#~15 @ Read 16-byte aligned blocks 1.193 + VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4 1.194 + ADD r14,r2, r12 1.195 + VADD.I32 Q10,Q0, Q9 1.196 + VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb 1.197 + PLD [r14,#64] 1.198 + VADD.I32 Q11,Q1, Q9 1.199 + ADD r14,r3, r12 1.200 + VADD.I32 Q12,Q2, Q9 1.201 + VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr 1.202 + PLD [r14,#64] 1.203 + VADD.I32 Q13,Q3, Q9 1.204 + VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0> 1.205 + VRSHRN.S32 D21,Q11,#9 1.206 + VDUP.I8 Q9, r12 1.207 + VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1> 1.208 + VRSHRN.S32 D23,Q13,#9 1.209 + @ We don't actually need the x weights, but we get them for free. 1.210 + @ Free ALU slot 1.211 + VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0> 1.212 + @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0> 1.213 + VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded. 1.214 + VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x 1.215 + VMOV.I8 D24,#74 1.216 + VTBL.8 D19,{D8, D9, D10,D11},D23 1.217 + VMOV.I8 D26,#102 1.218 + VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x 1.219 + VMOV.I8 D27,#25 1.220 + VTBL.8 D21,{D12,D13,D14,D15},D23 1.221 + @ We now have Y' in Q8, Cb in Q9, and Cr in Q10 1.222 + @ We use VDUP to expand constants, because it's a permute instruction, so 1.223 + @ it can dual issue on the A8. 1.224 + SUBS r6, r6, #16 @ width -= 16 1.225 + VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74 1.226 + VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G 1.227 + VMULL.U8 Q5, D17,D24 1.228 + VDUP.32 Q7, D30[1] 1.229 + VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G 1.230 + VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R 1.231 + VMLSL.U8 Q7, D19,D27 1.232 + VDUP.32 Q12,D30[0] 1.233 + VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R 1.234 + VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B 1.235 + VMLAL.U8 Q12,D21,D26 1.236 + VDUP.32 Q13,D31[0] 1.237 + VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B 1.238 + VMLAL.U8 Q13,D19,D29 1.239 + VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G 1.240 + VMLSL.U8 Q7, D21,D28 1.241 + VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R 1.242 + VADD.S16 Q12,Q5, Q12 1.243 + VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B 1.244 + VQADD.S16 Q13,Q5, Q13 1.245 + VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G 1.246 + VADD.S16 Q7, Q5, Q7 1.247 + @ Push each value to the top of its word and saturate it. 1.248 + VQSHLU.S16 Q11,Q11,#2 1.249 + VQSHLU.S16 Q12,Q12,#2 1.250 + VQSHLU.S16 Q6, Q6, #2 1.251 + VQSHLU.S16 Q7, Q7, #2 1.252 + VQSHLU.S16 Q8, Q8, #2 1.253 + VQSHLU.S16 Q13,Q13,#2 1.254 + @ Merge G and B into R. 1.255 + VSRI.U16 Q11,Q6, #5 1.256 + VSRI.U16 Q12,Q7, #5 1.257 + VSRI.U16 Q11,Q8, #11 1.258 + MOV r14,r8, LSL #4 1.259 + VSRI.U16 Q12,Q13,#11 1.260 + BLT s42xbily_neon_tail 1.261 + VDUP.I32 Q13,r14 1.262 + @ Store the result. 1.263 + VST1.16 {D22,D23,D24,D25},[r0]! 1.264 + BEQ s42xbily_neon_done 1.265 + @ Advance the x coordinates. 1.266 + VADD.I32 Q0, Q0, Q13 1.267 + VADD.I32 Q1, Q1, Q13 1.268 + ADD r7, r14 1.269 + VADD.I32 Q2, Q2, Q13 1.270 + VADD.I32 Q3, Q3, Q13 1.271 + B s42xbily_neon_loop 1.272 +s42xbily_neon_tail: 1.273 + @ We have between 1 and 15 pixels left to write. 1.274 + @ -r6 == the number of pixels we need to skip writing. 1.275 + @ Adjust r0 to point to the last one we need to write, because we're going 1.276 + @ to write them in reverse order. 1.277 + ADD r0, r0, r6, LSL #1 1.278 + MOV r14,#-2 1.279 + ADD r0, r0, #30 1.280 + @ Skip past the ones we don't need to write. 1.281 + SUB PC, PC, r6, LSL #2 1.282 + ORR r0, r0, r0 1.283 + VST1.16 {D25[3]},[r0,:16],r14 1.284 + VST1.16 {D25[2]},[r0,:16],r14 1.285 + VST1.16 {D25[1]},[r0,:16],r14 1.286 + VST1.16 {D25[0]},[r0,:16],r14 1.287 + VST1.16 {D24[3]},[r0,:16],r14 1.288 + VST1.16 {D24[2]},[r0,:16],r14 1.289 + VST1.16 {D24[1]},[r0,:16],r14 1.290 + VST1.16 {D24[0]},[r0,:16],r14 1.291 + VST1.16 {D23[3]},[r0,:16],r14 1.292 + VST1.16 {D23[2]},[r0,:16],r14 1.293 + VST1.16 {D23[1]},[r0,:16],r14 1.294 + VST1.16 {D23[0]},[r0,:16],r14 1.295 + VST1.16 {D22[3]},[r0,:16],r14 1.296 + VST1.16 {D22[2]},[r0,:16],r14 1.297 + VST1.16 {D22[1]},[r0,:16],r14 1.298 + VST1.16 {D22[0]},[r0,:16] 1.299 +s42xbily_neon_done: 1.300 + VPOP {Q4-Q7} @ 16 words. 1.301 + LDMFD r13!,{r4-r9,PC} @ 8 words. 1.302 + .fnend 1.303 + .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON 1.304 + 1.305 +#if defined(__ELF__)&&defined(__linux__) 1.306 + .section .note.GNU-stack,"",%progbits 1.307 +#endif