gfx/ycbcr/yuv_row_arm.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/ycbcr/yuv_row_arm.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,304 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +    .arch   armv7-a
     1.9 +    .fpu    neon
    1.10 +/* Allow to build on targets not supporting neon, and force the object file
    1.11 + * target to avoid bumping the final binary target */
    1.12 +    .object_arch armv4t
    1.13 +    .text
    1.14 +    .align
    1.15 +
    1.16 +    .balign 64
    1.17 +YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
    1.18 +    .short -14240
    1.19 +    .short -14240+384
    1.20 +    .short   8672
    1.21 +    .short   8672+192
    1.22 +    .short -17696
    1.23 +    .short -17696+384
    1.24 +    .byte 102
    1.25 +    .byte  25
    1.26 +    .byte  52
    1.27 +    .byte 129
    1.28 +YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
    1.29 +    .short -14240+128
    1.30 +    .short -14240+256
    1.31 +    .short   8672+64
    1.32 +    .short   8672+128
    1.33 +    .short -17696+128
    1.34 +    .short -17696+256
    1.35 +    .byte 102
    1.36 +    .byte  25
    1.37 +    .byte  52
    1.38 +    .byte 129
    1.39 +YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
    1.40 +    .short -14240+256
    1.41 +    .short -14240+128
    1.42 +    .short   8672+128
    1.43 +    .short   8672+64
    1.44 +    .short -17696+256
    1.45 +    .short -17696+128
    1.46 +    .byte 102
    1.47 +    .byte  25
    1.48 +    .byte  52
    1.49 +    .byte 129
    1.50 +YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
    1.51 +    .short -14240+384
    1.52 +    .short -14240
    1.53 +    .short   8672+192
    1.54 +    .short   8672
    1.55 +    .short -17696+384
    1.56 +    .short -17696
    1.57 +    .byte 102
    1.58 +    .byte  25
    1.59 +    .byte  52
    1.60 +    .byte 129
    1.61 +
    1.62 +@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
    1.63 +@  yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
    1.64 +@
    1.65 +@ ctx = {
    1.66 +@   uint16_t *rgb_row;       /*r0*/
    1.67 +@   const uint8_t *y_row;    /*r1*/
    1.68 +@   const uint8_t *u_row;    /*r2*/
    1.69 +@   const uint8_t *v_row;    /*r3*/
    1.70 +@   int y_yweight;           /*r4*/
    1.71 +@   int y_pitch;             /*r5*/
    1.72 +@   int width;               /*r6*/
    1.73 +@   int source_x0_q16;       /*r7*/
    1.74 +@   int source_dx_q16;       /*r8*/
    1.75 +@   int source_uv_xoffs_q16; /*r9*/
    1.76 +@ };
    1.77 +    .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
    1.78 +    .type   ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
    1.79 +    .balign 64
    1.80 +    .fnstart
    1.81 +ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
    1.82 +    STMFD       r13!,{r4-r9,r14}       @ 8 words.
    1.83 +    ADR         r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
    1.84 +    VPUSH       {Q4-Q7}                @ 16 words.
    1.85 +    ADD         r14,r14,r1, LSL #4     @ Select the dither table to use
    1.86 +    LDMIA       r0, {r0-r9}
    1.87 +    @ Set up image index registers.
    1.88 +    ADD         r12,r8, r8
    1.89 +    VMOV.I32    D16,#0         @ Q8 = < 2| 2| 0| 0>*source_dx_q16
    1.90 +    VDUP.32     D17,r12
    1.91 +    ADD         r12,r12,r12
    1.92 +    VTRN.32     D16,D17        @ Q2 = < 2| 0| 2| 0>*source_dx_q16
    1.93 +    VDUP.32     D19,r12        @ Q9 = < 4| 4| ?| ?>*source_dx_q16
    1.94 +    ADD         r12,r12,r12
    1.95 +    VDUP.32     Q0, r7         @ Q0 = < 1| 1| 1| 1>*source_x0_q16
    1.96 +    VADD.I32    D17,D17,D19    @ Q8 = < 6| 4| 2| 0>*source_dx_q16
    1.97 +    CMP         r8, #0                 @ If source_dx_q16 is negative...
    1.98 +    VDUP.32     Q9, r12        @ Q9 = < 8| 8| 8| 8>*source_dx_q16
    1.99 +    ADDLT       r7, r7, r8, LSL #4     @ Make r7 point to the end of the block
   1.100 +    VADD.I32    Q0, Q0, Q8     @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
   1.101 +    SUBLT       r7, r7, r8             @ (i.e., the lowest address we'll use)
   1.102 +    VADD.I32    Q1, Q0, Q9     @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
   1.103 +    VDUP.I32    Q9, r8         @ Q8 = < 1| 1| 1| 1>*source_dx_q16
   1.104 +    VADD.I32    Q2, Q0, Q9     @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
   1.105 +    VADD.I32    Q3, Q1, Q9     @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
   1.106 +    VLD1.64     {D30,D31},[r14,:128]   @ Load some constants
   1.107 +    VMOV.I8     D28,#52
   1.108 +    VMOV.I8     D29,#129
   1.109 +    @ The basic idea here is to do aligned loads of a block of data and then
   1.110 +    @  index into it using VTBL to extract the data from the source X
   1.111 +    @  coordinate corresponding to each destination pixel.
   1.112 +    @ This is significantly less code and significantly fewer cycles than doing
   1.113 +    @  a series of single-lane loads, but it means that the X step between
   1.114 +    @  pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
   1.115 +    @  that we could read 8 pixels from a single aligned 32-byte block of data.
   1.116 +    @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
   1.117 +    @  separated into even pixels and odd pixels to make extracting offsets and
   1.118 +    @  weights easier.
   1.119 +    @ We then pull out two bytes from the middle of each coordinate: the top
   1.120 +    @  byte corresponds to the integer part of the X coordinate, and the bottom
   1.121 +    @  byte corresponds to the weight to use for bilinear blending.
   1.122 +    @ These are separated out into different registers with VTRN.
   1.123 +    @ Then by subtracting the integer X coordinate of the first pixel in the
   1.124 +    @  data block we loaded, we produce an index register suitable for use by
   1.125 +    @  VTBL.
   1.126 +s42xbily_neon_loop:
   1.127 +    @ Load the Y' data.
   1.128 +    MOV         r12,r7, ASR #16
   1.129 +    VRSHRN.S32  D16,Q0, #8
   1.130 +    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
   1.131 +    VDUP.I8     D20,r12
   1.132 +    ADD         r12,r1, r12    @ r12 = y_row+(source_x&~7)
   1.133 +    VRSHRN.S32  D17,Q1, #8
   1.134 +    PLD         [r12,#64]
   1.135 +    VLD1.64     {D8, D9, D10,D11},[r12,:128],r5        @ Load Y' top row
   1.136 +    ADD         r14,r7, r8, LSL #3
   1.137 +    VRSHRN.S32  D18,Q2, #8
   1.138 +    MOV         r14,r14,ASR #16
   1.139 +    VRSHRN.S32  D19,Q3, #8
   1.140 +    AND         r14,r14,#~15   @ Read 16-byte aligned blocks
   1.141 +    VLD1.64     {D12,D13,D14,D15},[r12,:128]           @ Load Y' bottom row
   1.142 +    PLD         [r12,#64]
   1.143 +    VDUP.I8     D21,r14
   1.144 +    ADD         r14,r1, r14    @ r14 = y_row+(source_x&~7)
   1.145 +    VMOV.I8     Q13,#1
   1.146 +    PLD         [r14,#64]
   1.147 +    VTRN.8      Q8, Q9         @ Q8  = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
   1.148 +                               @ Q9  = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
   1.149 +    VSUB.S8     Q9, Q9, Q10    @ Make offsets relative to the data we loaded.
   1.150 +    @ First 8 Y' pixels
   1.151 +    VTBL.8      D20,{D8, D9, D10,D11},D18      @ Index top row at source_x
   1.152 +    VTBL.8      D24,{D12,D13,D14,D15},D18      @ Index bottom row at source_x
   1.153 +    VADD.S8     Q13,Q9, Q13                    @ Add 1 to source_x
   1.154 +    VTBL.8      D22,{D8, D9, D10,D11},D26      @ Index top row at source_x+1
   1.155 +    VTBL.8      D26,{D12,D13,D14,D15},D26      @ Index bottom row at source_x+1
   1.156 +    @ Next 8 Y' pixels
   1.157 +    VLD1.64     {D8, D9, D10,D11},[r14,:128],r5        @ Load Y' top row
   1.158 +    VLD1.64     {D12,D13,D14,D15},[r14,:128]           @ Load Y' bottom row
   1.159 +    PLD         [r14,#64]
   1.160 +    VTBL.8      D21,{D8, D9, D10,D11},D19      @ Index top row at source_x
   1.161 +    VTBL.8      D25,{D12,D13,D14,D15},D19      @ Index bottom row at source_x
   1.162 +    VTBL.8      D23,{D8, D9, D10,D11},D27      @ Index top row at source_x+1
   1.163 +    VTBL.8      D27,{D12,D13,D14,D15},D27      @ Index bottom row at source_x+1
   1.164 +    @ Blend Y'.
   1.165 +    VDUP.I16    Q9, r4         @ Load the y weights.
   1.166 +    VSUBL.U8    Q4, D24,D20    @ Q5:Q4 = c-a
   1.167 +    VSUBL.U8    Q5, D25,D21
   1.168 +    VSUBL.U8    Q6, D26,D22    @ Q7:Q6 = d-b
   1.169 +    VSUBL.U8    Q7, D27,D23
   1.170 +    VMUL.S16    Q4, Q4, Q9     @ Q5:Q4 = (c-a)*yweight
   1.171 +    VMUL.S16    Q5, Q5, Q9
   1.172 +    VMUL.S16    Q6, Q6, Q9     @ Q7:Q6 = (d-b)*yweight
   1.173 +    VMUL.S16    Q7, Q7, Q9
   1.174 +    VMOVL.U8    Q12,D16        @ Promote the x weights to 16 bits.
   1.175 +    VMOVL.U8    Q13,D17        @ Sadly, there's no VMULW.
   1.176 +    VRSHRN.S16  D8, Q4, #8     @ Q4 = (c-a)*yweight+128>>8
   1.177 +    VRSHRN.S16  D9, Q5, #8
   1.178 +    VRSHRN.S16  D12,Q6, #8     @ Q6 = (d-b)*yweight+128>>8
   1.179 +    VRSHRN.S16  D13,Q7, #8
   1.180 +    VADD.I8     Q10,Q10,Q4     @ Q10 = a+((c-a)*yweight+128>>8)
   1.181 +    VADD.I8     Q11,Q11,Q6     @ Q11 = b+((d-b)*yweight+128>>8)
   1.182 +    VSUBL.U8    Q4, D22,D20    @ Q5:Q4 = b-a
   1.183 +    VSUBL.U8    Q5, D23,D21
   1.184 +    VMUL.S16    Q4, Q4, Q12    @ Q5:Q4 = (b-a)*xweight
   1.185 +    VMUL.S16    Q5, Q5, Q13
   1.186 +    VRSHRN.S16  D8, Q4, #8     @ Q4 = (b-a)*xweight+128>>8
   1.187 +    ADD         r12,r7, r9
   1.188 +    VRSHRN.S16  D9, Q5, #8
   1.189 +    MOV         r12,r12,ASR #17
   1.190 +    VADD.I8     Q8, Q10,Q4     @ Q8 = a+((b-a)*xweight+128>>8)
   1.191 +    @ Start extracting the chroma x coordinates, and load Cb and Cr.
   1.192 +    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
   1.193 +    VDUP.I32    Q9, r9         @ Q9 = source_uv_xoffs_q16 x 4
   1.194 +    ADD         r14,r2, r12
   1.195 +    VADD.I32    Q10,Q0, Q9
   1.196 +    VLD1.64     {D8, D9, D10,D11},[r14,:128]   @ Load Cb
   1.197 +    PLD         [r14,#64]
   1.198 +    VADD.I32    Q11,Q1, Q9
   1.199 +    ADD         r14,r3, r12
   1.200 +    VADD.I32    Q12,Q2, Q9
   1.201 +    VLD1.64     {D12,D13,D14,D15},[r14,:128]   @ Load Cr
   1.202 +    PLD         [r14,#64]
   1.203 +    VADD.I32    Q13,Q3, Q9
   1.204 +    VRSHRN.S32  D20,Q10,#9     @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
   1.205 +    VRSHRN.S32  D21,Q11,#9
   1.206 +    VDUP.I8     Q9, r12
   1.207 +    VRSHRN.S32  D22,Q12,#9     @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
   1.208 +    VRSHRN.S32  D23,Q13,#9
   1.209 +    @ We don't actually need the x weights, but we get them for free.
   1.210 +    @ Free ALU slot
   1.211 +    VTRN.8      Q10,Q11        @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
   1.212 +    @ Free ALU slot            @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
   1.213 +    VSUB.S8     Q11,Q11,Q9     @ Make offsets relative to the data we loaded.
   1.214 +    VTBL.8      D18,{D8, D9, D10,D11},D22      @ Index Cb at source_x
   1.215 +    VMOV.I8     D24,#74
   1.216 +    VTBL.8      D19,{D8, D9, D10,D11},D23
   1.217 +    VMOV.I8     D26,#102
   1.218 +    VTBL.8      D20,{D12,D13,D14,D15},D22      @ Index Cr at source_x
   1.219 +    VMOV.I8     D27,#25
   1.220 +    VTBL.8      D21,{D12,D13,D14,D15},D23
   1.221 +    @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
   1.222 +    @ We use VDUP to expand constants, because it's a permute instruction, so
   1.223 +    @  it can dual issue on the A8.
   1.224 +    SUBS        r6, r6, #16    @ width -= 16
   1.225 +    VMULL.U8    Q4, D16,D24    @  Q5:Q4  = Y'*74
   1.226 +    VDUP.32     Q6, D30[1]     @  Q7:Q6  = bias_G
   1.227 +    VMULL.U8    Q5, D17,D24
   1.228 +    VDUP.32     Q7, D30[1]
   1.229 +    VMLSL.U8    Q6, D18,D27    @  Q7:Q6  = -25*Cb+bias_G
   1.230 +    VDUP.32     Q11,D30[0]     @ Q12:Q11 = bias_R
   1.231 +    VMLSL.U8    Q7, D19,D27
   1.232 +    VDUP.32     Q12,D30[0]
   1.233 +    VMLAL.U8    Q11,D20,D26    @ Q12:Q11 = 102*Cr+bias_R
   1.234 +    VDUP.32     Q8, D31[0]     @ Q13:Q8  = bias_B
   1.235 +    VMLAL.U8    Q12,D21,D26
   1.236 +    VDUP.32     Q13,D31[0]
   1.237 +    VMLAL.U8    Q8, D18,D29    @ Q13:Q8  = 129*Cb+bias_B
   1.238 +    VMLAL.U8    Q13,D19,D29
   1.239 +    VMLSL.U8    Q6, D20,D28    @  Q7:Q6  = -25*Cb-52*Cr+bias_G
   1.240 +    VMLSL.U8    Q7, D21,D28
   1.241 +    VADD.S16    Q11,Q4, Q11    @ Q12:Q11 = 74*Y'+102*Cr+bias_R
   1.242 +    VADD.S16    Q12,Q5, Q12
   1.243 +    VQADD.S16   Q8, Q4, Q8     @ Q13:Q8  = 74*Y'+129*Cr+bias_B
   1.244 +    VQADD.S16   Q13,Q5, Q13
   1.245 +    VADD.S16    Q6, Q4, Q6     @  Q7:Q6  = 74*Y'-25*Cb-52*Cr+bias_G
   1.246 +    VADD.S16    Q7, Q5, Q7
   1.247 +    @ Push each value to the top of its word and saturate it.
   1.248 +    VQSHLU.S16 Q11,Q11,#2
   1.249 +    VQSHLU.S16 Q12,Q12,#2
   1.250 +    VQSHLU.S16 Q6, Q6, #2
   1.251 +    VQSHLU.S16 Q7, Q7, #2
   1.252 +    VQSHLU.S16 Q8, Q8, #2
   1.253 +    VQSHLU.S16 Q13,Q13,#2
   1.254 +    @ Merge G and B into R.
   1.255 +    VSRI.U16   Q11,Q6, #5
   1.256 +    VSRI.U16   Q12,Q7, #5
   1.257 +    VSRI.U16   Q11,Q8, #11
   1.258 +    MOV         r14,r8, LSL #4
   1.259 +    VSRI.U16   Q12,Q13,#11
   1.260 +    BLT s42xbily_neon_tail
   1.261 +    VDUP.I32    Q13,r14
   1.262 +    @ Store the result.
   1.263 +    VST1.16     {D22,D23,D24,D25},[r0]!
   1.264 +    BEQ s42xbily_neon_done
   1.265 +    @ Advance the x coordinates.
   1.266 +    VADD.I32    Q0, Q0, Q13
   1.267 +    VADD.I32    Q1, Q1, Q13
   1.268 +    ADD         r7, r14
   1.269 +    VADD.I32    Q2, Q2, Q13
   1.270 +    VADD.I32    Q3, Q3, Q13
   1.271 +    B s42xbily_neon_loop
   1.272 +s42xbily_neon_tail:
   1.273 +    @ We have between 1 and 15 pixels left to write.
   1.274 +    @ -r6 == the number of pixels we need to skip writing.
   1.275 +    @ Adjust r0 to point to the last one we need to write, because we're going
   1.276 +    @  to write them in reverse order.
   1.277 +    ADD         r0, r0, r6, LSL #1
   1.278 +    MOV         r14,#-2
   1.279 +    ADD         r0, r0, #30
   1.280 +    @ Skip past the ones we don't need to write.
   1.281 +    SUB         PC, PC, r6, LSL #2
   1.282 +    ORR         r0, r0, r0
   1.283 +    VST1.16     {D25[3]},[r0,:16],r14
   1.284 +    VST1.16     {D25[2]},[r0,:16],r14
   1.285 +    VST1.16     {D25[1]},[r0,:16],r14
   1.286 +    VST1.16     {D25[0]},[r0,:16],r14
   1.287 +    VST1.16     {D24[3]},[r0,:16],r14
   1.288 +    VST1.16     {D24[2]},[r0,:16],r14
   1.289 +    VST1.16     {D24[1]},[r0,:16],r14
   1.290 +    VST1.16     {D24[0]},[r0,:16],r14
   1.291 +    VST1.16     {D23[3]},[r0,:16],r14
   1.292 +    VST1.16     {D23[2]},[r0,:16],r14
   1.293 +    VST1.16     {D23[1]},[r0,:16],r14
   1.294 +    VST1.16     {D23[0]},[r0,:16],r14
   1.295 +    VST1.16     {D22[3]},[r0,:16],r14
   1.296 +    VST1.16     {D22[2]},[r0,:16],r14
   1.297 +    VST1.16     {D22[1]},[r0,:16],r14
   1.298 +    VST1.16     {D22[0]},[r0,:16]
   1.299 +s42xbily_neon_done:
   1.300 +    VPOP        {Q4-Q7}                @ 16 words.
   1.301 +    LDMFD       r13!,{r4-r9,PC}        @ 8 words.
   1.302 +    .fnend
   1.303 +    .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
   1.304 +
   1.305 +#if defined(__ELF__)&&defined(__linux__)
   1.306 +    .section .note.GNU-stack,"",%progbits
   1.307 +#endif

mercurial