gfx/ycbcr/yuv_row_arm.s

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 4
michael@0 5 .arch armv7-a
michael@0 6 .fpu neon
michael@0 7 /* Allow to build on targets not supporting neon, and force the object file
michael@0 8 * target to avoid bumping the final binary target */
michael@0 9 .object_arch armv4t
michael@0 10 .text
michael@0 11 .align
michael@0 12
michael@0 13 .balign 64
michael@0 14 YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
michael@0 15 .short -14240
michael@0 16 .short -14240+384
michael@0 17 .short 8672
michael@0 18 .short 8672+192
michael@0 19 .short -17696
michael@0 20 .short -17696+384
michael@0 21 .byte 102
michael@0 22 .byte 25
michael@0 23 .byte 52
michael@0 24 .byte 129
michael@0 25 YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
michael@0 26 .short -14240+128
michael@0 27 .short -14240+256
michael@0 28 .short 8672+64
michael@0 29 .short 8672+128
michael@0 30 .short -17696+128
michael@0 31 .short -17696+256
michael@0 32 .byte 102
michael@0 33 .byte 25
michael@0 34 .byte 52
michael@0 35 .byte 129
michael@0 36 YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
michael@0 37 .short -14240+256
michael@0 38 .short -14240+128
michael@0 39 .short 8672+128
michael@0 40 .short 8672+64
michael@0 41 .short -17696+256
michael@0 42 .short -17696+128
michael@0 43 .byte 102
michael@0 44 .byte 25
michael@0 45 .byte 52
michael@0 46 .byte 129
michael@0 47 YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
michael@0 48 .short -14240+384
michael@0 49 .short -14240
michael@0 50 .short 8672+192
michael@0 51 .short 8672
michael@0 52 .short -17696+384
michael@0 53 .short -17696
michael@0 54 .byte 102
michael@0 55 .byte 25
michael@0 56 .byte 52
michael@0 57 .byte 129
michael@0 58
michael@0 59 @ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
michael@0 60 @ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
michael@0 61 @
michael@0 62 @ ctx = {
michael@0 63 @ uint16_t *rgb_row; /*r0*/
michael@0 64 @ const uint8_t *y_row; /*r1*/
michael@0 65 @ const uint8_t *u_row; /*r2*/
michael@0 66 @ const uint8_t *v_row; /*r3*/
michael@0 67 @ int y_yweight; /*r4*/
michael@0 68 @ int y_pitch; /*r5*/
michael@0 69 @ int width; /*r6*/
michael@0 70 @ int source_x0_q16; /*r7*/
michael@0 71 @ int source_dx_q16; /*r8*/
michael@0 72 @ int source_uv_xoffs_q16; /*r9*/
michael@0 73 @ };
michael@0 74 .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
michael@0 75 .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
michael@0 76 .balign 64
michael@0 77 .fnstart
michael@0 78 ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
michael@0 79 STMFD r13!,{r4-r9,r14} @ 8 words.
michael@0 80 ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
michael@0 81 VPUSH {Q4-Q7} @ 16 words.
michael@0 82 ADD r14,r14,r1, LSL #4 @ Select the dither table to use
michael@0 83 LDMIA r0, {r0-r9}
michael@0 84 @ Set up image index registers.
michael@0 85 ADD r12,r8, r8
michael@0 86 VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16
michael@0 87 VDUP.32 D17,r12
michael@0 88 ADD r12,r12,r12
michael@0 89 VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16
michael@0 90 VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16
michael@0 91 ADD r12,r12,r12
michael@0 92 VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16
michael@0 93 VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16
michael@0 94 CMP r8, #0 @ If source_dx_q16 is negative...
michael@0 95 VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16
michael@0 96 ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block
michael@0 97 VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
michael@0 98 SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use)
michael@0 99 VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
michael@0 100 VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16
michael@0 101 VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
michael@0 102 VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
michael@0 103 VLD1.64 {D30,D31},[r14,:128] @ Load some constants
michael@0 104 VMOV.I8 D28,#52
michael@0 105 VMOV.I8 D29,#129
michael@0 106 @ The basic idea here is to do aligned loads of a block of data and then
michael@0 107 @ index into it using VTBL to extract the data from the source X
michael@0 108 @ coordinate corresponding to each destination pixel.
michael@0 109 @ This is significantly less code and significantly fewer cycles than doing
michael@0 110 @ a series of single-lane loads, but it means that the X step between
michael@0 111 @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
michael@0 112 @ that we could read 8 pixels from a single aligned 32-byte block of data.
michael@0 113 @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
michael@0 114 @ separated into even pixels and odd pixels to make extracting offsets and
michael@0 115 @ weights easier.
michael@0 116 @ We then pull out two bytes from the middle of each coordinate: the top
michael@0 117 @ byte corresponds to the integer part of the X coordinate, and the bottom
michael@0 118 @ byte corresponds to the weight to use for bilinear blending.
michael@0 119 @ These are separated out into different registers with VTRN.
michael@0 120 @ Then by subtracting the integer X coordinate of the first pixel in the
michael@0 121 @ data block we loaded, we produce an index register suitable for use by
michael@0 122 @ VTBL.
michael@0 123 s42xbily_neon_loop:
michael@0 124 @ Load the Y' data.
michael@0 125 MOV r12,r7, ASR #16
michael@0 126 VRSHRN.S32 D16,Q0, #8
michael@0 127 AND r12,r12,#~15 @ Read 16-byte aligned blocks
michael@0 128 VDUP.I8 D20,r12
michael@0 129 ADD r12,r1, r12 @ r12 = y_row+(source_x&~7)
michael@0 130 VRSHRN.S32 D17,Q1, #8
michael@0 131 PLD [r12,#64]
michael@0 132 VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row
michael@0 133 ADD r14,r7, r8, LSL #3
michael@0 134 VRSHRN.S32 D18,Q2, #8
michael@0 135 MOV r14,r14,ASR #16
michael@0 136 VRSHRN.S32 D19,Q3, #8
michael@0 137 AND r14,r14,#~15 @ Read 16-byte aligned blocks
michael@0 138 VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row
michael@0 139 PLD [r12,#64]
michael@0 140 VDUP.I8 D21,r14
michael@0 141 ADD r14,r1, r14 @ r14 = y_row+(source_x&~7)
michael@0 142 VMOV.I8 Q13,#1
michael@0 143 PLD [r14,#64]
michael@0 144 VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
michael@0 145 @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
michael@0 146 VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded.
michael@0 147 @ First 8 Y' pixels
michael@0 148 VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x
michael@0 149 VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x
michael@0 150 VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x
michael@0 151 VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1
michael@0 152 VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1
michael@0 153 @ Next 8 Y' pixels
michael@0 154 VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row
michael@0 155 VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row
michael@0 156 PLD [r14,#64]
michael@0 157 VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x
michael@0 158 VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x
michael@0 159 VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1
michael@0 160 VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1
michael@0 161 @ Blend Y'.
michael@0 162 VDUP.I16 Q9, r4 @ Load the y weights.
michael@0 163 VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a
michael@0 164 VSUBL.U8 Q5, D25,D21
michael@0 165 VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b
michael@0 166 VSUBL.U8 Q7, D27,D23
michael@0 167 VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight
michael@0 168 VMUL.S16 Q5, Q5, Q9
michael@0 169 VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight
michael@0 170 VMUL.S16 Q7, Q7, Q9
michael@0 171 VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits.
michael@0 172 VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW.
michael@0 173 VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8
michael@0 174 VRSHRN.S16 D9, Q5, #8
michael@0 175 VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8
michael@0 176 VRSHRN.S16 D13,Q7, #8
michael@0 177 VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8)
michael@0 178 VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8)
michael@0 179 VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a
michael@0 180 VSUBL.U8 Q5, D23,D21
michael@0 181 VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight
michael@0 182 VMUL.S16 Q5, Q5, Q13
michael@0 183 VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8
michael@0 184 ADD r12,r7, r9
michael@0 185 VRSHRN.S16 D9, Q5, #8
michael@0 186 MOV r12,r12,ASR #17
michael@0 187 VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8)
michael@0 188 @ Start extracting the chroma x coordinates, and load Cb and Cr.
michael@0 189 AND r12,r12,#~15 @ Read 16-byte aligned blocks
michael@0 190 VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4
michael@0 191 ADD r14,r2, r12
michael@0 192 VADD.I32 Q10,Q0, Q9
michael@0 193 VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb
michael@0 194 PLD [r14,#64]
michael@0 195 VADD.I32 Q11,Q1, Q9
michael@0 196 ADD r14,r3, r12
michael@0 197 VADD.I32 Q12,Q2, Q9
michael@0 198 VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr
michael@0 199 PLD [r14,#64]
michael@0 200 VADD.I32 Q13,Q3, Q9
michael@0 201 VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
michael@0 202 VRSHRN.S32 D21,Q11,#9
michael@0 203 VDUP.I8 Q9, r12
michael@0 204 VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
michael@0 205 VRSHRN.S32 D23,Q13,#9
michael@0 206 @ We don't actually need the x weights, but we get them for free.
michael@0 207 @ Free ALU slot
michael@0 208 VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
michael@0 209 @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
michael@0 210 VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded.
michael@0 211 VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x
michael@0 212 VMOV.I8 D24,#74
michael@0 213 VTBL.8 D19,{D8, D9, D10,D11},D23
michael@0 214 VMOV.I8 D26,#102
michael@0 215 VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x
michael@0 216 VMOV.I8 D27,#25
michael@0 217 VTBL.8 D21,{D12,D13,D14,D15},D23
michael@0 218 @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
michael@0 219 @ We use VDUP to expand constants, because it's a permute instruction, so
michael@0 220 @ it can dual issue on the A8.
michael@0 221 SUBS r6, r6, #16 @ width -= 16
michael@0 222 VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74
michael@0 223 VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G
michael@0 224 VMULL.U8 Q5, D17,D24
michael@0 225 VDUP.32 Q7, D30[1]
michael@0 226 VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G
michael@0 227 VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R
michael@0 228 VMLSL.U8 Q7, D19,D27
michael@0 229 VDUP.32 Q12,D30[0]
michael@0 230 VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R
michael@0 231 VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B
michael@0 232 VMLAL.U8 Q12,D21,D26
michael@0 233 VDUP.32 Q13,D31[0]
michael@0 234 VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B
michael@0 235 VMLAL.U8 Q13,D19,D29
michael@0 236 VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G
michael@0 237 VMLSL.U8 Q7, D21,D28
michael@0 238 VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R
michael@0 239 VADD.S16 Q12,Q5, Q12
michael@0 240 VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B
michael@0 241 VQADD.S16 Q13,Q5, Q13
michael@0 242 VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G
michael@0 243 VADD.S16 Q7, Q5, Q7
michael@0 244 @ Push each value to the top of its word and saturate it.
michael@0 245 VQSHLU.S16 Q11,Q11,#2
michael@0 246 VQSHLU.S16 Q12,Q12,#2
michael@0 247 VQSHLU.S16 Q6, Q6, #2
michael@0 248 VQSHLU.S16 Q7, Q7, #2
michael@0 249 VQSHLU.S16 Q8, Q8, #2
michael@0 250 VQSHLU.S16 Q13,Q13,#2
michael@0 251 @ Merge G and B into R.
michael@0 252 VSRI.U16 Q11,Q6, #5
michael@0 253 VSRI.U16 Q12,Q7, #5
michael@0 254 VSRI.U16 Q11,Q8, #11
michael@0 255 MOV r14,r8, LSL #4
michael@0 256 VSRI.U16 Q12,Q13,#11
michael@0 257 BLT s42xbily_neon_tail
michael@0 258 VDUP.I32 Q13,r14
michael@0 259 @ Store the result.
michael@0 260 VST1.16 {D22,D23,D24,D25},[r0]!
michael@0 261 BEQ s42xbily_neon_done
michael@0 262 @ Advance the x coordinates.
michael@0 263 VADD.I32 Q0, Q0, Q13
michael@0 264 VADD.I32 Q1, Q1, Q13
michael@0 265 ADD r7, r14
michael@0 266 VADD.I32 Q2, Q2, Q13
michael@0 267 VADD.I32 Q3, Q3, Q13
michael@0 268 B s42xbily_neon_loop
michael@0 269 s42xbily_neon_tail:
michael@0 270 @ We have between 1 and 15 pixels left to write.
michael@0 271 @ -r6 == the number of pixels we need to skip writing.
michael@0 272 @ Adjust r0 to point to the last one we need to write, because we're going
michael@0 273 @ to write them in reverse order.
michael@0 274 ADD r0, r0, r6, LSL #1
michael@0 275 MOV r14,#-2
michael@0 276 ADD r0, r0, #30
michael@0 277 @ Skip past the ones we don't need to write.
michael@0 278 SUB PC, PC, r6, LSL #2
michael@0 279 ORR r0, r0, r0
michael@0 280 VST1.16 {D25[3]},[r0,:16],r14
michael@0 281 VST1.16 {D25[2]},[r0,:16],r14
michael@0 282 VST1.16 {D25[1]},[r0,:16],r14
michael@0 283 VST1.16 {D25[0]},[r0,:16],r14
michael@0 284 VST1.16 {D24[3]},[r0,:16],r14
michael@0 285 VST1.16 {D24[2]},[r0,:16],r14
michael@0 286 VST1.16 {D24[1]},[r0,:16],r14
michael@0 287 VST1.16 {D24[0]},[r0,:16],r14
michael@0 288 VST1.16 {D23[3]},[r0,:16],r14
michael@0 289 VST1.16 {D23[2]},[r0,:16],r14
michael@0 290 VST1.16 {D23[1]},[r0,:16],r14
michael@0 291 VST1.16 {D23[0]},[r0,:16],r14
michael@0 292 VST1.16 {D22[3]},[r0,:16],r14
michael@0 293 VST1.16 {D22[2]},[r0,:16],r14
michael@0 294 VST1.16 {D22[1]},[r0,:16],r14
michael@0 295 VST1.16 {D22[0]},[r0,:16]
michael@0 296 s42xbily_neon_done:
michael@0 297 VPOP {Q4-Q7} @ 16 words.
michael@0 298 LDMFD r13!,{r4-r9,PC} @ 8 words.
michael@0 299 .fnend
michael@0 300 .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
michael@0 301
michael@0 302 #if defined(__ELF__)&&defined(__linux__)
michael@0 303 .section .note.GNU-stack,"",%progbits
michael@0 304 #endif

mercurial