gfx/cairo/libpixman/src/pixman-arm-neon-asm.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1204 @@
     1.4 +/*
     1.5 + * Copyright © 2009 Nokia Corporation
     1.6 + *
     1.7 + * Permission is hereby granted, free of charge, to any person obtaining a
     1.8 + * copy of this software and associated documentation files (the "Software"),
     1.9 + * to deal in the Software without restriction, including without limitation
    1.10 + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    1.11 + * and/or sell copies of the Software, and to permit persons to whom the
    1.12 + * Software is furnished to do so, subject to the following conditions:
    1.13 + *
    1.14 + * The above copyright notice and this permission notice (including the next
    1.15 + * paragraph) shall be included in all copies or substantial portions of the
    1.16 + * Software.
    1.17 + *
    1.18 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    1.19 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    1.20 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
    1.21 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    1.22 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    1.23 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    1.24 + * DEALINGS IN THE SOFTWARE.
    1.25 + *
    1.26 + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
    1.27 + */
    1.28 +
    1.29 +/*
    1.30 + * This file contains a macro ('generate_composite_function') which can
    1.31 + * construct 2D image processing functions, based on a common template.
    1.32 + * Any combinations of source, destination and mask images with 8bpp,
    1.33 + * 16bpp, 24bpp, 32bpp color formats are supported.
    1.34 + *
    1.35 + * This macro takes care of:
    1.36 + *  - handling of leading and trailing unaligned pixels
    1.37 + *  - doing most of the work related to L2 cache preload
    1.38 + *  - encourages the use of software pipelining for better instructions
    1.39 + *    scheduling
    1.40 + *
    1.41 + * The user of this macro has to provide some configuration parameters
    1.42 + * (bit depths for the images, prefetch distance, etc.) and a set of
    1.43 + * macros, which should implement basic code chunks responsible for
    1.44 + * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
    1.45 + * examples.
    1.46 + *
    1.47 + * TODO:
    1.48 + *  - try overlapped pixel method (from Ian Rickards) when processing
    1.49 + *    exactly two blocks of pixels
    1.50 + *  - maybe add an option to do reverse scanline processing
    1.51 + */
    1.52 +
    1.53 +/*
    1.54 + * Bit flags for 'generate_composite_function' macro which are used
    1.55 + * to tune generated functions behavior.
    1.56 + */
    1.57 +.set FLAG_DST_WRITEONLY,       0
    1.58 +.set FLAG_DST_READWRITE,       1
    1.59 +.set FLAG_DEINTERLEAVE_32BPP,  2
    1.60 +
    1.61 +/*
    1.62 + * Offset in stack where mask and source pointer/stride can be accessed
    1.63 + * from 'init' macro. This is useful for doing special handling for solid mask.
    1.64 + */
    1.65 +.set ARGS_STACK_OFFSET,        40
    1.66 +
    1.67 +/*
    1.68 + * Constants for selecting preferable prefetch type.
    1.69 + */
    1.70 +.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
    1.71 +.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
    1.72 +.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
    1.73 +
    1.74 +/*
    1.75 + * Definitions of supplementary pixld/pixst macros (for partial load/store of
    1.76 + * pixel data).
    1.77 + */
    1.78 +
    1.79 +.macro pixldst1 op, elem_size, reg1, mem_operand, abits
    1.80 +.if abits > 0
    1.81 +    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
    1.82 +.else
    1.83 +    op&.&elem_size {d&reg1}, [&mem_operand&]!
    1.84 +.endif
    1.85 +.endm
    1.86 +
    1.87 +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
    1.88 +.if abits > 0
    1.89 +    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
    1.90 +.else
    1.91 +    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
    1.92 +.endif
    1.93 +.endm
    1.94 +
    1.95 +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
    1.96 +.if abits > 0
    1.97 +    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
    1.98 +.else
    1.99 +    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
   1.100 +.endif
   1.101 +.endm
   1.102 +
   1.103 +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
   1.104 +    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
   1.105 +.endm
   1.106 +
   1.107 +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
   1.108 +    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
   1.109 +.endm
   1.110 +
   1.111 +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
   1.112 +    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
   1.113 +.endm
   1.114 +
   1.115 +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
   1.116 +.if numbytes == 32
   1.117 +    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
   1.118 +                              %(basereg+6), %(basereg+7), mem_operand, abits
   1.119 +.elseif numbytes == 16
   1.120 +    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
   1.121 +.elseif numbytes == 8
   1.122 +    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
   1.123 +.elseif numbytes == 4
   1.124 +    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
   1.125 +        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
   1.126 +    .elseif elem_size == 16
   1.127 +        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
   1.128 +        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
   1.129 +    .else
   1.130 +        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
   1.131 +        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
   1.132 +        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
   1.133 +        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
   1.134 +    .endif
   1.135 +.elseif numbytes == 2
   1.136 +    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
   1.137 +        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
   1.138 +    .else
   1.139 +        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
   1.140 +        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
   1.141 +    .endif
   1.142 +.elseif numbytes == 1
   1.143 +    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
   1.144 +.else
   1.145 +    .error "unsupported size: numbytes"
   1.146 +.endif
   1.147 +.endm
   1.148 +
   1.149 +.macro pixld numpix, bpp, basereg, mem_operand, abits=0
   1.150 +.if bpp > 0
   1.151 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   1.152 +    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
   1.153 +                      %(basereg+6), %(basereg+7), mem_operand, abits
   1.154 +.elseif (bpp == 24) && (numpix == 8)
   1.155 +    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
   1.156 +.elseif (bpp == 24) && (numpix == 4)
   1.157 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
   1.158 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
   1.159 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
   1.160 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
   1.161 +.elseif (bpp == 24) && (numpix == 2)
   1.162 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
   1.163 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
   1.164 +.elseif (bpp == 24) && (numpix == 1)
   1.165 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
   1.166 +.else
   1.167 +    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
   1.168 +.endif
   1.169 +.endif
   1.170 +.endm
   1.171 +
   1.172 +.macro pixst numpix, bpp, basereg, mem_operand, abits=0
   1.173 +.if bpp > 0
   1.174 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   1.175 +    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
   1.176 +                      %(basereg+6), %(basereg+7), mem_operand, abits
   1.177 +.elseif (bpp == 24) && (numpix == 8)
   1.178 +    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
   1.179 +.elseif (bpp == 24) && (numpix == 4)
   1.180 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
   1.181 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
   1.182 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
   1.183 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
   1.184 +.elseif (bpp == 24) && (numpix == 2)
   1.185 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
   1.186 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
   1.187 +.elseif (bpp == 24) && (numpix == 1)
   1.188 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
   1.189 +.else
   1.190 +    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
   1.191 +.endif
   1.192 +.endif
   1.193 +.endm
   1.194 +
   1.195 +.macro pixld_a numpix, bpp, basereg, mem_operand
   1.196 +.if (bpp * numpix) <= 128
   1.197 +    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
   1.198 +.else
   1.199 +    pixld numpix, bpp, basereg, mem_operand, 128
   1.200 +.endif
   1.201 +.endm
   1.202 +
   1.203 +.macro pixst_a numpix, bpp, basereg, mem_operand
   1.204 +.if (bpp * numpix) <= 128
   1.205 +    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
   1.206 +.else
   1.207 +    pixst numpix, bpp, basereg, mem_operand, 128
   1.208 +.endif
   1.209 +.endm
   1.210 +
   1.211 +/*
   1.212 + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
   1.213 + * aliases to be defined)
   1.214 + */
   1.215 +.macro pixld1_s elem_size, reg1, mem_operand
   1.216 +.if elem_size == 16
   1.217 +    mov     TMP1, VX, asr #16
   1.218 +    adds    VX, VX, UNIT_X
   1.219 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   1.220 +    bpl     5b
   1.221 +    add     TMP1, mem_operand, TMP1, asl #1
   1.222 +    mov     TMP2, VX, asr #16
   1.223 +    adds    VX, VX, UNIT_X
   1.224 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   1.225 +    bpl     5b
   1.226 +    add     TMP2, mem_operand, TMP2, asl #1
   1.227 +    vld1.16 {d&reg1&[0]}, [TMP1, :16]
   1.228 +    mov     TMP1, VX, asr #16
   1.229 +    adds    VX, VX, UNIT_X
   1.230 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   1.231 +    bpl     5b
   1.232 +    add     TMP1, mem_operand, TMP1, asl #1
   1.233 +    vld1.16 {d&reg1&[1]}, [TMP2, :16]
   1.234 +    mov     TMP2, VX, asr #16
   1.235 +    adds    VX, VX, UNIT_X
   1.236 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   1.237 +    bpl     5b
   1.238 +    add     TMP2, mem_operand, TMP2, asl #1
   1.239 +    vld1.16 {d&reg1&[2]}, [TMP1, :16]
   1.240 +    vld1.16 {d&reg1&[3]}, [TMP2, :16]
   1.241 +.elseif elem_size == 32
   1.242 +    mov     TMP1, VX, asr #16
   1.243 +    adds    VX, VX, UNIT_X
   1.244 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   1.245 +    bpl     5b
   1.246 +    add     TMP1, mem_operand, TMP1, asl #2
   1.247 +    mov     TMP2, VX, asr #16
   1.248 +    adds    VX, VX, UNIT_X
   1.249 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   1.250 +    bpl     5b
   1.251 +    add     TMP2, mem_operand, TMP2, asl #2
   1.252 +    vld1.32 {d&reg1&[0]}, [TMP1, :32]
   1.253 +    vld1.32 {d&reg1&[1]}, [TMP2, :32]
   1.254 +.else
   1.255 +    .error "unsupported"
   1.256 +.endif
   1.257 +.endm
   1.258 +
   1.259 +.macro pixld2_s elem_size, reg1, reg2, mem_operand
   1.260 +.if 0 /* elem_size == 32 */
   1.261 +    mov     TMP1, VX, asr #16
   1.262 +    add     VX, VX, UNIT_X, asl #1
   1.263 +    add     TMP1, mem_operand, TMP1, asl #2
   1.264 +    mov     TMP2, VX, asr #16
   1.265 +    sub     VX, VX, UNIT_X
   1.266 +    add     TMP2, mem_operand, TMP2, asl #2
   1.267 +    vld1.32 {d&reg1&[0]}, [TMP1, :32]
   1.268 +    mov     TMP1, VX, asr #16
   1.269 +    add     VX, VX, UNIT_X, asl #1
   1.270 +    add     TMP1, mem_operand, TMP1, asl #2
   1.271 +    vld1.32 {d&reg2&[0]}, [TMP2, :32]
   1.272 +    mov     TMP2, VX, asr #16
   1.273 +    add     VX, VX, UNIT_X
   1.274 +    add     TMP2, mem_operand, TMP2, asl #2
   1.275 +    vld1.32 {d&reg1&[1]}, [TMP1, :32]
   1.276 +    vld1.32 {d&reg2&[1]}, [TMP2, :32]
   1.277 +.else
   1.278 +    pixld1_s elem_size, reg1, mem_operand
   1.279 +    pixld1_s elem_size, reg2, mem_operand
   1.280 +.endif
   1.281 +.endm
   1.282 +
   1.283 +.macro pixld0_s elem_size, reg1, idx, mem_operand
   1.284 +.if elem_size == 16
   1.285 +    mov     TMP1, VX, asr #16
   1.286 +    adds    VX, VX, UNIT_X
   1.287 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   1.288 +    bpl     5b
   1.289 +    add     TMP1, mem_operand, TMP1, asl #1
   1.290 +    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
   1.291 +.elseif elem_size == 32
   1.292 +    mov     TMP1, VX, asr #16
   1.293 +    adds    VX, VX, UNIT_X
   1.294 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   1.295 +    bpl     5b
   1.296 +    add     TMP1, mem_operand, TMP1, asl #2
   1.297 +    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
   1.298 +.endif
   1.299 +.endm
   1.300 +
   1.301 +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
   1.302 +.if numbytes == 32
   1.303 +    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
   1.304 +    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
   1.305 +    pixdeinterleave elem_size, %(basereg+4)
   1.306 +.elseif numbytes == 16
   1.307 +    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
   1.308 +.elseif numbytes == 8
   1.309 +    pixld1_s elem_size, %(basereg+1), mem_operand
   1.310 +.elseif numbytes == 4
   1.311 +    .if elem_size == 32
   1.312 +        pixld0_s elem_size, %(basereg+0), 1, mem_operand
   1.313 +    .elseif elem_size == 16
   1.314 +        pixld0_s elem_size, %(basereg+0), 2, mem_operand
   1.315 +        pixld0_s elem_size, %(basereg+0), 3, mem_operand
   1.316 +    .else
   1.317 +        pixld0_s elem_size, %(basereg+0), 4, mem_operand
   1.318 +        pixld0_s elem_size, %(basereg+0), 5, mem_operand
   1.319 +        pixld0_s elem_size, %(basereg+0), 6, mem_operand
   1.320 +        pixld0_s elem_size, %(basereg+0), 7, mem_operand
   1.321 +    .endif
   1.322 +.elseif numbytes == 2
   1.323 +    .if elem_size == 16
   1.324 +        pixld0_s elem_size, %(basereg+0), 1, mem_operand
   1.325 +    .else
   1.326 +        pixld0_s elem_size, %(basereg+0), 2, mem_operand
   1.327 +        pixld0_s elem_size, %(basereg+0), 3, mem_operand
   1.328 +    .endif
   1.329 +.elseif numbytes == 1
   1.330 +    pixld0_s elem_size, %(basereg+0), 1, mem_operand
   1.331 +.else
   1.332 +    .error "unsupported size: numbytes"
   1.333 +.endif
   1.334 +.endm
   1.335 +
   1.336 +.macro pixld_s numpix, bpp, basereg, mem_operand
   1.337 +.if bpp > 0
   1.338 +    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
   1.339 +.endif
   1.340 +.endm
   1.341 +
   1.342 +.macro vuzp8 reg1, reg2
   1.343 +    vuzp.8 d&reg1, d&reg2
   1.344 +.endm
   1.345 +
   1.346 +.macro vzip8 reg1, reg2
   1.347 +    vzip.8 d&reg1, d&reg2
   1.348 +.endm
   1.349 +
   1.350 +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
   1.351 +.macro pixdeinterleave bpp, basereg
   1.352 +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   1.353 +    vuzp8 %(basereg+0), %(basereg+1)
   1.354 +    vuzp8 %(basereg+2), %(basereg+3)
   1.355 +    vuzp8 %(basereg+1), %(basereg+3)
   1.356 +    vuzp8 %(basereg+0), %(basereg+2)
   1.357 +.endif
   1.358 +.endm
   1.359 +
   1.360 +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
   1.361 +.macro pixinterleave bpp, basereg
   1.362 +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   1.363 +    vzip8 %(basereg+0), %(basereg+2)
   1.364 +    vzip8 %(basereg+1), %(basereg+3)
   1.365 +    vzip8 %(basereg+2), %(basereg+3)
   1.366 +    vzip8 %(basereg+0), %(basereg+1)
   1.367 +.endif
   1.368 +.endm
   1.369 +
   1.370 +/*
   1.371 + * This is a macro for implementing cache preload. The main idea is that
   1.372 + * cache preload logic is mostly independent from the rest of pixels
   1.373 + * processing code. It starts at the top left pixel and moves forward
   1.374 + * across pixels and can jump across scanlines. Prefetch distance is
   1.375 + * handled in an 'incremental' way: it starts from 0 and advances to the
   1.376 + * optimal distance over time. After reaching optimal prefetch distance,
   1.377 + * it is kept constant. There are some checks which prevent prefetching
   1.378 + * unneeded pixel lines below the image (but it still can prefetch a bit
   1.379 + * more data on the right side of the image - not a big issue and may
   1.380 + * be actually helpful when rendering text glyphs). Additional trick is
   1.381 + * the use of LDR instruction for prefetch instead of PLD when moving to
   1.382 + * the next line, the point is that we have a high chance of getting TLB
   1.383 + * miss in this case, and PLD would be useless.
   1.384 + *
   1.385 + * This sounds like it may introduce a noticeable overhead (when working with
   1.386 + * fully cached data). But in reality, due to having a separate pipeline and
   1.387 + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
   1.388 + * execute simultaneously with NEON and be completely shadowed by it. Thus
   1.389 + * we get no performance overhead at all (*). This looks like a very nice
   1.390 + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
   1.391 + * but still can implement some rather advanced prefetch logic in sofware
   1.392 + * for almost zero cost!
   1.393 + *
   1.394 + * (*) The overhead of the prefetcher is visible when running some trivial
   1.395 + * pixels processing like simple copy. Anyway, having prefetch is a must
   1.396 + * when working with the graphics data.
   1.397 + */
   1.398 +.macro PF a, x:vararg
   1.399 +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
   1.400 +    a x
   1.401 +.endif
   1.402 +.endm
   1.403 +
   1.404 +.macro cache_preload std_increment, boost_increment
   1.405 +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
   1.406 +.if regs_shortage
   1.407 +    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
   1.408 +.endif
   1.409 +.if std_increment != 0
   1.410 +    PF add PF_X, PF_X, #std_increment
   1.411 +.endif
   1.412 +    PF tst PF_CTL, #0xF
   1.413 +    PF addne PF_X, PF_X, #boost_increment
   1.414 +    PF subne PF_CTL, PF_CTL, #1
   1.415 +    PF cmp PF_X, ORIG_W
   1.416 +.if src_bpp_shift >= 0
   1.417 +    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   1.418 +.endif
   1.419 +.if dst_r_bpp != 0
   1.420 +    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1.421 +.endif
   1.422 +.if mask_bpp_shift >= 0
   1.423 +    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   1.424 +.endif
   1.425 +    PF subge PF_X, PF_X, ORIG_W
   1.426 +    PF subges PF_CTL, PF_CTL, #0x10
   1.427 +.if src_bpp_shift >= 0
   1.428 +    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   1.429 +.endif
   1.430 +.if dst_r_bpp != 0
   1.431 +    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1.432 +.endif
   1.433 +.if mask_bpp_shift >= 0
   1.434 +    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   1.435 +.endif
   1.436 +.endif
   1.437 +.endm
   1.438 +
   1.439 +.macro cache_preload_simple
   1.440 +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
   1.441 +.if src_bpp > 0
   1.442 +    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
   1.443 +.endif
   1.444 +.if dst_r_bpp > 0
   1.445 +    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
   1.446 +.endif
   1.447 +.if mask_bpp > 0
   1.448 +    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
   1.449 +.endif
   1.450 +.endif
   1.451 +.endm
   1.452 +
   1.453 +.macro fetch_mask_pixblock
   1.454 +    pixld       pixblock_size, mask_bpp, \
   1.455 +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   1.456 +.endm
   1.457 +
   1.458 +/*
   1.459 + * Macro which is used to process leading pixels until destination
   1.460 + * pointer is properly aligned (at 16 bytes boundary). When destination
   1.461 + * buffer uses 16bpp format, this is unnecessary, or even pointless.
   1.462 + */
   1.463 +.macro ensure_destination_ptr_alignment process_pixblock_head, \
   1.464 +                                        process_pixblock_tail, \
   1.465 +                                        process_pixblock_tail_head
   1.466 +.if dst_w_bpp != 24
   1.467 +    tst         DST_R, #0xF
   1.468 +    beq         2f
   1.469 +
   1.470 +.irp lowbit, 1, 2, 4, 8, 16
   1.471 +local skip1
   1.472 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
   1.473 +.if lowbit < 16 /* we don't need more than 16-byte alignment */
   1.474 +    tst         DST_R, #lowbit
   1.475 +    beq         1f
   1.476 +.endif
   1.477 +    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
   1.478 +    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
   1.479 +.if dst_r_bpp > 0
   1.480 +    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
   1.481 +.else
   1.482 +    add         DST_R, DST_R, #lowbit
   1.483 +.endif
   1.484 +    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
   1.485 +    sub         W, W, #(lowbit * 8 / dst_w_bpp)
   1.486 +1:
   1.487 +.endif
   1.488 +.endr
   1.489 +    pixdeinterleave src_bpp, src_basereg
   1.490 +    pixdeinterleave mask_bpp, mask_basereg
   1.491 +    pixdeinterleave dst_r_bpp, dst_r_basereg
   1.492 +
   1.493 +    process_pixblock_head
   1.494 +    cache_preload 0, pixblock_size
   1.495 +    cache_preload_simple
   1.496 +    process_pixblock_tail
   1.497 +
   1.498 +    pixinterleave dst_w_bpp, dst_w_basereg
   1.499 +.irp lowbit, 1, 2, 4, 8, 16
   1.500 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
   1.501 +.if lowbit < 16 /* we don't need more than 16-byte alignment */
   1.502 +    tst         DST_W, #lowbit
   1.503 +    beq         1f
   1.504 +.endif
   1.505 +    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
   1.506 +1:
   1.507 +.endif
   1.508 +.endr
   1.509 +.endif
   1.510 +2:
   1.511 +.endm
   1.512 +
   1.513 +/*
   1.514 + * Special code for processing up to (pixblock_size - 1) remaining
   1.515 + * trailing pixels. As SIMD processing performs operation on
   1.516 + * pixblock_size pixels, anything smaller than this has to be loaded
   1.517 + * and stored in a special way. Loading and storing of pixel data is
   1.518 + * performed in such a way that we fill some 'slots' in the NEON
   1.519 + * registers (some slots naturally are unused), then perform compositing
   1.520 + * operation as usual. In the end, the data is taken from these 'slots'
   1.521 + * and saved to memory.
   1.522 + *
   1.523 + * cache_preload_flag - allows to suppress prefetch if
   1.524 + *                      set to 0
   1.525 + * dst_aligned_flag   - selects whether destination buffer
   1.526 + *                      is aligned
   1.527 + */
   1.528 +.macro process_trailing_pixels cache_preload_flag, \
   1.529 +                               dst_aligned_flag, \
   1.530 +                               process_pixblock_head, \
   1.531 +                               process_pixblock_tail, \
   1.532 +                               process_pixblock_tail_head
   1.533 +    tst         W, #(pixblock_size - 1)
   1.534 +    beq         2f
   1.535 +.irp chunk_size, 16, 8, 4, 2, 1
   1.536 +.if pixblock_size > chunk_size
   1.537 +    tst         W, #chunk_size
   1.538 +    beq         1f
   1.539 +    pixld_src   chunk_size, src_bpp, src_basereg, SRC
   1.540 +    pixld       chunk_size, mask_bpp, mask_basereg, MASK
   1.541 +.if dst_aligned_flag != 0
   1.542 +    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
   1.543 +.else
   1.544 +    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
   1.545 +.endif
   1.546 +.if cache_preload_flag != 0
   1.547 +    PF add      PF_X, PF_X, #chunk_size
   1.548 +.endif
   1.549 +1:
   1.550 +.endif
   1.551 +.endr
   1.552 +    pixdeinterleave src_bpp, src_basereg
   1.553 +    pixdeinterleave mask_bpp, mask_basereg
   1.554 +    pixdeinterleave dst_r_bpp, dst_r_basereg
   1.555 +
   1.556 +    process_pixblock_head
   1.557 +.if cache_preload_flag != 0
   1.558 +    cache_preload 0, pixblock_size
   1.559 +    cache_preload_simple
   1.560 +.endif
   1.561 +    process_pixblock_tail
   1.562 +    pixinterleave dst_w_bpp, dst_w_basereg
   1.563 +.irp chunk_size, 16, 8, 4, 2, 1
   1.564 +.if pixblock_size > chunk_size
   1.565 +    tst         W, #chunk_size
   1.566 +    beq         1f
   1.567 +.if dst_aligned_flag != 0
   1.568 +    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
   1.569 +.else
   1.570 +    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
   1.571 +.endif
   1.572 +1:
   1.573 +.endif
   1.574 +.endr
   1.575 +2:
   1.576 +.endm
   1.577 +
   1.578 +/*
   1.579 + * Macro, which performs all the needed operations to switch to the next
   1.580 + * scanline and start the next loop iteration unless all the scanlines
   1.581 + * are already processed.
   1.582 + */
   1.583 +.macro advance_to_next_scanline start_of_loop_label
   1.584 +.if regs_shortage
   1.585 +    ldrd        W, [sp] /* load W and H (width and height) from stack */
   1.586 +.else
   1.587 +    mov         W, ORIG_W
   1.588 +.endif
   1.589 +    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
   1.590 +.if src_bpp != 0
   1.591 +    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
   1.592 +.endif
   1.593 +.if mask_bpp != 0
   1.594 +    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
   1.595 +.endif
   1.596 +.if (dst_w_bpp != 24)
   1.597 +    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
   1.598 +.endif
   1.599 +.if (src_bpp != 24) && (src_bpp != 0)
   1.600 +    sub         SRC, SRC, W, lsl #src_bpp_shift
   1.601 +.endif
   1.602 +.if (mask_bpp != 24) && (mask_bpp != 0)
   1.603 +    sub         MASK, MASK, W, lsl #mask_bpp_shift
   1.604 +.endif
   1.605 +    subs        H, H, #1
   1.606 +    mov         DST_R, DST_W
   1.607 +.if regs_shortage
   1.608 +    str         H, [sp, #4] /* save updated height to stack */
   1.609 +.endif
   1.610 +    bge         start_of_loop_label
   1.611 +.endm
   1.612 +
   1.613 +/*
   1.614 + * Registers are allocated in the following way by default:
   1.615 + * d0, d1, d2, d3     - reserved for loading source pixel data
   1.616 + * d4, d5, d6, d7     - reserved for loading destination pixel data
   1.617 + * d24, d25, d26, d27 - reserved for loading mask pixel data
   1.618 + * d28, d29, d30, d31 - final destination pixel data for writeback to memory
   1.619 + */
   1.620 +.macro generate_composite_function fname, \
   1.621 +                                   src_bpp_, \
   1.622 +                                   mask_bpp_, \
   1.623 +                                   dst_w_bpp_, \
   1.624 +                                   flags, \
   1.625 +                                   pixblock_size_, \
   1.626 +                                   prefetch_distance, \
   1.627 +                                   init, \
   1.628 +                                   cleanup, \
   1.629 +                                   process_pixblock_head, \
   1.630 +                                   process_pixblock_tail, \
   1.631 +                                   process_pixblock_tail_head, \
   1.632 +                                   dst_w_basereg_ = 28, \
   1.633 +                                   dst_r_basereg_ = 4, \
   1.634 +                                   src_basereg_   = 0, \
   1.635 +                                   mask_basereg_  = 24
   1.636 +
   1.637 +    .func fname
   1.638 +    .global fname
   1.639 +    /* For ELF format also set function visibility to hidden */
   1.640 +#ifdef __ELF__
   1.641 +    .hidden fname
   1.642 +    .type fname, %function
   1.643 +#endif
   1.644 +fname:
   1.645 +    .fnstart
   1.646 +    .save       {r4-r12, lr}
   1.647 +    push        {r4-r12, lr}        /* save all registers */
   1.648 +
   1.649 +/*
   1.650 + * Select prefetch type for this function. If prefetch distance is
   1.651 + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
   1.652 + * has to be used instead of ADVANCED.
   1.653 + */
   1.654 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
   1.655 +.if prefetch_distance == 0
   1.656 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
   1.657 +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
   1.658 +        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
   1.659 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
   1.660 +.endif
   1.661 +
   1.662 +/*
   1.663 + * Make some macro arguments globally visible and accessible
   1.664 + * from other macros
   1.665 + */
   1.666 +    .set src_bpp, src_bpp_
   1.667 +    .set mask_bpp, mask_bpp_
   1.668 +    .set dst_w_bpp, dst_w_bpp_
   1.669 +    .set pixblock_size, pixblock_size_
   1.670 +    .set dst_w_basereg, dst_w_basereg_
   1.671 +    .set dst_r_basereg, dst_r_basereg_
   1.672 +    .set src_basereg, src_basereg_
   1.673 +    .set mask_basereg, mask_basereg_
   1.674 +
   1.675 +    .macro pixld_src x:vararg
   1.676 +        pixld x
   1.677 +    .endm
   1.678 +    .macro fetch_src_pixblock
   1.679 +        pixld_src   pixblock_size, src_bpp, \
   1.680 +                    (src_basereg - pixblock_size * src_bpp / 64), SRC
   1.681 +    .endm
   1.682 +/*
   1.683 + * Assign symbolic names to registers
   1.684 + */
   1.685 +    W           .req        r0      /* width (is updated during processing) */
   1.686 +    H           .req        r1      /* height (is updated during processing) */
   1.687 +    DST_W       .req        r2      /* destination buffer pointer for writes */
   1.688 +    DST_STRIDE  .req        r3      /* destination image stride */
   1.689 +    SRC         .req        r4      /* source buffer pointer */
   1.690 +    SRC_STRIDE  .req        r5      /* source image stride */
   1.691 +    DST_R       .req        r6      /* destination buffer pointer for reads */
   1.692 +
   1.693 +    MASK        .req        r7      /* mask pointer */
   1.694 +    MASK_STRIDE .req        r8      /* mask stride */
   1.695 +
   1.696 +    PF_CTL      .req        r9      /* combined lines counter and prefetch */
   1.697 +                                    /* distance increment counter */
   1.698 +    PF_X        .req        r10     /* pixel index in a scanline for current */
   1.699 +                                    /* pretetch position */
   1.700 +    PF_SRC      .req        r11     /* pointer to source scanline start */
   1.701 +                                    /* for prefetch purposes */
   1.702 +    PF_DST      .req        r12     /* pointer to destination scanline start */
   1.703 +                                    /* for prefetch purposes */
   1.704 +    PF_MASK     .req        r14     /* pointer to mask scanline start */
   1.705 +                                    /* for prefetch purposes */
   1.706 +/*
   1.707 + * Check whether we have enough registers for all the local variables.
   1.708 + * If we don't have enough registers, original width and height are
   1.709 + * kept on top of stack (and 'regs_shortage' variable is set to indicate
   1.710 + * this for the rest of code). Even if there are enough registers, the
   1.711 + * allocation scheme may be a bit different depending on whether source
   1.712 + * or mask is not used.
   1.713 + */
   1.714 +.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
   1.715 +    ORIG_W      .req        r10     /* saved original width */
   1.716 +    DUMMY       .req        r12     /* temporary register */
   1.717 +    .set        regs_shortage, 0
   1.718 +.elseif mask_bpp == 0
   1.719 +    ORIG_W      .req        r7      /* saved original width */
   1.720 +    DUMMY       .req        r8      /* temporary register */
   1.721 +    .set        regs_shortage, 0
   1.722 +.elseif src_bpp == 0
   1.723 +    ORIG_W      .req        r4      /* saved original width */
   1.724 +    DUMMY       .req        r5      /* temporary register */
   1.725 +    .set        regs_shortage, 0
   1.726 +.else
   1.727 +    ORIG_W      .req        r1      /* saved original width */
   1.728 +    DUMMY       .req        r1      /* temporary register */
   1.729 +    .set        regs_shortage, 1
   1.730 +.endif
   1.731 +
   1.732 +    .set mask_bpp_shift, -1
   1.733 +.if src_bpp == 32
   1.734 +    .set src_bpp_shift, 2
   1.735 +.elseif src_bpp == 24
   1.736 +    .set src_bpp_shift, 0
   1.737 +.elseif src_bpp == 16
   1.738 +    .set src_bpp_shift, 1
   1.739 +.elseif src_bpp == 8
   1.740 +    .set src_bpp_shift, 0
   1.741 +.elseif src_bpp == 0
   1.742 +    .set src_bpp_shift, -1
   1.743 +.else
   1.744 +    .error "requested src bpp (src_bpp) is not supported"
   1.745 +.endif
   1.746 +.if mask_bpp == 32
   1.747 +    .set mask_bpp_shift, 2
   1.748 +.elseif mask_bpp == 24
   1.749 +    .set mask_bpp_shift, 0
   1.750 +.elseif mask_bpp == 8
   1.751 +    .set mask_bpp_shift, 0
   1.752 +.elseif mask_bpp == 0
   1.753 +    .set mask_bpp_shift, -1
   1.754 +.else
   1.755 +    .error "requested mask bpp (mask_bpp) is not supported"
   1.756 +.endif
   1.757 +.if dst_w_bpp == 32
   1.758 +    .set dst_bpp_shift, 2
   1.759 +.elseif dst_w_bpp == 24
   1.760 +    .set dst_bpp_shift, 0
   1.761 +.elseif dst_w_bpp == 16
   1.762 +    .set dst_bpp_shift, 1
   1.763 +.elseif dst_w_bpp == 8
   1.764 +    .set dst_bpp_shift, 0
   1.765 +.else
   1.766 +    .error "requested dst bpp (dst_w_bpp) is not supported"
   1.767 +.endif
   1.768 +
   1.769 +.if (((flags) & FLAG_DST_READWRITE) != 0)
   1.770 +    .set dst_r_bpp, dst_w_bpp
   1.771 +.else
   1.772 +    .set dst_r_bpp, 0
   1.773 +.endif
   1.774 +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
   1.775 +    .set DEINTERLEAVE_32BPP_ENABLED, 1
   1.776 +.else
   1.777 +    .set DEINTERLEAVE_32BPP_ENABLED, 0
   1.778 +.endif
   1.779 +
   1.780 +.if prefetch_distance < 0 || prefetch_distance > 15
   1.781 +    .error "invalid prefetch distance (prefetch_distance)"
   1.782 +.endif
   1.783 +
   1.784 +.if src_bpp > 0
   1.785 +    ldr         SRC, [sp, #40]
   1.786 +.endif
   1.787 +.if mask_bpp > 0
   1.788 +    ldr         MASK, [sp, #48]
   1.789 +.endif
   1.790 +    PF mov      PF_X, #0
   1.791 +.if src_bpp > 0
   1.792 +    ldr         SRC_STRIDE, [sp, #44]
   1.793 +.endif
   1.794 +.if mask_bpp > 0
   1.795 +    ldr         MASK_STRIDE, [sp, #52]
   1.796 +.endif
   1.797 +    mov         DST_R, DST_W
   1.798 +
   1.799 +.if src_bpp == 24
   1.800 +    sub         SRC_STRIDE, SRC_STRIDE, W
   1.801 +    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
   1.802 +.endif
   1.803 +.if mask_bpp == 24
   1.804 +    sub         MASK_STRIDE, MASK_STRIDE, W
   1.805 +    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
   1.806 +.endif
   1.807 +.if dst_w_bpp == 24
   1.808 +    sub         DST_STRIDE, DST_STRIDE, W
   1.809 +    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
   1.810 +.endif
   1.811 +
   1.812 +/*
   1.813 + * Setup advanced prefetcher initial state
   1.814 + */
   1.815 +    PF mov      PF_SRC, SRC
   1.816 +    PF mov      PF_DST, DST_R
   1.817 +    PF mov      PF_MASK, MASK
   1.818 +    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
   1.819 +    PF mov      PF_CTL, H, lsl #4
   1.820 +    PF add      PF_CTL, #(prefetch_distance - 0x10)
   1.821 +
   1.822 +    init
   1.823 +.if regs_shortage
   1.824 +    .save       {r0, r1}
   1.825 +    push        {r0, r1}
   1.826 +.endif
   1.827 +    subs        H, H, #1
   1.828 +.if regs_shortage
   1.829 +    str         H, [sp, #4] /* save updated height to stack */
   1.830 +.else
   1.831 +    mov         ORIG_W, W
   1.832 +.endif
   1.833 +    blt         9f
   1.834 +    cmp         W, #(pixblock_size * 2)
   1.835 +    blt         8f
   1.836 +/*
   1.837 + * This is the start of the pipelined loop, which if optimized for
   1.838 + * long scanlines
   1.839 + */
   1.840 +0:
   1.841 +    ensure_destination_ptr_alignment process_pixblock_head, \
   1.842 +                                     process_pixblock_tail, \
   1.843 +                                     process_pixblock_tail_head
   1.844 +
   1.845 +    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
   1.846 +    pixld_a     pixblock_size, dst_r_bpp, \
   1.847 +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   1.848 +    fetch_src_pixblock
   1.849 +    pixld       pixblock_size, mask_bpp, \
   1.850 +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   1.851 +    PF add      PF_X, PF_X, #pixblock_size
   1.852 +    process_pixblock_head
   1.853 +    cache_preload 0, pixblock_size
   1.854 +    cache_preload_simple
   1.855 +    subs        W, W, #(pixblock_size * 2)
   1.856 +    blt         2f
   1.857 +1:
   1.858 +    process_pixblock_tail_head
   1.859 +    cache_preload_simple
   1.860 +    subs        W, W, #pixblock_size
   1.861 +    bge         1b
   1.862 +2:
   1.863 +    process_pixblock_tail
   1.864 +    pixst_a     pixblock_size, dst_w_bpp, \
   1.865 +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   1.866 +
   1.867 +    /* Process the remaining trailing pixels in the scanline */
   1.868 +    process_trailing_pixels 1, 1, \
   1.869 +                            process_pixblock_head, \
   1.870 +                            process_pixblock_tail, \
   1.871 +                            process_pixblock_tail_head
   1.872 +    advance_to_next_scanline 0b
   1.873 +
   1.874 +.if regs_shortage
   1.875 +    pop         {r0, r1}
   1.876 +.endif
   1.877 +    cleanup
   1.878 +    pop         {r4-r12, pc}  /* exit */
   1.879 +/*
   1.880 + * This is the start of the loop, designed to process images with small width
   1.881 + * (less than pixblock_size * 2 pixels). In this case neither pipelining
   1.882 + * nor prefetch are used.
   1.883 + */
   1.884 +8:
   1.885 +    /* Process exactly pixblock_size pixels if needed */
   1.886 +    tst         W, #pixblock_size
   1.887 +    beq         1f
   1.888 +    pixld       pixblock_size, dst_r_bpp, \
   1.889 +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   1.890 +    fetch_src_pixblock
   1.891 +    pixld       pixblock_size, mask_bpp, \
   1.892 +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   1.893 +    process_pixblock_head
   1.894 +    process_pixblock_tail
   1.895 +    pixst       pixblock_size, dst_w_bpp, \
   1.896 +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   1.897 +1:
   1.898 +    /* Process the remaining trailing pixels in the scanline */
   1.899 +    process_trailing_pixels 0, 0, \
   1.900 +                            process_pixblock_head, \
   1.901 +                            process_pixblock_tail, \
   1.902 +                            process_pixblock_tail_head
   1.903 +    advance_to_next_scanline 8b
   1.904 +9:
   1.905 +.if regs_shortage
   1.906 +    pop         {r0, r1}
   1.907 +.endif
   1.908 +    cleanup
   1.909 +    pop         {r4-r12, pc}  /* exit */
   1.910 +    .fnend
   1.911 +
   1.912 +    .purgem     fetch_src_pixblock
   1.913 +    .purgem     pixld_src
   1.914 +
   1.915 +    .unreq      SRC
   1.916 +    .unreq      MASK
   1.917 +    .unreq      DST_R
   1.918 +    .unreq      DST_W
   1.919 +    .unreq      ORIG_W
   1.920 +    .unreq      W
   1.921 +    .unreq      H
   1.922 +    .unreq      SRC_STRIDE
   1.923 +    .unreq      DST_STRIDE
   1.924 +    .unreq      MASK_STRIDE
   1.925 +    .unreq      PF_CTL
   1.926 +    .unreq      PF_X
   1.927 +    .unreq      PF_SRC
   1.928 +    .unreq      PF_DST
   1.929 +    .unreq      PF_MASK
   1.930 +    .unreq      DUMMY
   1.931 +    .endfunc
   1.932 +.endm
   1.933 +
   1.934 +/*
   1.935 + * A simplified variant of function generation template for a single
   1.936 + * scanline processing (for implementing pixman combine functions)
   1.937 + */
   1.938 +.macro generate_composite_function_scanline        use_nearest_scaling, \
   1.939 +                                                   fname, \
   1.940 +                                                   src_bpp_, \
   1.941 +                                                   mask_bpp_, \
   1.942 +                                                   dst_w_bpp_, \
   1.943 +                                                   flags, \
   1.944 +                                                   pixblock_size_, \
   1.945 +                                                   init, \
   1.946 +                                                   cleanup, \
   1.947 +                                                   process_pixblock_head, \
   1.948 +                                                   process_pixblock_tail, \
   1.949 +                                                   process_pixblock_tail_head, \
   1.950 +                                                   dst_w_basereg_ = 28, \
   1.951 +                                                   dst_r_basereg_ = 4, \
   1.952 +                                                   src_basereg_   = 0, \
   1.953 +                                                   mask_basereg_  = 24
   1.954 +
   1.955 +    .func fname
   1.956 +    .global fname
   1.957 +    /* For ELF format also set function visibility to hidden */
   1.958 +#ifdef __ELF__
   1.959 +    .hidden fname
   1.960 +    .type fname, %function
   1.961 +#endif
   1.962 +fname:
   1.963 +    .fnstart
   1.964 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
   1.965 +/*
   1.966 + * Make some macro arguments globally visible and accessible
   1.967 + * from other macros
   1.968 + */
   1.969 +    .set src_bpp, src_bpp_
   1.970 +    .set mask_bpp, mask_bpp_
   1.971 +    .set dst_w_bpp, dst_w_bpp_
   1.972 +    .set pixblock_size, pixblock_size_
   1.973 +    .set dst_w_basereg, dst_w_basereg_
   1.974 +    .set dst_r_basereg, dst_r_basereg_
   1.975 +    .set src_basereg, src_basereg_
   1.976 +    .set mask_basereg, mask_basereg_
   1.977 +
   1.978 +.if use_nearest_scaling != 0
   1.979 +    /*
   1.980 +     * Assign symbolic names to registers for nearest scaling
   1.981 +     */
   1.982 +    W           .req        r0
   1.983 +    DST_W       .req        r1
   1.984 +    SRC         .req        r2
   1.985 +    VX          .req        r3
   1.986 +    UNIT_X      .req        ip
   1.987 +    MASK        .req        lr
   1.988 +    TMP1        .req        r4
   1.989 +    TMP2        .req        r5
   1.990 +    DST_R       .req        r6
   1.991 +    SRC_WIDTH_FIXED .req        r7
   1.992 +
   1.993 +    .macro pixld_src x:vararg
   1.994 +        pixld_s x
   1.995 +    .endm
   1.996 +
   1.997 +    ldr         UNIT_X, [sp]
   1.998 +    .save       {r4-r8, lr}
   1.999 +    push        {r4-r8, lr}
  1.1000 +    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
  1.1001 +    .if mask_bpp != 0
  1.1002 +    ldr         MASK, [sp, #(24 + 8)]
  1.1003 +    .endif
  1.1004 +.else
  1.1005 +    /*
  1.1006 +     * Assign symbolic names to registers
  1.1007 +     */
  1.1008 +    W           .req        r0      /* width (is updated during processing) */
  1.1009 +    DST_W       .req        r1      /* destination buffer pointer for writes */
  1.1010 +    SRC         .req        r2      /* source buffer pointer */
  1.1011 +    DST_R       .req        ip      /* destination buffer pointer for reads */
  1.1012 +    MASK        .req        r3      /* mask pointer */
  1.1013 +
  1.1014 +    .macro pixld_src x:vararg
  1.1015 +        pixld x
  1.1016 +    .endm
  1.1017 +.endif
  1.1018 +
  1.1019 +.if (((flags) & FLAG_DST_READWRITE) != 0)
  1.1020 +    .set dst_r_bpp, dst_w_bpp
  1.1021 +.else
  1.1022 +    .set dst_r_bpp, 0
  1.1023 +.endif
  1.1024 +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
  1.1025 +    .set DEINTERLEAVE_32BPP_ENABLED, 1
  1.1026 +.else
  1.1027 +    .set DEINTERLEAVE_32BPP_ENABLED, 0
  1.1028 +.endif
  1.1029 +
  1.1030 +    .macro fetch_src_pixblock
  1.1031 +        pixld_src   pixblock_size, src_bpp, \
  1.1032 +                    (src_basereg - pixblock_size * src_bpp / 64), SRC
  1.1033 +    .endm
  1.1034 +
  1.1035 +    init
  1.1036 +    mov         DST_R, DST_W
  1.1037 +
  1.1038 +    cmp         W, #pixblock_size
  1.1039 +    blt         8f
  1.1040 +
  1.1041 +    ensure_destination_ptr_alignment process_pixblock_head, \
  1.1042 +                                     process_pixblock_tail, \
  1.1043 +                                     process_pixblock_tail_head
  1.1044 +
  1.1045 +    subs        W, W, #pixblock_size
  1.1046 +    blt         7f
  1.1047 +
  1.1048 +    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
  1.1049 +    pixld_a     pixblock_size, dst_r_bpp, \
  1.1050 +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
  1.1051 +    fetch_src_pixblock
  1.1052 +    pixld       pixblock_size, mask_bpp, \
  1.1053 +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
  1.1054 +    process_pixblock_head
  1.1055 +    subs        W, W, #pixblock_size
  1.1056 +    blt         2f
  1.1057 +1:
  1.1058 +    process_pixblock_tail_head
  1.1059 +    subs        W, W, #pixblock_size
  1.1060 +    bge         1b
  1.1061 +2:
  1.1062 +    process_pixblock_tail
  1.1063 +    pixst_a     pixblock_size, dst_w_bpp, \
  1.1064 +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
  1.1065 +7:
  1.1066 +    /* Process the remaining trailing pixels in the scanline (dst aligned) */
  1.1067 +    process_trailing_pixels 0, 1, \
  1.1068 +                            process_pixblock_head, \
  1.1069 +                            process_pixblock_tail, \
  1.1070 +                            process_pixblock_tail_head
  1.1071 +
  1.1072 +    cleanup
  1.1073 +.if use_nearest_scaling != 0
  1.1074 +    pop         {r4-r8, pc}  /* exit */
  1.1075 +.else
  1.1076 +    bx          lr  /* exit */
  1.1077 +.endif
  1.1078 +8:
  1.1079 +    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
  1.1080 +    process_trailing_pixels 0, 0, \
  1.1081 +                            process_pixblock_head, \
  1.1082 +                            process_pixblock_tail, \
  1.1083 +                            process_pixblock_tail_head
  1.1084 +
  1.1085 +    cleanup
  1.1086 +
  1.1087 +.if use_nearest_scaling != 0
  1.1088 +    pop         {r4-r8, pc}  /* exit */
  1.1089 +
  1.1090 +    .unreq      DST_R
  1.1091 +    .unreq      SRC
  1.1092 +    .unreq      W
  1.1093 +    .unreq      VX
  1.1094 +    .unreq      UNIT_X
  1.1095 +    .unreq      TMP1
  1.1096 +    .unreq      TMP2
  1.1097 +    .unreq      DST_W
  1.1098 +    .unreq      MASK
  1.1099 +    .unreq      SRC_WIDTH_FIXED
  1.1100 +
  1.1101 +.else
  1.1102 +    bx          lr  /* exit */
  1.1103 +
  1.1104 +    .unreq      SRC
  1.1105 +    .unreq      MASK
  1.1106 +    .unreq      DST_R
  1.1107 +    .unreq      DST_W
  1.1108 +    .unreq      W
  1.1109 +.endif
  1.1110 +
  1.1111 +    .purgem     fetch_src_pixblock
  1.1112 +    .purgem     pixld_src
  1.1113 +
  1.1114 +    .fnend
  1.1115 +    .endfunc
  1.1116 +.endm
  1.1117 +
  1.1118 +.macro generate_composite_function_single_scanline x:vararg
  1.1119 +    generate_composite_function_scanline 0, x
  1.1120 +.endm
  1.1121 +
  1.1122 +.macro generate_composite_function_nearest_scanline x:vararg
  1.1123 +    generate_composite_function_scanline 1, x
  1.1124 +.endm
  1.1125 +
  1.1126 +/* Default prologue/epilogue, nothing special needs to be done */
  1.1127 +
  1.1128 +.macro default_init
  1.1129 +.endm
  1.1130 +
  1.1131 +.macro default_cleanup
  1.1132 +.endm
  1.1133 +
  1.1134 +/*
  1.1135 + * Prologue/epilogue variant which additionally saves/restores d8-d15
  1.1136 + * registers (they need to be saved/restored by callee according to ABI).
  1.1137 + * This is required if the code needs to use all the NEON registers.
  1.1138 + */
  1.1139 +
  1.1140 +.macro default_init_need_all_regs
  1.1141 +    .vsave      {d8-d15}
  1.1142 +    vpush       {d8-d15}
  1.1143 +.endm
  1.1144 +
  1.1145 +.macro default_cleanup_need_all_regs
  1.1146 +    vpop        {d8-d15}
  1.1147 +.endm
  1.1148 +
  1.1149 +/******************************************************************************/
  1.1150 +
  1.1151 +/*
  1.1152 + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
  1.1153 + * into a planar a8r8g8b8 format (with a, r, g, b color components
  1.1154 + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
  1.1155 + *
  1.1156 + * Warning: the conversion is destructive and the original
  1.1157 + *          value (in) is lost.
  1.1158 + */
  1.1159 +.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
  1.1160 +    vshrn.u16   out_r, in,    #8
  1.1161 +    vshrn.u16   out_g, in,    #3
  1.1162 +    vsli.u16    in,    in,    #5
  1.1163 +    vmov.u8     out_a, #255
  1.1164 +    vsri.u8     out_r, out_r, #5
  1.1165 +    vsri.u8     out_g, out_g, #6
  1.1166 +    vshrn.u16   out_b, in,    #2
  1.1167 +.endm
  1.1168 +
  1.1169 +.macro convert_0565_to_x888 in, out_r, out_g, out_b
  1.1170 +    vshrn.u16   out_r, in,    #8
  1.1171 +    vshrn.u16   out_g, in,    #3
  1.1172 +    vsli.u16    in,    in,    #5
  1.1173 +    vsri.u8     out_r, out_r, #5
  1.1174 +    vsri.u8     out_g, out_g, #6
  1.1175 +    vshrn.u16   out_b, in,    #2
  1.1176 +.endm
  1.1177 +
  1.1178 +/*
  1.1179 + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
  1.1180 + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
  1.1181 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit
  1.1182 + * registers (tmp1, tmp2)
  1.1183 + */
  1.1184 +.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
  1.1185 +    vshll.u8    tmp1, in_g, #8
  1.1186 +    vshll.u8    out, in_r, #8
  1.1187 +    vshll.u8    tmp2, in_b, #8
  1.1188 +    vsri.u16    out, tmp1, #5
  1.1189 +    vsri.u16    out, tmp2, #11
  1.1190 +.endm
  1.1191 +
  1.1192 +/*
  1.1193 + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
  1.1194 + * returned in (out0, out1) registers pair. Requires one temporary
  1.1195 + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
  1.1196 + * value from 'in' is lost
  1.1197 + */
  1.1198 +.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
  1.1199 +    vshl.u16    out0, in,   #5  /* G top 6 bits */
  1.1200 +    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
  1.1201 +    vsri.u16    in,   in,   #5  /* R is ready in top bits */
  1.1202 +    vsri.u16    out0, out0, #6  /* G is ready in top bits */
  1.1203 +    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
  1.1204 +    vshr.u16    out1, in,   #8  /* R is in place */
  1.1205 +    vsri.u16    out0, tmp,  #8  /* G & B is in place */
  1.1206 +    vzip.u16    out0, out1      /* everything is in place */
  1.1207 +.endm

mercurial