gfx/cairo/libpixman/src/pixman-arm-neon-asm.S

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,3650 @@
     1.4 +/*
     1.5 + * Copyright © 2009 Nokia Corporation
     1.6 + *
     1.7 + * Permission is hereby granted, free of charge, to any person obtaining a
     1.8 + * copy of this software and associated documentation files (the "Software"),
     1.9 + * to deal in the Software without restriction, including without limitation
    1.10 + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    1.11 + * and/or sell copies of the Software, and to permit persons to whom the
    1.12 + * Software is furnished to do so, subject to the following conditions:
    1.13 + *
    1.14 + * The above copyright notice and this permission notice (including the next
    1.15 + * paragraph) shall be included in all copies or substantial portions of the
    1.16 + * Software.
    1.17 + *
    1.18 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    1.19 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    1.20 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
    1.21 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    1.22 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    1.23 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    1.24 + * DEALINGS IN THE SOFTWARE.
    1.25 + *
    1.26 + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
    1.27 + */
    1.28 +
    1.29 +/*
    1.30 + * This file contains implementations of NEON optimized pixel processing
    1.31 + * functions. There is no full and detailed tutorial, but some functions
    1.32 + * (those which are exposing some new or interesting features) are
    1.33 + * extensively commented and can be used as examples.
    1.34 + *
    1.35 + * You may want to have a look at the comments for following functions:
    1.36 + *  - pixman_composite_over_8888_0565_asm_neon
    1.37 + *  - pixman_composite_over_n_8_0565_asm_neon
    1.38 + */
    1.39 +
    1.40 +/* Prevent the stack from becoming executable for no reason... */
    1.41 +#if defined(__linux__) && defined(__ELF__)
    1.42 +.section .note.GNU-stack,"",%progbits
    1.43 +#endif
    1.44 +
    1.45 +    .text
    1.46 +    .fpu neon
    1.47 +    .arch armv7a
    1.48 +    .object_arch armv4
    1.49 +    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
    1.50 +    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
    1.51 +    .arm
    1.52 +    .altmacro
    1.53 +    .p2align 2
    1.54 +
    1.55 +#include "pixman-private.h"
    1.56 +#include "pixman-arm-neon-asm.h"
    1.57 +
    1.58 +/* Global configuration options and preferences */
    1.59 +
    1.60 +/*
    1.61 + * The code can optionally make use of unaligned memory accesses to improve
    1.62 + * performance of handling leading/trailing pixels for each scanline.
    1.63 + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
    1.64 + * example in linux if unaligned memory accesses are not configured to
    1.65 + * generate.exceptions.
    1.66 + */
    1.67 +.set RESPECT_STRICT_ALIGNMENT, 1
    1.68 +
    1.69 +/*
    1.70 + * Set default prefetch type. There is a choice between the following options:
    1.71 + *
    1.72 + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
    1.73 + * as NOP to workaround some HW bugs or for whatever other reason)
    1.74 + *
    1.75 + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
    1.76 + * advanced prefetch intruduces heavy overhead)
    1.77 + *
    1.78 + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
    1.79 + * which can run ARM and NEON instructions simultaneously so that extra ARM
    1.80 + * instructions do not add (many) extra cycles, but improve prefetch efficiency)
    1.81 + *
    1.82 + * Note: some types of function can't support advanced prefetch and fallback
    1.83 + *       to simple one (those which handle 24bpp pixels)
    1.84 + */
    1.85 +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
    1.86 +
    1.87 +/* Prefetch distance in pixels for simple prefetch */
    1.88 +.set PREFETCH_DISTANCE_SIMPLE, 64
    1.89 +
    1.90 +/*
    1.91 + * Implementation of pixman_composite_over_8888_0565_asm_neon
    1.92 + *
    1.93 + * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
    1.94 + * performs OVER compositing operation. Function fast_composite_over_8888_0565
    1.95 + * from pixman-fast-path.c does the same in C and can be used as a reference.
    1.96 + *
    1.97 + * First we need to have some NEON assembly code which can do the actual
    1.98 + * operation on the pixels and provide it to the template macro.
    1.99 + *
   1.100 + * Template macro quite conveniently takes care of emitting all the necessary
   1.101 + * code for memory reading and writing (including quite tricky cases of
   1.102 + * handling unaligned leading/trailing pixels), so we only need to deal with
   1.103 + * the data in NEON registers.
   1.104 + *
   1.105 + * NEON registers allocation in general is recommented to be the following:
   1.106 + * d0,  d1,  d2,  d3  - contain loaded source pixel data
   1.107 + * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
   1.108 + * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
   1.109 + * d28, d29, d30, d31 - place for storing the result (destination pixels)
   1.110 + *
   1.111 + * As can be seen above, four 64-bit NEON registers are used for keeping
   1.112 + * intermediate pixel data and up to 8 pixels can be processed in one step
   1.113 + * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
   1.114 + *
   1.115 + * This particular function uses the following registers allocation:
   1.116 + * d0,  d1,  d2,  d3  - contain loaded source pixel data
   1.117 + * d4,  d5            - contain loaded destination pixels (they are needed)
   1.118 + * d28, d29           - place for storing the result (destination pixels)
   1.119 + */
   1.120 +
   1.121 +/*
   1.122 + * Step one. We need to have some code to do some arithmetics on pixel data.
   1.123 + * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
   1.124 + * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
   1.125 + * perform all the needed calculations and write the result to {d28, d29}.
   1.126 + * The rationale for having two macros and not just one will be explained
   1.127 + * later. In practice, any single monolitic function which does the work can
   1.128 + * be split into two parts in any arbitrary way without affecting correctness.
   1.129 + *
   1.130 + * There is one special trick here too. Common template macro can optionally
   1.131 + * make our life a bit easier by doing R, G, B, A color components
   1.132 + * deinterleaving for 32bpp pixel formats (and this feature is used in
   1.133 + * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
   1.134 + * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
   1.135 + * actually use d0 register for blue channel (a vector of eight 8-bit
   1.136 + * values), d1 register for green, d2 for red and d3 for alpha. This
   1.137 + * simple conversion can be also done with a few NEON instructions:
   1.138 + *
   1.139 + * Packed to planar conversion:
   1.140 + *  vuzp.8 d0, d1
   1.141 + *  vuzp.8 d2, d3
   1.142 + *  vuzp.8 d1, d3
   1.143 + *  vuzp.8 d0, d2
   1.144 + *
   1.145 + * Planar to packed conversion:
   1.146 + *  vzip.8 d0, d2
   1.147 + *  vzip.8 d1, d3
   1.148 + *  vzip.8 d2, d3
   1.149 + *  vzip.8 d0, d1
   1.150 + *
   1.151 + * But pixel can be loaded directly in planar format using VLD4.8 NEON
   1.152 + * instruction. It is 1 cycle slower than VLD1.32, so this is not always
   1.153 + * desirable, that's why deinterleaving is optional.
   1.154 + *
   1.155 + * But anyway, here is the code:
   1.156 + */
   1.157 +.macro pixman_composite_over_8888_0565_process_pixblock_head
   1.158 +    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
   1.159 +       and put data into d6 - red, d7 - green, d30 - blue */
   1.160 +    vshrn.u16   d6, q2, #8
   1.161 +    vshrn.u16   d7, q2, #3
   1.162 +    vsli.u16    q2, q2, #5
   1.163 +    vsri.u8     d6, d6, #5
   1.164 +    vmvn.8      d3, d3      /* invert source alpha */
   1.165 +    vsri.u8     d7, d7, #6
   1.166 +    vshrn.u16   d30, q2, #2
   1.167 +    /* now do alpha blending, storing results in 8-bit planar format
   1.168 +       into d16 - red, d19 - green, d18 - blue */
   1.169 +    vmull.u8    q10, d3, d6
   1.170 +    vmull.u8    q11, d3, d7
   1.171 +    vmull.u8    q12, d3, d30
   1.172 +    vrshr.u16   q13, q10, #8
   1.173 +    vrshr.u16   q3, q11, #8
   1.174 +    vrshr.u16   q15, q12, #8
   1.175 +    vraddhn.u16 d20, q10, q13
   1.176 +    vraddhn.u16 d23, q11, q3
   1.177 +    vraddhn.u16 d22, q12, q15
   1.178 +.endm
   1.179 +
   1.180 +.macro pixman_composite_over_8888_0565_process_pixblock_tail
   1.181 +    /* ... continue alpha blending */
   1.182 +    vqadd.u8    d16, d2, d20
   1.183 +    vqadd.u8    q9, q0, q11
   1.184 +    /* convert the result to r5g6b5 and store it into {d28, d29} */
   1.185 +    vshll.u8    q14, d16, #8
   1.186 +    vshll.u8    q8, d19, #8
   1.187 +    vshll.u8    q9, d18, #8
   1.188 +    vsri.u16    q14, q8, #5
   1.189 +    vsri.u16    q14, q9, #11
   1.190 +.endm
   1.191 +
   1.192 +/*
   1.193 + * OK, now we got almost everything that we need. Using the above two
   1.194 + * macros, the work can be done right. But now we want to optimize
   1.195 + * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
   1.196 + * a lot from good code scheduling and software pipelining.
   1.197 + *
   1.198 + * Let's construct some code, which will run in the core main loop.
   1.199 + * Some pseudo-code of the main loop will look like this:
   1.200 + *   head
   1.201 + *   while (...) {
   1.202 + *     tail
   1.203 + *     head
   1.204 + *   }
   1.205 + *   tail
   1.206 + *
   1.207 + * It may look a bit weird, but this setup allows to hide instruction
   1.208 + * latencies better and also utilize dual-issue capability more
   1.209 + * efficiently (make pairs of load-store and ALU instructions).
   1.210 + *
   1.211 + * So what we need now is a '*_tail_head' macro, which will be used
   1.212 + * in the core main loop. A trivial straightforward implementation
   1.213 + * of this macro would look like this:
   1.214 + *
   1.215 + *   pixman_composite_over_8888_0565_process_pixblock_tail
   1.216 + *   vst1.16     {d28, d29}, [DST_W, :128]!
   1.217 + *   vld1.16     {d4, d5}, [DST_R, :128]!
   1.218 + *   vld4.32     {d0, d1, d2, d3}, [SRC]!
   1.219 + *   pixman_composite_over_8888_0565_process_pixblock_head
   1.220 + *   cache_preload 8, 8
   1.221 + *
   1.222 + * Now it also got some VLD/VST instructions. We simply can't move from
   1.223 + * processing one block of pixels to the other one with just arithmetics.
   1.224 + * The previously processed data needs to be written to memory and new
   1.225 + * data needs to be fetched. Fortunately, this main loop does not deal
   1.226 + * with partial leading/trailing pixels and can load/store a full block
   1.227 + * of pixels in a bulk. Additionally, destination buffer is already
   1.228 + * 16 bytes aligned here (which is good for performance).
   1.229 + *
   1.230 + * New things here are DST_R, DST_W, SRC and MASK identifiers. These
   1.231 + * are the aliases for ARM registers which are used as pointers for
   1.232 + * accessing data. We maintain separate pointers for reading and writing
   1.233 + * destination buffer (DST_R and DST_W).
   1.234 + *
   1.235 + * Another new thing is 'cache_preload' macro. It is used for prefetching
   1.236 + * data into CPU L2 cache and improve performance when dealing with large
   1.237 + * images which are far larger than cache size. It uses one argument
   1.238 + * (actually two, but they need to be the same here) - number of pixels
   1.239 + * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
   1.240 + * details about this macro. Moreover, if good performance is needed
   1.241 + * the code from this macro needs to be copied into '*_tail_head' macro
   1.242 + * and mixed with the rest of code for optimal instructions scheduling.
   1.243 + * We are actually doing it below.
   1.244 + *
   1.245 + * Now after all the explanations, here is the optimized code.
   1.246 + * Different instruction streams (originaling from '*_head', '*_tail'
   1.247 + * and 'cache_preload' macro) use different indentation levels for
   1.248 + * better readability. Actually taking the code from one of these
   1.249 + * indentation levels and ignoring a few VLD/VST instructions would
   1.250 + * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
   1.251 + * macro!
   1.252 + */
   1.253 +
   1.254 +#if 1
   1.255 +
   1.256 +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
   1.257 +        vqadd.u8    d16, d2, d20
   1.258 +    vld1.16     {d4, d5}, [DST_R, :128]!
   1.259 +        vqadd.u8    q9, q0, q11
   1.260 +    vshrn.u16   d6, q2, #8
   1.261 +    fetch_src_pixblock
   1.262 +    vshrn.u16   d7, q2, #3
   1.263 +    vsli.u16    q2, q2, #5
   1.264 +        vshll.u8    q14, d16, #8
   1.265 +                                    PF add PF_X, PF_X, #8
   1.266 +        vshll.u8    q8, d19, #8
   1.267 +                                    PF tst PF_CTL, #0xF
   1.268 +    vsri.u8     d6, d6, #5
   1.269 +                                    PF addne PF_X, PF_X, #8
   1.270 +    vmvn.8      d3, d3
   1.271 +                                    PF subne PF_CTL, PF_CTL, #1
   1.272 +    vsri.u8     d7, d7, #6
   1.273 +    vshrn.u16   d30, q2, #2
   1.274 +    vmull.u8    q10, d3, d6
   1.275 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   1.276 +    vmull.u8    q11, d3, d7
   1.277 +    vmull.u8    q12, d3, d30
   1.278 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1.279 +        vsri.u16    q14, q8, #5
   1.280 +                                    PF cmp PF_X, ORIG_W
   1.281 +        vshll.u8    q9, d18, #8
   1.282 +    vrshr.u16   q13, q10, #8
   1.283 +                                    PF subge PF_X, PF_X, ORIG_W
   1.284 +    vrshr.u16   q3, q11, #8
   1.285 +    vrshr.u16   q15, q12, #8
   1.286 +                                    PF subges PF_CTL, PF_CTL, #0x10
   1.287 +        vsri.u16    q14, q9, #11
   1.288 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   1.289 +    vraddhn.u16 d20, q10, q13
   1.290 +    vraddhn.u16 d23, q11, q3
   1.291 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1.292 +    vraddhn.u16 d22, q12, q15
   1.293 +        vst1.16     {d28, d29}, [DST_W, :128]!
   1.294 +.endm
   1.295 +
   1.296 +#else
   1.297 +
   1.298 +/* If we did not care much about the performance, we would just use this... */
   1.299 +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
   1.300 +    pixman_composite_over_8888_0565_process_pixblock_tail
   1.301 +    vst1.16     {d28, d29}, [DST_W, :128]!
   1.302 +    vld1.16     {d4, d5}, [DST_R, :128]!
   1.303 +    fetch_src_pixblock
   1.304 +    pixman_composite_over_8888_0565_process_pixblock_head
   1.305 +    cache_preload 8, 8
   1.306 +.endm
   1.307 +
   1.308 +#endif
   1.309 +
   1.310 +/*
   1.311 + * And now the final part. We are using 'generate_composite_function' macro
   1.312 + * to put all the stuff together. We are specifying the name of the function
   1.313 + * which we want to get, number of bits per pixel for the source, mask and
   1.314 + * destination (0 if unused, like mask in this case). Next come some bit
   1.315 + * flags:
   1.316 + *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
   1.317 + *                             and written, for write-only buffer we would use
   1.318 + *                             FLAG_DST_WRITEONLY flag instead
   1.319 + *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
   1.320 + *                             and separate color channels for 32bpp format.
   1.321 + * The next things are:
   1.322 + *  - the number of pixels processed per iteration (8 in this case, because
   1.323 + *    that's the maximum what can fit into four 64-bit NEON registers).
   1.324 + *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
   1.325 + *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
   1.326 + *    prefetch distance can be selected by running some benchmarks.
   1.327 + *
   1.328 + * After that we specify some macros, these are 'default_init',
   1.329 + * 'default_cleanup' here which are empty (but it is possible to have custom
   1.330 + * init/cleanup macros to be able to save/restore some extra NEON registers
   1.331 + * like d8-d15 or do anything else) followed by
   1.332 + * 'pixman_composite_over_8888_0565_process_pixblock_head',
   1.333 + * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
   1.334 + * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
   1.335 + * which we got implemented above.
   1.336 + *
   1.337 + * The last part is the NEON registers allocation scheme.
   1.338 + */
   1.339 +generate_composite_function \
   1.340 +    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
   1.341 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1.342 +    8, /* number of pixels, processed in a single block */ \
   1.343 +    5, /* prefetch distance */ \
   1.344 +    default_init, \
   1.345 +    default_cleanup, \
   1.346 +    pixman_composite_over_8888_0565_process_pixblock_head, \
   1.347 +    pixman_composite_over_8888_0565_process_pixblock_tail, \
   1.348 +    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
   1.349 +    28, /* dst_w_basereg */ \
   1.350 +    4,  /* dst_r_basereg */ \
   1.351 +    0,  /* src_basereg   */ \
   1.352 +    24  /* mask_basereg  */
   1.353 +
   1.354 +/******************************************************************************/
   1.355 +
   1.356 +.macro pixman_composite_over_n_0565_process_pixblock_head
   1.357 +    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
   1.358 +       and put data into d6 - red, d7 - green, d30 - blue */
   1.359 +    vshrn.u16   d6, q2, #8
   1.360 +    vshrn.u16   d7, q2, #3
   1.361 +    vsli.u16    q2, q2, #5
   1.362 +    vsri.u8     d6, d6, #5
   1.363 +    vsri.u8     d7, d7, #6
   1.364 +    vshrn.u16   d30, q2, #2
   1.365 +    /* now do alpha blending, storing results in 8-bit planar format
   1.366 +       into d16 - red, d19 - green, d18 - blue */
   1.367 +    vmull.u8    q10, d3, d6
   1.368 +    vmull.u8    q11, d3, d7
   1.369 +    vmull.u8    q12, d3, d30
   1.370 +    vrshr.u16   q13, q10, #8
   1.371 +    vrshr.u16   q3, q11, #8
   1.372 +    vrshr.u16   q15, q12, #8
   1.373 +    vraddhn.u16 d20, q10, q13
   1.374 +    vraddhn.u16 d23, q11, q3
   1.375 +    vraddhn.u16 d22, q12, q15
   1.376 +.endm
   1.377 +
   1.378 +.macro pixman_composite_over_n_0565_process_pixblock_tail
   1.379 +    /* ... continue alpha blending */
   1.380 +    vqadd.u8    d16, d2, d20
   1.381 +    vqadd.u8    q9, q0, q11
   1.382 +    /* convert the result to r5g6b5 and store it into {d28, d29} */
   1.383 +    vshll.u8    q14, d16, #8
   1.384 +    vshll.u8    q8, d19, #8
   1.385 +    vshll.u8    q9, d18, #8
   1.386 +    vsri.u16    q14, q8, #5
   1.387 +    vsri.u16    q14, q9, #11
   1.388 +.endm
   1.389 +
   1.390 +/* TODO: expand macros and do better instructions scheduling */
   1.391 +.macro pixman_composite_over_n_0565_process_pixblock_tail_head
   1.392 +    pixman_composite_over_n_0565_process_pixblock_tail
   1.393 +    vld1.16     {d4, d5}, [DST_R, :128]!
   1.394 +    vst1.16     {d28, d29}, [DST_W, :128]!
   1.395 +    pixman_composite_over_n_0565_process_pixblock_head
   1.396 +    cache_preload 8, 8
   1.397 +.endm
   1.398 +
   1.399 +.macro pixman_composite_over_n_0565_init
   1.400 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1.401 +    vld1.32     {d3[0]}, [DUMMY]
   1.402 +    vdup.8      d0, d3[0]
   1.403 +    vdup.8      d1, d3[1]
   1.404 +    vdup.8      d2, d3[2]
   1.405 +    vdup.8      d3, d3[3]
   1.406 +    vmvn.8      d3, d3      /* invert source alpha */
   1.407 +.endm
   1.408 +
   1.409 +generate_composite_function \
   1.410 +    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
   1.411 +    FLAG_DST_READWRITE, \
   1.412 +    8, /* number of pixels, processed in a single block */ \
   1.413 +    5, /* prefetch distance */ \
   1.414 +    pixman_composite_over_n_0565_init, \
   1.415 +    default_cleanup, \
   1.416 +    pixman_composite_over_n_0565_process_pixblock_head, \
   1.417 +    pixman_composite_over_n_0565_process_pixblock_tail, \
   1.418 +    pixman_composite_over_n_0565_process_pixblock_tail_head, \
   1.419 +    28, /* dst_w_basereg */ \
   1.420 +    4,  /* dst_r_basereg */ \
   1.421 +    0,  /* src_basereg   */ \
   1.422 +    24  /* mask_basereg  */
   1.423 +
   1.424 +/******************************************************************************/
   1.425 +
   1.426 +.macro pixman_composite_src_8888_0565_process_pixblock_head
   1.427 +    vshll.u8    q8, d1, #8
   1.428 +    vshll.u8    q14, d2, #8
   1.429 +    vshll.u8    q9, d0, #8
   1.430 +.endm
   1.431 +
   1.432 +.macro pixman_composite_src_8888_0565_process_pixblock_tail
   1.433 +    vsri.u16    q14, q8, #5
   1.434 +    vsri.u16    q14, q9, #11
   1.435 +.endm
   1.436 +
   1.437 +.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
   1.438 +        vsri.u16    q14, q8, #5
   1.439 +                                    PF add PF_X, PF_X, #8
   1.440 +                                    PF tst PF_CTL, #0xF
   1.441 +    fetch_src_pixblock
   1.442 +                                    PF addne PF_X, PF_X, #8
   1.443 +                                    PF subne PF_CTL, PF_CTL, #1
   1.444 +        vsri.u16    q14, q9, #11
   1.445 +                                    PF cmp PF_X, ORIG_W
   1.446 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   1.447 +    vshll.u8    q8, d1, #8
   1.448 +        vst1.16     {d28, d29}, [DST_W, :128]!
   1.449 +                                    PF subge PF_X, PF_X, ORIG_W
   1.450 +                                    PF subges PF_CTL, PF_CTL, #0x10
   1.451 +    vshll.u8    q14, d2, #8
   1.452 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   1.453 +    vshll.u8    q9, d0, #8
   1.454 +.endm
   1.455 +
   1.456 +generate_composite_function \
   1.457 +    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
   1.458 +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   1.459 +    8, /* number of pixels, processed in a single block */ \
   1.460 +    10, /* prefetch distance */ \
   1.461 +    default_init, \
   1.462 +    default_cleanup, \
   1.463 +    pixman_composite_src_8888_0565_process_pixblock_head, \
   1.464 +    pixman_composite_src_8888_0565_process_pixblock_tail, \
   1.465 +    pixman_composite_src_8888_0565_process_pixblock_tail_head
   1.466 +
   1.467 +/******************************************************************************/
   1.468 +
   1.469 +.macro pixman_composite_src_0565_8888_process_pixblock_head
   1.470 +    vshrn.u16   d30, q0, #8
   1.471 +    vshrn.u16   d29, q0, #3
   1.472 +    vsli.u16    q0, q0, #5
   1.473 +    vmov.u8     d31, #255
   1.474 +    vsri.u8     d30, d30, #5
   1.475 +    vsri.u8     d29, d29, #6
   1.476 +    vshrn.u16   d28, q0, #2
   1.477 +.endm
   1.478 +
   1.479 +.macro pixman_composite_src_0565_8888_process_pixblock_tail
   1.480 +.endm
   1.481 +
   1.482 +/* TODO: expand macros and do better instructions scheduling */
   1.483 +.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
   1.484 +    pixman_composite_src_0565_8888_process_pixblock_tail
   1.485 +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   1.486 +    fetch_src_pixblock
   1.487 +    pixman_composite_src_0565_8888_process_pixblock_head
   1.488 +    cache_preload 8, 8
   1.489 +.endm
   1.490 +
   1.491 +generate_composite_function \
   1.492 +    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
   1.493 +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   1.494 +    8, /* number of pixels, processed in a single block */ \
   1.495 +    10, /* prefetch distance */ \
   1.496 +    default_init, \
   1.497 +    default_cleanup, \
   1.498 +    pixman_composite_src_0565_8888_process_pixblock_head, \
   1.499 +    pixman_composite_src_0565_8888_process_pixblock_tail, \
   1.500 +    pixman_composite_src_0565_8888_process_pixblock_tail_head
   1.501 +
   1.502 +/******************************************************************************/
   1.503 +
   1.504 +.macro pixman_composite_add_8_8_process_pixblock_head
   1.505 +    vqadd.u8    q14, q0, q2
   1.506 +    vqadd.u8    q15, q1, q3
   1.507 +.endm
   1.508 +
   1.509 +.macro pixman_composite_add_8_8_process_pixblock_tail
   1.510 +.endm
   1.511 +
   1.512 +.macro pixman_composite_add_8_8_process_pixblock_tail_head
   1.513 +    fetch_src_pixblock
   1.514 +                                    PF add PF_X, PF_X, #32
   1.515 +                                    PF tst PF_CTL, #0xF
   1.516 +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1.517 +                                    PF addne PF_X, PF_X, #32
   1.518 +                                    PF subne PF_CTL, PF_CTL, #1
   1.519 +        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1.520 +                                    PF cmp PF_X, ORIG_W
   1.521 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   1.522 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1.523 +                                    PF subge PF_X, PF_X, ORIG_W
   1.524 +                                    PF subges PF_CTL, PF_CTL, #0x10
   1.525 +    vqadd.u8    q14, q0, q2
   1.526 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   1.527 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1.528 +    vqadd.u8    q15, q1, q3
   1.529 +.endm
   1.530 +
   1.531 +generate_composite_function \
   1.532 +    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
   1.533 +    FLAG_DST_READWRITE, \
   1.534 +    32, /* number of pixels, processed in a single block */ \
   1.535 +    10, /* prefetch distance */ \
   1.536 +    default_init, \
   1.537 +    default_cleanup, \
   1.538 +    pixman_composite_add_8_8_process_pixblock_head, \
   1.539 +    pixman_composite_add_8_8_process_pixblock_tail, \
   1.540 +    pixman_composite_add_8_8_process_pixblock_tail_head
   1.541 +
   1.542 +/******************************************************************************/
   1.543 +
   1.544 +.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
   1.545 +    fetch_src_pixblock
   1.546 +                                    PF add PF_X, PF_X, #8
   1.547 +                                    PF tst PF_CTL, #0xF
   1.548 +    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
   1.549 +                                    PF addne PF_X, PF_X, #8
   1.550 +                                    PF subne PF_CTL, PF_CTL, #1
   1.551 +        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
   1.552 +                                    PF cmp PF_X, ORIG_W
   1.553 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   1.554 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1.555 +                                    PF subge PF_X, PF_X, ORIG_W
   1.556 +                                    PF subges PF_CTL, PF_CTL, #0x10
   1.557 +    vqadd.u8    q14, q0, q2
   1.558 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   1.559 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1.560 +    vqadd.u8    q15, q1, q3
   1.561 +.endm
   1.562 +
   1.563 +generate_composite_function \
   1.564 +    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
   1.565 +    FLAG_DST_READWRITE, \
   1.566 +    8, /* number of pixels, processed in a single block */ \
   1.567 +    10, /* prefetch distance */ \
   1.568 +    default_init, \
   1.569 +    default_cleanup, \
   1.570 +    pixman_composite_add_8_8_process_pixblock_head, \
   1.571 +    pixman_composite_add_8_8_process_pixblock_tail, \
   1.572 +    pixman_composite_add_8888_8888_process_pixblock_tail_head
   1.573 +
   1.574 +generate_composite_function_single_scanline \
   1.575 +    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
   1.576 +    FLAG_DST_READWRITE, \
   1.577 +    8, /* number of pixels, processed in a single block */ \
   1.578 +    default_init, \
   1.579 +    default_cleanup, \
   1.580 +    pixman_composite_add_8_8_process_pixblock_head, \
   1.581 +    pixman_composite_add_8_8_process_pixblock_tail, \
   1.582 +    pixman_composite_add_8888_8888_process_pixblock_tail_head
   1.583 +
   1.584 +/******************************************************************************/
   1.585 +
   1.586 +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
   1.587 +    vmvn.8      d24, d3  /* get inverted alpha */
   1.588 +    /* do alpha blending */
   1.589 +    vmull.u8    q8, d24, d4
   1.590 +    vmull.u8    q9, d24, d5
   1.591 +    vmull.u8    q10, d24, d6
   1.592 +    vmull.u8    q11, d24, d7
   1.593 +.endm
   1.594 +
   1.595 +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
   1.596 +    vrshr.u16   q14, q8, #8
   1.597 +    vrshr.u16   q15, q9, #8
   1.598 +    vrshr.u16   q12, q10, #8
   1.599 +    vrshr.u16   q13, q11, #8
   1.600 +    vraddhn.u16 d28, q14, q8
   1.601 +    vraddhn.u16 d29, q15, q9
   1.602 +    vraddhn.u16 d30, q12, q10
   1.603 +    vraddhn.u16 d31, q13, q11
   1.604 +.endm
   1.605 +
   1.606 +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
   1.607 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1.608 +        vrshr.u16   q14, q8, #8
   1.609 +                                    PF add PF_X, PF_X, #8
   1.610 +                                    PF tst PF_CTL, #0xF
   1.611 +        vrshr.u16   q15, q9, #8
   1.612 +        vrshr.u16   q12, q10, #8
   1.613 +        vrshr.u16   q13, q11, #8
   1.614 +                                    PF addne PF_X, PF_X, #8
   1.615 +                                    PF subne PF_CTL, PF_CTL, #1
   1.616 +        vraddhn.u16 d28, q14, q8
   1.617 +        vraddhn.u16 d29, q15, q9
   1.618 +                                    PF cmp PF_X, ORIG_W
   1.619 +        vraddhn.u16 d30, q12, q10
   1.620 +        vraddhn.u16 d31, q13, q11
   1.621 +    fetch_src_pixblock
   1.622 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   1.623 +    vmvn.8      d22, d3
   1.624 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1.625 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1.626 +                                    PF subge PF_X, PF_X, ORIG_W
   1.627 +    vmull.u8    q8, d22, d4
   1.628 +                                    PF subges PF_CTL, PF_CTL, #0x10
   1.629 +    vmull.u8    q9, d22, d5
   1.630 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   1.631 +    vmull.u8    q10, d22, d6
   1.632 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1.633 +    vmull.u8    q11, d22, d7
   1.634 +.endm
   1.635 +
   1.636 +generate_composite_function_single_scanline \
   1.637 +    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
   1.638 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1.639 +    8, /* number of pixels, processed in a single block */ \
   1.640 +    default_init, \
   1.641 +    default_cleanup, \
   1.642 +    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
   1.643 +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
   1.644 +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
   1.645 +
   1.646 +/******************************************************************************/
   1.647 +
   1.648 +.macro pixman_composite_over_8888_8888_process_pixblock_head
   1.649 +    pixman_composite_out_reverse_8888_8888_process_pixblock_head
   1.650 +.endm
   1.651 +
   1.652 +.macro pixman_composite_over_8888_8888_process_pixblock_tail
   1.653 +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
   1.654 +    vqadd.u8    q14, q0, q14
   1.655 +    vqadd.u8    q15, q1, q15
   1.656 +.endm
   1.657 +
   1.658 +.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
   1.659 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1.660 +        vrshr.u16   q14, q8, #8
   1.661 +                                    PF add PF_X, PF_X, #8
   1.662 +                                    PF tst PF_CTL, #0xF
   1.663 +        vrshr.u16   q15, q9, #8
   1.664 +        vrshr.u16   q12, q10, #8
   1.665 +        vrshr.u16   q13, q11, #8
   1.666 +                                    PF addne PF_X, PF_X, #8
   1.667 +                                    PF subne PF_CTL, PF_CTL, #1
   1.668 +        vraddhn.u16 d28, q14, q8
   1.669 +        vraddhn.u16 d29, q15, q9
   1.670 +                                    PF cmp PF_X, ORIG_W
   1.671 +        vraddhn.u16 d30, q12, q10
   1.672 +        vraddhn.u16 d31, q13, q11
   1.673 +        vqadd.u8    q14, q0, q14
   1.674 +        vqadd.u8    q15, q1, q15
   1.675 +    fetch_src_pixblock
   1.676 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   1.677 +    vmvn.8      d22, d3
   1.678 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1.679 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1.680 +                                    PF subge PF_X, PF_X, ORIG_W
   1.681 +    vmull.u8    q8, d22, d4
   1.682 +                                    PF subges PF_CTL, PF_CTL, #0x10
   1.683 +    vmull.u8    q9, d22, d5
   1.684 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   1.685 +    vmull.u8    q10, d22, d6
   1.686 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1.687 +    vmull.u8    q11, d22, d7
   1.688 +.endm
   1.689 +
   1.690 +generate_composite_function \
   1.691 +    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
   1.692 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1.693 +    8, /* number of pixels, processed in a single block */ \
   1.694 +    5, /* prefetch distance */ \
   1.695 +    default_init, \
   1.696 +    default_cleanup, \
   1.697 +    pixman_composite_over_8888_8888_process_pixblock_head, \
   1.698 +    pixman_composite_over_8888_8888_process_pixblock_tail, \
   1.699 +    pixman_composite_over_8888_8888_process_pixblock_tail_head
   1.700 +
   1.701 +generate_composite_function_single_scanline \
   1.702 +    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
   1.703 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1.704 +    8, /* number of pixels, processed in a single block */ \
   1.705 +    default_init, \
   1.706 +    default_cleanup, \
   1.707 +    pixman_composite_over_8888_8888_process_pixblock_head, \
   1.708 +    pixman_composite_over_8888_8888_process_pixblock_tail, \
   1.709 +    pixman_composite_over_8888_8888_process_pixblock_tail_head
   1.710 +
   1.711 +/******************************************************************************/
   1.712 +
   1.713 +.macro pixman_composite_over_n_8888_process_pixblock_head
   1.714 +    /* deinterleaved source pixels in {d0, d1, d2, d3} */
   1.715 +    /* inverted alpha in {d24} */
   1.716 +    /* destination pixels in {d4, d5, d6, d7} */
   1.717 +    vmull.u8    q8, d24, d4
   1.718 +    vmull.u8    q9, d24, d5
   1.719 +    vmull.u8    q10, d24, d6
   1.720 +    vmull.u8    q11, d24, d7
   1.721 +.endm
   1.722 +
   1.723 +.macro pixman_composite_over_n_8888_process_pixblock_tail
   1.724 +    vrshr.u16   q14, q8, #8
   1.725 +    vrshr.u16   q15, q9, #8
   1.726 +    vrshr.u16   q2, q10, #8
   1.727 +    vrshr.u16   q3, q11, #8
   1.728 +    vraddhn.u16 d28, q14, q8
   1.729 +    vraddhn.u16 d29, q15, q9
   1.730 +    vraddhn.u16 d30, q2, q10
   1.731 +    vraddhn.u16 d31, q3, q11
   1.732 +    vqadd.u8    q14, q0, q14
   1.733 +    vqadd.u8    q15, q1, q15
   1.734 +.endm
   1.735 +
   1.736 +.macro pixman_composite_over_n_8888_process_pixblock_tail_head
   1.737 +        vrshr.u16   q14, q8, #8
   1.738 +        vrshr.u16   q15, q9, #8
   1.739 +        vrshr.u16   q2, q10, #8
   1.740 +        vrshr.u16   q3, q11, #8
   1.741 +        vraddhn.u16 d28, q14, q8
   1.742 +        vraddhn.u16 d29, q15, q9
   1.743 +        vraddhn.u16 d30, q2, q10
   1.744 +        vraddhn.u16 d31, q3, q11
   1.745 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1.746 +        vqadd.u8    q14, q0, q14
   1.747 +                                    PF add PF_X, PF_X, #8
   1.748 +                                    PF tst PF_CTL, #0x0F
   1.749 +                                    PF addne PF_X, PF_X, #8
   1.750 +                                    PF subne PF_CTL, PF_CTL, #1
   1.751 +        vqadd.u8    q15, q1, q15
   1.752 +                                    PF cmp PF_X, ORIG_W
   1.753 +    vmull.u8    q8, d24, d4
   1.754 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1.755 +    vmull.u8    q9, d24, d5
   1.756 +                                    PF subge PF_X, PF_X, ORIG_W
   1.757 +    vmull.u8    q10, d24, d6
   1.758 +                                    PF subges PF_CTL, PF_CTL, #0x10
   1.759 +    vmull.u8    q11, d24, d7
   1.760 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1.761 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1.762 +.endm
   1.763 +
   1.764 +.macro pixman_composite_over_n_8888_init
   1.765 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1.766 +    vld1.32     {d3[0]}, [DUMMY]
   1.767 +    vdup.8      d0, d3[0]
   1.768 +    vdup.8      d1, d3[1]
   1.769 +    vdup.8      d2, d3[2]
   1.770 +    vdup.8      d3, d3[3]
   1.771 +    vmvn.8      d24, d3  /* get inverted alpha */
   1.772 +.endm
   1.773 +
   1.774 +generate_composite_function \
   1.775 +    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
   1.776 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1.777 +    8, /* number of pixels, processed in a single block */ \
   1.778 +    5, /* prefetch distance */ \
   1.779 +    pixman_composite_over_n_8888_init, \
   1.780 +    default_cleanup, \
   1.781 +    pixman_composite_over_8888_8888_process_pixblock_head, \
   1.782 +    pixman_composite_over_8888_8888_process_pixblock_tail, \
   1.783 +    pixman_composite_over_n_8888_process_pixblock_tail_head
   1.784 +
   1.785 +/******************************************************************************/
   1.786 +
   1.787 +.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
   1.788 +        vrshr.u16   q14, q8, #8
   1.789 +                                    PF add PF_X, PF_X, #8
   1.790 +                                    PF tst PF_CTL, #0xF
   1.791 +        vrshr.u16   q15, q9, #8
   1.792 +        vrshr.u16   q12, q10, #8
   1.793 +        vrshr.u16   q13, q11, #8
   1.794 +                                    PF addne PF_X, PF_X, #8
   1.795 +                                    PF subne PF_CTL, PF_CTL, #1
   1.796 +        vraddhn.u16 d28, q14, q8
   1.797 +        vraddhn.u16 d29, q15, q9
   1.798 +                                    PF cmp PF_X, ORIG_W
   1.799 +        vraddhn.u16 d30, q12, q10
   1.800 +        vraddhn.u16 d31, q13, q11
   1.801 +        vqadd.u8    q14, q0, q14
   1.802 +        vqadd.u8    q15, q1, q15
   1.803 +    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
   1.804 +    vmvn.8      d22, d3
   1.805 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1.806 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1.807 +                                    PF subge PF_X, PF_X, ORIG_W
   1.808 +    vmull.u8    q8, d22, d4
   1.809 +                                    PF subges PF_CTL, PF_CTL, #0x10
   1.810 +    vmull.u8    q9, d22, d5
   1.811 +    vmull.u8    q10, d22, d6
   1.812 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1.813 +    vmull.u8    q11, d22, d7
   1.814 +.endm
   1.815 +
   1.816 +.macro pixman_composite_over_reverse_n_8888_init
   1.817 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1.818 +    vld1.32     {d7[0]}, [DUMMY]
   1.819 +    vdup.8      d4, d7[0]
   1.820 +    vdup.8      d5, d7[1]
   1.821 +    vdup.8      d6, d7[2]
   1.822 +    vdup.8      d7, d7[3]
   1.823 +.endm
   1.824 +
   1.825 +generate_composite_function \
   1.826 +    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
   1.827 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1.828 +    8, /* number of pixels, processed in a single block */ \
   1.829 +    5, /* prefetch distance */ \
   1.830 +    pixman_composite_over_reverse_n_8888_init, \
   1.831 +    default_cleanup, \
   1.832 +    pixman_composite_over_8888_8888_process_pixblock_head, \
   1.833 +    pixman_composite_over_8888_8888_process_pixblock_tail, \
   1.834 +    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
   1.835 +    28, /* dst_w_basereg */ \
   1.836 +    0,  /* dst_r_basereg */ \
   1.837 +    4,  /* src_basereg   */ \
   1.838 +    24  /* mask_basereg  */
   1.839 +
   1.840 +/******************************************************************************/
   1.841 +
   1.842 +.macro pixman_composite_over_8888_8_0565_process_pixblock_head
   1.843 +    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
   1.844 +    vmull.u8    q1,  d24, d9
   1.845 +    vmull.u8    q6,  d24, d10
   1.846 +    vmull.u8    q7,  d24, d11
   1.847 +        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
   1.848 +        vshrn.u16   d7,  q2, #3
   1.849 +        vsli.u16    q2,  q2, #5
   1.850 +    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
   1.851 +    vrshr.u16   q9,  q1,  #8
   1.852 +    vrshr.u16   q10, q6,  #8
   1.853 +    vrshr.u16   q11, q7,  #8
   1.854 +    vraddhn.u16 d0,  q0,  q8
   1.855 +    vraddhn.u16 d1,  q1,  q9
   1.856 +    vraddhn.u16 d2,  q6,  q10
   1.857 +    vraddhn.u16 d3,  q7,  q11
   1.858 +        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
   1.859 +        vsri.u8     d7,  d7, #6
   1.860 +    vmvn.8      d3,  d3
   1.861 +        vshrn.u16   d30, q2, #2
   1.862 +    vmull.u8    q8,  d3, d6     /* now do alpha blending */
   1.863 +    vmull.u8    q9,  d3, d7
   1.864 +    vmull.u8    q10, d3, d30
   1.865 +.endm
   1.866 +
   1.867 +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
   1.868 +    /* 3 cycle bubble (after vmull.u8) */
   1.869 +    vrshr.u16   q13, q8,  #8
   1.870 +    vrshr.u16   q11, q9,  #8
   1.871 +    vrshr.u16   q15, q10, #8
   1.872 +    vraddhn.u16 d16, q8,  q13
   1.873 +    vraddhn.u16 d27, q9,  q11
   1.874 +    vraddhn.u16 d26, q10, q15
   1.875 +    vqadd.u8    d16, d2,  d16
   1.876 +    /* 1 cycle bubble */
   1.877 +    vqadd.u8    q9,  q0,  q13
   1.878 +    vshll.u8    q14, d16, #8    /* convert to 16bpp */
   1.879 +    vshll.u8    q8,  d19, #8
   1.880 +    vshll.u8    q9,  d18, #8
   1.881 +    vsri.u16    q14, q8,  #5
   1.882 +    /* 1 cycle bubble */
   1.883 +    vsri.u16    q14, q9,  #11
   1.884 +.endm
   1.885 +
   1.886 +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
   1.887 +    vld1.16     {d4, d5}, [DST_R, :128]!
   1.888 +    vshrn.u16   d6,  q2,  #8
   1.889 +    fetch_mask_pixblock
   1.890 +    vshrn.u16   d7,  q2,  #3
   1.891 +    fetch_src_pixblock
   1.892 +    vmull.u8    q6,  d24, d10
   1.893 +        vrshr.u16   q13, q8,  #8
   1.894 +        vrshr.u16   q11, q9,  #8
   1.895 +        vrshr.u16   q15, q10, #8
   1.896 +        vraddhn.u16 d16, q8,  q13
   1.897 +        vraddhn.u16 d27, q9,  q11
   1.898 +        vraddhn.u16 d26, q10, q15
   1.899 +        vqadd.u8    d16, d2,  d16
   1.900 +    vmull.u8    q1,  d24, d9
   1.901 +        vqadd.u8    q9,  q0,  q13
   1.902 +        vshll.u8    q14, d16, #8
   1.903 +    vmull.u8    q0,  d24, d8
   1.904 +        vshll.u8    q8,  d19, #8
   1.905 +        vshll.u8    q9,  d18, #8
   1.906 +        vsri.u16    q14, q8,  #5
   1.907 +    vmull.u8    q7,  d24, d11
   1.908 +        vsri.u16    q14, q9,  #11
   1.909 +
   1.910 +    cache_preload 8, 8
   1.911 +
   1.912 +    vsli.u16    q2,  q2,  #5
   1.913 +    vrshr.u16   q8,  q0,  #8
   1.914 +    vrshr.u16   q9,  q1,  #8
   1.915 +    vrshr.u16   q10, q6,  #8
   1.916 +    vrshr.u16   q11, q7,  #8
   1.917 +    vraddhn.u16 d0,  q0,  q8
   1.918 +    vraddhn.u16 d1,  q1,  q9
   1.919 +    vraddhn.u16 d2,  q6,  q10
   1.920 +    vraddhn.u16 d3,  q7,  q11
   1.921 +    vsri.u8     d6,  d6,  #5
   1.922 +    vsri.u8     d7,  d7,  #6
   1.923 +    vmvn.8      d3,  d3
   1.924 +    vshrn.u16   d30, q2,  #2
   1.925 +    vst1.16     {d28, d29}, [DST_W, :128]!
   1.926 +    vmull.u8    q8,  d3,  d6
   1.927 +    vmull.u8    q9,  d3,  d7
   1.928 +    vmull.u8    q10, d3,  d30
   1.929 +.endm
   1.930 +
   1.931 +generate_composite_function \
   1.932 +    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
   1.933 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1.934 +    8, /* number of pixels, processed in a single block */ \
   1.935 +    5, /* prefetch distance */ \
   1.936 +    default_init_need_all_regs, \
   1.937 +    default_cleanup_need_all_regs, \
   1.938 +    pixman_composite_over_8888_8_0565_process_pixblock_head, \
   1.939 +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
   1.940 +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
   1.941 +    28, /* dst_w_basereg */ \
   1.942 +    4,  /* dst_r_basereg */ \
   1.943 +    8,  /* src_basereg   */ \
   1.944 +    24  /* mask_basereg  */
   1.945 +
   1.946 +/******************************************************************************/
   1.947 +
   1.948 +/*
   1.949 + * This function needs a special initialization of solid mask.
   1.950 + * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
   1.951 + * offset, split into color components and replicated in d8-d11
   1.952 + * registers. Additionally, this function needs all the NEON registers,
   1.953 + * so it has to save d8-d15 registers which are callee saved according
   1.954 + * to ABI. These registers are restored from 'cleanup' macro. All the
   1.955 + * other NEON registers are caller saved, so can be clobbered freely
   1.956 + * without introducing any problems.
   1.957 + */
   1.958 +.macro pixman_composite_over_n_8_0565_init
   1.959 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
   1.960 +    .vsave      {d8-d15}
   1.961 +    vpush       {d8-d15}
   1.962 +    vld1.32     {d11[0]}, [DUMMY]
   1.963 +    vdup.8      d8, d11[0]
   1.964 +    vdup.8      d9, d11[1]
   1.965 +    vdup.8      d10, d11[2]
   1.966 +    vdup.8      d11, d11[3]
   1.967 +.endm
   1.968 +
   1.969 +.macro pixman_composite_over_n_8_0565_cleanup
   1.970 +    vpop        {d8-d15}
   1.971 +.endm
   1.972 +
   1.973 +generate_composite_function \
   1.974 +    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
   1.975 +    FLAG_DST_READWRITE, \
   1.976 +    8, /* number of pixels, processed in a single block */ \
   1.977 +    5, /* prefetch distance */ \
   1.978 +    pixman_composite_over_n_8_0565_init, \
   1.979 +    pixman_composite_over_n_8_0565_cleanup, \
   1.980 +    pixman_composite_over_8888_8_0565_process_pixblock_head, \
   1.981 +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
   1.982 +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
   1.983 +
   1.984 +/******************************************************************************/
   1.985 +
   1.986 +.macro pixman_composite_over_8888_n_0565_init
   1.987 +    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
   1.988 +    .vsave      {d8-d15}
   1.989 +    vpush       {d8-d15}
   1.990 +    vld1.32     {d24[0]}, [DUMMY]
   1.991 +    vdup.8      d24, d24[3]
   1.992 +.endm
   1.993 +
   1.994 +.macro pixman_composite_over_8888_n_0565_cleanup
   1.995 +    vpop        {d8-d15}
   1.996 +.endm
   1.997 +
   1.998 +generate_composite_function \
   1.999 +    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
  1.1000 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.1001 +    8, /* number of pixels, processed in a single block */ \
  1.1002 +    5, /* prefetch distance */ \
  1.1003 +    pixman_composite_over_8888_n_0565_init, \
  1.1004 +    pixman_composite_over_8888_n_0565_cleanup, \
  1.1005 +    pixman_composite_over_8888_8_0565_process_pixblock_head, \
  1.1006 +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
  1.1007 +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
  1.1008 +    28, /* dst_w_basereg */ \
  1.1009 +    4,  /* dst_r_basereg */ \
  1.1010 +    8,  /* src_basereg   */ \
  1.1011 +    24  /* mask_basereg  */
  1.1012 +
  1.1013 +/******************************************************************************/
  1.1014 +
  1.1015 +.macro pixman_composite_src_0565_0565_process_pixblock_head
  1.1016 +.endm
  1.1017 +
  1.1018 +.macro pixman_composite_src_0565_0565_process_pixblock_tail
  1.1019 +.endm
  1.1020 +
  1.1021 +.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
  1.1022 +    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
  1.1023 +    fetch_src_pixblock
  1.1024 +    cache_preload 16, 16
  1.1025 +.endm
  1.1026 +
  1.1027 +generate_composite_function \
  1.1028 +    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
  1.1029 +    FLAG_DST_WRITEONLY, \
  1.1030 +    16, /* number of pixels, processed in a single block */ \
  1.1031 +    10, /* prefetch distance */ \
  1.1032 +    default_init, \
  1.1033 +    default_cleanup, \
  1.1034 +    pixman_composite_src_0565_0565_process_pixblock_head, \
  1.1035 +    pixman_composite_src_0565_0565_process_pixblock_tail, \
  1.1036 +    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
  1.1037 +    0, /* dst_w_basereg */ \
  1.1038 +    0, /* dst_r_basereg */ \
  1.1039 +    0, /* src_basereg   */ \
  1.1040 +    0  /* mask_basereg  */
  1.1041 +
  1.1042 +/******************************************************************************/
  1.1043 +
  1.1044 +.macro pixman_composite_src_n_8_process_pixblock_head
  1.1045 +.endm
  1.1046 +
  1.1047 +.macro pixman_composite_src_n_8_process_pixblock_tail
  1.1048 +.endm
  1.1049 +
  1.1050 +.macro pixman_composite_src_n_8_process_pixblock_tail_head
  1.1051 +    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
  1.1052 +.endm
  1.1053 +
  1.1054 +.macro pixman_composite_src_n_8_init
  1.1055 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1056 +    vld1.32     {d0[0]}, [DUMMY]
  1.1057 +    vsli.u64    d0, d0, #8
  1.1058 +    vsli.u64    d0, d0, #16
  1.1059 +    vsli.u64    d0, d0, #32
  1.1060 +    vorr        d1, d0, d0
  1.1061 +    vorr        q1, q0, q0
  1.1062 +.endm
  1.1063 +
  1.1064 +.macro pixman_composite_src_n_8_cleanup
  1.1065 +.endm
  1.1066 +
  1.1067 +generate_composite_function \
  1.1068 +    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
  1.1069 +    FLAG_DST_WRITEONLY, \
  1.1070 +    32, /* number of pixels, processed in a single block */ \
  1.1071 +    0,  /* prefetch distance */ \
  1.1072 +    pixman_composite_src_n_8_init, \
  1.1073 +    pixman_composite_src_n_8_cleanup, \
  1.1074 +    pixman_composite_src_n_8_process_pixblock_head, \
  1.1075 +    pixman_composite_src_n_8_process_pixblock_tail, \
  1.1076 +    pixman_composite_src_n_8_process_pixblock_tail_head, \
  1.1077 +    0, /* dst_w_basereg */ \
  1.1078 +    0, /* dst_r_basereg */ \
  1.1079 +    0, /* src_basereg   */ \
  1.1080 +    0  /* mask_basereg  */
  1.1081 +
  1.1082 +/******************************************************************************/
  1.1083 +
  1.1084 +.macro pixman_composite_src_n_0565_process_pixblock_head
  1.1085 +.endm
  1.1086 +
  1.1087 +.macro pixman_composite_src_n_0565_process_pixblock_tail
  1.1088 +.endm
  1.1089 +
  1.1090 +.macro pixman_composite_src_n_0565_process_pixblock_tail_head
  1.1091 +    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
  1.1092 +.endm
  1.1093 +
  1.1094 +.macro pixman_composite_src_n_0565_init
  1.1095 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1096 +    vld1.32     {d0[0]}, [DUMMY]
  1.1097 +    vsli.u64    d0, d0, #16
  1.1098 +    vsli.u64    d0, d0, #32
  1.1099 +    vorr        d1, d0, d0
  1.1100 +    vorr        q1, q0, q0
  1.1101 +.endm
  1.1102 +
  1.1103 +.macro pixman_composite_src_n_0565_cleanup
  1.1104 +.endm
  1.1105 +
  1.1106 +generate_composite_function \
  1.1107 +    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
  1.1108 +    FLAG_DST_WRITEONLY, \
  1.1109 +    16, /* number of pixels, processed in a single block */ \
  1.1110 +    0,  /* prefetch distance */ \
  1.1111 +    pixman_composite_src_n_0565_init, \
  1.1112 +    pixman_composite_src_n_0565_cleanup, \
  1.1113 +    pixman_composite_src_n_0565_process_pixblock_head, \
  1.1114 +    pixman_composite_src_n_0565_process_pixblock_tail, \
  1.1115 +    pixman_composite_src_n_0565_process_pixblock_tail_head, \
  1.1116 +    0, /* dst_w_basereg */ \
  1.1117 +    0, /* dst_r_basereg */ \
  1.1118 +    0, /* src_basereg   */ \
  1.1119 +    0  /* mask_basereg  */
  1.1120 +
  1.1121 +/******************************************************************************/
  1.1122 +
  1.1123 +.macro pixman_composite_src_n_8888_process_pixblock_head
  1.1124 +.endm
  1.1125 +
  1.1126 +.macro pixman_composite_src_n_8888_process_pixblock_tail
  1.1127 +.endm
  1.1128 +
  1.1129 +.macro pixman_composite_src_n_8888_process_pixblock_tail_head
  1.1130 +    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
  1.1131 +.endm
  1.1132 +
  1.1133 +.macro pixman_composite_src_n_8888_init
  1.1134 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1135 +    vld1.32     {d0[0]}, [DUMMY]
  1.1136 +    vsli.u64    d0, d0, #32
  1.1137 +    vorr        d1, d0, d0
  1.1138 +    vorr        q1, q0, q0
  1.1139 +.endm
  1.1140 +
  1.1141 +.macro pixman_composite_src_n_8888_cleanup
  1.1142 +.endm
  1.1143 +
  1.1144 +generate_composite_function \
  1.1145 +    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
  1.1146 +    FLAG_DST_WRITEONLY, \
  1.1147 +    8, /* number of pixels, processed in a single block */ \
  1.1148 +    0, /* prefetch distance */ \
  1.1149 +    pixman_composite_src_n_8888_init, \
  1.1150 +    pixman_composite_src_n_8888_cleanup, \
  1.1151 +    pixman_composite_src_n_8888_process_pixblock_head, \
  1.1152 +    pixman_composite_src_n_8888_process_pixblock_tail, \
  1.1153 +    pixman_composite_src_n_8888_process_pixblock_tail_head, \
  1.1154 +    0, /* dst_w_basereg */ \
  1.1155 +    0, /* dst_r_basereg */ \
  1.1156 +    0, /* src_basereg   */ \
  1.1157 +    0  /* mask_basereg  */
  1.1158 +
  1.1159 +/******************************************************************************/
  1.1160 +
  1.1161 +.macro pixman_composite_src_8888_8888_process_pixblock_head
  1.1162 +.endm
  1.1163 +
  1.1164 +.macro pixman_composite_src_8888_8888_process_pixblock_tail
  1.1165 +.endm
  1.1166 +
  1.1167 +.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
  1.1168 +    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
  1.1169 +    fetch_src_pixblock
  1.1170 +    cache_preload 8, 8
  1.1171 +.endm
  1.1172 +
  1.1173 +generate_composite_function \
  1.1174 +    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
  1.1175 +    FLAG_DST_WRITEONLY, \
  1.1176 +    8, /* number of pixels, processed in a single block */ \
  1.1177 +    10, /* prefetch distance */ \
  1.1178 +    default_init, \
  1.1179 +    default_cleanup, \
  1.1180 +    pixman_composite_src_8888_8888_process_pixblock_head, \
  1.1181 +    pixman_composite_src_8888_8888_process_pixblock_tail, \
  1.1182 +    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
  1.1183 +    0, /* dst_w_basereg */ \
  1.1184 +    0, /* dst_r_basereg */ \
  1.1185 +    0, /* src_basereg   */ \
  1.1186 +    0  /* mask_basereg  */
  1.1187 +
  1.1188 +/******************************************************************************/
  1.1189 +
  1.1190 +.macro pixman_composite_src_x888_8888_process_pixblock_head
  1.1191 +    vorr     q0, q0, q2
  1.1192 +    vorr     q1, q1, q2
  1.1193 +.endm
  1.1194 +
  1.1195 +.macro pixman_composite_src_x888_8888_process_pixblock_tail
  1.1196 +.endm
  1.1197 +
  1.1198 +.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
  1.1199 +    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
  1.1200 +    fetch_src_pixblock
  1.1201 +    vorr     q0, q0, q2
  1.1202 +    vorr     q1, q1, q2
  1.1203 +    cache_preload 8, 8
  1.1204 +.endm
  1.1205 +
  1.1206 +.macro pixman_composite_src_x888_8888_init
  1.1207 +    vmov.u8  q2, #0xFF
  1.1208 +    vshl.u32 q2, q2, #24
  1.1209 +.endm
  1.1210 +
  1.1211 +generate_composite_function \
  1.1212 +    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
  1.1213 +    FLAG_DST_WRITEONLY, \
  1.1214 +    8, /* number of pixels, processed in a single block */ \
  1.1215 +    10, /* prefetch distance */ \
  1.1216 +    pixman_composite_src_x888_8888_init, \
  1.1217 +    default_cleanup, \
  1.1218 +    pixman_composite_src_x888_8888_process_pixblock_head, \
  1.1219 +    pixman_composite_src_x888_8888_process_pixblock_tail, \
  1.1220 +    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
  1.1221 +    0, /* dst_w_basereg */ \
  1.1222 +    0, /* dst_r_basereg */ \
  1.1223 +    0, /* src_basereg   */ \
  1.1224 +    0  /* mask_basereg  */
  1.1225 +
  1.1226 +/******************************************************************************/
  1.1227 +
  1.1228 +.macro pixman_composite_src_n_8_8888_process_pixblock_head
  1.1229 +    /* expecting solid source in {d0, d1, d2, d3} */
  1.1230 +    /* mask is in d24 (d25, d26, d27 are unused) */
  1.1231 +
  1.1232 +    /* in */
  1.1233 +    vmull.u8    q8, d24, d0
  1.1234 +    vmull.u8    q9, d24, d1
  1.1235 +    vmull.u8    q10, d24, d2
  1.1236 +    vmull.u8    q11, d24, d3
  1.1237 +    vrsra.u16   q8, q8, #8
  1.1238 +    vrsra.u16   q9, q9, #8
  1.1239 +    vrsra.u16   q10, q10, #8
  1.1240 +    vrsra.u16   q11, q11, #8
  1.1241 +.endm
  1.1242 +
  1.1243 +.macro pixman_composite_src_n_8_8888_process_pixblock_tail
  1.1244 +    vrshrn.u16  d28, q8, #8
  1.1245 +    vrshrn.u16  d29, q9, #8
  1.1246 +    vrshrn.u16  d30, q10, #8
  1.1247 +    vrshrn.u16  d31, q11, #8
  1.1248 +.endm
  1.1249 +
  1.1250 +.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
  1.1251 +    fetch_mask_pixblock
  1.1252 +                                    PF add PF_X, PF_X, #8
  1.1253 +        vrshrn.u16  d28, q8, #8
  1.1254 +                                    PF tst PF_CTL, #0x0F
  1.1255 +        vrshrn.u16  d29, q9, #8
  1.1256 +                                    PF addne PF_X, PF_X, #8
  1.1257 +        vrshrn.u16  d30, q10, #8
  1.1258 +                                    PF subne PF_CTL, PF_CTL, #1
  1.1259 +        vrshrn.u16  d31, q11, #8
  1.1260 +                                    PF cmp PF_X, ORIG_W
  1.1261 +    vmull.u8    q8, d24, d0
  1.1262 +                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
  1.1263 +    vmull.u8    q9, d24, d1
  1.1264 +                                    PF subge PF_X, PF_X, ORIG_W
  1.1265 +    vmull.u8    q10, d24, d2
  1.1266 +                                    PF subges PF_CTL, PF_CTL, #0x10
  1.1267 +    vmull.u8    q11, d24, d3
  1.1268 +                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
  1.1269 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.1270 +    vrsra.u16   q8, q8, #8
  1.1271 +    vrsra.u16   q9, q9, #8
  1.1272 +    vrsra.u16   q10, q10, #8
  1.1273 +    vrsra.u16   q11, q11, #8
  1.1274 +.endm
  1.1275 +
  1.1276 +.macro pixman_composite_src_n_8_8888_init
  1.1277 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1278 +    vld1.32     {d3[0]}, [DUMMY]
  1.1279 +    vdup.8      d0, d3[0]
  1.1280 +    vdup.8      d1, d3[1]
  1.1281 +    vdup.8      d2, d3[2]
  1.1282 +    vdup.8      d3, d3[3]
  1.1283 +.endm
  1.1284 +
  1.1285 +.macro pixman_composite_src_n_8_8888_cleanup
  1.1286 +.endm
  1.1287 +
  1.1288 +generate_composite_function \
  1.1289 +    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
  1.1290 +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  1.1291 +    8, /* number of pixels, processed in a single block */ \
  1.1292 +    5, /* prefetch distance */ \
  1.1293 +    pixman_composite_src_n_8_8888_init, \
  1.1294 +    pixman_composite_src_n_8_8888_cleanup, \
  1.1295 +    pixman_composite_src_n_8_8888_process_pixblock_head, \
  1.1296 +    pixman_composite_src_n_8_8888_process_pixblock_tail, \
  1.1297 +    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
  1.1298 +
  1.1299 +/******************************************************************************/
  1.1300 +
  1.1301 +.macro pixman_composite_src_n_8_8_process_pixblock_head
  1.1302 +    vmull.u8    q0, d24, d16
  1.1303 +    vmull.u8    q1, d25, d16
  1.1304 +    vmull.u8    q2, d26, d16
  1.1305 +    vmull.u8    q3, d27, d16
  1.1306 +    vrsra.u16   q0, q0,  #8
  1.1307 +    vrsra.u16   q1, q1,  #8
  1.1308 +    vrsra.u16   q2, q2,  #8
  1.1309 +    vrsra.u16   q3, q3,  #8
  1.1310 +.endm
  1.1311 +
  1.1312 +.macro pixman_composite_src_n_8_8_process_pixblock_tail
  1.1313 +    vrshrn.u16  d28, q0, #8
  1.1314 +    vrshrn.u16  d29, q1, #8
  1.1315 +    vrshrn.u16  d30, q2, #8
  1.1316 +    vrshrn.u16  d31, q3, #8
  1.1317 +.endm
  1.1318 +
  1.1319 +.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
  1.1320 +    fetch_mask_pixblock
  1.1321 +                                    PF add PF_X, PF_X, #8
  1.1322 +        vrshrn.u16  d28, q0, #8
  1.1323 +                                    PF tst PF_CTL, #0x0F
  1.1324 +        vrshrn.u16  d29, q1, #8
  1.1325 +                                    PF addne PF_X, PF_X, #8
  1.1326 +        vrshrn.u16  d30, q2, #8
  1.1327 +                                    PF subne PF_CTL, PF_CTL, #1
  1.1328 +        vrshrn.u16  d31, q3, #8
  1.1329 +                                    PF cmp PF_X, ORIG_W
  1.1330 +    vmull.u8    q0,  d24, d16
  1.1331 +                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
  1.1332 +    vmull.u8    q1,  d25, d16
  1.1333 +                                    PF subge PF_X, PF_X, ORIG_W
  1.1334 +    vmull.u8    q2,  d26, d16
  1.1335 +                                    PF subges PF_CTL, PF_CTL, #0x10
  1.1336 +    vmull.u8    q3,  d27, d16
  1.1337 +                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
  1.1338 +        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.1339 +    vrsra.u16   q0, q0,  #8
  1.1340 +    vrsra.u16   q1, q1,  #8
  1.1341 +    vrsra.u16   q2, q2,  #8
  1.1342 +    vrsra.u16   q3, q3,  #8
  1.1343 +.endm
  1.1344 +
  1.1345 +.macro pixman_composite_src_n_8_8_init
  1.1346 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1347 +    vld1.32     {d16[0]}, [DUMMY]
  1.1348 +    vdup.8      d16, d16[3]
  1.1349 +.endm
  1.1350 +
  1.1351 +.macro pixman_composite_src_n_8_8_cleanup
  1.1352 +.endm
  1.1353 +
  1.1354 +generate_composite_function \
  1.1355 +    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
  1.1356 +    FLAG_DST_WRITEONLY, \
  1.1357 +    32, /* number of pixels, processed in a single block */ \
  1.1358 +    5, /* prefetch distance */ \
  1.1359 +    pixman_composite_src_n_8_8_init, \
  1.1360 +    pixman_composite_src_n_8_8_cleanup, \
  1.1361 +    pixman_composite_src_n_8_8_process_pixblock_head, \
  1.1362 +    pixman_composite_src_n_8_8_process_pixblock_tail, \
  1.1363 +    pixman_composite_src_n_8_8_process_pixblock_tail_head
  1.1364 +
  1.1365 +/******************************************************************************/
  1.1366 +
  1.1367 +.macro pixman_composite_over_n_8_8888_process_pixblock_head
  1.1368 +    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
  1.1369 +    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
  1.1370 +    /* and destination data in {d4, d5, d6, d7} */
  1.1371 +    /* mask is in d24 (d25, d26, d27 are unused) */
  1.1372 +
  1.1373 +    /* in */
  1.1374 +    vmull.u8    q6, d24, d8
  1.1375 +    vmull.u8    q7, d24, d9
  1.1376 +    vmull.u8    q8, d24, d10
  1.1377 +    vmull.u8    q9, d24, d11
  1.1378 +    vrshr.u16   q10, q6, #8
  1.1379 +    vrshr.u16   q11, q7, #8
  1.1380 +    vrshr.u16   q12, q8, #8
  1.1381 +    vrshr.u16   q13, q9, #8
  1.1382 +    vraddhn.u16 d0, q6, q10
  1.1383 +    vraddhn.u16 d1, q7, q11
  1.1384 +    vraddhn.u16 d2, q8, q12
  1.1385 +    vraddhn.u16 d3, q9, q13
  1.1386 +    vmvn.8      d25, d3  /* get inverted alpha */
  1.1387 +    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
  1.1388 +    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
  1.1389 +    /* now do alpha blending */
  1.1390 +    vmull.u8    q8, d25, d4
  1.1391 +    vmull.u8    q9, d25, d5
  1.1392 +    vmull.u8    q10, d25, d6
  1.1393 +    vmull.u8    q11, d25, d7
  1.1394 +.endm
  1.1395 +
  1.1396 +.macro pixman_composite_over_n_8_8888_process_pixblock_tail
  1.1397 +    vrshr.u16   q14, q8, #8
  1.1398 +    vrshr.u16   q15, q9, #8
  1.1399 +    vrshr.u16   q6, q10, #8
  1.1400 +    vrshr.u16   q7, q11, #8
  1.1401 +    vraddhn.u16 d28, q14, q8
  1.1402 +    vraddhn.u16 d29, q15, q9
  1.1403 +    vraddhn.u16 d30, q6, q10
  1.1404 +    vraddhn.u16 d31, q7, q11
  1.1405 +    vqadd.u8    q14, q0, q14
  1.1406 +    vqadd.u8    q15, q1, q15
  1.1407 +.endm
  1.1408 +
  1.1409 +.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
  1.1410 +        vrshr.u16   q14, q8, #8
  1.1411 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1.1412 +        vrshr.u16   q15, q9, #8
  1.1413 +    fetch_mask_pixblock
  1.1414 +        vrshr.u16   q6, q10, #8
  1.1415 +                                    PF add PF_X, PF_X, #8
  1.1416 +        vrshr.u16   q7, q11, #8
  1.1417 +                                    PF tst PF_CTL, #0x0F
  1.1418 +        vraddhn.u16 d28, q14, q8
  1.1419 +                                    PF addne PF_X, PF_X, #8
  1.1420 +        vraddhn.u16 d29, q15, q9
  1.1421 +                                    PF subne PF_CTL, PF_CTL, #1
  1.1422 +        vraddhn.u16 d30, q6, q10
  1.1423 +                                    PF cmp PF_X, ORIG_W
  1.1424 +        vraddhn.u16 d31, q7, q11
  1.1425 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
  1.1426 +    vmull.u8    q6, d24, d8
  1.1427 +                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
  1.1428 +    vmull.u8    q7, d24, d9
  1.1429 +                                    PF subge PF_X, PF_X, ORIG_W
  1.1430 +    vmull.u8    q8, d24, d10
  1.1431 +                                    PF subges PF_CTL, PF_CTL, #0x10
  1.1432 +    vmull.u8    q9, d24, d11
  1.1433 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
  1.1434 +        vqadd.u8    q14, q0, q14
  1.1435 +                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
  1.1436 +        vqadd.u8    q15, q1, q15
  1.1437 +    vrshr.u16   q10, q6, #8
  1.1438 +    vrshr.u16   q11, q7, #8
  1.1439 +    vrshr.u16   q12, q8, #8
  1.1440 +    vrshr.u16   q13, q9, #8
  1.1441 +    vraddhn.u16 d0, q6, q10
  1.1442 +    vraddhn.u16 d1, q7, q11
  1.1443 +    vraddhn.u16 d2, q8, q12
  1.1444 +    vraddhn.u16 d3, q9, q13
  1.1445 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.1446 +    vmvn.8      d25, d3
  1.1447 +    vmull.u8    q8, d25, d4
  1.1448 +    vmull.u8    q9, d25, d5
  1.1449 +    vmull.u8    q10, d25, d6
  1.1450 +    vmull.u8    q11, d25, d7
  1.1451 +.endm
  1.1452 +
  1.1453 +.macro pixman_composite_over_n_8_8888_init
  1.1454 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1455 +    .vsave      {d8-d15}
  1.1456 +    vpush       {d8-d15}
  1.1457 +    vld1.32     {d11[0]}, [DUMMY]
  1.1458 +    vdup.8      d8, d11[0]
  1.1459 +    vdup.8      d9, d11[1]
  1.1460 +    vdup.8      d10, d11[2]
  1.1461 +    vdup.8      d11, d11[3]
  1.1462 +.endm
  1.1463 +
  1.1464 +.macro pixman_composite_over_n_8_8888_cleanup
  1.1465 +    vpop        {d8-d15}
  1.1466 +.endm
  1.1467 +
  1.1468 +generate_composite_function \
  1.1469 +    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
  1.1470 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.1471 +    8, /* number of pixels, processed in a single block */ \
  1.1472 +    5, /* prefetch distance */ \
  1.1473 +    pixman_composite_over_n_8_8888_init, \
  1.1474 +    pixman_composite_over_n_8_8888_cleanup, \
  1.1475 +    pixman_composite_over_n_8_8888_process_pixblock_head, \
  1.1476 +    pixman_composite_over_n_8_8888_process_pixblock_tail, \
  1.1477 +    pixman_composite_over_n_8_8888_process_pixblock_tail_head
  1.1478 +
  1.1479 +/******************************************************************************/
  1.1480 +
  1.1481 +.macro pixman_composite_over_n_8_8_process_pixblock_head
  1.1482 +    vmull.u8    q0,  d24, d8
  1.1483 +    vmull.u8    q1,  d25, d8
  1.1484 +    vmull.u8    q6,  d26, d8
  1.1485 +    vmull.u8    q7,  d27, d8
  1.1486 +    vrshr.u16   q10, q0,  #8
  1.1487 +    vrshr.u16   q11, q1,  #8
  1.1488 +    vrshr.u16   q12, q6,  #8
  1.1489 +    vrshr.u16   q13, q7,  #8
  1.1490 +    vraddhn.u16 d0,  q0,  q10
  1.1491 +    vraddhn.u16 d1,  q1,  q11
  1.1492 +    vraddhn.u16 d2,  q6,  q12
  1.1493 +    vraddhn.u16 d3,  q7,  q13
  1.1494 +    vmvn.8      q12, q0
  1.1495 +    vmvn.8      q13, q1
  1.1496 +    vmull.u8    q8,  d24, d4
  1.1497 +    vmull.u8    q9,  d25, d5
  1.1498 +    vmull.u8    q10, d26, d6
  1.1499 +    vmull.u8    q11, d27, d7
  1.1500 +.endm
  1.1501 +
  1.1502 +.macro pixman_composite_over_n_8_8_process_pixblock_tail
  1.1503 +    vrshr.u16   q14, q8,  #8
  1.1504 +    vrshr.u16   q15, q9,  #8
  1.1505 +    vrshr.u16   q12, q10, #8
  1.1506 +    vrshr.u16   q13, q11, #8
  1.1507 +    vraddhn.u16 d28, q14, q8
  1.1508 +    vraddhn.u16 d29, q15, q9
  1.1509 +    vraddhn.u16 d30, q12, q10
  1.1510 +    vraddhn.u16 d31, q13, q11
  1.1511 +    vqadd.u8    q14, q0,  q14
  1.1512 +    vqadd.u8    q15, q1,  q15
  1.1513 +.endm
  1.1514 +
  1.1515 +/* TODO: expand macros and do better instructions scheduling */
  1.1516 +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
  1.1517 +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1.1518 +    pixman_composite_over_n_8_8_process_pixblock_tail
  1.1519 +    fetch_mask_pixblock
  1.1520 +    cache_preload 32, 32
  1.1521 +    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.1522 +    pixman_composite_over_n_8_8_process_pixblock_head
  1.1523 +.endm
  1.1524 +
  1.1525 +.macro pixman_composite_over_n_8_8_init
  1.1526 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1527 +    .vsave      {d8-d15}
  1.1528 +    vpush       {d8-d15}
  1.1529 +    vld1.32     {d8[0]}, [DUMMY]
  1.1530 +    vdup.8      d8, d8[3]
  1.1531 +.endm
  1.1532 +
  1.1533 +.macro pixman_composite_over_n_8_8_cleanup
  1.1534 +    vpop        {d8-d15}
  1.1535 +.endm
  1.1536 +
  1.1537 +generate_composite_function \
  1.1538 +    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
  1.1539 +    FLAG_DST_READWRITE, \
  1.1540 +    32, /* number of pixels, processed in a single block */ \
  1.1541 +    5, /* prefetch distance */ \
  1.1542 +    pixman_composite_over_n_8_8_init, \
  1.1543 +    pixman_composite_over_n_8_8_cleanup, \
  1.1544 +    pixman_composite_over_n_8_8_process_pixblock_head, \
  1.1545 +    pixman_composite_over_n_8_8_process_pixblock_tail, \
  1.1546 +    pixman_composite_over_n_8_8_process_pixblock_tail_head
  1.1547 +
  1.1548 +/******************************************************************************/
  1.1549 +
  1.1550 +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
  1.1551 +    /*
  1.1552 +     * 'combine_mask_ca' replacement
  1.1553 +     *
  1.1554 +     * input:  solid src (n) in {d8,  d9,  d10, d11}
  1.1555 +     *         dest in          {d4,  d5,  d6,  d7 }
  1.1556 +     *         mask in          {d24, d25, d26, d27}
  1.1557 +     * output: updated src in   {d0,  d1,  d2,  d3 }
  1.1558 +     *         updated mask in  {d24, d25, d26, d3 }
  1.1559 +     */
  1.1560 +    vmull.u8    q0,  d24, d8
  1.1561 +    vmull.u8    q1,  d25, d9
  1.1562 +    vmull.u8    q6,  d26, d10
  1.1563 +    vmull.u8    q7,  d27, d11
  1.1564 +    vmull.u8    q9,  d11, d25
  1.1565 +    vmull.u8    q12, d11, d24
  1.1566 +    vmull.u8    q13, d11, d26
  1.1567 +    vrshr.u16   q8,  q0,  #8
  1.1568 +    vrshr.u16   q10, q1,  #8
  1.1569 +    vrshr.u16   q11, q6,  #8
  1.1570 +    vraddhn.u16 d0,  q0,  q8
  1.1571 +    vraddhn.u16 d1,  q1,  q10
  1.1572 +    vraddhn.u16 d2,  q6,  q11
  1.1573 +    vrshr.u16   q11, q12, #8
  1.1574 +    vrshr.u16   q8,  q9,  #8
  1.1575 +    vrshr.u16   q6,  q13, #8
  1.1576 +    vrshr.u16   q10, q7,  #8
  1.1577 +    vraddhn.u16 d24, q12, q11
  1.1578 +    vraddhn.u16 d25, q9,  q8
  1.1579 +    vraddhn.u16 d26, q13, q6
  1.1580 +    vraddhn.u16 d3,  q7,  q10
  1.1581 +    /*
  1.1582 +     * 'combine_over_ca' replacement
  1.1583 +     *
  1.1584 +     * output: updated dest in {d28, d29, d30, d31}
  1.1585 +     */
  1.1586 +    vmvn.8      q12, q12
  1.1587 +    vmvn.8      d26, d26
  1.1588 +    vmull.u8    q8,  d24, d4
  1.1589 +    vmull.u8    q9,  d25, d5
  1.1590 +    vmvn.8      d27, d3
  1.1591 +    vmull.u8    q10, d26, d6
  1.1592 +    vmull.u8    q11, d27, d7
  1.1593 +.endm
  1.1594 +
  1.1595 +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
  1.1596 +    /* ... continue 'combine_over_ca' replacement */
  1.1597 +    vrshr.u16   q14, q8,  #8
  1.1598 +    vrshr.u16   q15, q9,  #8
  1.1599 +    vrshr.u16   q6,  q10, #8
  1.1600 +    vrshr.u16   q7,  q11, #8
  1.1601 +    vraddhn.u16 d28, q14, q8
  1.1602 +    vraddhn.u16 d29, q15, q9
  1.1603 +    vraddhn.u16 d30, q6,  q10
  1.1604 +    vraddhn.u16 d31, q7,  q11
  1.1605 +    vqadd.u8    q14, q0,  q14
  1.1606 +    vqadd.u8    q15, q1,  q15
  1.1607 +.endm
  1.1608 +
  1.1609 +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
  1.1610 +        vrshr.u16   q14, q8, #8
  1.1611 +        vrshr.u16   q15, q9, #8
  1.1612 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1.1613 +        vrshr.u16   q6, q10, #8
  1.1614 +        vrshr.u16   q7, q11, #8
  1.1615 +        vraddhn.u16 d28, q14, q8
  1.1616 +        vraddhn.u16 d29, q15, q9
  1.1617 +        vraddhn.u16 d30, q6, q10
  1.1618 +        vraddhn.u16 d31, q7, q11
  1.1619 +    fetch_mask_pixblock
  1.1620 +        vqadd.u8    q14, q0, q14
  1.1621 +        vqadd.u8    q15, q1, q15
  1.1622 +    cache_preload 8, 8
  1.1623 +    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
  1.1624 +    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.1625 +.endm
  1.1626 +
  1.1627 +.macro pixman_composite_over_n_8888_8888_ca_init
  1.1628 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1629 +    .vsave      {d8-d15}
  1.1630 +    vpush       {d8-d15}
  1.1631 +    vld1.32     {d11[0]}, [DUMMY]
  1.1632 +    vdup.8      d8, d11[0]
  1.1633 +    vdup.8      d9, d11[1]
  1.1634 +    vdup.8      d10, d11[2]
  1.1635 +    vdup.8      d11, d11[3]
  1.1636 +.endm
  1.1637 +
  1.1638 +.macro pixman_composite_over_n_8888_8888_ca_cleanup
  1.1639 +    vpop        {d8-d15}
  1.1640 +.endm
  1.1641 +
  1.1642 +generate_composite_function \
  1.1643 +    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
  1.1644 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.1645 +    8, /* number of pixels, processed in a single block */ \
  1.1646 +    5, /* prefetch distance */ \
  1.1647 +    pixman_composite_over_n_8888_8888_ca_init, \
  1.1648 +    pixman_composite_over_n_8888_8888_ca_cleanup, \
  1.1649 +    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
  1.1650 +    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
  1.1651 +    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
  1.1652 +
  1.1653 +/******************************************************************************/
  1.1654 +
  1.1655 +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
  1.1656 +    /*
  1.1657 +     * 'combine_mask_ca' replacement
  1.1658 +     *
  1.1659 +     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
  1.1660 +     *         mask in          {d24, d25, d26}       [B, G, R]
  1.1661 +     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
  1.1662 +     *         updated mask in  {d24, d25, d26}       [B, G, R]
  1.1663 +     */
  1.1664 +    vmull.u8    q0,  d24, d8
  1.1665 +    vmull.u8    q1,  d25, d9
  1.1666 +    vmull.u8    q6,  d26, d10
  1.1667 +    vmull.u8    q9,  d11, d25
  1.1668 +    vmull.u8    q12, d11, d24
  1.1669 +    vmull.u8    q13, d11, d26
  1.1670 +    vrshr.u16   q8,  q0,  #8
  1.1671 +    vrshr.u16   q10, q1,  #8
  1.1672 +    vrshr.u16   q11, q6,  #8
  1.1673 +    vraddhn.u16 d0,  q0,  q8
  1.1674 +    vraddhn.u16 d1,  q1,  q10
  1.1675 +    vraddhn.u16 d2,  q6,  q11
  1.1676 +    vrshr.u16   q11, q12, #8
  1.1677 +    vrshr.u16   q8,  q9,  #8
  1.1678 +    vrshr.u16   q6,  q13, #8
  1.1679 +    vraddhn.u16 d24, q12, q11
  1.1680 +    vraddhn.u16 d25, q9,  q8
  1.1681 +    /*
  1.1682 +     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
  1.1683 +     * and put data into d16 - blue, d17 - green, d18 - red
  1.1684 +     */
  1.1685 +       vshrn.u16   d17, q2,  #3
  1.1686 +       vshrn.u16   d18, q2,  #8
  1.1687 +    vraddhn.u16 d26, q13, q6
  1.1688 +       vsli.u16    q2,  q2,  #5
  1.1689 +       vsri.u8     d18, d18, #5
  1.1690 +       vsri.u8     d17, d17, #6
  1.1691 +    /*
  1.1692 +     * 'combine_over_ca' replacement
  1.1693 +     *
  1.1694 +     * output: updated dest in d16 - blue, d17 - green, d18 - red
  1.1695 +     */
  1.1696 +    vmvn.8      q12, q12
  1.1697 +       vshrn.u16   d16, q2,  #2
  1.1698 +    vmvn.8      d26, d26
  1.1699 +    vmull.u8    q6,  d16, d24
  1.1700 +    vmull.u8    q7,  d17, d25
  1.1701 +    vmull.u8    q11, d18, d26
  1.1702 +.endm
  1.1703 +
  1.1704 +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
  1.1705 +    /* ... continue 'combine_over_ca' replacement */
  1.1706 +    vrshr.u16   q10, q6,  #8
  1.1707 +    vrshr.u16   q14, q7,  #8
  1.1708 +    vrshr.u16   q15, q11, #8
  1.1709 +    vraddhn.u16 d16, q10, q6
  1.1710 +    vraddhn.u16 d17, q14, q7
  1.1711 +    vraddhn.u16 d18, q15, q11
  1.1712 +    vqadd.u8    q8,  q0,  q8
  1.1713 +    vqadd.u8    d18, d2,  d18
  1.1714 +    /*
  1.1715 +     * convert the results in d16, d17, d18 to r5g6b5 and store
  1.1716 +     * them into {d28, d29}
  1.1717 +     */
  1.1718 +    vshll.u8    q14, d18, #8
  1.1719 +    vshll.u8    q10, d17, #8
  1.1720 +    vshll.u8    q15, d16, #8
  1.1721 +    vsri.u16    q14, q10, #5
  1.1722 +    vsri.u16    q14, q15, #11
  1.1723 +.endm
  1.1724 +
  1.1725 +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
  1.1726 +    fetch_mask_pixblock
  1.1727 +        vrshr.u16   q10, q6, #8
  1.1728 +        vrshr.u16   q14, q7, #8
  1.1729 +    vld1.16     {d4, d5}, [DST_R, :128]!
  1.1730 +        vrshr.u16   q15, q11, #8
  1.1731 +        vraddhn.u16 d16, q10, q6
  1.1732 +        vraddhn.u16 d17, q14, q7
  1.1733 +        vraddhn.u16 d22, q15, q11
  1.1734 +            /* process_pixblock_head */
  1.1735 +            /*
  1.1736 +             * 'combine_mask_ca' replacement
  1.1737 +             *
  1.1738 +             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
  1.1739 +             *         mask in          {d24, d25, d26}       [B, G, R]
  1.1740 +             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
  1.1741 +             *         updated mask in  {d24, d25, d26}       [B, G, R]
  1.1742 +             */
  1.1743 +            vmull.u8    q6,  d26, d10
  1.1744 +        vqadd.u8    q8,  q0, q8
  1.1745 +            vmull.u8    q0,  d24, d8
  1.1746 +        vqadd.u8    d22, d2, d22
  1.1747 +            vmull.u8    q1,  d25, d9
  1.1748 +        /*
  1.1749 +         * convert the result in d16, d17, d22 to r5g6b5 and store
  1.1750 +         * it into {d28, d29}
  1.1751 +         */
  1.1752 +        vshll.u8    q14, d22, #8
  1.1753 +        vshll.u8    q10, d17, #8
  1.1754 +        vshll.u8    q15, d16, #8
  1.1755 +            vmull.u8    q9,  d11, d25
  1.1756 +        vsri.u16    q14, q10, #5
  1.1757 +            vmull.u8    q12, d11, d24
  1.1758 +            vmull.u8    q13, d11, d26
  1.1759 +        vsri.u16    q14, q15, #11
  1.1760 +    cache_preload 8, 8
  1.1761 +            vrshr.u16   q8,  q0,  #8
  1.1762 +            vrshr.u16   q10, q1,  #8
  1.1763 +            vrshr.u16   q11, q6,  #8
  1.1764 +            vraddhn.u16 d0,  q0,  q8
  1.1765 +            vraddhn.u16 d1,  q1,  q10
  1.1766 +            vraddhn.u16 d2,  q6,  q11
  1.1767 +            vrshr.u16   q11, q12, #8
  1.1768 +            vrshr.u16   q8,  q9,  #8
  1.1769 +            vrshr.u16   q6,  q13, #8
  1.1770 +            vraddhn.u16 d24, q12, q11
  1.1771 +            vraddhn.u16 d25, q9,  q8
  1.1772 +                /*
  1.1773 +                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
  1.1774 +	         * 8-bit format and put data into d16 - blue, d17 - green,
  1.1775 +	         * d18 - red
  1.1776 +                 */
  1.1777 +                vshrn.u16   d17, q2,  #3
  1.1778 +                vshrn.u16   d18, q2,  #8
  1.1779 +            vraddhn.u16 d26, q13, q6
  1.1780 +                vsli.u16    q2,  q2,  #5
  1.1781 +                vsri.u8     d17, d17, #6
  1.1782 +                vsri.u8     d18, d18, #5
  1.1783 +            /*
  1.1784 +             * 'combine_over_ca' replacement
  1.1785 +             *
  1.1786 +             * output: updated dest in d16 - blue, d17 - green, d18 - red
  1.1787 +             */
  1.1788 +            vmvn.8      q12, q12
  1.1789 +                vshrn.u16   d16, q2,  #2
  1.1790 +            vmvn.8      d26, d26
  1.1791 +            vmull.u8    q7,  d17, d25
  1.1792 +            vmull.u8    q6,  d16, d24
  1.1793 +            vmull.u8    q11, d18, d26
  1.1794 +    vst1.16     {d28, d29}, [DST_W, :128]!
  1.1795 +.endm
  1.1796 +
  1.1797 +.macro pixman_composite_over_n_8888_0565_ca_init
  1.1798 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1799 +    .vsave      {d8-d15}
  1.1800 +    vpush       {d8-d15}
  1.1801 +    vld1.32     {d11[0]}, [DUMMY]
  1.1802 +    vdup.8      d8, d11[0]
  1.1803 +    vdup.8      d9, d11[1]
  1.1804 +    vdup.8      d10, d11[2]
  1.1805 +    vdup.8      d11, d11[3]
  1.1806 +.endm
  1.1807 +
  1.1808 +.macro pixman_composite_over_n_8888_0565_ca_cleanup
  1.1809 +    vpop        {d8-d15}
  1.1810 +.endm
  1.1811 +
  1.1812 +generate_composite_function \
  1.1813 +    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
  1.1814 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.1815 +    8, /* number of pixels, processed in a single block */ \
  1.1816 +    5, /* prefetch distance */ \
  1.1817 +    pixman_composite_over_n_8888_0565_ca_init, \
  1.1818 +    pixman_composite_over_n_8888_0565_ca_cleanup, \
  1.1819 +    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
  1.1820 +    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
  1.1821 +    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
  1.1822 +
  1.1823 +/******************************************************************************/
  1.1824 +
  1.1825 +.macro pixman_composite_in_n_8_process_pixblock_head
  1.1826 +    /* expecting source data in {d0, d1, d2, d3} */
  1.1827 +    /* and destination data in {d4, d5, d6, d7} */
  1.1828 +    vmull.u8    q8,  d4,  d3
  1.1829 +    vmull.u8    q9,  d5,  d3
  1.1830 +    vmull.u8    q10, d6,  d3
  1.1831 +    vmull.u8    q11, d7,  d3
  1.1832 +.endm
  1.1833 +
  1.1834 +.macro pixman_composite_in_n_8_process_pixblock_tail
  1.1835 +    vrshr.u16   q14, q8,  #8
  1.1836 +    vrshr.u16   q15, q9,  #8
  1.1837 +    vrshr.u16   q12, q10, #8
  1.1838 +    vrshr.u16   q13, q11, #8
  1.1839 +    vraddhn.u16 d28, q8,  q14
  1.1840 +    vraddhn.u16 d29, q9,  q15
  1.1841 +    vraddhn.u16 d30, q10, q12
  1.1842 +    vraddhn.u16 d31, q11, q13
  1.1843 +.endm
  1.1844 +
  1.1845 +.macro pixman_composite_in_n_8_process_pixblock_tail_head
  1.1846 +    pixman_composite_in_n_8_process_pixblock_tail
  1.1847 +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1.1848 +    cache_preload 32, 32
  1.1849 +    pixman_composite_in_n_8_process_pixblock_head
  1.1850 +    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.1851 +.endm
  1.1852 +
  1.1853 +.macro pixman_composite_in_n_8_init
  1.1854 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1855 +    vld1.32     {d3[0]}, [DUMMY]
  1.1856 +    vdup.8      d3, d3[3]
  1.1857 +.endm
  1.1858 +
  1.1859 +.macro pixman_composite_in_n_8_cleanup
  1.1860 +.endm
  1.1861 +
  1.1862 +generate_composite_function \
  1.1863 +    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
  1.1864 +    FLAG_DST_READWRITE, \
  1.1865 +    32, /* number of pixels, processed in a single block */ \
  1.1866 +    5, /* prefetch distance */ \
  1.1867 +    pixman_composite_in_n_8_init, \
  1.1868 +    pixman_composite_in_n_8_cleanup, \
  1.1869 +    pixman_composite_in_n_8_process_pixblock_head, \
  1.1870 +    pixman_composite_in_n_8_process_pixblock_tail, \
  1.1871 +    pixman_composite_in_n_8_process_pixblock_tail_head, \
  1.1872 +    28, /* dst_w_basereg */ \
  1.1873 +    4,  /* dst_r_basereg */ \
  1.1874 +    0,  /* src_basereg   */ \
  1.1875 +    24  /* mask_basereg  */
  1.1876 +
  1.1877 +.macro pixman_composite_add_n_8_8_process_pixblock_head
  1.1878 +    /* expecting source data in {d8, d9, d10, d11} */
  1.1879 +    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
  1.1880 +    /* and destination data in {d4, d5, d6, d7} */
  1.1881 +    /* mask is in d24, d25, d26, d27 */
  1.1882 +    vmull.u8    q0, d24, d11
  1.1883 +    vmull.u8    q1, d25, d11
  1.1884 +    vmull.u8    q6, d26, d11
  1.1885 +    vmull.u8    q7, d27, d11
  1.1886 +    vrshr.u16   q10, q0, #8
  1.1887 +    vrshr.u16   q11, q1, #8
  1.1888 +    vrshr.u16   q12, q6, #8
  1.1889 +    vrshr.u16   q13, q7, #8
  1.1890 +    vraddhn.u16 d0, q0, q10
  1.1891 +    vraddhn.u16 d1, q1, q11
  1.1892 +    vraddhn.u16 d2, q6, q12
  1.1893 +    vraddhn.u16 d3, q7, q13
  1.1894 +    vqadd.u8    q14, q0, q2
  1.1895 +    vqadd.u8    q15, q1, q3
  1.1896 +.endm
  1.1897 +
  1.1898 +.macro pixman_composite_add_n_8_8_process_pixblock_tail
  1.1899 +.endm
  1.1900 +
  1.1901 +/* TODO: expand macros and do better instructions scheduling */
  1.1902 +.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
  1.1903 +    pixman_composite_add_n_8_8_process_pixblock_tail
  1.1904 +    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.1905 +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1.1906 +    fetch_mask_pixblock
  1.1907 +    cache_preload 32, 32
  1.1908 +    pixman_composite_add_n_8_8_process_pixblock_head
  1.1909 +.endm
  1.1910 +
  1.1911 +.macro pixman_composite_add_n_8_8_init
  1.1912 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.1913 +    .vsave      {d8-d15}
  1.1914 +    vpush       {d8-d15}
  1.1915 +    vld1.32     {d11[0]}, [DUMMY]
  1.1916 +    vdup.8      d11, d11[3]
  1.1917 +.endm
  1.1918 +
  1.1919 +.macro pixman_composite_add_n_8_8_cleanup
  1.1920 +    vpop        {d8-d15}
  1.1921 +.endm
  1.1922 +
  1.1923 +generate_composite_function \
  1.1924 +    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
  1.1925 +    FLAG_DST_READWRITE, \
  1.1926 +    32, /* number of pixels, processed in a single block */ \
  1.1927 +    5, /* prefetch distance */ \
  1.1928 +    pixman_composite_add_n_8_8_init, \
  1.1929 +    pixman_composite_add_n_8_8_cleanup, \
  1.1930 +    pixman_composite_add_n_8_8_process_pixblock_head, \
  1.1931 +    pixman_composite_add_n_8_8_process_pixblock_tail, \
  1.1932 +    pixman_composite_add_n_8_8_process_pixblock_tail_head
  1.1933 +
  1.1934 +/******************************************************************************/
  1.1935 +
  1.1936 +.macro pixman_composite_add_8_8_8_process_pixblock_head
  1.1937 +    /* expecting source data in {d0, d1, d2, d3} */
  1.1938 +    /* destination data in {d4, d5, d6, d7} */
  1.1939 +    /* mask in {d24, d25, d26, d27} */
  1.1940 +    vmull.u8    q8, d24, d0
  1.1941 +    vmull.u8    q9, d25, d1
  1.1942 +    vmull.u8    q10, d26, d2
  1.1943 +    vmull.u8    q11, d27, d3
  1.1944 +    vrshr.u16   q0, q8, #8
  1.1945 +    vrshr.u16   q1, q9, #8
  1.1946 +    vrshr.u16   q12, q10, #8
  1.1947 +    vrshr.u16   q13, q11, #8
  1.1948 +    vraddhn.u16 d0, q0, q8
  1.1949 +    vraddhn.u16 d1, q1, q9
  1.1950 +    vraddhn.u16 d2, q12, q10
  1.1951 +    vraddhn.u16 d3, q13, q11
  1.1952 +    vqadd.u8    q14, q0, q2
  1.1953 +    vqadd.u8    q15, q1, q3
  1.1954 +.endm
  1.1955 +
  1.1956 +.macro pixman_composite_add_8_8_8_process_pixblock_tail
  1.1957 +.endm
  1.1958 +
  1.1959 +/* TODO: expand macros and do better instructions scheduling */
  1.1960 +.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
  1.1961 +    pixman_composite_add_8_8_8_process_pixblock_tail
  1.1962 +    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.1963 +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1.1964 +    fetch_mask_pixblock
  1.1965 +    fetch_src_pixblock
  1.1966 +    cache_preload 32, 32
  1.1967 +    pixman_composite_add_8_8_8_process_pixblock_head
  1.1968 +.endm
  1.1969 +
  1.1970 +.macro pixman_composite_add_8_8_8_init
  1.1971 +.endm
  1.1972 +
  1.1973 +.macro pixman_composite_add_8_8_8_cleanup
  1.1974 +.endm
  1.1975 +
  1.1976 +generate_composite_function \
  1.1977 +    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
  1.1978 +    FLAG_DST_READWRITE, \
  1.1979 +    32, /* number of pixels, processed in a single block */ \
  1.1980 +    5, /* prefetch distance */ \
  1.1981 +    pixman_composite_add_8_8_8_init, \
  1.1982 +    pixman_composite_add_8_8_8_cleanup, \
  1.1983 +    pixman_composite_add_8_8_8_process_pixblock_head, \
  1.1984 +    pixman_composite_add_8_8_8_process_pixblock_tail, \
  1.1985 +    pixman_composite_add_8_8_8_process_pixblock_tail_head
  1.1986 +
  1.1987 +/******************************************************************************/
  1.1988 +
  1.1989 +.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
  1.1990 +    /* expecting source data in {d0, d1, d2, d3} */
  1.1991 +    /* destination data in {d4, d5, d6, d7} */
  1.1992 +    /* mask in {d24, d25, d26, d27} */
  1.1993 +    vmull.u8    q8,  d27, d0
  1.1994 +    vmull.u8    q9,  d27, d1
  1.1995 +    vmull.u8    q10, d27, d2
  1.1996 +    vmull.u8    q11, d27, d3
  1.1997 +    /* 1 cycle bubble */
  1.1998 +    vrsra.u16   q8,  q8,  #8
  1.1999 +    vrsra.u16   q9,  q9,  #8
  1.2000 +    vrsra.u16   q10, q10, #8
  1.2001 +    vrsra.u16   q11, q11, #8
  1.2002 +.endm
  1.2003 +
  1.2004 +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
  1.2005 +    /* 2 cycle bubble */
  1.2006 +    vrshrn.u16  d28, q8,  #8
  1.2007 +    vrshrn.u16  d29, q9,  #8
  1.2008 +    vrshrn.u16  d30, q10, #8
  1.2009 +    vrshrn.u16  d31, q11, #8
  1.2010 +    vqadd.u8    q14, q2,  q14
  1.2011 +    /* 1 cycle bubble */
  1.2012 +    vqadd.u8    q15, q3,  q15
  1.2013 +.endm
  1.2014 +
  1.2015 +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
  1.2016 +    fetch_src_pixblock
  1.2017 +        vrshrn.u16  d28, q8,  #8
  1.2018 +    fetch_mask_pixblock
  1.2019 +        vrshrn.u16  d29, q9,  #8
  1.2020 +    vmull.u8    q8,  d27, d0
  1.2021 +        vrshrn.u16  d30, q10, #8
  1.2022 +    vmull.u8    q9,  d27, d1
  1.2023 +        vrshrn.u16  d31, q11, #8
  1.2024 +    vmull.u8    q10, d27, d2
  1.2025 +        vqadd.u8    q14, q2,  q14
  1.2026 +    vmull.u8    q11, d27, d3
  1.2027 +        vqadd.u8    q15, q3,  q15
  1.2028 +    vrsra.u16   q8,  q8,  #8
  1.2029 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1.2030 +    vrsra.u16   q9,  q9,  #8
  1.2031 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1.2032 +    vrsra.u16   q10, q10, #8
  1.2033 +
  1.2034 +    cache_preload 8, 8
  1.2035 +
  1.2036 +    vrsra.u16   q11, q11, #8
  1.2037 +.endm
  1.2038 +
  1.2039 +generate_composite_function \
  1.2040 +    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
  1.2041 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2042 +    8, /* number of pixels, processed in a single block */ \
  1.2043 +    10, /* prefetch distance */ \
  1.2044 +    default_init, \
  1.2045 +    default_cleanup, \
  1.2046 +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  1.2047 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  1.2048 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
  1.2049 +
  1.2050 +generate_composite_function_single_scanline \
  1.2051 +    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
  1.2052 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2053 +    8, /* number of pixels, processed in a single block */ \
  1.2054 +    default_init, \
  1.2055 +    default_cleanup, \
  1.2056 +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  1.2057 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  1.2058 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
  1.2059 +
  1.2060 +/******************************************************************************/
  1.2061 +
  1.2062 +generate_composite_function \
  1.2063 +    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
  1.2064 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2065 +    8, /* number of pixels, processed in a single block */ \
  1.2066 +    5, /* prefetch distance */ \
  1.2067 +    default_init, \
  1.2068 +    default_cleanup, \
  1.2069 +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  1.2070 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  1.2071 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
  1.2072 +    28, /* dst_w_basereg */ \
  1.2073 +    4,  /* dst_r_basereg */ \
  1.2074 +    0,  /* src_basereg   */ \
  1.2075 +    27  /* mask_basereg  */
  1.2076 +
  1.2077 +/******************************************************************************/
  1.2078 +
  1.2079 +.macro pixman_composite_add_n_8_8888_init
  1.2080 +    add         DUMMY, sp, #ARGS_STACK_OFFSET
  1.2081 +    vld1.32     {d3[0]}, [DUMMY]
  1.2082 +    vdup.8      d0, d3[0]
  1.2083 +    vdup.8      d1, d3[1]
  1.2084 +    vdup.8      d2, d3[2]
  1.2085 +    vdup.8      d3, d3[3]
  1.2086 +.endm
  1.2087 +
  1.2088 +.macro pixman_composite_add_n_8_8888_cleanup
  1.2089 +.endm
  1.2090 +
  1.2091 +generate_composite_function \
  1.2092 +    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
  1.2093 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2094 +    8, /* number of pixels, processed in a single block */ \
  1.2095 +    5, /* prefetch distance */ \
  1.2096 +    pixman_composite_add_n_8_8888_init, \
  1.2097 +    pixman_composite_add_n_8_8888_cleanup, \
  1.2098 +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  1.2099 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  1.2100 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
  1.2101 +    28, /* dst_w_basereg */ \
  1.2102 +    4,  /* dst_r_basereg */ \
  1.2103 +    0,  /* src_basereg   */ \
  1.2104 +    27  /* mask_basereg  */
  1.2105 +
  1.2106 +/******************************************************************************/
  1.2107 +
  1.2108 +.macro pixman_composite_add_8888_n_8888_init
  1.2109 +    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
  1.2110 +    vld1.32     {d27[0]}, [DUMMY]
  1.2111 +    vdup.8      d27, d27[3]
  1.2112 +.endm
  1.2113 +
  1.2114 +.macro pixman_composite_add_8888_n_8888_cleanup
  1.2115 +.endm
  1.2116 +
  1.2117 +generate_composite_function \
  1.2118 +    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
  1.2119 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2120 +    8, /* number of pixels, processed in a single block */ \
  1.2121 +    5, /* prefetch distance */ \
  1.2122 +    pixman_composite_add_8888_n_8888_init, \
  1.2123 +    pixman_composite_add_8888_n_8888_cleanup, \
  1.2124 +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  1.2125 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  1.2126 +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
  1.2127 +    28, /* dst_w_basereg */ \
  1.2128 +    4,  /* dst_r_basereg */ \
  1.2129 +    0,  /* src_basereg   */ \
  1.2130 +    27  /* mask_basereg  */
  1.2131 +
  1.2132 +/******************************************************************************/
  1.2133 +
  1.2134 +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
  1.2135 +    /* expecting source data in {d0, d1, d2, d3} */
  1.2136 +    /* destination data in {d4, d5, d6, d7} */
  1.2137 +    /* solid mask is in d15 */
  1.2138 +
  1.2139 +    /* 'in' */
  1.2140 +    vmull.u8    q8, d15, d3
  1.2141 +    vmull.u8    q6, d15, d2
  1.2142 +    vmull.u8    q5, d15, d1
  1.2143 +    vmull.u8    q4, d15, d0
  1.2144 +    vrshr.u16   q13, q8, #8
  1.2145 +    vrshr.u16   q12, q6, #8
  1.2146 +    vrshr.u16   q11, q5, #8
  1.2147 +    vrshr.u16   q10, q4, #8
  1.2148 +    vraddhn.u16 d3, q8, q13
  1.2149 +    vraddhn.u16 d2, q6, q12
  1.2150 +    vraddhn.u16 d1, q5, q11
  1.2151 +    vraddhn.u16 d0, q4, q10
  1.2152 +    vmvn.8      d24, d3  /* get inverted alpha */
  1.2153 +    /* now do alpha blending */
  1.2154 +    vmull.u8    q8, d24, d4
  1.2155 +    vmull.u8    q9, d24, d5
  1.2156 +    vmull.u8    q10, d24, d6
  1.2157 +    vmull.u8    q11, d24, d7
  1.2158 +.endm
  1.2159 +
  1.2160 +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
  1.2161 +    vrshr.u16   q14, q8, #8
  1.2162 +    vrshr.u16   q15, q9, #8
  1.2163 +    vrshr.u16   q12, q10, #8
  1.2164 +    vrshr.u16   q13, q11, #8
  1.2165 +    vraddhn.u16 d28, q14, q8
  1.2166 +    vraddhn.u16 d29, q15, q9
  1.2167 +    vraddhn.u16 d30, q12, q10
  1.2168 +    vraddhn.u16 d31, q13, q11
  1.2169 +.endm
  1.2170 +
  1.2171 +/* TODO: expand macros and do better instructions scheduling */
  1.2172 +.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
  1.2173 +    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
  1.2174 +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
  1.2175 +    fetch_src_pixblock
  1.2176 +    cache_preload 8, 8
  1.2177 +    fetch_mask_pixblock
  1.2178 +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
  1.2179 +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
  1.2180 +.endm
  1.2181 +
  1.2182 +generate_composite_function_single_scanline \
  1.2183 +    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
  1.2184 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2185 +    8, /* number of pixels, processed in a single block */ \
  1.2186 +    default_init_need_all_regs, \
  1.2187 +    default_cleanup_need_all_regs, \
  1.2188 +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
  1.2189 +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
  1.2190 +    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
  1.2191 +    28, /* dst_w_basereg */ \
  1.2192 +    4,  /* dst_r_basereg */ \
  1.2193 +    0,  /* src_basereg   */ \
  1.2194 +    12  /* mask_basereg  */
  1.2195 +
  1.2196 +/******************************************************************************/
  1.2197 +
  1.2198 +.macro pixman_composite_over_8888_n_8888_process_pixblock_head
  1.2199 +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
  1.2200 +.endm
  1.2201 +
  1.2202 +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
  1.2203 +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
  1.2204 +    vqadd.u8    q14, q0, q14
  1.2205 +    vqadd.u8    q15, q1, q15
  1.2206 +.endm
  1.2207 +
  1.2208 +/* TODO: expand macros and do better instructions scheduling */
  1.2209 +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
  1.2210 +    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
  1.2211 +    pixman_composite_over_8888_n_8888_process_pixblock_tail
  1.2212 +    fetch_src_pixblock
  1.2213 +    cache_preload 8, 8
  1.2214 +    pixman_composite_over_8888_n_8888_process_pixblock_head
  1.2215 +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
  1.2216 +.endm
  1.2217 +
  1.2218 +.macro pixman_composite_over_8888_n_8888_init
  1.2219 +    add         DUMMY, sp, #48
  1.2220 +    .vsave      {d8-d15}
  1.2221 +    vpush       {d8-d15}
  1.2222 +    vld1.32     {d15[0]}, [DUMMY]
  1.2223 +    vdup.8      d15, d15[3]
  1.2224 +.endm
  1.2225 +
  1.2226 +.macro pixman_composite_over_8888_n_8888_cleanup
  1.2227 +    vpop        {d8-d15}
  1.2228 +.endm
  1.2229 +
  1.2230 +generate_composite_function \
  1.2231 +    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
  1.2232 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2233 +    8, /* number of pixels, processed in a single block */ \
  1.2234 +    5, /* prefetch distance */ \
  1.2235 +    pixman_composite_over_8888_n_8888_init, \
  1.2236 +    pixman_composite_over_8888_n_8888_cleanup, \
  1.2237 +    pixman_composite_over_8888_n_8888_process_pixblock_head, \
  1.2238 +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
  1.2239 +    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
  1.2240 +
  1.2241 +/******************************************************************************/
  1.2242 +
  1.2243 +/* TODO: expand macros and do better instructions scheduling */
  1.2244 +.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
  1.2245 +    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
  1.2246 +    pixman_composite_over_8888_n_8888_process_pixblock_tail
  1.2247 +    fetch_src_pixblock
  1.2248 +    cache_preload 8, 8
  1.2249 +    fetch_mask_pixblock
  1.2250 +    pixman_composite_over_8888_n_8888_process_pixblock_head
  1.2251 +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
  1.2252 +.endm
  1.2253 +
  1.2254 +generate_composite_function \
  1.2255 +    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
  1.2256 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2257 +    8, /* number of pixels, processed in a single block */ \
  1.2258 +    5, /* prefetch distance */ \
  1.2259 +    default_init_need_all_regs, \
  1.2260 +    default_cleanup_need_all_regs, \
  1.2261 +    pixman_composite_over_8888_n_8888_process_pixblock_head, \
  1.2262 +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
  1.2263 +    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
  1.2264 +    28, /* dst_w_basereg */ \
  1.2265 +    4,  /* dst_r_basereg */ \
  1.2266 +    0,  /* src_basereg   */ \
  1.2267 +    12  /* mask_basereg  */
  1.2268 +
  1.2269 +generate_composite_function_single_scanline \
  1.2270 +    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
  1.2271 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2272 +    8, /* number of pixels, processed in a single block */ \
  1.2273 +    default_init_need_all_regs, \
  1.2274 +    default_cleanup_need_all_regs, \
  1.2275 +    pixman_composite_over_8888_n_8888_process_pixblock_head, \
  1.2276 +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
  1.2277 +    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
  1.2278 +    28, /* dst_w_basereg */ \
  1.2279 +    4,  /* dst_r_basereg */ \
  1.2280 +    0,  /* src_basereg   */ \
  1.2281 +    12  /* mask_basereg  */
  1.2282 +
  1.2283 +/******************************************************************************/
  1.2284 +
  1.2285 +/* TODO: expand macros and do better instructions scheduling */
  1.2286 +.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
  1.2287 +    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
  1.2288 +    pixman_composite_over_8888_n_8888_process_pixblock_tail
  1.2289 +    fetch_src_pixblock
  1.2290 +    cache_preload 8, 8
  1.2291 +    fetch_mask_pixblock
  1.2292 +    pixman_composite_over_8888_n_8888_process_pixblock_head
  1.2293 +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
  1.2294 +.endm
  1.2295 +
  1.2296 +generate_composite_function \
  1.2297 +    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
  1.2298 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2299 +    8, /* number of pixels, processed in a single block */ \
  1.2300 +    5, /* prefetch distance */ \
  1.2301 +    default_init_need_all_regs, \
  1.2302 +    default_cleanup_need_all_regs, \
  1.2303 +    pixman_composite_over_8888_n_8888_process_pixblock_head, \
  1.2304 +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
  1.2305 +    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
  1.2306 +    28, /* dst_w_basereg */ \
  1.2307 +    4,  /* dst_r_basereg */ \
  1.2308 +    0,  /* src_basereg   */ \
  1.2309 +    15  /* mask_basereg  */
  1.2310 +
  1.2311 +/******************************************************************************/
  1.2312 +
  1.2313 +.macro pixman_composite_src_0888_0888_process_pixblock_head
  1.2314 +.endm
  1.2315 +
  1.2316 +.macro pixman_composite_src_0888_0888_process_pixblock_tail
  1.2317 +.endm
  1.2318 +
  1.2319 +.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
  1.2320 +    vst3.8 {d0, d1, d2}, [DST_W]!
  1.2321 +    fetch_src_pixblock
  1.2322 +    cache_preload 8, 8
  1.2323 +.endm
  1.2324 +
  1.2325 +generate_composite_function \
  1.2326 +    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
  1.2327 +    FLAG_DST_WRITEONLY, \
  1.2328 +    8, /* number of pixels, processed in a single block */ \
  1.2329 +    10, /* prefetch distance */ \
  1.2330 +    default_init, \
  1.2331 +    default_cleanup, \
  1.2332 +    pixman_composite_src_0888_0888_process_pixblock_head, \
  1.2333 +    pixman_composite_src_0888_0888_process_pixblock_tail, \
  1.2334 +    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
  1.2335 +    0, /* dst_w_basereg */ \
  1.2336 +    0, /* dst_r_basereg */ \
  1.2337 +    0, /* src_basereg   */ \
  1.2338 +    0  /* mask_basereg  */
  1.2339 +
  1.2340 +/******************************************************************************/
  1.2341 +
  1.2342 +.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
  1.2343 +    vswp   d0, d2
  1.2344 +.endm
  1.2345 +
  1.2346 +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
  1.2347 +.endm
  1.2348 +
  1.2349 +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
  1.2350 +    vst4.8 {d0, d1, d2, d3}, [DST_W]!
  1.2351 +    fetch_src_pixblock
  1.2352 +    vswp   d0, d2
  1.2353 +    cache_preload 8, 8
  1.2354 +.endm
  1.2355 +
  1.2356 +.macro pixman_composite_src_0888_8888_rev_init
  1.2357 +    veor   d3, d3, d3
  1.2358 +.endm
  1.2359 +
  1.2360 +generate_composite_function \
  1.2361 +    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
  1.2362 +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  1.2363 +    8, /* number of pixels, processed in a single block */ \
  1.2364 +    10, /* prefetch distance */ \
  1.2365 +    pixman_composite_src_0888_8888_rev_init, \
  1.2366 +    default_cleanup, \
  1.2367 +    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
  1.2368 +    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
  1.2369 +    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
  1.2370 +    0, /* dst_w_basereg */ \
  1.2371 +    0, /* dst_r_basereg */ \
  1.2372 +    0, /* src_basereg   */ \
  1.2373 +    0  /* mask_basereg  */
  1.2374 +
  1.2375 +/******************************************************************************/
  1.2376 +
  1.2377 +.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
  1.2378 +    vshll.u8    q8, d1, #8
  1.2379 +    vshll.u8    q9, d2, #8
  1.2380 +.endm
  1.2381 +
  1.2382 +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
  1.2383 +    vshll.u8    q14, d0, #8
  1.2384 +    vsri.u16    q14, q8, #5
  1.2385 +    vsri.u16    q14, q9, #11
  1.2386 +.endm
  1.2387 +
  1.2388 +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
  1.2389 +        vshll.u8    q14, d0, #8
  1.2390 +    fetch_src_pixblock
  1.2391 +        vsri.u16    q14, q8, #5
  1.2392 +        vsri.u16    q14, q9, #11
  1.2393 +    vshll.u8    q8, d1, #8
  1.2394 +        vst1.16 {d28, d29}, [DST_W, :128]!
  1.2395 +    vshll.u8    q9, d2, #8
  1.2396 +.endm
  1.2397 +
  1.2398 +generate_composite_function \
  1.2399 +    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
  1.2400 +    FLAG_DST_WRITEONLY, \
  1.2401 +    8, /* number of pixels, processed in a single block */ \
  1.2402 +    10, /* prefetch distance */ \
  1.2403 +    default_init, \
  1.2404 +    default_cleanup, \
  1.2405 +    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
  1.2406 +    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
  1.2407 +    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
  1.2408 +    28, /* dst_w_basereg */ \
  1.2409 +    0, /* dst_r_basereg */ \
  1.2410 +    0, /* src_basereg   */ \
  1.2411 +    0  /* mask_basereg  */
  1.2412 +
  1.2413 +/******************************************************************************/
  1.2414 +
  1.2415 +.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
  1.2416 +    vmull.u8    q8, d3, d0
  1.2417 +    vmull.u8    q9, d3, d1
  1.2418 +    vmull.u8    q10, d3, d2
  1.2419 +.endm
  1.2420 +
  1.2421 +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
  1.2422 +    vrshr.u16   q11, q8, #8
  1.2423 +    vswp        d3, d31
  1.2424 +    vrshr.u16   q12, q9, #8
  1.2425 +    vrshr.u16   q13, q10, #8
  1.2426 +    vraddhn.u16 d30, q11, q8
  1.2427 +    vraddhn.u16 d29, q12, q9
  1.2428 +    vraddhn.u16 d28, q13, q10
  1.2429 +.endm
  1.2430 +
  1.2431 +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
  1.2432 +        vrshr.u16   q11, q8, #8
  1.2433 +        vswp        d3, d31
  1.2434 +        vrshr.u16   q12, q9, #8
  1.2435 +        vrshr.u16   q13, q10, #8
  1.2436 +    fetch_src_pixblock
  1.2437 +        vraddhn.u16 d30, q11, q8
  1.2438 +                                    PF add PF_X, PF_X, #8
  1.2439 +                                    PF tst PF_CTL, #0xF
  1.2440 +                                    PF addne PF_X, PF_X, #8
  1.2441 +                                    PF subne PF_CTL, PF_CTL, #1
  1.2442 +        vraddhn.u16 d29, q12, q9
  1.2443 +        vraddhn.u16 d28, q13, q10
  1.2444 +    vmull.u8    q8, d3, d0
  1.2445 +    vmull.u8    q9, d3, d1
  1.2446 +    vmull.u8    q10, d3, d2
  1.2447 +        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
  1.2448 +                                    PF cmp PF_X, ORIG_W
  1.2449 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
  1.2450 +                                    PF subge PF_X, PF_X, ORIG_W
  1.2451 +                                    PF subges PF_CTL, PF_CTL, #0x10
  1.2452 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
  1.2453 +.endm
  1.2454 +
  1.2455 +generate_composite_function \
  1.2456 +    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
  1.2457 +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  1.2458 +    8, /* number of pixels, processed in a single block */ \
  1.2459 +    10, /* prefetch distance */ \
  1.2460 +    default_init, \
  1.2461 +    default_cleanup, \
  1.2462 +    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
  1.2463 +    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
  1.2464 +    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
  1.2465 +    28, /* dst_w_basereg */ \
  1.2466 +    0, /* dst_r_basereg */ \
  1.2467 +    0, /* src_basereg   */ \
  1.2468 +    0  /* mask_basereg  */
  1.2469 +
  1.2470 +/******************************************************************************/
  1.2471 +
  1.2472 +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
  1.2473 +    vmull.u8    q8, d3, d0
  1.2474 +    vmull.u8    q9, d3, d1
  1.2475 +    vmull.u8    q10, d3, d2
  1.2476 +.endm
  1.2477 +
  1.2478 +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
  1.2479 +    vrshr.u16   q11, q8, #8
  1.2480 +    vswp        d3, d31
  1.2481 +    vrshr.u16   q12, q9, #8
  1.2482 +    vrshr.u16   q13, q10, #8
  1.2483 +    vraddhn.u16 d28, q11, q8
  1.2484 +    vraddhn.u16 d29, q12, q9
  1.2485 +    vraddhn.u16 d30, q13, q10
  1.2486 +.endm
  1.2487 +
  1.2488 +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
  1.2489 +        vrshr.u16   q11, q8, #8
  1.2490 +        vswp        d3, d31
  1.2491 +        vrshr.u16   q12, q9, #8
  1.2492 +        vrshr.u16   q13, q10, #8
  1.2493 +    fetch_src_pixblock
  1.2494 +        vraddhn.u16 d28, q11, q8
  1.2495 +                                    PF add PF_X, PF_X, #8
  1.2496 +                                    PF tst PF_CTL, #0xF
  1.2497 +                                    PF addne PF_X, PF_X, #8
  1.2498 +                                    PF subne PF_CTL, PF_CTL, #1
  1.2499 +        vraddhn.u16 d29, q12, q9
  1.2500 +        vraddhn.u16 d30, q13, q10
  1.2501 +    vmull.u8    q8, d3, d0
  1.2502 +    vmull.u8    q9, d3, d1
  1.2503 +    vmull.u8    q10, d3, d2
  1.2504 +        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
  1.2505 +                                    PF cmp PF_X, ORIG_W
  1.2506 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
  1.2507 +                                    PF subge PF_X, PF_X, ORIG_W
  1.2508 +                                    PF subges PF_CTL, PF_CTL, #0x10
  1.2509 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
  1.2510 +.endm
  1.2511 +
  1.2512 +generate_composite_function \
  1.2513 +    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
  1.2514 +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  1.2515 +    8, /* number of pixels, processed in a single block */ \
  1.2516 +    10, /* prefetch distance */ \
  1.2517 +    default_init, \
  1.2518 +    default_cleanup, \
  1.2519 +    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
  1.2520 +    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
  1.2521 +    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
  1.2522 +    28, /* dst_w_basereg */ \
  1.2523 +    0, /* dst_r_basereg */ \
  1.2524 +    0, /* src_basereg   */ \
  1.2525 +    0  /* mask_basereg  */
  1.2526 +
  1.2527 +/******************************************************************************/
  1.2528 +
  1.2529 +.macro pixman_composite_over_0565_8_0565_process_pixblock_head
  1.2530 +    /* mask is in d15 */
  1.2531 +    convert_0565_to_x888 q4, d2, d1, d0
  1.2532 +    convert_0565_to_x888 q5, d6, d5, d4
  1.2533 +    /* source pixel data is in      {d0, d1, d2, XX} */
  1.2534 +    /* destination pixel data is in {d4, d5, d6, XX} */
  1.2535 +    vmvn.8      d7,  d15
  1.2536 +    vmull.u8    q6,  d15, d2
  1.2537 +    vmull.u8    q5,  d15, d1
  1.2538 +    vmull.u8    q4,  d15, d0
  1.2539 +    vmull.u8    q8,  d7,  d4
  1.2540 +    vmull.u8    q9,  d7,  d5
  1.2541 +    vmull.u8    q13, d7,  d6
  1.2542 +    vrshr.u16   q12, q6,  #8
  1.2543 +    vrshr.u16   q11, q5,  #8
  1.2544 +    vrshr.u16   q10, q4,  #8
  1.2545 +    vraddhn.u16 d2,  q6,  q12
  1.2546 +    vraddhn.u16 d1,  q5,  q11
  1.2547 +    vraddhn.u16 d0,  q4,  q10
  1.2548 +.endm
  1.2549 +
  1.2550 +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
  1.2551 +    vrshr.u16   q14, q8,  #8
  1.2552 +    vrshr.u16   q15, q9,  #8
  1.2553 +    vrshr.u16   q12, q13, #8
  1.2554 +    vraddhn.u16 d28, q14, q8
  1.2555 +    vraddhn.u16 d29, q15, q9
  1.2556 +    vraddhn.u16 d30, q12, q13
  1.2557 +    vqadd.u8    q0,  q0,  q14
  1.2558 +    vqadd.u8    q1,  q1,  q15
  1.2559 +    /* 32bpp result is in {d0, d1, d2, XX} */
  1.2560 +    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
  1.2561 +.endm
  1.2562 +
  1.2563 +/* TODO: expand macros and do better instructions scheduling */
  1.2564 +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
  1.2565 +    fetch_mask_pixblock
  1.2566 +    pixman_composite_over_0565_8_0565_process_pixblock_tail
  1.2567 +    fetch_src_pixblock
  1.2568 +    vld1.16    {d10, d11}, [DST_R, :128]!
  1.2569 +    cache_preload 8, 8
  1.2570 +    pixman_composite_over_0565_8_0565_process_pixblock_head
  1.2571 +    vst1.16    {d28, d29}, [DST_W, :128]!
  1.2572 +.endm
  1.2573 +
  1.2574 +generate_composite_function \
  1.2575 +    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
  1.2576 +    FLAG_DST_READWRITE, \
  1.2577 +    8, /* number of pixels, processed in a single block */ \
  1.2578 +    5, /* prefetch distance */ \
  1.2579 +    default_init_need_all_regs, \
  1.2580 +    default_cleanup_need_all_regs, \
  1.2581 +    pixman_composite_over_0565_8_0565_process_pixblock_head, \
  1.2582 +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
  1.2583 +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
  1.2584 +    28, /* dst_w_basereg */ \
  1.2585 +    10,  /* dst_r_basereg */ \
  1.2586 +    8,  /* src_basereg   */ \
  1.2587 +    15  /* mask_basereg  */
  1.2588 +
  1.2589 +/******************************************************************************/
  1.2590 +
  1.2591 +.macro pixman_composite_over_0565_n_0565_init
  1.2592 +    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
  1.2593 +    .vsave      {d8-d15}
  1.2594 +    vpush       {d8-d15}
  1.2595 +    vld1.32     {d15[0]}, [DUMMY]
  1.2596 +    vdup.8      d15, d15[3]
  1.2597 +.endm
  1.2598 +
  1.2599 +.macro pixman_composite_over_0565_n_0565_cleanup
  1.2600 +    vpop        {d8-d15}
  1.2601 +.endm
  1.2602 +
  1.2603 +generate_composite_function \
  1.2604 +    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
  1.2605 +    FLAG_DST_READWRITE, \
  1.2606 +    8, /* number of pixels, processed in a single block */ \
  1.2607 +    5, /* prefetch distance */ \
  1.2608 +    pixman_composite_over_0565_n_0565_init, \
  1.2609 +    pixman_composite_over_0565_n_0565_cleanup, \
  1.2610 +    pixman_composite_over_0565_8_0565_process_pixblock_head, \
  1.2611 +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
  1.2612 +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
  1.2613 +    28, /* dst_w_basereg */ \
  1.2614 +    10, /* dst_r_basereg */ \
  1.2615 +    8,  /* src_basereg   */ \
  1.2616 +    15  /* mask_basereg  */
  1.2617 +
  1.2618 +/******************************************************************************/
  1.2619 +
  1.2620 +.macro pixman_composite_add_0565_8_0565_process_pixblock_head
  1.2621 +    /* mask is in d15 */
  1.2622 +    convert_0565_to_x888 q4, d2, d1, d0
  1.2623 +    convert_0565_to_x888 q5, d6, d5, d4
  1.2624 +    /* source pixel data is in      {d0, d1, d2, XX} */
  1.2625 +    /* destination pixel data is in {d4, d5, d6, XX} */
  1.2626 +    vmull.u8    q6,  d15, d2
  1.2627 +    vmull.u8    q5,  d15, d1
  1.2628 +    vmull.u8    q4,  d15, d0
  1.2629 +    vrshr.u16   q12, q6,  #8
  1.2630 +    vrshr.u16   q11, q5,  #8
  1.2631 +    vrshr.u16   q10, q4,  #8
  1.2632 +    vraddhn.u16 d2,  q6,  q12
  1.2633 +    vraddhn.u16 d1,  q5,  q11
  1.2634 +    vraddhn.u16 d0,  q4,  q10
  1.2635 +.endm
  1.2636 +
  1.2637 +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
  1.2638 +    vqadd.u8    q0,  q0,  q2
  1.2639 +    vqadd.u8    q1,  q1,  q3
  1.2640 +    /* 32bpp result is in {d0, d1, d2, XX} */
  1.2641 +    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
  1.2642 +.endm
  1.2643 +
  1.2644 +/* TODO: expand macros and do better instructions scheduling */
  1.2645 +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
  1.2646 +    fetch_mask_pixblock
  1.2647 +    pixman_composite_add_0565_8_0565_process_pixblock_tail
  1.2648 +    fetch_src_pixblock
  1.2649 +    vld1.16    {d10, d11}, [DST_R, :128]!
  1.2650 +    cache_preload 8, 8
  1.2651 +    pixman_composite_add_0565_8_0565_process_pixblock_head
  1.2652 +    vst1.16    {d28, d29}, [DST_W, :128]!
  1.2653 +.endm
  1.2654 +
  1.2655 +generate_composite_function \
  1.2656 +    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
  1.2657 +    FLAG_DST_READWRITE, \
  1.2658 +    8, /* number of pixels, processed in a single block */ \
  1.2659 +    5, /* prefetch distance */ \
  1.2660 +    default_init_need_all_regs, \
  1.2661 +    default_cleanup_need_all_regs, \
  1.2662 +    pixman_composite_add_0565_8_0565_process_pixblock_head, \
  1.2663 +    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
  1.2664 +    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
  1.2665 +    28, /* dst_w_basereg */ \
  1.2666 +    10, /* dst_r_basereg */ \
  1.2667 +    8,  /* src_basereg   */ \
  1.2668 +    15  /* mask_basereg  */
  1.2669 +
  1.2670 +/******************************************************************************/
  1.2671 +
  1.2672 +.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
  1.2673 +    /* mask is in d15 */
  1.2674 +    convert_0565_to_x888 q5, d6, d5, d4
  1.2675 +    /* destination pixel data is in {d4, d5, d6, xx} */
  1.2676 +    vmvn.8      d24, d15 /* get inverted alpha */
  1.2677 +    /* now do alpha blending */
  1.2678 +    vmull.u8    q8, d24, d4
  1.2679 +    vmull.u8    q9, d24, d5
  1.2680 +    vmull.u8    q10, d24, d6
  1.2681 +.endm
  1.2682 +
  1.2683 +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
  1.2684 +    vrshr.u16   q14, q8, #8
  1.2685 +    vrshr.u16   q15, q9, #8
  1.2686 +    vrshr.u16   q12, q10, #8
  1.2687 +    vraddhn.u16 d0, q14, q8
  1.2688 +    vraddhn.u16 d1, q15, q9
  1.2689 +    vraddhn.u16 d2, q12, q10
  1.2690 +    /* 32bpp result is in {d0, d1, d2, XX} */
  1.2691 +    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
  1.2692 +.endm
  1.2693 +
  1.2694 +/* TODO: expand macros and do better instructions scheduling */
  1.2695 +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
  1.2696 +    fetch_src_pixblock
  1.2697 +    pixman_composite_out_reverse_8_0565_process_pixblock_tail
  1.2698 +    vld1.16    {d10, d11}, [DST_R, :128]!
  1.2699 +    cache_preload 8, 8
  1.2700 +    pixman_composite_out_reverse_8_0565_process_pixblock_head
  1.2701 +    vst1.16    {d28, d29}, [DST_W, :128]!
  1.2702 +.endm
  1.2703 +
  1.2704 +generate_composite_function \
  1.2705 +    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
  1.2706 +    FLAG_DST_READWRITE, \
  1.2707 +    8, /* number of pixels, processed in a single block */ \
  1.2708 +    5, /* prefetch distance */ \
  1.2709 +    default_init_need_all_regs, \
  1.2710 +    default_cleanup_need_all_regs, \
  1.2711 +    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
  1.2712 +    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
  1.2713 +    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
  1.2714 +    28, /* dst_w_basereg */ \
  1.2715 +    10, /* dst_r_basereg */ \
  1.2716 +    15, /* src_basereg   */ \
  1.2717 +    0   /* mask_basereg  */
  1.2718 +
  1.2719 +/******************************************************************************/
  1.2720 +
  1.2721 +.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
  1.2722 +    /* src is in d0 */
  1.2723 +    /* destination pixel data is in {d4, d5, d6, d7} */
  1.2724 +    vmvn.8      d1, d0 /* get inverted alpha */
  1.2725 +    /* now do alpha blending */
  1.2726 +    vmull.u8    q8, d1, d4
  1.2727 +    vmull.u8    q9, d1, d5
  1.2728 +    vmull.u8    q10, d1, d6
  1.2729 +    vmull.u8    q11, d1, d7
  1.2730 +.endm
  1.2731 +
  1.2732 +.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
  1.2733 +    vrshr.u16   q14, q8, #8
  1.2734 +    vrshr.u16   q15, q9, #8
  1.2735 +    vrshr.u16   q12, q10, #8
  1.2736 +    vrshr.u16   q13, q11, #8
  1.2737 +    vraddhn.u16 d28, q14, q8
  1.2738 +    vraddhn.u16 d29, q15, q9
  1.2739 +    vraddhn.u16 d30, q12, q10
  1.2740 +    vraddhn.u16 d31, q13, q11
  1.2741 +    /* 32bpp result is in {d28, d29, d30, d31} */
  1.2742 +.endm
  1.2743 +
  1.2744 +/* TODO: expand macros and do better instructions scheduling */
  1.2745 +.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
  1.2746 +    fetch_src_pixblock
  1.2747 +    pixman_composite_out_reverse_8_8888_process_pixblock_tail
  1.2748 +    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
  1.2749 +    cache_preload 8, 8
  1.2750 +    pixman_composite_out_reverse_8_8888_process_pixblock_head
  1.2751 +    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
  1.2752 +.endm
  1.2753 +
  1.2754 +generate_composite_function \
  1.2755 +    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
  1.2756 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2757 +    8, /* number of pixels, processed in a single block */ \
  1.2758 +    5, /* prefetch distance */ \
  1.2759 +    default_init, \
  1.2760 +    default_cleanup, \
  1.2761 +    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
  1.2762 +    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
  1.2763 +    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
  1.2764 +    28, /* dst_w_basereg */ \
  1.2765 +    4, /* dst_r_basereg */ \
  1.2766 +    0, /* src_basereg   */ \
  1.2767 +    0   /* mask_basereg  */
  1.2768 +
  1.2769 +/******************************************************************************/
  1.2770 +
  1.2771 +generate_composite_function_nearest_scanline \
  1.2772 +    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
  1.2773 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2774 +    8, /* number of pixels, processed in a single block */ \
  1.2775 +    default_init, \
  1.2776 +    default_cleanup, \
  1.2777 +    pixman_composite_over_8888_8888_process_pixblock_head, \
  1.2778 +    pixman_composite_over_8888_8888_process_pixblock_tail, \
  1.2779 +    pixman_composite_over_8888_8888_process_pixblock_tail_head
  1.2780 +
  1.2781 +generate_composite_function_nearest_scanline \
  1.2782 +    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
  1.2783 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2784 +    8, /* number of pixels, processed in a single block */ \
  1.2785 +    default_init, \
  1.2786 +    default_cleanup, \
  1.2787 +    pixman_composite_over_8888_0565_process_pixblock_head, \
  1.2788 +    pixman_composite_over_8888_0565_process_pixblock_tail, \
  1.2789 +    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
  1.2790 +    28, /* dst_w_basereg */ \
  1.2791 +    4,  /* dst_r_basereg */ \
  1.2792 +    0,  /* src_basereg   */ \
  1.2793 +    24  /* mask_basereg  */
  1.2794 +
  1.2795 +generate_composite_function_nearest_scanline \
  1.2796 +    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
  1.2797 +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  1.2798 +    8, /* number of pixels, processed in a single block */ \
  1.2799 +    default_init, \
  1.2800 +    default_cleanup, \
  1.2801 +    pixman_composite_src_8888_0565_process_pixblock_head, \
  1.2802 +    pixman_composite_src_8888_0565_process_pixblock_tail, \
  1.2803 +    pixman_composite_src_8888_0565_process_pixblock_tail_head
  1.2804 +
  1.2805 +generate_composite_function_nearest_scanline \
  1.2806 +    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
  1.2807 +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  1.2808 +    8, /* number of pixels, processed in a single block */ \
  1.2809 +    default_init, \
  1.2810 +    default_cleanup, \
  1.2811 +    pixman_composite_src_0565_8888_process_pixblock_head, \
  1.2812 +    pixman_composite_src_0565_8888_process_pixblock_tail, \
  1.2813 +    pixman_composite_src_0565_8888_process_pixblock_tail_head
  1.2814 +
  1.2815 +generate_composite_function_nearest_scanline \
  1.2816 +    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
  1.2817 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1.2818 +    8, /* number of pixels, processed in a single block */ \
  1.2819 +    default_init_need_all_regs, \
  1.2820 +    default_cleanup_need_all_regs, \
  1.2821 +    pixman_composite_over_8888_8_0565_process_pixblock_head, \
  1.2822 +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
  1.2823 +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
  1.2824 +    28, /* dst_w_basereg */ \
  1.2825 +    4,  /* dst_r_basereg */ \
  1.2826 +    8,  /* src_basereg   */ \
  1.2827 +    24  /* mask_basereg  */
  1.2828 +
  1.2829 +generate_composite_function_nearest_scanline \
  1.2830 +    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
  1.2831 +    FLAG_DST_READWRITE, \
  1.2832 +    8, /* number of pixels, processed in a single block */ \
  1.2833 +    default_init_need_all_regs, \
  1.2834 +    default_cleanup_need_all_regs, \
  1.2835 +    pixman_composite_over_0565_8_0565_process_pixblock_head, \
  1.2836 +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
  1.2837 +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
  1.2838 +    28, /* dst_w_basereg */ \
  1.2839 +    10,  /* dst_r_basereg */ \
  1.2840 +    8,  /* src_basereg   */ \
  1.2841 +    15  /* mask_basereg  */
  1.2842 +
  1.2843 +/******************************************************************************/
  1.2844 +
  1.2845 +/* Supplementary macro for setting function attributes */
  1.2846 +.macro pixman_asm_function fname
  1.2847 +    .func fname
  1.2848 +    .global fname
  1.2849 +#ifdef __ELF__
  1.2850 +    .hidden fname
  1.2851 +    .type fname, %function
  1.2852 +#endif
  1.2853 +fname:
  1.2854 +.endm
  1.2855 +
  1.2856 +/*
  1.2857 + * Bilinear scaling support code which tries to provide pixel fetching, color
  1.2858 + * format conversion, and interpolation as separate macros which can be used
  1.2859 + * as the basic building blocks for constructing bilinear scanline functions.
  1.2860 + */
  1.2861 +
  1.2862 +.macro bilinear_load_8888 reg1, reg2, tmp
  1.2863 +    mov       TMP1, X, asr #16
  1.2864 +    add       X, X, UX
  1.2865 +    add       TMP1, TOP, TMP1, asl #2
  1.2866 +    vld1.32   {reg1}, [TMP1], STRIDE
  1.2867 +    vld1.32   {reg2}, [TMP1]
  1.2868 +.endm
  1.2869 +
  1.2870 +.macro bilinear_load_0565 reg1, reg2, tmp
  1.2871 +    mov       TMP1, X, asr #16
  1.2872 +    add       X, X, UX
  1.2873 +    add       TMP1, TOP, TMP1, asl #1
  1.2874 +    vld1.32   {reg2[0]}, [TMP1], STRIDE
  1.2875 +    vld1.32   {reg2[1]}, [TMP1]
  1.2876 +    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
  1.2877 +.endm
  1.2878 +
  1.2879 +.macro bilinear_load_and_vertical_interpolate_two_8888 \
  1.2880 +                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
  1.2881 +
  1.2882 +    bilinear_load_8888 reg1, reg2, tmp1
  1.2883 +    vmull.u8  acc1, reg1, d28
  1.2884 +    vmlal.u8  acc1, reg2, d29
  1.2885 +    bilinear_load_8888 reg3, reg4, tmp2
  1.2886 +    vmull.u8  acc2, reg3, d28
  1.2887 +    vmlal.u8  acc2, reg4, d29
  1.2888 +.endm
  1.2889 +
  1.2890 +.macro bilinear_load_and_vertical_interpolate_four_8888 \
  1.2891 +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
  1.2892 +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
  1.2893 +
  1.2894 +    bilinear_load_and_vertical_interpolate_two_8888 \
  1.2895 +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
  1.2896 +    bilinear_load_and_vertical_interpolate_two_8888 \
  1.2897 +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
  1.2898 +.endm
  1.2899 +
  1.2900 +.macro bilinear_load_and_vertical_interpolate_two_0565 \
  1.2901 +                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
  1.2902 +
  1.2903 +    mov       TMP1, X, asr #16
  1.2904 +    add       X, X, UX
  1.2905 +    add       TMP1, TOP, TMP1, asl #1
  1.2906 +    mov       TMP2, X, asr #16
  1.2907 +    add       X, X, UX
  1.2908 +    add       TMP2, TOP, TMP2, asl #1
  1.2909 +    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
  1.2910 +    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
  1.2911 +    vld1.32   {acc2lo[1]}, [TMP1]
  1.2912 +    vld1.32   {acc2hi[1]}, [TMP2]
  1.2913 +    convert_0565_to_x888 acc2, reg3, reg2, reg1
  1.2914 +    vzip.u8   reg1, reg3
  1.2915 +    vzip.u8   reg2, reg4
  1.2916 +    vzip.u8   reg3, reg4
  1.2917 +    vzip.u8   reg1, reg2
  1.2918 +    vmull.u8  acc1, reg1, d28
  1.2919 +    vmlal.u8  acc1, reg2, d29
  1.2920 +    vmull.u8  acc2, reg3, d28
  1.2921 +    vmlal.u8  acc2, reg4, d29
  1.2922 +.endm
  1.2923 +
  1.2924 +.macro bilinear_load_and_vertical_interpolate_four_0565 \
  1.2925 +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
  1.2926 +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
  1.2927 +
  1.2928 +    mov       TMP1, X, asr #16
  1.2929 +    add       X, X, UX
  1.2930 +    add       TMP1, TOP, TMP1, asl #1
  1.2931 +    mov       TMP2, X, asr #16
  1.2932 +    add       X, X, UX
  1.2933 +    add       TMP2, TOP, TMP2, asl #1
  1.2934 +    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
  1.2935 +    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
  1.2936 +    vld1.32   {xacc2lo[1]}, [TMP1]
  1.2937 +    vld1.32   {xacc2hi[1]}, [TMP2]
  1.2938 +    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
  1.2939 +    mov       TMP1, X, asr #16
  1.2940 +    add       X, X, UX
  1.2941 +    add       TMP1, TOP, TMP1, asl #1
  1.2942 +    mov       TMP2, X, asr #16
  1.2943 +    add       X, X, UX
  1.2944 +    add       TMP2, TOP, TMP2, asl #1
  1.2945 +    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
  1.2946 +    vzip.u8   xreg1, xreg3
  1.2947 +    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
  1.2948 +    vzip.u8   xreg2, xreg4
  1.2949 +    vld1.32   {yacc2lo[1]}, [TMP1]
  1.2950 +    vzip.u8   xreg3, xreg4
  1.2951 +    vld1.32   {yacc2hi[1]}, [TMP2]
  1.2952 +    vzip.u8   xreg1, xreg2
  1.2953 +    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
  1.2954 +    vmull.u8  xacc1, xreg1, d28
  1.2955 +    vzip.u8   yreg1, yreg3
  1.2956 +    vmlal.u8  xacc1, xreg2, d29
  1.2957 +    vzip.u8   yreg2, yreg4
  1.2958 +    vmull.u8  xacc2, xreg3, d28
  1.2959 +    vzip.u8   yreg3, yreg4
  1.2960 +    vmlal.u8  xacc2, xreg4, d29
  1.2961 +    vzip.u8   yreg1, yreg2
  1.2962 +    vmull.u8  yacc1, yreg1, d28
  1.2963 +    vmlal.u8  yacc1, yreg2, d29
  1.2964 +    vmull.u8  yacc2, yreg3, d28
  1.2965 +    vmlal.u8  yacc2, yreg4, d29
  1.2966 +.endm
  1.2967 +
  1.2968 +.macro bilinear_store_8888 numpix, tmp1, tmp2
  1.2969 +.if numpix == 4
  1.2970 +    vst1.32   {d0, d1}, [OUT, :128]!
  1.2971 +.elseif numpix == 2
  1.2972 +    vst1.32   {d0}, [OUT, :64]!
  1.2973 +.elseif numpix == 1
  1.2974 +    vst1.32   {d0[0]}, [OUT, :32]!
  1.2975 +.else
  1.2976 +    .error bilinear_store_8888 numpix is unsupported
  1.2977 +.endif
  1.2978 +.endm
  1.2979 +
  1.2980 +.macro bilinear_store_0565 numpix, tmp1, tmp2
  1.2981 +    vuzp.u8 d0, d1
  1.2982 +    vuzp.u8 d2, d3
  1.2983 +    vuzp.u8 d1, d3
  1.2984 +    vuzp.u8 d0, d2
  1.2985 +    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
  1.2986 +.if numpix == 4
  1.2987 +    vst1.16   {d2}, [OUT, :64]!
  1.2988 +.elseif numpix == 2
  1.2989 +    vst1.32   {d2[0]}, [OUT, :32]!
  1.2990 +.elseif numpix == 1
  1.2991 +    vst1.16   {d2[0]}, [OUT, :16]!
  1.2992 +.else
  1.2993 +    .error bilinear_store_0565 numpix is unsupported
  1.2994 +.endif
  1.2995 +.endm
  1.2996 +
  1.2997 +.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
  1.2998 +    bilinear_load_&src_fmt d0, d1, d2
  1.2999 +    vmull.u8  q1, d0, d28
  1.3000 +    vmlal.u8  q1, d1, d29
  1.3001 +    /* 5 cycles bubble */
  1.3002 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
  1.3003 +    vmlsl.u16 q0, d2, d30
  1.3004 +    vmlal.u16 q0, d3, d30
  1.3005 +    /* 5 cycles bubble */
  1.3006 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3007 +    /* 3 cycles bubble */
  1.3008 +    vmovn.u16 d0, q0
  1.3009 +    /* 1 cycle bubble */
  1.3010 +    bilinear_store_&dst_fmt 1, q2, q3
  1.3011 +.endm
  1.3012 +
  1.3013 +.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
  1.3014 +    bilinear_load_and_vertical_interpolate_two_&src_fmt \
  1.3015 +                q1, q11, d0, d1, d20, d21, d22, d23
  1.3016 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
  1.3017 +    vmlsl.u16 q0, d2, d30
  1.3018 +    vmlal.u16 q0, d3, d30
  1.3019 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
  1.3020 +    vmlsl.u16 q10, d22, d31
  1.3021 +    vmlal.u16 q10, d23, d31
  1.3022 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3023 +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3024 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3025 +    vadd.u16  q12, q12, q13
  1.3026 +    vmovn.u16 d0, q0
  1.3027 +    bilinear_store_&dst_fmt 2, q2, q3
  1.3028 +.endm
  1.3029 +
  1.3030 +.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
  1.3031 +    bilinear_load_and_vertical_interpolate_four_&src_fmt \
  1.3032 +                q1, q11, d0, d1, d20, d21, d22, d23 \
  1.3033 +                q3, q9,  d4, d5, d16, d17, d18, d19
  1.3034 +    pld       [TMP1, PF_OFFS]
  1.3035 +    sub       TMP1, TMP1, STRIDE
  1.3036 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
  1.3037 +    vmlsl.u16 q0, d2, d30
  1.3038 +    vmlal.u16 q0, d3, d30
  1.3039 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
  1.3040 +    vmlsl.u16 q10, d22, d31
  1.3041 +    vmlal.u16 q10, d23, d31
  1.3042 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3043 +    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
  1.3044 +    vmlsl.u16 q2, d6, d30
  1.3045 +    vmlal.u16 q2, d7, d30
  1.3046 +    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
  1.3047 +    pld       [TMP2, PF_OFFS]
  1.3048 +    vmlsl.u16 q8, d18, d31
  1.3049 +    vmlal.u16 q8, d19, d31
  1.3050 +    vadd.u16  q12, q12, q13
  1.3051 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3052 +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3053 +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3054 +    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3055 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3056 +    vmovn.u16 d0, q0
  1.3057 +    vmovn.u16 d1, q2
  1.3058 +    vadd.u16  q12, q12, q13
  1.3059 +    bilinear_store_&dst_fmt 4, q2, q3
  1.3060 +.endm
  1.3061 +
  1.3062 +.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
  1.3063 +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
  1.3064 +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
  1.3065 +.else
  1.3066 +    bilinear_interpolate_four_pixels src_fmt, dst_fmt
  1.3067 +.endif
  1.3068 +.endm
  1.3069 +
  1.3070 +.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
  1.3071 +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
  1.3072 +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
  1.3073 +.endif
  1.3074 +.endm
  1.3075 +
  1.3076 +.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  1.3077 +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
  1.3078 +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
  1.3079 +.else
  1.3080 +    bilinear_interpolate_four_pixels src_fmt, dst_fmt
  1.3081 +.endif
  1.3082 +.endm
  1.3083 +
  1.3084 +.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
  1.3085 +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
  1.3086 +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
  1.3087 +.else
  1.3088 +    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
  1.3089 +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  1.3090 +.endif
  1.3091 +.endm
  1.3092 +
  1.3093 +.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
  1.3094 +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
  1.3095 +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
  1.3096 +.else
  1.3097 +    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
  1.3098 +.endif
  1.3099 +.endm
  1.3100 +
  1.3101 +.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
  1.3102 +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
  1.3103 +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
  1.3104 +.else
  1.3105 +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  1.3106 +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  1.3107 +.endif
  1.3108 +.endm
  1.3109 +
  1.3110 +.set BILINEAR_FLAG_UNROLL_4,          0
  1.3111 +.set BILINEAR_FLAG_UNROLL_8,          1
  1.3112 +.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
  1.3113 +
  1.3114 +/*
  1.3115 + * Main template macro for generating NEON optimized bilinear scanline
  1.3116 + * functions.
  1.3117 + *
  1.3118 + * Bilinear scanline scaler macro template uses the following arguments:
  1.3119 + *  fname             - name of the function to generate
  1.3120 + *  src_fmt           - source color format (8888 or 0565)
  1.3121 + *  dst_fmt           - destination color format (8888 or 0565)
  1.3122 + *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
  1.3123 + *  prefetch_distance - prefetch in the source image by that many
  1.3124 + *                      pixels ahead
  1.3125 + */
  1.3126 +
  1.3127 +.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
  1.3128 +                                       src_bpp_shift, dst_bpp_shift, \
  1.3129 +                                       prefetch_distance, flags
  1.3130 +
  1.3131 +pixman_asm_function fname
  1.3132 +    OUT       .req      r0
  1.3133 +    TOP       .req      r1
  1.3134 +    BOTTOM    .req      r2
  1.3135 +    WT        .req      r3
  1.3136 +    WB        .req      r4
  1.3137 +    X         .req      r5
  1.3138 +    UX        .req      r6
  1.3139 +    WIDTH     .req      ip
  1.3140 +    TMP1      .req      r3
  1.3141 +    TMP2      .req      r4
  1.3142 +    PF_OFFS   .req      r7
  1.3143 +    TMP3      .req      r8
  1.3144 +    TMP4      .req      r9
  1.3145 +    STRIDE    .req      r2
  1.3146 +
  1.3147 +    .fnstart
  1.3148 +    mov       ip, sp
  1.3149 +    .save     {r4, r5, r6, r7, r8, r9}
  1.3150 +    push      {r4, r5, r6, r7, r8, r9}
  1.3151 +    mov       PF_OFFS, #prefetch_distance
  1.3152 +    ldmia     ip, {WB, X, UX, WIDTH}
  1.3153 +    mul       PF_OFFS, PF_OFFS, UX
  1.3154 +
  1.3155 +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
  1.3156 +    .vsave    {d8-d15}
  1.3157 +    vpush     {d8-d15}
  1.3158 +.endif
  1.3159 +
  1.3160 +    sub       STRIDE, BOTTOM, TOP
  1.3161 +    .unreq    BOTTOM
  1.3162 +
  1.3163 +    cmp       WIDTH, #0
  1.3164 +    ble       3f
  1.3165 +
  1.3166 +    vdup.u16  q12, X
  1.3167 +    vdup.u16  q13, UX
  1.3168 +    vdup.u8   d28, WT
  1.3169 +    vdup.u8   d29, WB
  1.3170 +    vadd.u16  d25, d25, d26
  1.3171 +
  1.3172 +    /* ensure good destination alignment  */
  1.3173 +    cmp       WIDTH, #1
  1.3174 +    blt       0f
  1.3175 +    tst       OUT, #(1 << dst_bpp_shift)
  1.3176 +    beq       0f
  1.3177 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3178 +    vadd.u16  q12, q12, q13
  1.3179 +    bilinear_interpolate_last_pixel src_fmt, dst_fmt
  1.3180 +    sub       WIDTH, WIDTH, #1
  1.3181 +0:
  1.3182 +    vadd.u16  q13, q13, q13
  1.3183 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3184 +    vadd.u16  q12, q12, q13
  1.3185 +
  1.3186 +    cmp       WIDTH, #2
  1.3187 +    blt       0f
  1.3188 +    tst       OUT, #(1 << (dst_bpp_shift + 1))
  1.3189 +    beq       0f
  1.3190 +    bilinear_interpolate_two_pixels src_fmt, dst_fmt
  1.3191 +    sub       WIDTH, WIDTH, #2
  1.3192 +0:
  1.3193 +.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
  1.3194 +/*********** 8 pixels per iteration *****************/
  1.3195 +    cmp       WIDTH, #4
  1.3196 +    blt       0f
  1.3197 +    tst       OUT, #(1 << (dst_bpp_shift + 2))
  1.3198 +    beq       0f
  1.3199 +    bilinear_interpolate_four_pixels src_fmt, dst_fmt
  1.3200 +    sub       WIDTH, WIDTH, #4
  1.3201 +0:
  1.3202 +    subs      WIDTH, WIDTH, #8
  1.3203 +    blt       1f
  1.3204 +    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
  1.3205 +    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
  1.3206 +    subs      WIDTH, WIDTH, #8
  1.3207 +    blt       5f
  1.3208 +0:
  1.3209 +    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
  1.3210 +    subs      WIDTH, WIDTH, #8
  1.3211 +    bge       0b
  1.3212 +5:
  1.3213 +    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
  1.3214 +1:
  1.3215 +    tst       WIDTH, #4
  1.3216 +    beq       2f
  1.3217 +    bilinear_interpolate_four_pixels src_fmt, dst_fmt
  1.3218 +2:
  1.3219 +.else
  1.3220 +/*********** 4 pixels per iteration *****************/
  1.3221 +    subs      WIDTH, WIDTH, #4
  1.3222 +    blt       1f
  1.3223 +    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
  1.3224 +    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
  1.3225 +    subs      WIDTH, WIDTH, #4
  1.3226 +    blt       5f
  1.3227 +0:
  1.3228 +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  1.3229 +    subs      WIDTH, WIDTH, #4
  1.3230 +    bge       0b
  1.3231 +5:
  1.3232 +    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
  1.3233 +1:
  1.3234 +/****************************************************/
  1.3235 +.endif
  1.3236 +    /* handle the remaining trailing pixels */
  1.3237 +    tst       WIDTH, #2
  1.3238 +    beq       2f
  1.3239 +    bilinear_interpolate_two_pixels src_fmt, dst_fmt
  1.3240 +2:
  1.3241 +    tst       WIDTH, #1
  1.3242 +    beq       3f
  1.3243 +    bilinear_interpolate_last_pixel src_fmt, dst_fmt
  1.3244 +3:
  1.3245 +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
  1.3246 +    vpop      {d8-d15}
  1.3247 +.endif
  1.3248 +    pop       {r4, r5, r6, r7, r8, r9}
  1.3249 +    bx        lr
  1.3250 +    .fnend
  1.3251 +
  1.3252 +    .unreq    OUT
  1.3253 +    .unreq    TOP
  1.3254 +    .unreq    WT
  1.3255 +    .unreq    WB
  1.3256 +    .unreq    X
  1.3257 +    .unreq    UX
  1.3258 +    .unreq    WIDTH
  1.3259 +    .unreq    TMP1
  1.3260 +    .unreq    TMP2
  1.3261 +    .unreq    PF_OFFS
  1.3262 +    .unreq    TMP3
  1.3263 +    .unreq    TMP4
  1.3264 +    .unreq    STRIDE
  1.3265 +.endfunc
  1.3266 +
  1.3267 +.endm
  1.3268 +
  1.3269 +/*****************************************************************************/
  1.3270 +
  1.3271 +.set have_bilinear_interpolate_four_pixels_8888_8888, 1
  1.3272 +
  1.3273 +.macro bilinear_interpolate_four_pixels_8888_8888_head
  1.3274 +    mov       TMP1, X, asr #16
  1.3275 +    add       X, X, UX
  1.3276 +    add       TMP1, TOP, TMP1, asl #2
  1.3277 +    mov       TMP2, X, asr #16
  1.3278 +    add       X, X, UX
  1.3279 +    add       TMP2, TOP, TMP2, asl #2
  1.3280 +
  1.3281 +    vld1.32   {d22}, [TMP1], STRIDE
  1.3282 +    vld1.32   {d23}, [TMP1]
  1.3283 +    mov       TMP3, X, asr #16
  1.3284 +    add       X, X, UX
  1.3285 +    add       TMP3, TOP, TMP3, asl #2
  1.3286 +    vmull.u8  q8, d22, d28
  1.3287 +    vmlal.u8  q8, d23, d29
  1.3288 +
  1.3289 +    vld1.32   {d22}, [TMP2], STRIDE
  1.3290 +    vld1.32   {d23}, [TMP2]
  1.3291 +    mov       TMP4, X, asr #16
  1.3292 +    add       X, X, UX
  1.3293 +    add       TMP4, TOP, TMP4, asl #2
  1.3294 +    vmull.u8  q9, d22, d28
  1.3295 +    vmlal.u8  q9, d23, d29
  1.3296 +
  1.3297 +    vld1.32   {d22}, [TMP3], STRIDE
  1.3298 +    vld1.32   {d23}, [TMP3]
  1.3299 +    vmull.u8  q10, d22, d28
  1.3300 +    vmlal.u8  q10, d23, d29
  1.3301 +
  1.3302 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  1.3303 +    vmlsl.u16 q0, d16, d30
  1.3304 +    vmlal.u16 q0, d17, d30
  1.3305 +
  1.3306 +    pld       [TMP4, PF_OFFS]
  1.3307 +    vld1.32   {d16}, [TMP4], STRIDE
  1.3308 +    vld1.32   {d17}, [TMP4]
  1.3309 +    pld       [TMP4, PF_OFFS]
  1.3310 +    vmull.u8  q11, d16, d28
  1.3311 +    vmlal.u8  q11, d17, d29
  1.3312 +
  1.3313 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  1.3314 +    vmlsl.u16 q1, d18, d31
  1.3315 +.endm
  1.3316 +
  1.3317 +.macro bilinear_interpolate_four_pixels_8888_8888_tail
  1.3318 +    vmlal.u16 q1, d19, d31
  1.3319 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3320 +    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  1.3321 +    vmlsl.u16 q2, d20, d30
  1.3322 +    vmlal.u16 q2, d21, d30
  1.3323 +    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  1.3324 +    vmlsl.u16 q3, d22, d31
  1.3325 +    vmlal.u16 q3, d23, d31
  1.3326 +    vadd.u16  q12, q12, q13
  1.3327 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3328 +    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3329 +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3330 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3331 +    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3332 +    vmovn.u16 d6, q0
  1.3333 +    vmovn.u16 d7, q2
  1.3334 +    vadd.u16  q12, q12, q13
  1.3335 +    vst1.32   {d6, d7}, [OUT, :128]!
  1.3336 +.endm
  1.3337 +
  1.3338 +.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
  1.3339 +    mov       TMP1, X, asr #16
  1.3340 +    add       X, X, UX
  1.3341 +    add       TMP1, TOP, TMP1, asl #2
  1.3342 +    mov       TMP2, X, asr #16
  1.3343 +    add       X, X, UX
  1.3344 +    add       TMP2, TOP, TMP2, asl #2
  1.3345 +        vmlal.u16 q1, d19, d31
  1.3346 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3347 +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  1.3348 +        vmlsl.u16 q2, d20, d30
  1.3349 +        vmlal.u16 q2, d21, d30
  1.3350 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  1.3351 +    vld1.32   {d20}, [TMP1], STRIDE
  1.3352 +        vmlsl.u16 q3, d22, d31
  1.3353 +        vmlal.u16 q3, d23, d31
  1.3354 +    vld1.32   {d21}, [TMP1]
  1.3355 +    vmull.u8  q8, d20, d28
  1.3356 +    vmlal.u8  q8, d21, d29
  1.3357 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3358 +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3359 +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3360 +    vld1.32   {d22}, [TMP2], STRIDE
  1.3361 +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3362 +        vadd.u16  q12, q12, q13
  1.3363 +    vld1.32   {d23}, [TMP2]
  1.3364 +    vmull.u8  q9, d22, d28
  1.3365 +    mov       TMP3, X, asr #16
  1.3366 +    add       X, X, UX
  1.3367 +    add       TMP3, TOP, TMP3, asl #2
  1.3368 +    mov       TMP4, X, asr #16
  1.3369 +    add       X, X, UX
  1.3370 +    add       TMP4, TOP, TMP4, asl #2
  1.3371 +    vmlal.u8  q9, d23, d29
  1.3372 +    vld1.32   {d22}, [TMP3], STRIDE
  1.3373 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3374 +    vld1.32   {d23}, [TMP3]
  1.3375 +    vmull.u8  q10, d22, d28
  1.3376 +    vmlal.u8  q10, d23, d29
  1.3377 +        vmovn.u16 d6, q0
  1.3378 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  1.3379 +        vmovn.u16 d7, q2
  1.3380 +    vmlsl.u16 q0, d16, d30
  1.3381 +    vmlal.u16 q0, d17, d30
  1.3382 +    pld       [TMP4, PF_OFFS]
  1.3383 +    vld1.32   {d16}, [TMP4], STRIDE
  1.3384 +        vadd.u16  q12, q12, q13
  1.3385 +    vld1.32   {d17}, [TMP4]
  1.3386 +    pld       [TMP4, PF_OFFS]
  1.3387 +    vmull.u8  q11, d16, d28
  1.3388 +    vmlal.u8  q11, d17, d29
  1.3389 +        vst1.32   {d6, d7}, [OUT, :128]!
  1.3390 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  1.3391 +    vmlsl.u16 q1, d18, d31
  1.3392 +.endm
  1.3393 +
  1.3394 +/*****************************************************************************/
  1.3395 +
  1.3396 +.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
  1.3397 +
  1.3398 +.macro bilinear_interpolate_eight_pixels_8888_0565_head
  1.3399 +    mov       TMP1, X, asr #16
  1.3400 +    add       X, X, UX
  1.3401 +    add       TMP1, TOP, TMP1, asl #2
  1.3402 +    mov       TMP2, X, asr #16
  1.3403 +    add       X, X, UX
  1.3404 +    add       TMP2, TOP, TMP2, asl #2
  1.3405 +    vld1.32   {d20}, [TMP1], STRIDE
  1.3406 +    vld1.32   {d21}, [TMP1]
  1.3407 +    vmull.u8  q8, d20, d28
  1.3408 +    vmlal.u8  q8, d21, d29
  1.3409 +    vld1.32   {d22}, [TMP2], STRIDE
  1.3410 +    vld1.32   {d23}, [TMP2]
  1.3411 +    vmull.u8  q9, d22, d28
  1.3412 +    mov       TMP3, X, asr #16
  1.3413 +    add       X, X, UX
  1.3414 +    add       TMP3, TOP, TMP3, asl #2
  1.3415 +    mov       TMP4, X, asr #16
  1.3416 +    add       X, X, UX
  1.3417 +    add       TMP4, TOP, TMP4, asl #2
  1.3418 +    vmlal.u8  q9, d23, d29
  1.3419 +    vld1.32   {d22}, [TMP3], STRIDE
  1.3420 +    vld1.32   {d23}, [TMP3]
  1.3421 +    vmull.u8  q10, d22, d28
  1.3422 +    vmlal.u8  q10, d23, d29
  1.3423 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  1.3424 +    vmlsl.u16 q0, d16, d30
  1.3425 +    vmlal.u16 q0, d17, d30
  1.3426 +    pld       [TMP4, PF_OFFS]
  1.3427 +    vld1.32   {d16}, [TMP4], STRIDE
  1.3428 +    vld1.32   {d17}, [TMP4]
  1.3429 +    pld       [TMP4, PF_OFFS]
  1.3430 +    vmull.u8  q11, d16, d28
  1.3431 +    vmlal.u8  q11, d17, d29
  1.3432 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  1.3433 +    vmlsl.u16 q1, d18, d31
  1.3434 +
  1.3435 +    mov       TMP1, X, asr #16
  1.3436 +    add       X, X, UX
  1.3437 +    add       TMP1, TOP, TMP1, asl #2
  1.3438 +    mov       TMP2, X, asr #16
  1.3439 +    add       X, X, UX
  1.3440 +    add       TMP2, TOP, TMP2, asl #2
  1.3441 +        vmlal.u16 q1, d19, d31
  1.3442 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3443 +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  1.3444 +        vmlsl.u16 q2, d20, d30
  1.3445 +        vmlal.u16 q2, d21, d30
  1.3446 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  1.3447 +    vld1.32   {d20}, [TMP1], STRIDE
  1.3448 +        vmlsl.u16 q3, d22, d31
  1.3449 +        vmlal.u16 q3, d23, d31
  1.3450 +    vld1.32   {d21}, [TMP1]
  1.3451 +    vmull.u8  q8, d20, d28
  1.3452 +    vmlal.u8  q8, d21, d29
  1.3453 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3454 +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3455 +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3456 +    vld1.32   {d22}, [TMP2], STRIDE
  1.3457 +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3458 +        vadd.u16  q12, q12, q13
  1.3459 +    vld1.32   {d23}, [TMP2]
  1.3460 +    vmull.u8  q9, d22, d28
  1.3461 +    mov       TMP3, X, asr #16
  1.3462 +    add       X, X, UX
  1.3463 +    add       TMP3, TOP, TMP3, asl #2
  1.3464 +    mov       TMP4, X, asr #16
  1.3465 +    add       X, X, UX
  1.3466 +    add       TMP4, TOP, TMP4, asl #2
  1.3467 +    vmlal.u8  q9, d23, d29
  1.3468 +    vld1.32   {d22}, [TMP3], STRIDE
  1.3469 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3470 +    vld1.32   {d23}, [TMP3]
  1.3471 +    vmull.u8  q10, d22, d28
  1.3472 +    vmlal.u8  q10, d23, d29
  1.3473 +        vmovn.u16 d8, q0
  1.3474 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  1.3475 +        vmovn.u16 d9, q2
  1.3476 +    vmlsl.u16 q0, d16, d30
  1.3477 +    vmlal.u16 q0, d17, d30
  1.3478 +    pld       [TMP4, PF_OFFS]
  1.3479 +    vld1.32   {d16}, [TMP4], STRIDE
  1.3480 +        vadd.u16  q12, q12, q13
  1.3481 +    vld1.32   {d17}, [TMP4]
  1.3482 +    pld       [TMP4, PF_OFFS]
  1.3483 +    vmull.u8  q11, d16, d28
  1.3484 +    vmlal.u8  q11, d17, d29
  1.3485 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  1.3486 +    vmlsl.u16 q1, d18, d31
  1.3487 +.endm
  1.3488 +
  1.3489 +.macro bilinear_interpolate_eight_pixels_8888_0565_tail
  1.3490 +    vmlal.u16 q1, d19, d31
  1.3491 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3492 +    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  1.3493 +    vmlsl.u16 q2, d20, d30
  1.3494 +    vmlal.u16 q2, d21, d30
  1.3495 +    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  1.3496 +    vmlsl.u16 q3, d22, d31
  1.3497 +    vmlal.u16 q3, d23, d31
  1.3498 +    vadd.u16  q12, q12, q13
  1.3499 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3500 +    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3501 +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3502 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3503 +    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3504 +    vmovn.u16 d10, q0
  1.3505 +    vmovn.u16 d11, q2
  1.3506 +    vadd.u16  q12, q12, q13
  1.3507 +
  1.3508 +    vuzp.u8   d8, d9
  1.3509 +    vuzp.u8   d10, d11
  1.3510 +    vuzp.u8   d9, d11
  1.3511 +    vuzp.u8   d8, d10
  1.3512 +    vshll.u8  q6, d9, #8
  1.3513 +    vshll.u8  q5, d10, #8
  1.3514 +    vshll.u8  q7, d8, #8
  1.3515 +    vsri.u16  q5, q6, #5
  1.3516 +    vsri.u16  q5, q7, #11
  1.3517 +    vst1.32   {d10, d11}, [OUT, :128]!
  1.3518 +.endm
  1.3519 +
  1.3520 +.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
  1.3521 +    mov       TMP1, X, asr #16
  1.3522 +    add       X, X, UX
  1.3523 +    add       TMP1, TOP, TMP1, asl #2
  1.3524 +    mov       TMP2, X, asr #16
  1.3525 +    add       X, X, UX
  1.3526 +    add       TMP2, TOP, TMP2, asl #2
  1.3527 +        vmlal.u16 q1, d19, d31
  1.3528 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3529 +            vuzp.u8 d8, d9
  1.3530 +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  1.3531 +        vmlsl.u16 q2, d20, d30
  1.3532 +        vmlal.u16 q2, d21, d30
  1.3533 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  1.3534 +    vld1.32   {d20}, [TMP1], STRIDE
  1.3535 +        vmlsl.u16 q3, d22, d31
  1.3536 +        vmlal.u16 q3, d23, d31
  1.3537 +    vld1.32   {d21}, [TMP1]
  1.3538 +    vmull.u8  q8, d20, d28
  1.3539 +    vmlal.u8  q8, d21, d29
  1.3540 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3541 +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3542 +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3543 +    vld1.32   {d22}, [TMP2], STRIDE
  1.3544 +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3545 +        vadd.u16  q12, q12, q13
  1.3546 +    vld1.32   {d23}, [TMP2]
  1.3547 +    vmull.u8  q9, d22, d28
  1.3548 +    mov       TMP3, X, asr #16
  1.3549 +    add       X, X, UX
  1.3550 +    add       TMP3, TOP, TMP3, asl #2
  1.3551 +    mov       TMP4, X, asr #16
  1.3552 +    add       X, X, UX
  1.3553 +    add       TMP4, TOP, TMP4, asl #2
  1.3554 +    vmlal.u8  q9, d23, d29
  1.3555 +    vld1.32   {d22}, [TMP3], STRIDE
  1.3556 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3557 +    vld1.32   {d23}, [TMP3]
  1.3558 +    vmull.u8  q10, d22, d28
  1.3559 +    vmlal.u8  q10, d23, d29
  1.3560 +        vmovn.u16 d10, q0
  1.3561 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  1.3562 +        vmovn.u16 d11, q2
  1.3563 +    vmlsl.u16 q0, d16, d30
  1.3564 +    vmlal.u16 q0, d17, d30
  1.3565 +    pld       [TMP4, PF_OFFS]
  1.3566 +    vld1.32   {d16}, [TMP4], STRIDE
  1.3567 +        vadd.u16  q12, q12, q13
  1.3568 +    vld1.32   {d17}, [TMP4]
  1.3569 +    pld       [TMP4, PF_OFFS]
  1.3570 +    vmull.u8  q11, d16, d28
  1.3571 +    vmlal.u8  q11, d17, d29
  1.3572 +            vuzp.u8 d10, d11
  1.3573 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  1.3574 +    vmlsl.u16 q1, d18, d31
  1.3575 +
  1.3576 +    mov       TMP1, X, asr #16
  1.3577 +    add       X, X, UX
  1.3578 +    add       TMP1, TOP, TMP1, asl #2
  1.3579 +    mov       TMP2, X, asr #16
  1.3580 +    add       X, X, UX
  1.3581 +    add       TMP2, TOP, TMP2, asl #2
  1.3582 +        vmlal.u16 q1, d19, d31
  1.3583 +            vuzp.u8 d9, d11
  1.3584 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3585 +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  1.3586 +            vuzp.u8 d8, d10
  1.3587 +        vmlsl.u16 q2, d20, d30
  1.3588 +        vmlal.u16 q2, d21, d30
  1.3589 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  1.3590 +    vld1.32   {d20}, [TMP1], STRIDE
  1.3591 +        vmlsl.u16 q3, d22, d31
  1.3592 +        vmlal.u16 q3, d23, d31
  1.3593 +    vld1.32   {d21}, [TMP1]
  1.3594 +    vmull.u8  q8, d20, d28
  1.3595 +    vmlal.u8  q8, d21, d29
  1.3596 +            vshll.u8  q6, d9, #8
  1.3597 +            vshll.u8  q5, d10, #8
  1.3598 +            vshll.u8  q7, d8, #8
  1.3599 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3600 +            vsri.u16  q5, q6, #5
  1.3601 +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3602 +            vsri.u16  q5, q7, #11
  1.3603 +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3604 +    vld1.32   {d22}, [TMP2], STRIDE
  1.3605 +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.3606 +        vadd.u16  q12, q12, q13
  1.3607 +    vld1.32   {d23}, [TMP2]
  1.3608 +    vmull.u8  q9, d22, d28
  1.3609 +    mov       TMP3, X, asr #16
  1.3610 +    add       X, X, UX
  1.3611 +    add       TMP3, TOP, TMP3, asl #2
  1.3612 +    mov       TMP4, X, asr #16
  1.3613 +    add       X, X, UX
  1.3614 +    add       TMP4, TOP, TMP4, asl #2
  1.3615 +    vmlal.u8  q9, d23, d29
  1.3616 +    vld1.32   {d22}, [TMP3], STRIDE
  1.3617 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.3618 +    vld1.32   {d23}, [TMP3]
  1.3619 +    vmull.u8  q10, d22, d28
  1.3620 +    vmlal.u8  q10, d23, d29
  1.3621 +        vmovn.u16 d8, q0
  1.3622 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  1.3623 +        vmovn.u16 d9, q2
  1.3624 +    vmlsl.u16 q0, d16, d30
  1.3625 +    vmlal.u16 q0, d17, d30
  1.3626 +    pld       [TMP4, PF_OFFS]
  1.3627 +    vld1.32   {d16}, [TMP4], STRIDE
  1.3628 +        vadd.u16  q12, q12, q13
  1.3629 +    vld1.32   {d17}, [TMP4]
  1.3630 +    pld       [TMP4, PF_OFFS]
  1.3631 +    vmull.u8  q11, d16, d28
  1.3632 +    vmlal.u8  q11, d17, d29
  1.3633 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  1.3634 +            vst1.32   {d10, d11}, [OUT, :128]!
  1.3635 +    vmlsl.u16 q1, d18, d31
  1.3636 +.endm
  1.3637 +/*****************************************************************************/
  1.3638 +
  1.3639 +generate_bilinear_scanline_func \
  1.3640 +    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
  1.3641 +    2, 2, 28, BILINEAR_FLAG_UNROLL_4
  1.3642 +
  1.3643 +generate_bilinear_scanline_func \
  1.3644 +    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
  1.3645 +    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
  1.3646 +
  1.3647 +generate_bilinear_scanline_func \
  1.3648 +    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
  1.3649 +    1, 2, 28, BILINEAR_FLAG_UNROLL_4
  1.3650 +
  1.3651 +generate_bilinear_scanline_func \
  1.3652 +    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
  1.3653 +    1, 1, 28, BILINEAR_FLAG_UNROLL_4

mercurial