gfx/cairo/libpixman/src/pixman-arm-neon-asm.S

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  * Copyright © 2009 Nokia Corporation
     3  *
     4  * Permission is hereby granted, free of charge, to any person obtaining a
     5  * copy of this software and associated documentation files (the "Software"),
     6  * to deal in the Software without restriction, including without limitation
     7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     8  * and/or sell copies of the Software, and to permit persons to whom the
     9  * Software is furnished to do so, subject to the following conditions:
    10  *
    11  * The above copyright notice and this permission notice (including the next
    12  * paragraph) shall be included in all copies or substantial portions of the
    13  * Software.
    14  *
    15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
    18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    21  * DEALINGS IN THE SOFTWARE.
    22  *
    23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
    24  */
    26 /*
    27  * This file contains implementations of NEON optimized pixel processing
    28  * functions. There is no full and detailed tutorial, but some functions
    29  * (those which are exposing some new or interesting features) are
    30  * extensively commented and can be used as examples.
    31  *
    32  * You may want to have a look at the comments for following functions:
    33  *  - pixman_composite_over_8888_0565_asm_neon
    34  *  - pixman_composite_over_n_8_0565_asm_neon
    35  */
    37 /* Prevent the stack from becoming executable for no reason... */
    38 #if defined(__linux__) && defined(__ELF__)
    39 .section .note.GNU-stack,"",%progbits
    40 #endif
    42     .text
    43     .fpu neon
    44     .arch armv7a
    45     .object_arch armv4
    46     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
    47     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
    48     .arm
    49     .altmacro
    50     .p2align 2
    52 #include "pixman-private.h"
    53 #include "pixman-arm-neon-asm.h"
    55 /* Global configuration options and preferences */
    57 /*
    58  * The code can optionally make use of unaligned memory accesses to improve
    59  * performance of handling leading/trailing pixels for each scanline.
    60  * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
    61  * example in linux if unaligned memory accesses are not configured to
    62  * generate.exceptions.
    63  */
    64 .set RESPECT_STRICT_ALIGNMENT, 1
    66 /*
    67  * Set default prefetch type. There is a choice between the following options:
    68  *
    69  * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
    70  * as NOP to workaround some HW bugs or for whatever other reason)
    71  *
    72  * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
    73  * advanced prefetch intruduces heavy overhead)
    74  *
    75  * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
    76  * which can run ARM and NEON instructions simultaneously so that extra ARM
    77  * instructions do not add (many) extra cycles, but improve prefetch efficiency)
    78  *
    79  * Note: some types of function can't support advanced prefetch and fallback
    80  *       to simple one (those which handle 24bpp pixels)
    81  */
    82 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
    84 /* Prefetch distance in pixels for simple prefetch */
    85 .set PREFETCH_DISTANCE_SIMPLE, 64
    87 /*
    88  * Implementation of pixman_composite_over_8888_0565_asm_neon
    89  *
    90  * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
    91  * performs OVER compositing operation. Function fast_composite_over_8888_0565
    92  * from pixman-fast-path.c does the same in C and can be used as a reference.
    93  *
    94  * First we need to have some NEON assembly code which can do the actual
    95  * operation on the pixels and provide it to the template macro.
    96  *
    97  * Template macro quite conveniently takes care of emitting all the necessary
    98  * code for memory reading and writing (including quite tricky cases of
    99  * handling unaligned leading/trailing pixels), so we only need to deal with
   100  * the data in NEON registers.
   101  *
   102  * NEON registers allocation in general is recommented to be the following:
   103  * d0,  d1,  d2,  d3  - contain loaded source pixel data
   104  * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
   105  * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
   106  * d28, d29, d30, d31 - place for storing the result (destination pixels)
   107  *
   108  * As can be seen above, four 64-bit NEON registers are used for keeping
   109  * intermediate pixel data and up to 8 pixels can be processed in one step
   110  * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
   111  *
   112  * This particular function uses the following registers allocation:
   113  * d0,  d1,  d2,  d3  - contain loaded source pixel data
   114  * d4,  d5            - contain loaded destination pixels (they are needed)
   115  * d28, d29           - place for storing the result (destination pixels)
   116  */
   118 /*
   119  * Step one. We need to have some code to do some arithmetics on pixel data.
   120  * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
   121  * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
   122  * perform all the needed calculations and write the result to {d28, d29}.
   123  * The rationale for having two macros and not just one will be explained
   124  * later. In practice, any single monolitic function which does the work can
   125  * be split into two parts in any arbitrary way without affecting correctness.
   126  *
   127  * There is one special trick here too. Common template macro can optionally
   128  * make our life a bit easier by doing R, G, B, A color components
   129  * deinterleaving for 32bpp pixel formats (and this feature is used in
   130  * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
   131  * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
   132  * actually use d0 register for blue channel (a vector of eight 8-bit
   133  * values), d1 register for green, d2 for red and d3 for alpha. This
   134  * simple conversion can be also done with a few NEON instructions:
   135  *
   136  * Packed to planar conversion:
   137  *  vuzp.8 d0, d1
   138  *  vuzp.8 d2, d3
   139  *  vuzp.8 d1, d3
   140  *  vuzp.8 d0, d2
   141  *
   142  * Planar to packed conversion:
   143  *  vzip.8 d0, d2
   144  *  vzip.8 d1, d3
   145  *  vzip.8 d2, d3
   146  *  vzip.8 d0, d1
   147  *
   148  * But pixel can be loaded directly in planar format using VLD4.8 NEON
   149  * instruction. It is 1 cycle slower than VLD1.32, so this is not always
   150  * desirable, that's why deinterleaving is optional.
   151  *
   152  * But anyway, here is the code:
   153  */
   154 .macro pixman_composite_over_8888_0565_process_pixblock_head
   155     /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
   156        and put data into d6 - red, d7 - green, d30 - blue */
   157     vshrn.u16   d6, q2, #8
   158     vshrn.u16   d7, q2, #3
   159     vsli.u16    q2, q2, #5
   160     vsri.u8     d6, d6, #5
   161     vmvn.8      d3, d3      /* invert source alpha */
   162     vsri.u8     d7, d7, #6
   163     vshrn.u16   d30, q2, #2
   164     /* now do alpha blending, storing results in 8-bit planar format
   165        into d16 - red, d19 - green, d18 - blue */
   166     vmull.u8    q10, d3, d6
   167     vmull.u8    q11, d3, d7
   168     vmull.u8    q12, d3, d30
   169     vrshr.u16   q13, q10, #8
   170     vrshr.u16   q3, q11, #8
   171     vrshr.u16   q15, q12, #8
   172     vraddhn.u16 d20, q10, q13
   173     vraddhn.u16 d23, q11, q3
   174     vraddhn.u16 d22, q12, q15
   175 .endm
   177 .macro pixman_composite_over_8888_0565_process_pixblock_tail
   178     /* ... continue alpha blending */
   179     vqadd.u8    d16, d2, d20
   180     vqadd.u8    q9, q0, q11
   181     /* convert the result to r5g6b5 and store it into {d28, d29} */
   182     vshll.u8    q14, d16, #8
   183     vshll.u8    q8, d19, #8
   184     vshll.u8    q9, d18, #8
   185     vsri.u16    q14, q8, #5
   186     vsri.u16    q14, q9, #11
   187 .endm
   189 /*
   190  * OK, now we got almost everything that we need. Using the above two
   191  * macros, the work can be done right. But now we want to optimize
   192  * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
   193  * a lot from good code scheduling and software pipelining.
   194  *
   195  * Let's construct some code, which will run in the core main loop.
   196  * Some pseudo-code of the main loop will look like this:
   197  *   head
   198  *   while (...) {
   199  *     tail
   200  *     head
   201  *   }
   202  *   tail
   203  *
   204  * It may look a bit weird, but this setup allows to hide instruction
   205  * latencies better and also utilize dual-issue capability more
   206  * efficiently (make pairs of load-store and ALU instructions).
   207  *
   208  * So what we need now is a '*_tail_head' macro, which will be used
   209  * in the core main loop. A trivial straightforward implementation
   210  * of this macro would look like this:
   211  *
   212  *   pixman_composite_over_8888_0565_process_pixblock_tail
   213  *   vst1.16     {d28, d29}, [DST_W, :128]!
   214  *   vld1.16     {d4, d5}, [DST_R, :128]!
   215  *   vld4.32     {d0, d1, d2, d3}, [SRC]!
   216  *   pixman_composite_over_8888_0565_process_pixblock_head
   217  *   cache_preload 8, 8
   218  *
   219  * Now it also got some VLD/VST instructions. We simply can't move from
   220  * processing one block of pixels to the other one with just arithmetics.
   221  * The previously processed data needs to be written to memory and new
   222  * data needs to be fetched. Fortunately, this main loop does not deal
   223  * with partial leading/trailing pixels and can load/store a full block
   224  * of pixels in a bulk. Additionally, destination buffer is already
   225  * 16 bytes aligned here (which is good for performance).
   226  *
   227  * New things here are DST_R, DST_W, SRC and MASK identifiers. These
   228  * are the aliases for ARM registers which are used as pointers for
   229  * accessing data. We maintain separate pointers for reading and writing
   230  * destination buffer (DST_R and DST_W).
   231  *
   232  * Another new thing is 'cache_preload' macro. It is used for prefetching
   233  * data into CPU L2 cache and improve performance when dealing with large
   234  * images which are far larger than cache size. It uses one argument
   235  * (actually two, but they need to be the same here) - number of pixels
   236  * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
   237  * details about this macro. Moreover, if good performance is needed
   238  * the code from this macro needs to be copied into '*_tail_head' macro
   239  * and mixed with the rest of code for optimal instructions scheduling.
   240  * We are actually doing it below.
   241  *
   242  * Now after all the explanations, here is the optimized code.
   243  * Different instruction streams (originaling from '*_head', '*_tail'
   244  * and 'cache_preload' macro) use different indentation levels for
   245  * better readability. Actually taking the code from one of these
   246  * indentation levels and ignoring a few VLD/VST instructions would
   247  * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
   248  * macro!
   249  */
   251 #if 1
   253 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
   254         vqadd.u8    d16, d2, d20
   255     vld1.16     {d4, d5}, [DST_R, :128]!
   256         vqadd.u8    q9, q0, q11
   257     vshrn.u16   d6, q2, #8
   258     fetch_src_pixblock
   259     vshrn.u16   d7, q2, #3
   260     vsli.u16    q2, q2, #5
   261         vshll.u8    q14, d16, #8
   262                                     PF add PF_X, PF_X, #8
   263         vshll.u8    q8, d19, #8
   264                                     PF tst PF_CTL, #0xF
   265     vsri.u8     d6, d6, #5
   266                                     PF addne PF_X, PF_X, #8
   267     vmvn.8      d3, d3
   268                                     PF subne PF_CTL, PF_CTL, #1
   269     vsri.u8     d7, d7, #6
   270     vshrn.u16   d30, q2, #2
   271     vmull.u8    q10, d3, d6
   272                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   273     vmull.u8    q11, d3, d7
   274     vmull.u8    q12, d3, d30
   275                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   276         vsri.u16    q14, q8, #5
   277                                     PF cmp PF_X, ORIG_W
   278         vshll.u8    q9, d18, #8
   279     vrshr.u16   q13, q10, #8
   280                                     PF subge PF_X, PF_X, ORIG_W
   281     vrshr.u16   q3, q11, #8
   282     vrshr.u16   q15, q12, #8
   283                                     PF subges PF_CTL, PF_CTL, #0x10
   284         vsri.u16    q14, q9, #11
   285                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   286     vraddhn.u16 d20, q10, q13
   287     vraddhn.u16 d23, q11, q3
   288                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   289     vraddhn.u16 d22, q12, q15
   290         vst1.16     {d28, d29}, [DST_W, :128]!
   291 .endm
   293 #else
   295 /* If we did not care much about the performance, we would just use this... */
   296 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
   297     pixman_composite_over_8888_0565_process_pixblock_tail
   298     vst1.16     {d28, d29}, [DST_W, :128]!
   299     vld1.16     {d4, d5}, [DST_R, :128]!
   300     fetch_src_pixblock
   301     pixman_composite_over_8888_0565_process_pixblock_head
   302     cache_preload 8, 8
   303 .endm
   305 #endif
   307 /*
   308  * And now the final part. We are using 'generate_composite_function' macro
   309  * to put all the stuff together. We are specifying the name of the function
   310  * which we want to get, number of bits per pixel for the source, mask and
   311  * destination (0 if unused, like mask in this case). Next come some bit
   312  * flags:
   313  *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
   314  *                             and written, for write-only buffer we would use
   315  *                             FLAG_DST_WRITEONLY flag instead
   316  *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
   317  *                             and separate color channels for 32bpp format.
   318  * The next things are:
   319  *  - the number of pixels processed per iteration (8 in this case, because
   320  *    that's the maximum what can fit into four 64-bit NEON registers).
   321  *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
   322  *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
   323  *    prefetch distance can be selected by running some benchmarks.
   324  *
   325  * After that we specify some macros, these are 'default_init',
   326  * 'default_cleanup' here which are empty (but it is possible to have custom
   327  * init/cleanup macros to be able to save/restore some extra NEON registers
   328  * like d8-d15 or do anything else) followed by
   329  * 'pixman_composite_over_8888_0565_process_pixblock_head',
   330  * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
   331  * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
   332  * which we got implemented above.
   333  *
   334  * The last part is the NEON registers allocation scheme.
   335  */
   336 generate_composite_function \
   337     pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
   338     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   339     8, /* number of pixels, processed in a single block */ \
   340     5, /* prefetch distance */ \
   341     default_init, \
   342     default_cleanup, \
   343     pixman_composite_over_8888_0565_process_pixblock_head, \
   344     pixman_composite_over_8888_0565_process_pixblock_tail, \
   345     pixman_composite_over_8888_0565_process_pixblock_tail_head, \
   346     28, /* dst_w_basereg */ \
   347     4,  /* dst_r_basereg */ \
   348     0,  /* src_basereg   */ \
   349     24  /* mask_basereg  */
   351 /******************************************************************************/
   353 .macro pixman_composite_over_n_0565_process_pixblock_head
   354     /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
   355        and put data into d6 - red, d7 - green, d30 - blue */
   356     vshrn.u16   d6, q2, #8
   357     vshrn.u16   d7, q2, #3
   358     vsli.u16    q2, q2, #5
   359     vsri.u8     d6, d6, #5
   360     vsri.u8     d7, d7, #6
   361     vshrn.u16   d30, q2, #2
   362     /* now do alpha blending, storing results in 8-bit planar format
   363        into d16 - red, d19 - green, d18 - blue */
   364     vmull.u8    q10, d3, d6
   365     vmull.u8    q11, d3, d7
   366     vmull.u8    q12, d3, d30
   367     vrshr.u16   q13, q10, #8
   368     vrshr.u16   q3, q11, #8
   369     vrshr.u16   q15, q12, #8
   370     vraddhn.u16 d20, q10, q13
   371     vraddhn.u16 d23, q11, q3
   372     vraddhn.u16 d22, q12, q15
   373 .endm
   375 .macro pixman_composite_over_n_0565_process_pixblock_tail
   376     /* ... continue alpha blending */
   377     vqadd.u8    d16, d2, d20
   378     vqadd.u8    q9, q0, q11
   379     /* convert the result to r5g6b5 and store it into {d28, d29} */
   380     vshll.u8    q14, d16, #8
   381     vshll.u8    q8, d19, #8
   382     vshll.u8    q9, d18, #8
   383     vsri.u16    q14, q8, #5
   384     vsri.u16    q14, q9, #11
   385 .endm
   387 /* TODO: expand macros and do better instructions scheduling */
   388 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
   389     pixman_composite_over_n_0565_process_pixblock_tail
   390     vld1.16     {d4, d5}, [DST_R, :128]!
   391     vst1.16     {d28, d29}, [DST_W, :128]!
   392     pixman_composite_over_n_0565_process_pixblock_head
   393     cache_preload 8, 8
   394 .endm
   396 .macro pixman_composite_over_n_0565_init
   397     add         DUMMY, sp, #ARGS_STACK_OFFSET
   398     vld1.32     {d3[0]}, [DUMMY]
   399     vdup.8      d0, d3[0]
   400     vdup.8      d1, d3[1]
   401     vdup.8      d2, d3[2]
   402     vdup.8      d3, d3[3]
   403     vmvn.8      d3, d3      /* invert source alpha */
   404 .endm
   406 generate_composite_function \
   407     pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
   408     FLAG_DST_READWRITE, \
   409     8, /* number of pixels, processed in a single block */ \
   410     5, /* prefetch distance */ \
   411     pixman_composite_over_n_0565_init, \
   412     default_cleanup, \
   413     pixman_composite_over_n_0565_process_pixblock_head, \
   414     pixman_composite_over_n_0565_process_pixblock_tail, \
   415     pixman_composite_over_n_0565_process_pixblock_tail_head, \
   416     28, /* dst_w_basereg */ \
   417     4,  /* dst_r_basereg */ \
   418     0,  /* src_basereg   */ \
   419     24  /* mask_basereg  */
   421 /******************************************************************************/
   423 .macro pixman_composite_src_8888_0565_process_pixblock_head
   424     vshll.u8    q8, d1, #8
   425     vshll.u8    q14, d2, #8
   426     vshll.u8    q9, d0, #8
   427 .endm
   429 .macro pixman_composite_src_8888_0565_process_pixblock_tail
   430     vsri.u16    q14, q8, #5
   431     vsri.u16    q14, q9, #11
   432 .endm
   434 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
   435         vsri.u16    q14, q8, #5
   436                                     PF add PF_X, PF_X, #8
   437                                     PF tst PF_CTL, #0xF
   438     fetch_src_pixblock
   439                                     PF addne PF_X, PF_X, #8
   440                                     PF subne PF_CTL, PF_CTL, #1
   441         vsri.u16    q14, q9, #11
   442                                     PF cmp PF_X, ORIG_W
   443                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   444     vshll.u8    q8, d1, #8
   445         vst1.16     {d28, d29}, [DST_W, :128]!
   446                                     PF subge PF_X, PF_X, ORIG_W
   447                                     PF subges PF_CTL, PF_CTL, #0x10
   448     vshll.u8    q14, d2, #8
   449                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   450     vshll.u8    q9, d0, #8
   451 .endm
   453 generate_composite_function \
   454     pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
   455     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   456     8, /* number of pixels, processed in a single block */ \
   457     10, /* prefetch distance */ \
   458     default_init, \
   459     default_cleanup, \
   460     pixman_composite_src_8888_0565_process_pixblock_head, \
   461     pixman_composite_src_8888_0565_process_pixblock_tail, \
   462     pixman_composite_src_8888_0565_process_pixblock_tail_head
   464 /******************************************************************************/
   466 .macro pixman_composite_src_0565_8888_process_pixblock_head
   467     vshrn.u16   d30, q0, #8
   468     vshrn.u16   d29, q0, #3
   469     vsli.u16    q0, q0, #5
   470     vmov.u8     d31, #255
   471     vsri.u8     d30, d30, #5
   472     vsri.u8     d29, d29, #6
   473     vshrn.u16   d28, q0, #2
   474 .endm
   476 .macro pixman_composite_src_0565_8888_process_pixblock_tail
   477 .endm
   479 /* TODO: expand macros and do better instructions scheduling */
   480 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
   481     pixman_composite_src_0565_8888_process_pixblock_tail
   482     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   483     fetch_src_pixblock
   484     pixman_composite_src_0565_8888_process_pixblock_head
   485     cache_preload 8, 8
   486 .endm
   488 generate_composite_function \
   489     pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
   490     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   491     8, /* number of pixels, processed in a single block */ \
   492     10, /* prefetch distance */ \
   493     default_init, \
   494     default_cleanup, \
   495     pixman_composite_src_0565_8888_process_pixblock_head, \
   496     pixman_composite_src_0565_8888_process_pixblock_tail, \
   497     pixman_composite_src_0565_8888_process_pixblock_tail_head
   499 /******************************************************************************/
   501 .macro pixman_composite_add_8_8_process_pixblock_head
   502     vqadd.u8    q14, q0, q2
   503     vqadd.u8    q15, q1, q3
   504 .endm
   506 .macro pixman_composite_add_8_8_process_pixblock_tail
   507 .endm
   509 .macro pixman_composite_add_8_8_process_pixblock_tail_head
   510     fetch_src_pixblock
   511                                     PF add PF_X, PF_X, #32
   512                                     PF tst PF_CTL, #0xF
   513     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   514                                     PF addne PF_X, PF_X, #32
   515                                     PF subne PF_CTL, PF_CTL, #1
   516         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   517                                     PF cmp PF_X, ORIG_W
   518                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   519                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   520                                     PF subge PF_X, PF_X, ORIG_W
   521                                     PF subges PF_CTL, PF_CTL, #0x10
   522     vqadd.u8    q14, q0, q2
   523                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   524                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   525     vqadd.u8    q15, q1, q3
   526 .endm
   528 generate_composite_function \
   529     pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
   530     FLAG_DST_READWRITE, \
   531     32, /* number of pixels, processed in a single block */ \
   532     10, /* prefetch distance */ \
   533     default_init, \
   534     default_cleanup, \
   535     pixman_composite_add_8_8_process_pixblock_head, \
   536     pixman_composite_add_8_8_process_pixblock_tail, \
   537     pixman_composite_add_8_8_process_pixblock_tail_head
   539 /******************************************************************************/
   541 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
   542     fetch_src_pixblock
   543                                     PF add PF_X, PF_X, #8
   544                                     PF tst PF_CTL, #0xF
   545     vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
   546                                     PF addne PF_X, PF_X, #8
   547                                     PF subne PF_CTL, PF_CTL, #1
   548         vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
   549                                     PF cmp PF_X, ORIG_W
   550                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   551                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   552                                     PF subge PF_X, PF_X, ORIG_W
   553                                     PF subges PF_CTL, PF_CTL, #0x10
   554     vqadd.u8    q14, q0, q2
   555                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   556                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   557     vqadd.u8    q15, q1, q3
   558 .endm
   560 generate_composite_function \
   561     pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
   562     FLAG_DST_READWRITE, \
   563     8, /* number of pixels, processed in a single block */ \
   564     10, /* prefetch distance */ \
   565     default_init, \
   566     default_cleanup, \
   567     pixman_composite_add_8_8_process_pixblock_head, \
   568     pixman_composite_add_8_8_process_pixblock_tail, \
   569     pixman_composite_add_8888_8888_process_pixblock_tail_head
   571 generate_composite_function_single_scanline \
   572     pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
   573     FLAG_DST_READWRITE, \
   574     8, /* number of pixels, processed in a single block */ \
   575     default_init, \
   576     default_cleanup, \
   577     pixman_composite_add_8_8_process_pixblock_head, \
   578     pixman_composite_add_8_8_process_pixblock_tail, \
   579     pixman_composite_add_8888_8888_process_pixblock_tail_head
   581 /******************************************************************************/
   583 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
   584     vmvn.8      d24, d3  /* get inverted alpha */
   585     /* do alpha blending */
   586     vmull.u8    q8, d24, d4
   587     vmull.u8    q9, d24, d5
   588     vmull.u8    q10, d24, d6
   589     vmull.u8    q11, d24, d7
   590 .endm
   592 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
   593     vrshr.u16   q14, q8, #8
   594     vrshr.u16   q15, q9, #8
   595     vrshr.u16   q12, q10, #8
   596     vrshr.u16   q13, q11, #8
   597     vraddhn.u16 d28, q14, q8
   598     vraddhn.u16 d29, q15, q9
   599     vraddhn.u16 d30, q12, q10
   600     vraddhn.u16 d31, q13, q11
   601 .endm
   603 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
   604     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   605         vrshr.u16   q14, q8, #8
   606                                     PF add PF_X, PF_X, #8
   607                                     PF tst PF_CTL, #0xF
   608         vrshr.u16   q15, q9, #8
   609         vrshr.u16   q12, q10, #8
   610         vrshr.u16   q13, q11, #8
   611                                     PF addne PF_X, PF_X, #8
   612                                     PF subne PF_CTL, PF_CTL, #1
   613         vraddhn.u16 d28, q14, q8
   614         vraddhn.u16 d29, q15, q9
   615                                     PF cmp PF_X, ORIG_W
   616         vraddhn.u16 d30, q12, q10
   617         vraddhn.u16 d31, q13, q11
   618     fetch_src_pixblock
   619                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   620     vmvn.8      d22, d3
   621                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   622         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   623                                     PF subge PF_X, PF_X, ORIG_W
   624     vmull.u8    q8, d22, d4
   625                                     PF subges PF_CTL, PF_CTL, #0x10
   626     vmull.u8    q9, d22, d5
   627                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   628     vmull.u8    q10, d22, d6
   629                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   630     vmull.u8    q11, d22, d7
   631 .endm
   633 generate_composite_function_single_scanline \
   634     pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
   635     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   636     8, /* number of pixels, processed in a single block */ \
   637     default_init, \
   638     default_cleanup, \
   639     pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
   640     pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
   641     pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
   643 /******************************************************************************/
   645 .macro pixman_composite_over_8888_8888_process_pixblock_head
   646     pixman_composite_out_reverse_8888_8888_process_pixblock_head
   647 .endm
   649 .macro pixman_composite_over_8888_8888_process_pixblock_tail
   650     pixman_composite_out_reverse_8888_8888_process_pixblock_tail
   651     vqadd.u8    q14, q0, q14
   652     vqadd.u8    q15, q1, q15
   653 .endm
   655 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
   656     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   657         vrshr.u16   q14, q8, #8
   658                                     PF add PF_X, PF_X, #8
   659                                     PF tst PF_CTL, #0xF
   660         vrshr.u16   q15, q9, #8
   661         vrshr.u16   q12, q10, #8
   662         vrshr.u16   q13, q11, #8
   663                                     PF addne PF_X, PF_X, #8
   664                                     PF subne PF_CTL, PF_CTL, #1
   665         vraddhn.u16 d28, q14, q8
   666         vraddhn.u16 d29, q15, q9
   667                                     PF cmp PF_X, ORIG_W
   668         vraddhn.u16 d30, q12, q10
   669         vraddhn.u16 d31, q13, q11
   670         vqadd.u8    q14, q0, q14
   671         vqadd.u8    q15, q1, q15
   672     fetch_src_pixblock
   673                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   674     vmvn.8      d22, d3
   675                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   676         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   677                                     PF subge PF_X, PF_X, ORIG_W
   678     vmull.u8    q8, d22, d4
   679                                     PF subges PF_CTL, PF_CTL, #0x10
   680     vmull.u8    q9, d22, d5
   681                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   682     vmull.u8    q10, d22, d6
   683                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   684     vmull.u8    q11, d22, d7
   685 .endm
   687 generate_composite_function \
   688     pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
   689     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   690     8, /* number of pixels, processed in a single block */ \
   691     5, /* prefetch distance */ \
   692     default_init, \
   693     default_cleanup, \
   694     pixman_composite_over_8888_8888_process_pixblock_head, \
   695     pixman_composite_over_8888_8888_process_pixblock_tail, \
   696     pixman_composite_over_8888_8888_process_pixblock_tail_head
   698 generate_composite_function_single_scanline \
   699     pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
   700     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   701     8, /* number of pixels, processed in a single block */ \
   702     default_init, \
   703     default_cleanup, \
   704     pixman_composite_over_8888_8888_process_pixblock_head, \
   705     pixman_composite_over_8888_8888_process_pixblock_tail, \
   706     pixman_composite_over_8888_8888_process_pixblock_tail_head
   708 /******************************************************************************/
   710 .macro pixman_composite_over_n_8888_process_pixblock_head
   711     /* deinterleaved source pixels in {d0, d1, d2, d3} */
   712     /* inverted alpha in {d24} */
   713     /* destination pixels in {d4, d5, d6, d7} */
   714     vmull.u8    q8, d24, d4
   715     vmull.u8    q9, d24, d5
   716     vmull.u8    q10, d24, d6
   717     vmull.u8    q11, d24, d7
   718 .endm
   720 .macro pixman_composite_over_n_8888_process_pixblock_tail
   721     vrshr.u16   q14, q8, #8
   722     vrshr.u16   q15, q9, #8
   723     vrshr.u16   q2, q10, #8
   724     vrshr.u16   q3, q11, #8
   725     vraddhn.u16 d28, q14, q8
   726     vraddhn.u16 d29, q15, q9
   727     vraddhn.u16 d30, q2, q10
   728     vraddhn.u16 d31, q3, q11
   729     vqadd.u8    q14, q0, q14
   730     vqadd.u8    q15, q1, q15
   731 .endm
   733 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
   734         vrshr.u16   q14, q8, #8
   735         vrshr.u16   q15, q9, #8
   736         vrshr.u16   q2, q10, #8
   737         vrshr.u16   q3, q11, #8
   738         vraddhn.u16 d28, q14, q8
   739         vraddhn.u16 d29, q15, q9
   740         vraddhn.u16 d30, q2, q10
   741         vraddhn.u16 d31, q3, q11
   742     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   743         vqadd.u8    q14, q0, q14
   744                                     PF add PF_X, PF_X, #8
   745                                     PF tst PF_CTL, #0x0F
   746                                     PF addne PF_X, PF_X, #8
   747                                     PF subne PF_CTL, PF_CTL, #1
   748         vqadd.u8    q15, q1, q15
   749                                     PF cmp PF_X, ORIG_W
   750     vmull.u8    q8, d24, d4
   751                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   752     vmull.u8    q9, d24, d5
   753                                     PF subge PF_X, PF_X, ORIG_W
   754     vmull.u8    q10, d24, d6
   755                                     PF subges PF_CTL, PF_CTL, #0x10
   756     vmull.u8    q11, d24, d7
   757                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   758         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   759 .endm
   761 .macro pixman_composite_over_n_8888_init
   762     add         DUMMY, sp, #ARGS_STACK_OFFSET
   763     vld1.32     {d3[0]}, [DUMMY]
   764     vdup.8      d0, d3[0]
   765     vdup.8      d1, d3[1]
   766     vdup.8      d2, d3[2]
   767     vdup.8      d3, d3[3]
   768     vmvn.8      d24, d3  /* get inverted alpha */
   769 .endm
   771 generate_composite_function \
   772     pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
   773     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   774     8, /* number of pixels, processed in a single block */ \
   775     5, /* prefetch distance */ \
   776     pixman_composite_over_n_8888_init, \
   777     default_cleanup, \
   778     pixman_composite_over_8888_8888_process_pixblock_head, \
   779     pixman_composite_over_8888_8888_process_pixblock_tail, \
   780     pixman_composite_over_n_8888_process_pixblock_tail_head
   782 /******************************************************************************/
   784 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
   785         vrshr.u16   q14, q8, #8
   786                                     PF add PF_X, PF_X, #8
   787                                     PF tst PF_CTL, #0xF
   788         vrshr.u16   q15, q9, #8
   789         vrshr.u16   q12, q10, #8
   790         vrshr.u16   q13, q11, #8
   791                                     PF addne PF_X, PF_X, #8
   792                                     PF subne PF_CTL, PF_CTL, #1
   793         vraddhn.u16 d28, q14, q8
   794         vraddhn.u16 d29, q15, q9
   795                                     PF cmp PF_X, ORIG_W
   796         vraddhn.u16 d30, q12, q10
   797         vraddhn.u16 d31, q13, q11
   798         vqadd.u8    q14, q0, q14
   799         vqadd.u8    q15, q1, q15
   800     vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
   801     vmvn.8      d22, d3
   802                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   803         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   804                                     PF subge PF_X, PF_X, ORIG_W
   805     vmull.u8    q8, d22, d4
   806                                     PF subges PF_CTL, PF_CTL, #0x10
   807     vmull.u8    q9, d22, d5
   808     vmull.u8    q10, d22, d6
   809                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   810     vmull.u8    q11, d22, d7
   811 .endm
   813 .macro pixman_composite_over_reverse_n_8888_init
   814     add         DUMMY, sp, #ARGS_STACK_OFFSET
   815     vld1.32     {d7[0]}, [DUMMY]
   816     vdup.8      d4, d7[0]
   817     vdup.8      d5, d7[1]
   818     vdup.8      d6, d7[2]
   819     vdup.8      d7, d7[3]
   820 .endm
   822 generate_composite_function \
   823     pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
   824     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   825     8, /* number of pixels, processed in a single block */ \
   826     5, /* prefetch distance */ \
   827     pixman_composite_over_reverse_n_8888_init, \
   828     default_cleanup, \
   829     pixman_composite_over_8888_8888_process_pixblock_head, \
   830     pixman_composite_over_8888_8888_process_pixblock_tail, \
   831     pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
   832     28, /* dst_w_basereg */ \
   833     0,  /* dst_r_basereg */ \
   834     4,  /* src_basereg   */ \
   835     24  /* mask_basereg  */
   837 /******************************************************************************/
   839 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
   840     vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
   841     vmull.u8    q1,  d24, d9
   842     vmull.u8    q6,  d24, d10
   843     vmull.u8    q7,  d24, d11
   844         vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
   845         vshrn.u16   d7,  q2, #3
   846         vsli.u16    q2,  q2, #5
   847     vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
   848     vrshr.u16   q9,  q1,  #8
   849     vrshr.u16   q10, q6,  #8
   850     vrshr.u16   q11, q7,  #8
   851     vraddhn.u16 d0,  q0,  q8
   852     vraddhn.u16 d1,  q1,  q9
   853     vraddhn.u16 d2,  q6,  q10
   854     vraddhn.u16 d3,  q7,  q11
   855         vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
   856         vsri.u8     d7,  d7, #6
   857     vmvn.8      d3,  d3
   858         vshrn.u16   d30, q2, #2
   859     vmull.u8    q8,  d3, d6     /* now do alpha blending */
   860     vmull.u8    q9,  d3, d7
   861     vmull.u8    q10, d3, d30
   862 .endm
   864 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
   865     /* 3 cycle bubble (after vmull.u8) */
   866     vrshr.u16   q13, q8,  #8
   867     vrshr.u16   q11, q9,  #8
   868     vrshr.u16   q15, q10, #8
   869     vraddhn.u16 d16, q8,  q13
   870     vraddhn.u16 d27, q9,  q11
   871     vraddhn.u16 d26, q10, q15
   872     vqadd.u8    d16, d2,  d16
   873     /* 1 cycle bubble */
   874     vqadd.u8    q9,  q0,  q13
   875     vshll.u8    q14, d16, #8    /* convert to 16bpp */
   876     vshll.u8    q8,  d19, #8
   877     vshll.u8    q9,  d18, #8
   878     vsri.u16    q14, q8,  #5
   879     /* 1 cycle bubble */
   880     vsri.u16    q14, q9,  #11
   881 .endm
   883 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
   884     vld1.16     {d4, d5}, [DST_R, :128]!
   885     vshrn.u16   d6,  q2,  #8
   886     fetch_mask_pixblock
   887     vshrn.u16   d7,  q2,  #3
   888     fetch_src_pixblock
   889     vmull.u8    q6,  d24, d10
   890         vrshr.u16   q13, q8,  #8
   891         vrshr.u16   q11, q9,  #8
   892         vrshr.u16   q15, q10, #8
   893         vraddhn.u16 d16, q8,  q13
   894         vraddhn.u16 d27, q9,  q11
   895         vraddhn.u16 d26, q10, q15
   896         vqadd.u8    d16, d2,  d16
   897     vmull.u8    q1,  d24, d9
   898         vqadd.u8    q9,  q0,  q13
   899         vshll.u8    q14, d16, #8
   900     vmull.u8    q0,  d24, d8
   901         vshll.u8    q8,  d19, #8
   902         vshll.u8    q9,  d18, #8
   903         vsri.u16    q14, q8,  #5
   904     vmull.u8    q7,  d24, d11
   905         vsri.u16    q14, q9,  #11
   907     cache_preload 8, 8
   909     vsli.u16    q2,  q2,  #5
   910     vrshr.u16   q8,  q0,  #8
   911     vrshr.u16   q9,  q1,  #8
   912     vrshr.u16   q10, q6,  #8
   913     vrshr.u16   q11, q7,  #8
   914     vraddhn.u16 d0,  q0,  q8
   915     vraddhn.u16 d1,  q1,  q9
   916     vraddhn.u16 d2,  q6,  q10
   917     vraddhn.u16 d3,  q7,  q11
   918     vsri.u8     d6,  d6,  #5
   919     vsri.u8     d7,  d7,  #6
   920     vmvn.8      d3,  d3
   921     vshrn.u16   d30, q2,  #2
   922     vst1.16     {d28, d29}, [DST_W, :128]!
   923     vmull.u8    q8,  d3,  d6
   924     vmull.u8    q9,  d3,  d7
   925     vmull.u8    q10, d3,  d30
   926 .endm
   928 generate_composite_function \
   929     pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
   930     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   931     8, /* number of pixels, processed in a single block */ \
   932     5, /* prefetch distance */ \
   933     default_init_need_all_regs, \
   934     default_cleanup_need_all_regs, \
   935     pixman_composite_over_8888_8_0565_process_pixblock_head, \
   936     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
   937     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
   938     28, /* dst_w_basereg */ \
   939     4,  /* dst_r_basereg */ \
   940     8,  /* src_basereg   */ \
   941     24  /* mask_basereg  */
   943 /******************************************************************************/
   945 /*
   946  * This function needs a special initialization of solid mask.
   947  * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
   948  * offset, split into color components and replicated in d8-d11
   949  * registers. Additionally, this function needs all the NEON registers,
   950  * so it has to save d8-d15 registers which are callee saved according
   951  * to ABI. These registers are restored from 'cleanup' macro. All the
   952  * other NEON registers are caller saved, so can be clobbered freely
   953  * without introducing any problems.
   954  */
   955 .macro pixman_composite_over_n_8_0565_init
   956     add         DUMMY, sp, #ARGS_STACK_OFFSET
   957     .vsave      {d8-d15}
   958     vpush       {d8-d15}
   959     vld1.32     {d11[0]}, [DUMMY]
   960     vdup.8      d8, d11[0]
   961     vdup.8      d9, d11[1]
   962     vdup.8      d10, d11[2]
   963     vdup.8      d11, d11[3]
   964 .endm
   966 .macro pixman_composite_over_n_8_0565_cleanup
   967     vpop        {d8-d15}
   968 .endm
   970 generate_composite_function \
   971     pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
   972     FLAG_DST_READWRITE, \
   973     8, /* number of pixels, processed in a single block */ \
   974     5, /* prefetch distance */ \
   975     pixman_composite_over_n_8_0565_init, \
   976     pixman_composite_over_n_8_0565_cleanup, \
   977     pixman_composite_over_8888_8_0565_process_pixblock_head, \
   978     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
   979     pixman_composite_over_8888_8_0565_process_pixblock_tail_head
   981 /******************************************************************************/
   983 .macro pixman_composite_over_8888_n_0565_init
   984     add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
   985     .vsave      {d8-d15}
   986     vpush       {d8-d15}
   987     vld1.32     {d24[0]}, [DUMMY]
   988     vdup.8      d24, d24[3]
   989 .endm
   991 .macro pixman_composite_over_8888_n_0565_cleanup
   992     vpop        {d8-d15}
   993 .endm
   995 generate_composite_function \
   996     pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
   997     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   998     8, /* number of pixels, processed in a single block */ \
   999     5, /* prefetch distance */ \
  1000     pixman_composite_over_8888_n_0565_init, \
  1001     pixman_composite_over_8888_n_0565_cleanup, \
  1002     pixman_composite_over_8888_8_0565_process_pixblock_head, \
  1003     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
  1004     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
  1005     28, /* dst_w_basereg */ \
  1006     4,  /* dst_r_basereg */ \
  1007     8,  /* src_basereg   */ \
  1008     24  /* mask_basereg  */
  1010 /******************************************************************************/
  1012 .macro pixman_composite_src_0565_0565_process_pixblock_head
  1013 .endm
  1015 .macro pixman_composite_src_0565_0565_process_pixblock_tail
  1016 .endm
  1018 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
  1019     vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
  1020     fetch_src_pixblock
  1021     cache_preload 16, 16
  1022 .endm
  1024 generate_composite_function \
  1025     pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
  1026     FLAG_DST_WRITEONLY, \
  1027     16, /* number of pixels, processed in a single block */ \
  1028     10, /* prefetch distance */ \
  1029     default_init, \
  1030     default_cleanup, \
  1031     pixman_composite_src_0565_0565_process_pixblock_head, \
  1032     pixman_composite_src_0565_0565_process_pixblock_tail, \
  1033     pixman_composite_src_0565_0565_process_pixblock_tail_head, \
  1034     0, /* dst_w_basereg */ \
  1035     0, /* dst_r_basereg */ \
  1036     0, /* src_basereg   */ \
  1037     0  /* mask_basereg  */
  1039 /******************************************************************************/
  1041 .macro pixman_composite_src_n_8_process_pixblock_head
  1042 .endm
  1044 .macro pixman_composite_src_n_8_process_pixblock_tail
  1045 .endm
  1047 .macro pixman_composite_src_n_8_process_pixblock_tail_head
  1048     vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
  1049 .endm
  1051 .macro pixman_composite_src_n_8_init
  1052     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1053     vld1.32     {d0[0]}, [DUMMY]
  1054     vsli.u64    d0, d0, #8
  1055     vsli.u64    d0, d0, #16
  1056     vsli.u64    d0, d0, #32
  1057     vorr        d1, d0, d0
  1058     vorr        q1, q0, q0
  1059 .endm
  1061 .macro pixman_composite_src_n_8_cleanup
  1062 .endm
  1064 generate_composite_function \
  1065     pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
  1066     FLAG_DST_WRITEONLY, \
  1067     32, /* number of pixels, processed in a single block */ \
  1068     0,  /* prefetch distance */ \
  1069     pixman_composite_src_n_8_init, \
  1070     pixman_composite_src_n_8_cleanup, \
  1071     pixman_composite_src_n_8_process_pixblock_head, \
  1072     pixman_composite_src_n_8_process_pixblock_tail, \
  1073     pixman_composite_src_n_8_process_pixblock_tail_head, \
  1074     0, /* dst_w_basereg */ \
  1075     0, /* dst_r_basereg */ \
  1076     0, /* src_basereg   */ \
  1077     0  /* mask_basereg  */
  1079 /******************************************************************************/
  1081 .macro pixman_composite_src_n_0565_process_pixblock_head
  1082 .endm
  1084 .macro pixman_composite_src_n_0565_process_pixblock_tail
  1085 .endm
  1087 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
  1088     vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
  1089 .endm
  1091 .macro pixman_composite_src_n_0565_init
  1092     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1093     vld1.32     {d0[0]}, [DUMMY]
  1094     vsli.u64    d0, d0, #16
  1095     vsli.u64    d0, d0, #32
  1096     vorr        d1, d0, d0
  1097     vorr        q1, q0, q0
  1098 .endm
  1100 .macro pixman_composite_src_n_0565_cleanup
  1101 .endm
  1103 generate_composite_function \
  1104     pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
  1105     FLAG_DST_WRITEONLY, \
  1106     16, /* number of pixels, processed in a single block */ \
  1107     0,  /* prefetch distance */ \
  1108     pixman_composite_src_n_0565_init, \
  1109     pixman_composite_src_n_0565_cleanup, \
  1110     pixman_composite_src_n_0565_process_pixblock_head, \
  1111     pixman_composite_src_n_0565_process_pixblock_tail, \
  1112     pixman_composite_src_n_0565_process_pixblock_tail_head, \
  1113     0, /* dst_w_basereg */ \
  1114     0, /* dst_r_basereg */ \
  1115     0, /* src_basereg   */ \
  1116     0  /* mask_basereg  */
  1118 /******************************************************************************/
  1120 .macro pixman_composite_src_n_8888_process_pixblock_head
  1121 .endm
  1123 .macro pixman_composite_src_n_8888_process_pixblock_tail
  1124 .endm
  1126 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
  1127     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
  1128 .endm
  1130 .macro pixman_composite_src_n_8888_init
  1131     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1132     vld1.32     {d0[0]}, [DUMMY]
  1133     vsli.u64    d0, d0, #32
  1134     vorr        d1, d0, d0
  1135     vorr        q1, q0, q0
  1136 .endm
  1138 .macro pixman_composite_src_n_8888_cleanup
  1139 .endm
  1141 generate_composite_function \
  1142     pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
  1143     FLAG_DST_WRITEONLY, \
  1144     8, /* number of pixels, processed in a single block */ \
  1145     0, /* prefetch distance */ \
  1146     pixman_composite_src_n_8888_init, \
  1147     pixman_composite_src_n_8888_cleanup, \
  1148     pixman_composite_src_n_8888_process_pixblock_head, \
  1149     pixman_composite_src_n_8888_process_pixblock_tail, \
  1150     pixman_composite_src_n_8888_process_pixblock_tail_head, \
  1151     0, /* dst_w_basereg */ \
  1152     0, /* dst_r_basereg */ \
  1153     0, /* src_basereg   */ \
  1154     0  /* mask_basereg  */
  1156 /******************************************************************************/
  1158 .macro pixman_composite_src_8888_8888_process_pixblock_head
  1159 .endm
  1161 .macro pixman_composite_src_8888_8888_process_pixblock_tail
  1162 .endm
  1164 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
  1165     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
  1166     fetch_src_pixblock
  1167     cache_preload 8, 8
  1168 .endm
  1170 generate_composite_function \
  1171     pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
  1172     FLAG_DST_WRITEONLY, \
  1173     8, /* number of pixels, processed in a single block */ \
  1174     10, /* prefetch distance */ \
  1175     default_init, \
  1176     default_cleanup, \
  1177     pixman_composite_src_8888_8888_process_pixblock_head, \
  1178     pixman_composite_src_8888_8888_process_pixblock_tail, \
  1179     pixman_composite_src_8888_8888_process_pixblock_tail_head, \
  1180     0, /* dst_w_basereg */ \
  1181     0, /* dst_r_basereg */ \
  1182     0, /* src_basereg   */ \
  1183     0  /* mask_basereg  */
  1185 /******************************************************************************/
  1187 .macro pixman_composite_src_x888_8888_process_pixblock_head
  1188     vorr     q0, q0, q2
  1189     vorr     q1, q1, q2
  1190 .endm
  1192 .macro pixman_composite_src_x888_8888_process_pixblock_tail
  1193 .endm
  1195 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
  1196     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
  1197     fetch_src_pixblock
  1198     vorr     q0, q0, q2
  1199     vorr     q1, q1, q2
  1200     cache_preload 8, 8
  1201 .endm
  1203 .macro pixman_composite_src_x888_8888_init
  1204     vmov.u8  q2, #0xFF
  1205     vshl.u32 q2, q2, #24
  1206 .endm
  1208 generate_composite_function \
  1209     pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
  1210     FLAG_DST_WRITEONLY, \
  1211     8, /* number of pixels, processed in a single block */ \
  1212     10, /* prefetch distance */ \
  1213     pixman_composite_src_x888_8888_init, \
  1214     default_cleanup, \
  1215     pixman_composite_src_x888_8888_process_pixblock_head, \
  1216     pixman_composite_src_x888_8888_process_pixblock_tail, \
  1217     pixman_composite_src_x888_8888_process_pixblock_tail_head, \
  1218     0, /* dst_w_basereg */ \
  1219     0, /* dst_r_basereg */ \
  1220     0, /* src_basereg   */ \
  1221     0  /* mask_basereg  */
  1223 /******************************************************************************/
  1225 .macro pixman_composite_src_n_8_8888_process_pixblock_head
  1226     /* expecting solid source in {d0, d1, d2, d3} */
  1227     /* mask is in d24 (d25, d26, d27 are unused) */
  1229     /* in */
  1230     vmull.u8    q8, d24, d0
  1231     vmull.u8    q9, d24, d1
  1232     vmull.u8    q10, d24, d2
  1233     vmull.u8    q11, d24, d3
  1234     vrsra.u16   q8, q8, #8
  1235     vrsra.u16   q9, q9, #8
  1236     vrsra.u16   q10, q10, #8
  1237     vrsra.u16   q11, q11, #8
  1238 .endm
  1240 .macro pixman_composite_src_n_8_8888_process_pixblock_tail
  1241     vrshrn.u16  d28, q8, #8
  1242     vrshrn.u16  d29, q9, #8
  1243     vrshrn.u16  d30, q10, #8
  1244     vrshrn.u16  d31, q11, #8
  1245 .endm
  1247 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
  1248     fetch_mask_pixblock
  1249                                     PF add PF_X, PF_X, #8
  1250         vrshrn.u16  d28, q8, #8
  1251                                     PF tst PF_CTL, #0x0F
  1252         vrshrn.u16  d29, q9, #8
  1253                                     PF addne PF_X, PF_X, #8
  1254         vrshrn.u16  d30, q10, #8
  1255                                     PF subne PF_CTL, PF_CTL, #1
  1256         vrshrn.u16  d31, q11, #8
  1257                                     PF cmp PF_X, ORIG_W
  1258     vmull.u8    q8, d24, d0
  1259                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
  1260     vmull.u8    q9, d24, d1
  1261                                     PF subge PF_X, PF_X, ORIG_W
  1262     vmull.u8    q10, d24, d2
  1263                                     PF subges PF_CTL, PF_CTL, #0x10
  1264     vmull.u8    q11, d24, d3
  1265                                     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
  1266         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1267     vrsra.u16   q8, q8, #8
  1268     vrsra.u16   q9, q9, #8
  1269     vrsra.u16   q10, q10, #8
  1270     vrsra.u16   q11, q11, #8
  1271 .endm
  1273 .macro pixman_composite_src_n_8_8888_init
  1274     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1275     vld1.32     {d3[0]}, [DUMMY]
  1276     vdup.8      d0, d3[0]
  1277     vdup.8      d1, d3[1]
  1278     vdup.8      d2, d3[2]
  1279     vdup.8      d3, d3[3]
  1280 .endm
  1282 .macro pixman_composite_src_n_8_8888_cleanup
  1283 .endm
  1285 generate_composite_function \
  1286     pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
  1287     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  1288     8, /* number of pixels, processed in a single block */ \
  1289     5, /* prefetch distance */ \
  1290     pixman_composite_src_n_8_8888_init, \
  1291     pixman_composite_src_n_8_8888_cleanup, \
  1292     pixman_composite_src_n_8_8888_process_pixblock_head, \
  1293     pixman_composite_src_n_8_8888_process_pixblock_tail, \
  1294     pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
  1296 /******************************************************************************/
  1298 .macro pixman_composite_src_n_8_8_process_pixblock_head
  1299     vmull.u8    q0, d24, d16
  1300     vmull.u8    q1, d25, d16
  1301     vmull.u8    q2, d26, d16
  1302     vmull.u8    q3, d27, d16
  1303     vrsra.u16   q0, q0,  #8
  1304     vrsra.u16   q1, q1,  #8
  1305     vrsra.u16   q2, q2,  #8
  1306     vrsra.u16   q3, q3,  #8
  1307 .endm
  1309 .macro pixman_composite_src_n_8_8_process_pixblock_tail
  1310     vrshrn.u16  d28, q0, #8
  1311     vrshrn.u16  d29, q1, #8
  1312     vrshrn.u16  d30, q2, #8
  1313     vrshrn.u16  d31, q3, #8
  1314 .endm
  1316 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
  1317     fetch_mask_pixblock
  1318                                     PF add PF_X, PF_X, #8
  1319         vrshrn.u16  d28, q0, #8
  1320                                     PF tst PF_CTL, #0x0F
  1321         vrshrn.u16  d29, q1, #8
  1322                                     PF addne PF_X, PF_X, #8
  1323         vrshrn.u16  d30, q2, #8
  1324                                     PF subne PF_CTL, PF_CTL, #1
  1325         vrshrn.u16  d31, q3, #8
  1326                                     PF cmp PF_X, ORIG_W
  1327     vmull.u8    q0,  d24, d16
  1328                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
  1329     vmull.u8    q1,  d25, d16
  1330                                     PF subge PF_X, PF_X, ORIG_W
  1331     vmull.u8    q2,  d26, d16
  1332                                     PF subges PF_CTL, PF_CTL, #0x10
  1333     vmull.u8    q3,  d27, d16
  1334                                     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
  1335         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1336     vrsra.u16   q0, q0,  #8
  1337     vrsra.u16   q1, q1,  #8
  1338     vrsra.u16   q2, q2,  #8
  1339     vrsra.u16   q3, q3,  #8
  1340 .endm
  1342 .macro pixman_composite_src_n_8_8_init
  1343     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1344     vld1.32     {d16[0]}, [DUMMY]
  1345     vdup.8      d16, d16[3]
  1346 .endm
  1348 .macro pixman_composite_src_n_8_8_cleanup
  1349 .endm
  1351 generate_composite_function \
  1352     pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
  1353     FLAG_DST_WRITEONLY, \
  1354     32, /* number of pixels, processed in a single block */ \
  1355     5, /* prefetch distance */ \
  1356     pixman_composite_src_n_8_8_init, \
  1357     pixman_composite_src_n_8_8_cleanup, \
  1358     pixman_composite_src_n_8_8_process_pixblock_head, \
  1359     pixman_composite_src_n_8_8_process_pixblock_tail, \
  1360     pixman_composite_src_n_8_8_process_pixblock_tail_head
  1362 /******************************************************************************/
  1364 .macro pixman_composite_over_n_8_8888_process_pixblock_head
  1365     /* expecting deinterleaved source data in {d8, d9, d10, d11} */
  1366     /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
  1367     /* and destination data in {d4, d5, d6, d7} */
  1368     /* mask is in d24 (d25, d26, d27 are unused) */
  1370     /* in */
  1371     vmull.u8    q6, d24, d8
  1372     vmull.u8    q7, d24, d9
  1373     vmull.u8    q8, d24, d10
  1374     vmull.u8    q9, d24, d11
  1375     vrshr.u16   q10, q6, #8
  1376     vrshr.u16   q11, q7, #8
  1377     vrshr.u16   q12, q8, #8
  1378     vrshr.u16   q13, q9, #8
  1379     vraddhn.u16 d0, q6, q10
  1380     vraddhn.u16 d1, q7, q11
  1381     vraddhn.u16 d2, q8, q12
  1382     vraddhn.u16 d3, q9, q13
  1383     vmvn.8      d25, d3  /* get inverted alpha */
  1384     /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
  1385     /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
  1386     /* now do alpha blending */
  1387     vmull.u8    q8, d25, d4
  1388     vmull.u8    q9, d25, d5
  1389     vmull.u8    q10, d25, d6
  1390     vmull.u8    q11, d25, d7
  1391 .endm
  1393 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
  1394     vrshr.u16   q14, q8, #8
  1395     vrshr.u16   q15, q9, #8
  1396     vrshr.u16   q6, q10, #8
  1397     vrshr.u16   q7, q11, #8
  1398     vraddhn.u16 d28, q14, q8
  1399     vraddhn.u16 d29, q15, q9
  1400     vraddhn.u16 d30, q6, q10
  1401     vraddhn.u16 d31, q7, q11
  1402     vqadd.u8    q14, q0, q14
  1403     vqadd.u8    q15, q1, q15
  1404 .endm
  1406 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
  1407         vrshr.u16   q14, q8, #8
  1408     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1409         vrshr.u16   q15, q9, #8
  1410     fetch_mask_pixblock
  1411         vrshr.u16   q6, q10, #8
  1412                                     PF add PF_X, PF_X, #8
  1413         vrshr.u16   q7, q11, #8
  1414                                     PF tst PF_CTL, #0x0F
  1415         vraddhn.u16 d28, q14, q8
  1416                                     PF addne PF_X, PF_X, #8
  1417         vraddhn.u16 d29, q15, q9
  1418                                     PF subne PF_CTL, PF_CTL, #1
  1419         vraddhn.u16 d30, q6, q10
  1420                                     PF cmp PF_X, ORIG_W
  1421         vraddhn.u16 d31, q7, q11
  1422                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
  1423     vmull.u8    q6, d24, d8
  1424                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
  1425     vmull.u8    q7, d24, d9
  1426                                     PF subge PF_X, PF_X, ORIG_W
  1427     vmull.u8    q8, d24, d10
  1428                                     PF subges PF_CTL, PF_CTL, #0x10
  1429     vmull.u8    q9, d24, d11
  1430                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
  1431         vqadd.u8    q14, q0, q14
  1432                                     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
  1433         vqadd.u8    q15, q1, q15
  1434     vrshr.u16   q10, q6, #8
  1435     vrshr.u16   q11, q7, #8
  1436     vrshr.u16   q12, q8, #8
  1437     vrshr.u16   q13, q9, #8
  1438     vraddhn.u16 d0, q6, q10
  1439     vraddhn.u16 d1, q7, q11
  1440     vraddhn.u16 d2, q8, q12
  1441     vraddhn.u16 d3, q9, q13
  1442         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1443     vmvn.8      d25, d3
  1444     vmull.u8    q8, d25, d4
  1445     vmull.u8    q9, d25, d5
  1446     vmull.u8    q10, d25, d6
  1447     vmull.u8    q11, d25, d7
  1448 .endm
  1450 .macro pixman_composite_over_n_8_8888_init
  1451     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1452     .vsave      {d8-d15}
  1453     vpush       {d8-d15}
  1454     vld1.32     {d11[0]}, [DUMMY]
  1455     vdup.8      d8, d11[0]
  1456     vdup.8      d9, d11[1]
  1457     vdup.8      d10, d11[2]
  1458     vdup.8      d11, d11[3]
  1459 .endm
  1461 .macro pixman_composite_over_n_8_8888_cleanup
  1462     vpop        {d8-d15}
  1463 .endm
  1465 generate_composite_function \
  1466     pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
  1467     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1468     8, /* number of pixels, processed in a single block */ \
  1469     5, /* prefetch distance */ \
  1470     pixman_composite_over_n_8_8888_init, \
  1471     pixman_composite_over_n_8_8888_cleanup, \
  1472     pixman_composite_over_n_8_8888_process_pixblock_head, \
  1473     pixman_composite_over_n_8_8888_process_pixblock_tail, \
  1474     pixman_composite_over_n_8_8888_process_pixblock_tail_head
  1476 /******************************************************************************/
  1478 .macro pixman_composite_over_n_8_8_process_pixblock_head
  1479     vmull.u8    q0,  d24, d8
  1480     vmull.u8    q1,  d25, d8
  1481     vmull.u8    q6,  d26, d8
  1482     vmull.u8    q7,  d27, d8
  1483     vrshr.u16   q10, q0,  #8
  1484     vrshr.u16   q11, q1,  #8
  1485     vrshr.u16   q12, q6,  #8
  1486     vrshr.u16   q13, q7,  #8
  1487     vraddhn.u16 d0,  q0,  q10
  1488     vraddhn.u16 d1,  q1,  q11
  1489     vraddhn.u16 d2,  q6,  q12
  1490     vraddhn.u16 d3,  q7,  q13
  1491     vmvn.8      q12, q0
  1492     vmvn.8      q13, q1
  1493     vmull.u8    q8,  d24, d4
  1494     vmull.u8    q9,  d25, d5
  1495     vmull.u8    q10, d26, d6
  1496     vmull.u8    q11, d27, d7
  1497 .endm
  1499 .macro pixman_composite_over_n_8_8_process_pixblock_tail
  1500     vrshr.u16   q14, q8,  #8
  1501     vrshr.u16   q15, q9,  #8
  1502     vrshr.u16   q12, q10, #8
  1503     vrshr.u16   q13, q11, #8
  1504     vraddhn.u16 d28, q14, q8
  1505     vraddhn.u16 d29, q15, q9
  1506     vraddhn.u16 d30, q12, q10
  1507     vraddhn.u16 d31, q13, q11
  1508     vqadd.u8    q14, q0,  q14
  1509     vqadd.u8    q15, q1,  q15
  1510 .endm
  1512 /* TODO: expand macros and do better instructions scheduling */
  1513 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
  1514     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1515     pixman_composite_over_n_8_8_process_pixblock_tail
  1516     fetch_mask_pixblock
  1517     cache_preload 32, 32
  1518     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1519     pixman_composite_over_n_8_8_process_pixblock_head
  1520 .endm
  1522 .macro pixman_composite_over_n_8_8_init
  1523     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1524     .vsave      {d8-d15}
  1525     vpush       {d8-d15}
  1526     vld1.32     {d8[0]}, [DUMMY]
  1527     vdup.8      d8, d8[3]
  1528 .endm
  1530 .macro pixman_composite_over_n_8_8_cleanup
  1531     vpop        {d8-d15}
  1532 .endm
  1534 generate_composite_function \
  1535     pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
  1536     FLAG_DST_READWRITE, \
  1537     32, /* number of pixels, processed in a single block */ \
  1538     5, /* prefetch distance */ \
  1539     pixman_composite_over_n_8_8_init, \
  1540     pixman_composite_over_n_8_8_cleanup, \
  1541     pixman_composite_over_n_8_8_process_pixblock_head, \
  1542     pixman_composite_over_n_8_8_process_pixblock_tail, \
  1543     pixman_composite_over_n_8_8_process_pixblock_tail_head
  1545 /******************************************************************************/
  1547 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
  1548     /*
  1549      * 'combine_mask_ca' replacement
  1551      * input:  solid src (n) in {d8,  d9,  d10, d11}
  1552      *         dest in          {d4,  d5,  d6,  d7 }
  1553      *         mask in          {d24, d25, d26, d27}
  1554      * output: updated src in   {d0,  d1,  d2,  d3 }
  1555      *         updated mask in  {d24, d25, d26, d3 }
  1556      */
  1557     vmull.u8    q0,  d24, d8
  1558     vmull.u8    q1,  d25, d9
  1559     vmull.u8    q6,  d26, d10
  1560     vmull.u8    q7,  d27, d11
  1561     vmull.u8    q9,  d11, d25
  1562     vmull.u8    q12, d11, d24
  1563     vmull.u8    q13, d11, d26
  1564     vrshr.u16   q8,  q0,  #8
  1565     vrshr.u16   q10, q1,  #8
  1566     vrshr.u16   q11, q6,  #8
  1567     vraddhn.u16 d0,  q0,  q8
  1568     vraddhn.u16 d1,  q1,  q10
  1569     vraddhn.u16 d2,  q6,  q11
  1570     vrshr.u16   q11, q12, #8
  1571     vrshr.u16   q8,  q9,  #8
  1572     vrshr.u16   q6,  q13, #8
  1573     vrshr.u16   q10, q7,  #8
  1574     vraddhn.u16 d24, q12, q11
  1575     vraddhn.u16 d25, q9,  q8
  1576     vraddhn.u16 d26, q13, q6
  1577     vraddhn.u16 d3,  q7,  q10
  1578     /*
  1579      * 'combine_over_ca' replacement
  1581      * output: updated dest in {d28, d29, d30, d31}
  1582      */
  1583     vmvn.8      q12, q12
  1584     vmvn.8      d26, d26
  1585     vmull.u8    q8,  d24, d4
  1586     vmull.u8    q9,  d25, d5
  1587     vmvn.8      d27, d3
  1588     vmull.u8    q10, d26, d6
  1589     vmull.u8    q11, d27, d7
  1590 .endm
  1592 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
  1593     /* ... continue 'combine_over_ca' replacement */
  1594     vrshr.u16   q14, q8,  #8
  1595     vrshr.u16   q15, q9,  #8
  1596     vrshr.u16   q6,  q10, #8
  1597     vrshr.u16   q7,  q11, #8
  1598     vraddhn.u16 d28, q14, q8
  1599     vraddhn.u16 d29, q15, q9
  1600     vraddhn.u16 d30, q6,  q10
  1601     vraddhn.u16 d31, q7,  q11
  1602     vqadd.u8    q14, q0,  q14
  1603     vqadd.u8    q15, q1,  q15
  1604 .endm
  1606 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
  1607         vrshr.u16   q14, q8, #8
  1608         vrshr.u16   q15, q9, #8
  1609     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1610         vrshr.u16   q6, q10, #8
  1611         vrshr.u16   q7, q11, #8
  1612         vraddhn.u16 d28, q14, q8
  1613         vraddhn.u16 d29, q15, q9
  1614         vraddhn.u16 d30, q6, q10
  1615         vraddhn.u16 d31, q7, q11
  1616     fetch_mask_pixblock
  1617         vqadd.u8    q14, q0, q14
  1618         vqadd.u8    q15, q1, q15
  1619     cache_preload 8, 8
  1620     pixman_composite_over_n_8888_8888_ca_process_pixblock_head
  1621     vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1622 .endm
  1624 .macro pixman_composite_over_n_8888_8888_ca_init
  1625     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1626     .vsave      {d8-d15}
  1627     vpush       {d8-d15}
  1628     vld1.32     {d11[0]}, [DUMMY]
  1629     vdup.8      d8, d11[0]
  1630     vdup.8      d9, d11[1]
  1631     vdup.8      d10, d11[2]
  1632     vdup.8      d11, d11[3]
  1633 .endm
  1635 .macro pixman_composite_over_n_8888_8888_ca_cleanup
  1636     vpop        {d8-d15}
  1637 .endm
  1639 generate_composite_function \
  1640     pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
  1641     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1642     8, /* number of pixels, processed in a single block */ \
  1643     5, /* prefetch distance */ \
  1644     pixman_composite_over_n_8888_8888_ca_init, \
  1645     pixman_composite_over_n_8888_8888_ca_cleanup, \
  1646     pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
  1647     pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
  1648     pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
  1650 /******************************************************************************/
  1652 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
  1653     /*
  1654      * 'combine_mask_ca' replacement
  1656      * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
  1657      *         mask in          {d24, d25, d26}       [B, G, R]
  1658      * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
  1659      *         updated mask in  {d24, d25, d26}       [B, G, R]
  1660      */
  1661     vmull.u8    q0,  d24, d8
  1662     vmull.u8    q1,  d25, d9
  1663     vmull.u8    q6,  d26, d10
  1664     vmull.u8    q9,  d11, d25
  1665     vmull.u8    q12, d11, d24
  1666     vmull.u8    q13, d11, d26
  1667     vrshr.u16   q8,  q0,  #8
  1668     vrshr.u16   q10, q1,  #8
  1669     vrshr.u16   q11, q6,  #8
  1670     vraddhn.u16 d0,  q0,  q8
  1671     vraddhn.u16 d1,  q1,  q10
  1672     vraddhn.u16 d2,  q6,  q11
  1673     vrshr.u16   q11, q12, #8
  1674     vrshr.u16   q8,  q9,  #8
  1675     vrshr.u16   q6,  q13, #8
  1676     vraddhn.u16 d24, q12, q11
  1677     vraddhn.u16 d25, q9,  q8
  1678     /*
  1679      * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
  1680      * and put data into d16 - blue, d17 - green, d18 - red
  1681      */
  1682        vshrn.u16   d17, q2,  #3
  1683        vshrn.u16   d18, q2,  #8
  1684     vraddhn.u16 d26, q13, q6
  1685        vsli.u16    q2,  q2,  #5
  1686        vsri.u8     d18, d18, #5
  1687        vsri.u8     d17, d17, #6
  1688     /*
  1689      * 'combine_over_ca' replacement
  1691      * output: updated dest in d16 - blue, d17 - green, d18 - red
  1692      */
  1693     vmvn.8      q12, q12
  1694        vshrn.u16   d16, q2,  #2
  1695     vmvn.8      d26, d26
  1696     vmull.u8    q6,  d16, d24
  1697     vmull.u8    q7,  d17, d25
  1698     vmull.u8    q11, d18, d26
  1699 .endm
  1701 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
  1702     /* ... continue 'combine_over_ca' replacement */
  1703     vrshr.u16   q10, q6,  #8
  1704     vrshr.u16   q14, q7,  #8
  1705     vrshr.u16   q15, q11, #8
  1706     vraddhn.u16 d16, q10, q6
  1707     vraddhn.u16 d17, q14, q7
  1708     vraddhn.u16 d18, q15, q11
  1709     vqadd.u8    q8,  q0,  q8
  1710     vqadd.u8    d18, d2,  d18
  1711     /*
  1712      * convert the results in d16, d17, d18 to r5g6b5 and store
  1713      * them into {d28, d29}
  1714      */
  1715     vshll.u8    q14, d18, #8
  1716     vshll.u8    q10, d17, #8
  1717     vshll.u8    q15, d16, #8
  1718     vsri.u16    q14, q10, #5
  1719     vsri.u16    q14, q15, #11
  1720 .endm
  1722 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
  1723     fetch_mask_pixblock
  1724         vrshr.u16   q10, q6, #8
  1725         vrshr.u16   q14, q7, #8
  1726     vld1.16     {d4, d5}, [DST_R, :128]!
  1727         vrshr.u16   q15, q11, #8
  1728         vraddhn.u16 d16, q10, q6
  1729         vraddhn.u16 d17, q14, q7
  1730         vraddhn.u16 d22, q15, q11
  1731             /* process_pixblock_head */
  1732             /*
  1733              * 'combine_mask_ca' replacement
  1735              * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
  1736              *         mask in          {d24, d25, d26}       [B, G, R]
  1737              * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
  1738              *         updated mask in  {d24, d25, d26}       [B, G, R]
  1739              */
  1740             vmull.u8    q6,  d26, d10
  1741         vqadd.u8    q8,  q0, q8
  1742             vmull.u8    q0,  d24, d8
  1743         vqadd.u8    d22, d2, d22
  1744             vmull.u8    q1,  d25, d9
  1745         /*
  1746          * convert the result in d16, d17, d22 to r5g6b5 and store
  1747          * it into {d28, d29}
  1748          */
  1749         vshll.u8    q14, d22, #8
  1750         vshll.u8    q10, d17, #8
  1751         vshll.u8    q15, d16, #8
  1752             vmull.u8    q9,  d11, d25
  1753         vsri.u16    q14, q10, #5
  1754             vmull.u8    q12, d11, d24
  1755             vmull.u8    q13, d11, d26
  1756         vsri.u16    q14, q15, #11
  1757     cache_preload 8, 8
  1758             vrshr.u16   q8,  q0,  #8
  1759             vrshr.u16   q10, q1,  #8
  1760             vrshr.u16   q11, q6,  #8
  1761             vraddhn.u16 d0,  q0,  q8
  1762             vraddhn.u16 d1,  q1,  q10
  1763             vraddhn.u16 d2,  q6,  q11
  1764             vrshr.u16   q11, q12, #8
  1765             vrshr.u16   q8,  q9,  #8
  1766             vrshr.u16   q6,  q13, #8
  1767             vraddhn.u16 d24, q12, q11
  1768             vraddhn.u16 d25, q9,  q8
  1769                 /*
  1770                  * convert 8 r5g6b5 pixel data from {d4, d5} to planar
  1771 	         * 8-bit format and put data into d16 - blue, d17 - green,
  1772 	         * d18 - red
  1773                  */
  1774                 vshrn.u16   d17, q2,  #3
  1775                 vshrn.u16   d18, q2,  #8
  1776             vraddhn.u16 d26, q13, q6
  1777                 vsli.u16    q2,  q2,  #5
  1778                 vsri.u8     d17, d17, #6
  1779                 vsri.u8     d18, d18, #5
  1780             /*
  1781              * 'combine_over_ca' replacement
  1783              * output: updated dest in d16 - blue, d17 - green, d18 - red
  1784              */
  1785             vmvn.8      q12, q12
  1786                 vshrn.u16   d16, q2,  #2
  1787             vmvn.8      d26, d26
  1788             vmull.u8    q7,  d17, d25
  1789             vmull.u8    q6,  d16, d24
  1790             vmull.u8    q11, d18, d26
  1791     vst1.16     {d28, d29}, [DST_W, :128]!
  1792 .endm
  1794 .macro pixman_composite_over_n_8888_0565_ca_init
  1795     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1796     .vsave      {d8-d15}
  1797     vpush       {d8-d15}
  1798     vld1.32     {d11[0]}, [DUMMY]
  1799     vdup.8      d8, d11[0]
  1800     vdup.8      d9, d11[1]
  1801     vdup.8      d10, d11[2]
  1802     vdup.8      d11, d11[3]
  1803 .endm
  1805 .macro pixman_composite_over_n_8888_0565_ca_cleanup
  1806     vpop        {d8-d15}
  1807 .endm
  1809 generate_composite_function \
  1810     pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
  1811     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  1812     8, /* number of pixels, processed in a single block */ \
  1813     5, /* prefetch distance */ \
  1814     pixman_composite_over_n_8888_0565_ca_init, \
  1815     pixman_composite_over_n_8888_0565_ca_cleanup, \
  1816     pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
  1817     pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
  1818     pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
  1820 /******************************************************************************/
  1822 .macro pixman_composite_in_n_8_process_pixblock_head
  1823     /* expecting source data in {d0, d1, d2, d3} */
  1824     /* and destination data in {d4, d5, d6, d7} */
  1825     vmull.u8    q8,  d4,  d3
  1826     vmull.u8    q9,  d5,  d3
  1827     vmull.u8    q10, d6,  d3
  1828     vmull.u8    q11, d7,  d3
  1829 .endm
  1831 .macro pixman_composite_in_n_8_process_pixblock_tail
  1832     vrshr.u16   q14, q8,  #8
  1833     vrshr.u16   q15, q9,  #8
  1834     vrshr.u16   q12, q10, #8
  1835     vrshr.u16   q13, q11, #8
  1836     vraddhn.u16 d28, q8,  q14
  1837     vraddhn.u16 d29, q9,  q15
  1838     vraddhn.u16 d30, q10, q12
  1839     vraddhn.u16 d31, q11, q13
  1840 .endm
  1842 .macro pixman_composite_in_n_8_process_pixblock_tail_head
  1843     pixman_composite_in_n_8_process_pixblock_tail
  1844     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1845     cache_preload 32, 32
  1846     pixman_composite_in_n_8_process_pixblock_head
  1847     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1848 .endm
  1850 .macro pixman_composite_in_n_8_init
  1851     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1852     vld1.32     {d3[0]}, [DUMMY]
  1853     vdup.8      d3, d3[3]
  1854 .endm
  1856 .macro pixman_composite_in_n_8_cleanup
  1857 .endm
  1859 generate_composite_function \
  1860     pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
  1861     FLAG_DST_READWRITE, \
  1862     32, /* number of pixels, processed in a single block */ \
  1863     5, /* prefetch distance */ \
  1864     pixman_composite_in_n_8_init, \
  1865     pixman_composite_in_n_8_cleanup, \
  1866     pixman_composite_in_n_8_process_pixblock_head, \
  1867     pixman_composite_in_n_8_process_pixblock_tail, \
  1868     pixman_composite_in_n_8_process_pixblock_tail_head, \
  1869     28, /* dst_w_basereg */ \
  1870     4,  /* dst_r_basereg */ \
  1871     0,  /* src_basereg   */ \
  1872     24  /* mask_basereg  */
  1874 .macro pixman_composite_add_n_8_8_process_pixblock_head
  1875     /* expecting source data in {d8, d9, d10, d11} */
  1876     /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
  1877     /* and destination data in {d4, d5, d6, d7} */
  1878     /* mask is in d24, d25, d26, d27 */
  1879     vmull.u8    q0, d24, d11
  1880     vmull.u8    q1, d25, d11
  1881     vmull.u8    q6, d26, d11
  1882     vmull.u8    q7, d27, d11
  1883     vrshr.u16   q10, q0, #8
  1884     vrshr.u16   q11, q1, #8
  1885     vrshr.u16   q12, q6, #8
  1886     vrshr.u16   q13, q7, #8
  1887     vraddhn.u16 d0, q0, q10
  1888     vraddhn.u16 d1, q1, q11
  1889     vraddhn.u16 d2, q6, q12
  1890     vraddhn.u16 d3, q7, q13
  1891     vqadd.u8    q14, q0, q2
  1892     vqadd.u8    q15, q1, q3
  1893 .endm
  1895 .macro pixman_composite_add_n_8_8_process_pixblock_tail
  1896 .endm
  1898 /* TODO: expand macros and do better instructions scheduling */
  1899 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
  1900     pixman_composite_add_n_8_8_process_pixblock_tail
  1901     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1902     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1903     fetch_mask_pixblock
  1904     cache_preload 32, 32
  1905     pixman_composite_add_n_8_8_process_pixblock_head
  1906 .endm
  1908 .macro pixman_composite_add_n_8_8_init
  1909     add         DUMMY, sp, #ARGS_STACK_OFFSET
  1910     .vsave      {d8-d15}
  1911     vpush       {d8-d15}
  1912     vld1.32     {d11[0]}, [DUMMY]
  1913     vdup.8      d11, d11[3]
  1914 .endm
  1916 .macro pixman_composite_add_n_8_8_cleanup
  1917     vpop        {d8-d15}
  1918 .endm
  1920 generate_composite_function \
  1921     pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
  1922     FLAG_DST_READWRITE, \
  1923     32, /* number of pixels, processed in a single block */ \
  1924     5, /* prefetch distance */ \
  1925     pixman_composite_add_n_8_8_init, \
  1926     pixman_composite_add_n_8_8_cleanup, \
  1927     pixman_composite_add_n_8_8_process_pixblock_head, \
  1928     pixman_composite_add_n_8_8_process_pixblock_tail, \
  1929     pixman_composite_add_n_8_8_process_pixblock_tail_head
  1931 /******************************************************************************/
  1933 .macro pixman_composite_add_8_8_8_process_pixblock_head
  1934     /* expecting source data in {d0, d1, d2, d3} */
  1935     /* destination data in {d4, d5, d6, d7} */
  1936     /* mask in {d24, d25, d26, d27} */
  1937     vmull.u8    q8, d24, d0
  1938     vmull.u8    q9, d25, d1
  1939     vmull.u8    q10, d26, d2
  1940     vmull.u8    q11, d27, d3
  1941     vrshr.u16   q0, q8, #8
  1942     vrshr.u16   q1, q9, #8
  1943     vrshr.u16   q12, q10, #8
  1944     vrshr.u16   q13, q11, #8
  1945     vraddhn.u16 d0, q0, q8
  1946     vraddhn.u16 d1, q1, q9
  1947     vraddhn.u16 d2, q12, q10
  1948     vraddhn.u16 d3, q13, q11
  1949     vqadd.u8    q14, q0, q2
  1950     vqadd.u8    q15, q1, q3
  1951 .endm
  1953 .macro pixman_composite_add_8_8_8_process_pixblock_tail
  1954 .endm
  1956 /* TODO: expand macros and do better instructions scheduling */
  1957 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
  1958     pixman_composite_add_8_8_8_process_pixblock_tail
  1959     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
  1960     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
  1961     fetch_mask_pixblock
  1962     fetch_src_pixblock
  1963     cache_preload 32, 32
  1964     pixman_composite_add_8_8_8_process_pixblock_head
  1965 .endm
  1967 .macro pixman_composite_add_8_8_8_init
  1968 .endm
  1970 .macro pixman_composite_add_8_8_8_cleanup
  1971 .endm
  1973 generate_composite_function \
  1974     pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
  1975     FLAG_DST_READWRITE, \
  1976     32, /* number of pixels, processed in a single block */ \
  1977     5, /* prefetch distance */ \
  1978     pixman_composite_add_8_8_8_init, \
  1979     pixman_composite_add_8_8_8_cleanup, \
  1980     pixman_composite_add_8_8_8_process_pixblock_head, \
  1981     pixman_composite_add_8_8_8_process_pixblock_tail, \
  1982     pixman_composite_add_8_8_8_process_pixblock_tail_head
  1984 /******************************************************************************/
  1986 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
  1987     /* expecting source data in {d0, d1, d2, d3} */
  1988     /* destination data in {d4, d5, d6, d7} */
  1989     /* mask in {d24, d25, d26, d27} */
  1990     vmull.u8    q8,  d27, d0
  1991     vmull.u8    q9,  d27, d1
  1992     vmull.u8    q10, d27, d2
  1993     vmull.u8    q11, d27, d3
  1994     /* 1 cycle bubble */
  1995     vrsra.u16   q8,  q8,  #8
  1996     vrsra.u16   q9,  q9,  #8
  1997     vrsra.u16   q10, q10, #8
  1998     vrsra.u16   q11, q11, #8
  1999 .endm
  2001 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
  2002     /* 2 cycle bubble */
  2003     vrshrn.u16  d28, q8,  #8
  2004     vrshrn.u16  d29, q9,  #8
  2005     vrshrn.u16  d30, q10, #8
  2006     vrshrn.u16  d31, q11, #8
  2007     vqadd.u8    q14, q2,  q14
  2008     /* 1 cycle bubble */
  2009     vqadd.u8    q15, q3,  q15
  2010 .endm
  2012 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
  2013     fetch_src_pixblock
  2014         vrshrn.u16  d28, q8,  #8
  2015     fetch_mask_pixblock
  2016         vrshrn.u16  d29, q9,  #8
  2017     vmull.u8    q8,  d27, d0
  2018         vrshrn.u16  d30, q10, #8
  2019     vmull.u8    q9,  d27, d1
  2020         vrshrn.u16  d31, q11, #8
  2021     vmull.u8    q10, d27, d2
  2022         vqadd.u8    q14, q2,  q14
  2023     vmull.u8    q11, d27, d3
  2024         vqadd.u8    q15, q3,  q15
  2025     vrsra.u16   q8,  q8,  #8
  2026     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
  2027     vrsra.u16   q9,  q9,  #8
  2028         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
  2029     vrsra.u16   q10, q10, #8
  2031     cache_preload 8, 8
  2033     vrsra.u16   q11, q11, #8
  2034 .endm
  2036 generate_composite_function \
  2037     pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
  2038     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2039     8, /* number of pixels, processed in a single block */ \
  2040     10, /* prefetch distance */ \
  2041     default_init, \
  2042     default_cleanup, \
  2043     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  2044     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  2045     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
  2047 generate_composite_function_single_scanline \
  2048     pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
  2049     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2050     8, /* number of pixels, processed in a single block */ \
  2051     default_init, \
  2052     default_cleanup, \
  2053     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  2054     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  2055     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
  2057 /******************************************************************************/
  2059 generate_composite_function \
  2060     pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
  2061     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2062     8, /* number of pixels, processed in a single block */ \
  2063     5, /* prefetch distance */ \
  2064     default_init, \
  2065     default_cleanup, \
  2066     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  2067     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  2068     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
  2069     28, /* dst_w_basereg */ \
  2070     4,  /* dst_r_basereg */ \
  2071     0,  /* src_basereg   */ \
  2072     27  /* mask_basereg  */
  2074 /******************************************************************************/
  2076 .macro pixman_composite_add_n_8_8888_init
  2077     add         DUMMY, sp, #ARGS_STACK_OFFSET
  2078     vld1.32     {d3[0]}, [DUMMY]
  2079     vdup.8      d0, d3[0]
  2080     vdup.8      d1, d3[1]
  2081     vdup.8      d2, d3[2]
  2082     vdup.8      d3, d3[3]
  2083 .endm
  2085 .macro pixman_composite_add_n_8_8888_cleanup
  2086 .endm
  2088 generate_composite_function \
  2089     pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
  2090     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2091     8, /* number of pixels, processed in a single block */ \
  2092     5, /* prefetch distance */ \
  2093     pixman_composite_add_n_8_8888_init, \
  2094     pixman_composite_add_n_8_8888_cleanup, \
  2095     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  2096     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  2097     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
  2098     28, /* dst_w_basereg */ \
  2099     4,  /* dst_r_basereg */ \
  2100     0,  /* src_basereg   */ \
  2101     27  /* mask_basereg  */
  2103 /******************************************************************************/
  2105 .macro pixman_composite_add_8888_n_8888_init
  2106     add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
  2107     vld1.32     {d27[0]}, [DUMMY]
  2108     vdup.8      d27, d27[3]
  2109 .endm
  2111 .macro pixman_composite_add_8888_n_8888_cleanup
  2112 .endm
  2114 generate_composite_function \
  2115     pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
  2116     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2117     8, /* number of pixels, processed in a single block */ \
  2118     5, /* prefetch distance */ \
  2119     pixman_composite_add_8888_n_8888_init, \
  2120     pixman_composite_add_8888_n_8888_cleanup, \
  2121     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
  2122     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
  2123     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
  2124     28, /* dst_w_basereg */ \
  2125     4,  /* dst_r_basereg */ \
  2126     0,  /* src_basereg   */ \
  2127     27  /* mask_basereg  */
  2129 /******************************************************************************/
  2131 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
  2132     /* expecting source data in {d0, d1, d2, d3} */
  2133     /* destination data in {d4, d5, d6, d7} */
  2134     /* solid mask is in d15 */
  2136     /* 'in' */
  2137     vmull.u8    q8, d15, d3
  2138     vmull.u8    q6, d15, d2
  2139     vmull.u8    q5, d15, d1
  2140     vmull.u8    q4, d15, d0
  2141     vrshr.u16   q13, q8, #8
  2142     vrshr.u16   q12, q6, #8
  2143     vrshr.u16   q11, q5, #8
  2144     vrshr.u16   q10, q4, #8
  2145     vraddhn.u16 d3, q8, q13
  2146     vraddhn.u16 d2, q6, q12
  2147     vraddhn.u16 d1, q5, q11
  2148     vraddhn.u16 d0, q4, q10
  2149     vmvn.8      d24, d3  /* get inverted alpha */
  2150     /* now do alpha blending */
  2151     vmull.u8    q8, d24, d4
  2152     vmull.u8    q9, d24, d5
  2153     vmull.u8    q10, d24, d6
  2154     vmull.u8    q11, d24, d7
  2155 .endm
  2157 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
  2158     vrshr.u16   q14, q8, #8
  2159     vrshr.u16   q15, q9, #8
  2160     vrshr.u16   q12, q10, #8
  2161     vrshr.u16   q13, q11, #8
  2162     vraddhn.u16 d28, q14, q8
  2163     vraddhn.u16 d29, q15, q9
  2164     vraddhn.u16 d30, q12, q10
  2165     vraddhn.u16 d31, q13, q11
  2166 .endm
  2168 /* TODO: expand macros and do better instructions scheduling */
  2169 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
  2170     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
  2171     pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
  2172     fetch_src_pixblock
  2173     cache_preload 8, 8
  2174     fetch_mask_pixblock
  2175     pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
  2176     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
  2177 .endm
  2179 generate_composite_function_single_scanline \
  2180     pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
  2181     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2182     8, /* number of pixels, processed in a single block */ \
  2183     default_init_need_all_regs, \
  2184     default_cleanup_need_all_regs, \
  2185     pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
  2186     pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
  2187     pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
  2188     28, /* dst_w_basereg */ \
  2189     4,  /* dst_r_basereg */ \
  2190     0,  /* src_basereg   */ \
  2191     12  /* mask_basereg  */
  2193 /******************************************************************************/
  2195 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
  2196     pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
  2197 .endm
  2199 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
  2200     pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
  2201     vqadd.u8    q14, q0, q14
  2202     vqadd.u8    q15, q1, q15
  2203 .endm
  2205 /* TODO: expand macros and do better instructions scheduling */
  2206 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
  2207     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
  2208     pixman_composite_over_8888_n_8888_process_pixblock_tail
  2209     fetch_src_pixblock
  2210     cache_preload 8, 8
  2211     pixman_composite_over_8888_n_8888_process_pixblock_head
  2212     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
  2213 .endm
  2215 .macro pixman_composite_over_8888_n_8888_init
  2216     add         DUMMY, sp, #48
  2217     .vsave      {d8-d15}
  2218     vpush       {d8-d15}
  2219     vld1.32     {d15[0]}, [DUMMY]
  2220     vdup.8      d15, d15[3]
  2221 .endm
  2223 .macro pixman_composite_over_8888_n_8888_cleanup
  2224     vpop        {d8-d15}
  2225 .endm
  2227 generate_composite_function \
  2228     pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
  2229     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2230     8, /* number of pixels, processed in a single block */ \
  2231     5, /* prefetch distance */ \
  2232     pixman_composite_over_8888_n_8888_init, \
  2233     pixman_composite_over_8888_n_8888_cleanup, \
  2234     pixman_composite_over_8888_n_8888_process_pixblock_head, \
  2235     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
  2236     pixman_composite_over_8888_n_8888_process_pixblock_tail_head
  2238 /******************************************************************************/
  2240 /* TODO: expand macros and do better instructions scheduling */
  2241 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
  2242     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
  2243     pixman_composite_over_8888_n_8888_process_pixblock_tail
  2244     fetch_src_pixblock
  2245     cache_preload 8, 8
  2246     fetch_mask_pixblock
  2247     pixman_composite_over_8888_n_8888_process_pixblock_head
  2248     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
  2249 .endm
  2251 generate_composite_function \
  2252     pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
  2253     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2254     8, /* number of pixels, processed in a single block */ \
  2255     5, /* prefetch distance */ \
  2256     default_init_need_all_regs, \
  2257     default_cleanup_need_all_regs, \
  2258     pixman_composite_over_8888_n_8888_process_pixblock_head, \
  2259     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
  2260     pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
  2261     28, /* dst_w_basereg */ \
  2262     4,  /* dst_r_basereg */ \
  2263     0,  /* src_basereg   */ \
  2264     12  /* mask_basereg  */
  2266 generate_composite_function_single_scanline \
  2267     pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
  2268     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2269     8, /* number of pixels, processed in a single block */ \
  2270     default_init_need_all_regs, \
  2271     default_cleanup_need_all_regs, \
  2272     pixman_composite_over_8888_n_8888_process_pixblock_head, \
  2273     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
  2274     pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
  2275     28, /* dst_w_basereg */ \
  2276     4,  /* dst_r_basereg */ \
  2277     0,  /* src_basereg   */ \
  2278     12  /* mask_basereg  */
  2280 /******************************************************************************/
  2282 /* TODO: expand macros and do better instructions scheduling */
  2283 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
  2284     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
  2285     pixman_composite_over_8888_n_8888_process_pixblock_tail
  2286     fetch_src_pixblock
  2287     cache_preload 8, 8
  2288     fetch_mask_pixblock
  2289     pixman_composite_over_8888_n_8888_process_pixblock_head
  2290     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
  2291 .endm
  2293 generate_composite_function \
  2294     pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
  2295     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2296     8, /* number of pixels, processed in a single block */ \
  2297     5, /* prefetch distance */ \
  2298     default_init_need_all_regs, \
  2299     default_cleanup_need_all_regs, \
  2300     pixman_composite_over_8888_n_8888_process_pixblock_head, \
  2301     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
  2302     pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
  2303     28, /* dst_w_basereg */ \
  2304     4,  /* dst_r_basereg */ \
  2305     0,  /* src_basereg   */ \
  2306     15  /* mask_basereg  */
  2308 /******************************************************************************/
  2310 .macro pixman_composite_src_0888_0888_process_pixblock_head
  2311 .endm
  2313 .macro pixman_composite_src_0888_0888_process_pixblock_tail
  2314 .endm
  2316 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
  2317     vst3.8 {d0, d1, d2}, [DST_W]!
  2318     fetch_src_pixblock
  2319     cache_preload 8, 8
  2320 .endm
  2322 generate_composite_function \
  2323     pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
  2324     FLAG_DST_WRITEONLY, \
  2325     8, /* number of pixels, processed in a single block */ \
  2326     10, /* prefetch distance */ \
  2327     default_init, \
  2328     default_cleanup, \
  2329     pixman_composite_src_0888_0888_process_pixblock_head, \
  2330     pixman_composite_src_0888_0888_process_pixblock_tail, \
  2331     pixman_composite_src_0888_0888_process_pixblock_tail_head, \
  2332     0, /* dst_w_basereg */ \
  2333     0, /* dst_r_basereg */ \
  2334     0, /* src_basereg   */ \
  2335     0  /* mask_basereg  */
  2337 /******************************************************************************/
  2339 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
  2340     vswp   d0, d2
  2341 .endm
  2343 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
  2344 .endm
  2346 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
  2347     vst4.8 {d0, d1, d2, d3}, [DST_W]!
  2348     fetch_src_pixblock
  2349     vswp   d0, d2
  2350     cache_preload 8, 8
  2351 .endm
  2353 .macro pixman_composite_src_0888_8888_rev_init
  2354     veor   d3, d3, d3
  2355 .endm
  2357 generate_composite_function \
  2358     pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
  2359     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  2360     8, /* number of pixels, processed in a single block */ \
  2361     10, /* prefetch distance */ \
  2362     pixman_composite_src_0888_8888_rev_init, \
  2363     default_cleanup, \
  2364     pixman_composite_src_0888_8888_rev_process_pixblock_head, \
  2365     pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
  2366     pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
  2367     0, /* dst_w_basereg */ \
  2368     0, /* dst_r_basereg */ \
  2369     0, /* src_basereg   */ \
  2370     0  /* mask_basereg  */
  2372 /******************************************************************************/
  2374 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
  2375     vshll.u8    q8, d1, #8
  2376     vshll.u8    q9, d2, #8
  2377 .endm
  2379 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
  2380     vshll.u8    q14, d0, #8
  2381     vsri.u16    q14, q8, #5
  2382     vsri.u16    q14, q9, #11
  2383 .endm
  2385 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
  2386         vshll.u8    q14, d0, #8
  2387     fetch_src_pixblock
  2388         vsri.u16    q14, q8, #5
  2389         vsri.u16    q14, q9, #11
  2390     vshll.u8    q8, d1, #8
  2391         vst1.16 {d28, d29}, [DST_W, :128]!
  2392     vshll.u8    q9, d2, #8
  2393 .endm
  2395 generate_composite_function \
  2396     pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
  2397     FLAG_DST_WRITEONLY, \
  2398     8, /* number of pixels, processed in a single block */ \
  2399     10, /* prefetch distance */ \
  2400     default_init, \
  2401     default_cleanup, \
  2402     pixman_composite_src_0888_0565_rev_process_pixblock_head, \
  2403     pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
  2404     pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
  2405     28, /* dst_w_basereg */ \
  2406     0, /* dst_r_basereg */ \
  2407     0, /* src_basereg   */ \
  2408     0  /* mask_basereg  */
  2410 /******************************************************************************/
  2412 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
  2413     vmull.u8    q8, d3, d0
  2414     vmull.u8    q9, d3, d1
  2415     vmull.u8    q10, d3, d2
  2416 .endm
  2418 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
  2419     vrshr.u16   q11, q8, #8
  2420     vswp        d3, d31
  2421     vrshr.u16   q12, q9, #8
  2422     vrshr.u16   q13, q10, #8
  2423     vraddhn.u16 d30, q11, q8
  2424     vraddhn.u16 d29, q12, q9
  2425     vraddhn.u16 d28, q13, q10
  2426 .endm
  2428 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
  2429         vrshr.u16   q11, q8, #8
  2430         vswp        d3, d31
  2431         vrshr.u16   q12, q9, #8
  2432         vrshr.u16   q13, q10, #8
  2433     fetch_src_pixblock
  2434         vraddhn.u16 d30, q11, q8
  2435                                     PF add PF_X, PF_X, #8
  2436                                     PF tst PF_CTL, #0xF
  2437                                     PF addne PF_X, PF_X, #8
  2438                                     PF subne PF_CTL, PF_CTL, #1
  2439         vraddhn.u16 d29, q12, q9
  2440         vraddhn.u16 d28, q13, q10
  2441     vmull.u8    q8, d3, d0
  2442     vmull.u8    q9, d3, d1
  2443     vmull.u8    q10, d3, d2
  2444         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
  2445                                     PF cmp PF_X, ORIG_W
  2446                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
  2447                                     PF subge PF_X, PF_X, ORIG_W
  2448                                     PF subges PF_CTL, PF_CTL, #0x10
  2449                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
  2450 .endm
  2452 generate_composite_function \
  2453     pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
  2454     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  2455     8, /* number of pixels, processed in a single block */ \
  2456     10, /* prefetch distance */ \
  2457     default_init, \
  2458     default_cleanup, \
  2459     pixman_composite_src_pixbuf_8888_process_pixblock_head, \
  2460     pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
  2461     pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
  2462     28, /* dst_w_basereg */ \
  2463     0, /* dst_r_basereg */ \
  2464     0, /* src_basereg   */ \
  2465     0  /* mask_basereg  */
  2467 /******************************************************************************/
  2469 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
  2470     vmull.u8    q8, d3, d0
  2471     vmull.u8    q9, d3, d1
  2472     vmull.u8    q10, d3, d2
  2473 .endm
  2475 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
  2476     vrshr.u16   q11, q8, #8
  2477     vswp        d3, d31
  2478     vrshr.u16   q12, q9, #8
  2479     vrshr.u16   q13, q10, #8
  2480     vraddhn.u16 d28, q11, q8
  2481     vraddhn.u16 d29, q12, q9
  2482     vraddhn.u16 d30, q13, q10
  2483 .endm
  2485 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
  2486         vrshr.u16   q11, q8, #8
  2487         vswp        d3, d31
  2488         vrshr.u16   q12, q9, #8
  2489         vrshr.u16   q13, q10, #8
  2490     fetch_src_pixblock
  2491         vraddhn.u16 d28, q11, q8
  2492                                     PF add PF_X, PF_X, #8
  2493                                     PF tst PF_CTL, #0xF
  2494                                     PF addne PF_X, PF_X, #8
  2495                                     PF subne PF_CTL, PF_CTL, #1
  2496         vraddhn.u16 d29, q12, q9
  2497         vraddhn.u16 d30, q13, q10
  2498     vmull.u8    q8, d3, d0
  2499     vmull.u8    q9, d3, d1
  2500     vmull.u8    q10, d3, d2
  2501         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
  2502                                     PF cmp PF_X, ORIG_W
  2503                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
  2504                                     PF subge PF_X, PF_X, ORIG_W
  2505                                     PF subges PF_CTL, PF_CTL, #0x10
  2506                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
  2507 .endm
  2509 generate_composite_function \
  2510     pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
  2511     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  2512     8, /* number of pixels, processed in a single block */ \
  2513     10, /* prefetch distance */ \
  2514     default_init, \
  2515     default_cleanup, \
  2516     pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
  2517     pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
  2518     pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
  2519     28, /* dst_w_basereg */ \
  2520     0, /* dst_r_basereg */ \
  2521     0, /* src_basereg   */ \
  2522     0  /* mask_basereg  */
  2524 /******************************************************************************/
  2526 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
  2527     /* mask is in d15 */
  2528     convert_0565_to_x888 q4, d2, d1, d0
  2529     convert_0565_to_x888 q5, d6, d5, d4
  2530     /* source pixel data is in      {d0, d1, d2, XX} */
  2531     /* destination pixel data is in {d4, d5, d6, XX} */
  2532     vmvn.8      d7,  d15
  2533     vmull.u8    q6,  d15, d2
  2534     vmull.u8    q5,  d15, d1
  2535     vmull.u8    q4,  d15, d0
  2536     vmull.u8    q8,  d7,  d4
  2537     vmull.u8    q9,  d7,  d5
  2538     vmull.u8    q13, d7,  d6
  2539     vrshr.u16   q12, q6,  #8
  2540     vrshr.u16   q11, q5,  #8
  2541     vrshr.u16   q10, q4,  #8
  2542     vraddhn.u16 d2,  q6,  q12
  2543     vraddhn.u16 d1,  q5,  q11
  2544     vraddhn.u16 d0,  q4,  q10
  2545 .endm
  2547 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
  2548     vrshr.u16   q14, q8,  #8
  2549     vrshr.u16   q15, q9,  #8
  2550     vrshr.u16   q12, q13, #8
  2551     vraddhn.u16 d28, q14, q8
  2552     vraddhn.u16 d29, q15, q9
  2553     vraddhn.u16 d30, q12, q13
  2554     vqadd.u8    q0,  q0,  q14
  2555     vqadd.u8    q1,  q1,  q15
  2556     /* 32bpp result is in {d0, d1, d2, XX} */
  2557     convert_8888_to_0565 d2, d1, d0, q14, q15, q3
  2558 .endm
  2560 /* TODO: expand macros and do better instructions scheduling */
  2561 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
  2562     fetch_mask_pixblock
  2563     pixman_composite_over_0565_8_0565_process_pixblock_tail
  2564     fetch_src_pixblock
  2565     vld1.16    {d10, d11}, [DST_R, :128]!
  2566     cache_preload 8, 8
  2567     pixman_composite_over_0565_8_0565_process_pixblock_head
  2568     vst1.16    {d28, d29}, [DST_W, :128]!
  2569 .endm
  2571 generate_composite_function \
  2572     pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
  2573     FLAG_DST_READWRITE, \
  2574     8, /* number of pixels, processed in a single block */ \
  2575     5, /* prefetch distance */ \
  2576     default_init_need_all_regs, \
  2577     default_cleanup_need_all_regs, \
  2578     pixman_composite_over_0565_8_0565_process_pixblock_head, \
  2579     pixman_composite_over_0565_8_0565_process_pixblock_tail, \
  2580     pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
  2581     28, /* dst_w_basereg */ \
  2582     10,  /* dst_r_basereg */ \
  2583     8,  /* src_basereg   */ \
  2584     15  /* mask_basereg  */
  2586 /******************************************************************************/
  2588 .macro pixman_composite_over_0565_n_0565_init
  2589     add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
  2590     .vsave      {d8-d15}
  2591     vpush       {d8-d15}
  2592     vld1.32     {d15[0]}, [DUMMY]
  2593     vdup.8      d15, d15[3]
  2594 .endm
  2596 .macro pixman_composite_over_0565_n_0565_cleanup
  2597     vpop        {d8-d15}
  2598 .endm
  2600 generate_composite_function \
  2601     pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
  2602     FLAG_DST_READWRITE, \
  2603     8, /* number of pixels, processed in a single block */ \
  2604     5, /* prefetch distance */ \
  2605     pixman_composite_over_0565_n_0565_init, \
  2606     pixman_composite_over_0565_n_0565_cleanup, \
  2607     pixman_composite_over_0565_8_0565_process_pixblock_head, \
  2608     pixman_composite_over_0565_8_0565_process_pixblock_tail, \
  2609     pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
  2610     28, /* dst_w_basereg */ \
  2611     10, /* dst_r_basereg */ \
  2612     8,  /* src_basereg   */ \
  2613     15  /* mask_basereg  */
  2615 /******************************************************************************/
  2617 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
  2618     /* mask is in d15 */
  2619     convert_0565_to_x888 q4, d2, d1, d0
  2620     convert_0565_to_x888 q5, d6, d5, d4
  2621     /* source pixel data is in      {d0, d1, d2, XX} */
  2622     /* destination pixel data is in {d4, d5, d6, XX} */
  2623     vmull.u8    q6,  d15, d2
  2624     vmull.u8    q5,  d15, d1
  2625     vmull.u8    q4,  d15, d0
  2626     vrshr.u16   q12, q6,  #8
  2627     vrshr.u16   q11, q5,  #8
  2628     vrshr.u16   q10, q4,  #8
  2629     vraddhn.u16 d2,  q6,  q12
  2630     vraddhn.u16 d1,  q5,  q11
  2631     vraddhn.u16 d0,  q4,  q10
  2632 .endm
  2634 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
  2635     vqadd.u8    q0,  q0,  q2
  2636     vqadd.u8    q1,  q1,  q3
  2637     /* 32bpp result is in {d0, d1, d2, XX} */
  2638     convert_8888_to_0565 d2, d1, d0, q14, q15, q3
  2639 .endm
  2641 /* TODO: expand macros and do better instructions scheduling */
  2642 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
  2643     fetch_mask_pixblock
  2644     pixman_composite_add_0565_8_0565_process_pixblock_tail
  2645     fetch_src_pixblock
  2646     vld1.16    {d10, d11}, [DST_R, :128]!
  2647     cache_preload 8, 8
  2648     pixman_composite_add_0565_8_0565_process_pixblock_head
  2649     vst1.16    {d28, d29}, [DST_W, :128]!
  2650 .endm
  2652 generate_composite_function \
  2653     pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
  2654     FLAG_DST_READWRITE, \
  2655     8, /* number of pixels, processed in a single block */ \
  2656     5, /* prefetch distance */ \
  2657     default_init_need_all_regs, \
  2658     default_cleanup_need_all_regs, \
  2659     pixman_composite_add_0565_8_0565_process_pixblock_head, \
  2660     pixman_composite_add_0565_8_0565_process_pixblock_tail, \
  2661     pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
  2662     28, /* dst_w_basereg */ \
  2663     10, /* dst_r_basereg */ \
  2664     8,  /* src_basereg   */ \
  2665     15  /* mask_basereg  */
  2667 /******************************************************************************/
  2669 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
  2670     /* mask is in d15 */
  2671     convert_0565_to_x888 q5, d6, d5, d4
  2672     /* destination pixel data is in {d4, d5, d6, xx} */
  2673     vmvn.8      d24, d15 /* get inverted alpha */
  2674     /* now do alpha blending */
  2675     vmull.u8    q8, d24, d4
  2676     vmull.u8    q9, d24, d5
  2677     vmull.u8    q10, d24, d6
  2678 .endm
  2680 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
  2681     vrshr.u16   q14, q8, #8
  2682     vrshr.u16   q15, q9, #8
  2683     vrshr.u16   q12, q10, #8
  2684     vraddhn.u16 d0, q14, q8
  2685     vraddhn.u16 d1, q15, q9
  2686     vraddhn.u16 d2, q12, q10
  2687     /* 32bpp result is in {d0, d1, d2, XX} */
  2688     convert_8888_to_0565 d2, d1, d0, q14, q15, q3
  2689 .endm
  2691 /* TODO: expand macros and do better instructions scheduling */
  2692 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
  2693     fetch_src_pixblock
  2694     pixman_composite_out_reverse_8_0565_process_pixblock_tail
  2695     vld1.16    {d10, d11}, [DST_R, :128]!
  2696     cache_preload 8, 8
  2697     pixman_composite_out_reverse_8_0565_process_pixblock_head
  2698     vst1.16    {d28, d29}, [DST_W, :128]!
  2699 .endm
  2701 generate_composite_function \
  2702     pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
  2703     FLAG_DST_READWRITE, \
  2704     8, /* number of pixels, processed in a single block */ \
  2705     5, /* prefetch distance */ \
  2706     default_init_need_all_regs, \
  2707     default_cleanup_need_all_regs, \
  2708     pixman_composite_out_reverse_8_0565_process_pixblock_head, \
  2709     pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
  2710     pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
  2711     28, /* dst_w_basereg */ \
  2712     10, /* dst_r_basereg */ \
  2713     15, /* src_basereg   */ \
  2714     0   /* mask_basereg  */
  2716 /******************************************************************************/
  2718 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
  2719     /* src is in d0 */
  2720     /* destination pixel data is in {d4, d5, d6, d7} */
  2721     vmvn.8      d1, d0 /* get inverted alpha */
  2722     /* now do alpha blending */
  2723     vmull.u8    q8, d1, d4
  2724     vmull.u8    q9, d1, d5
  2725     vmull.u8    q10, d1, d6
  2726     vmull.u8    q11, d1, d7
  2727 .endm
  2729 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
  2730     vrshr.u16   q14, q8, #8
  2731     vrshr.u16   q15, q9, #8
  2732     vrshr.u16   q12, q10, #8
  2733     vrshr.u16   q13, q11, #8
  2734     vraddhn.u16 d28, q14, q8
  2735     vraddhn.u16 d29, q15, q9
  2736     vraddhn.u16 d30, q12, q10
  2737     vraddhn.u16 d31, q13, q11
  2738     /* 32bpp result is in {d28, d29, d30, d31} */
  2739 .endm
  2741 /* TODO: expand macros and do better instructions scheduling */
  2742 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
  2743     fetch_src_pixblock
  2744     pixman_composite_out_reverse_8_8888_process_pixblock_tail
  2745     vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
  2746     cache_preload 8, 8
  2747     pixman_composite_out_reverse_8_8888_process_pixblock_head
  2748     vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
  2749 .endm
  2751 generate_composite_function \
  2752     pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
  2753     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2754     8, /* number of pixels, processed in a single block */ \
  2755     5, /* prefetch distance */ \
  2756     default_init, \
  2757     default_cleanup, \
  2758     pixman_composite_out_reverse_8_8888_process_pixblock_head, \
  2759     pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
  2760     pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
  2761     28, /* dst_w_basereg */ \
  2762     4, /* dst_r_basereg */ \
  2763     0, /* src_basereg   */ \
  2764     0   /* mask_basereg  */
  2766 /******************************************************************************/
  2768 generate_composite_function_nearest_scanline \
  2769     pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
  2770     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2771     8, /* number of pixels, processed in a single block */ \
  2772     default_init, \
  2773     default_cleanup, \
  2774     pixman_composite_over_8888_8888_process_pixblock_head, \
  2775     pixman_composite_over_8888_8888_process_pixblock_tail, \
  2776     pixman_composite_over_8888_8888_process_pixblock_tail_head
  2778 generate_composite_function_nearest_scanline \
  2779     pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
  2780     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2781     8, /* number of pixels, processed in a single block */ \
  2782     default_init, \
  2783     default_cleanup, \
  2784     pixman_composite_over_8888_0565_process_pixblock_head, \
  2785     pixman_composite_over_8888_0565_process_pixblock_tail, \
  2786     pixman_composite_over_8888_0565_process_pixblock_tail_head, \
  2787     28, /* dst_w_basereg */ \
  2788     4,  /* dst_r_basereg */ \
  2789     0,  /* src_basereg   */ \
  2790     24  /* mask_basereg  */
  2792 generate_composite_function_nearest_scanline \
  2793     pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
  2794     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  2795     8, /* number of pixels, processed in a single block */ \
  2796     default_init, \
  2797     default_cleanup, \
  2798     pixman_composite_src_8888_0565_process_pixblock_head, \
  2799     pixman_composite_src_8888_0565_process_pixblock_tail, \
  2800     pixman_composite_src_8888_0565_process_pixblock_tail_head
  2802 generate_composite_function_nearest_scanline \
  2803     pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
  2804     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
  2805     8, /* number of pixels, processed in a single block */ \
  2806     default_init, \
  2807     default_cleanup, \
  2808     pixman_composite_src_0565_8888_process_pixblock_head, \
  2809     pixman_composite_src_0565_8888_process_pixblock_tail, \
  2810     pixman_composite_src_0565_8888_process_pixblock_tail_head
  2812 generate_composite_function_nearest_scanline \
  2813     pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
  2814     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
  2815     8, /* number of pixels, processed in a single block */ \
  2816     default_init_need_all_regs, \
  2817     default_cleanup_need_all_regs, \
  2818     pixman_composite_over_8888_8_0565_process_pixblock_head, \
  2819     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
  2820     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
  2821     28, /* dst_w_basereg */ \
  2822     4,  /* dst_r_basereg */ \
  2823     8,  /* src_basereg   */ \
  2824     24  /* mask_basereg  */
  2826 generate_composite_function_nearest_scanline \
  2827     pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
  2828     FLAG_DST_READWRITE, \
  2829     8, /* number of pixels, processed in a single block */ \
  2830     default_init_need_all_regs, \
  2831     default_cleanup_need_all_regs, \
  2832     pixman_composite_over_0565_8_0565_process_pixblock_head, \
  2833     pixman_composite_over_0565_8_0565_process_pixblock_tail, \
  2834     pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
  2835     28, /* dst_w_basereg */ \
  2836     10,  /* dst_r_basereg */ \
  2837     8,  /* src_basereg   */ \
  2838     15  /* mask_basereg  */
  2840 /******************************************************************************/
  2842 /* Supplementary macro for setting function attributes */
  2843 .macro pixman_asm_function fname
  2844     .func fname
  2845     .global fname
  2846 #ifdef __ELF__
  2847     .hidden fname
  2848     .type fname, %function
  2849 #endif
  2850 fname:
  2851 .endm
  2853 /*
  2854  * Bilinear scaling support code which tries to provide pixel fetching, color
  2855  * format conversion, and interpolation as separate macros which can be used
  2856  * as the basic building blocks for constructing bilinear scanline functions.
  2857  */
  2859 .macro bilinear_load_8888 reg1, reg2, tmp
  2860     mov       TMP1, X, asr #16
  2861     add       X, X, UX
  2862     add       TMP1, TOP, TMP1, asl #2
  2863     vld1.32   {reg1}, [TMP1], STRIDE
  2864     vld1.32   {reg2}, [TMP1]
  2865 .endm
  2867 .macro bilinear_load_0565 reg1, reg2, tmp
  2868     mov       TMP1, X, asr #16
  2869     add       X, X, UX
  2870     add       TMP1, TOP, TMP1, asl #1
  2871     vld1.32   {reg2[0]}, [TMP1], STRIDE
  2872     vld1.32   {reg2[1]}, [TMP1]
  2873     convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
  2874 .endm
  2876 .macro bilinear_load_and_vertical_interpolate_two_8888 \
  2877                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
  2879     bilinear_load_8888 reg1, reg2, tmp1
  2880     vmull.u8  acc1, reg1, d28
  2881     vmlal.u8  acc1, reg2, d29
  2882     bilinear_load_8888 reg3, reg4, tmp2
  2883     vmull.u8  acc2, reg3, d28
  2884     vmlal.u8  acc2, reg4, d29
  2885 .endm
  2887 .macro bilinear_load_and_vertical_interpolate_four_8888 \
  2888                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
  2889                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
  2891     bilinear_load_and_vertical_interpolate_two_8888 \
  2892                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
  2893     bilinear_load_and_vertical_interpolate_two_8888 \
  2894                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
  2895 .endm
  2897 .macro bilinear_load_and_vertical_interpolate_two_0565 \
  2898                 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
  2900     mov       TMP1, X, asr #16
  2901     add       X, X, UX
  2902     add       TMP1, TOP, TMP1, asl #1
  2903     mov       TMP2, X, asr #16
  2904     add       X, X, UX
  2905     add       TMP2, TOP, TMP2, asl #1
  2906     vld1.32   {acc2lo[0]}, [TMP1], STRIDE
  2907     vld1.32   {acc2hi[0]}, [TMP2], STRIDE
  2908     vld1.32   {acc2lo[1]}, [TMP1]
  2909     vld1.32   {acc2hi[1]}, [TMP2]
  2910     convert_0565_to_x888 acc2, reg3, reg2, reg1
  2911     vzip.u8   reg1, reg3
  2912     vzip.u8   reg2, reg4
  2913     vzip.u8   reg3, reg4
  2914     vzip.u8   reg1, reg2
  2915     vmull.u8  acc1, reg1, d28
  2916     vmlal.u8  acc1, reg2, d29
  2917     vmull.u8  acc2, reg3, d28
  2918     vmlal.u8  acc2, reg4, d29
  2919 .endm
  2921 .macro bilinear_load_and_vertical_interpolate_four_0565 \
  2922                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
  2923                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
  2925     mov       TMP1, X, asr #16
  2926     add       X, X, UX
  2927     add       TMP1, TOP, TMP1, asl #1
  2928     mov       TMP2, X, asr #16
  2929     add       X, X, UX
  2930     add       TMP2, TOP, TMP2, asl #1
  2931     vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
  2932     vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
  2933     vld1.32   {xacc2lo[1]}, [TMP1]
  2934     vld1.32   {xacc2hi[1]}, [TMP2]
  2935     convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
  2936     mov       TMP1, X, asr #16
  2937     add       X, X, UX
  2938     add       TMP1, TOP, TMP1, asl #1
  2939     mov       TMP2, X, asr #16
  2940     add       X, X, UX
  2941     add       TMP2, TOP, TMP2, asl #1
  2942     vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
  2943     vzip.u8   xreg1, xreg3
  2944     vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
  2945     vzip.u8   xreg2, xreg4
  2946     vld1.32   {yacc2lo[1]}, [TMP1]
  2947     vzip.u8   xreg3, xreg4
  2948     vld1.32   {yacc2hi[1]}, [TMP2]
  2949     vzip.u8   xreg1, xreg2
  2950     convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
  2951     vmull.u8  xacc1, xreg1, d28
  2952     vzip.u8   yreg1, yreg3
  2953     vmlal.u8  xacc1, xreg2, d29
  2954     vzip.u8   yreg2, yreg4
  2955     vmull.u8  xacc2, xreg3, d28
  2956     vzip.u8   yreg3, yreg4
  2957     vmlal.u8  xacc2, xreg4, d29
  2958     vzip.u8   yreg1, yreg2
  2959     vmull.u8  yacc1, yreg1, d28
  2960     vmlal.u8  yacc1, yreg2, d29
  2961     vmull.u8  yacc2, yreg3, d28
  2962     vmlal.u8  yacc2, yreg4, d29
  2963 .endm
  2965 .macro bilinear_store_8888 numpix, tmp1, tmp2
  2966 .if numpix == 4
  2967     vst1.32   {d0, d1}, [OUT, :128]!
  2968 .elseif numpix == 2
  2969     vst1.32   {d0}, [OUT, :64]!
  2970 .elseif numpix == 1
  2971     vst1.32   {d0[0]}, [OUT, :32]!
  2972 .else
  2973     .error bilinear_store_8888 numpix is unsupported
  2974 .endif
  2975 .endm
  2977 .macro bilinear_store_0565 numpix, tmp1, tmp2
  2978     vuzp.u8 d0, d1
  2979     vuzp.u8 d2, d3
  2980     vuzp.u8 d1, d3
  2981     vuzp.u8 d0, d2
  2982     convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
  2983 .if numpix == 4
  2984     vst1.16   {d2}, [OUT, :64]!
  2985 .elseif numpix == 2
  2986     vst1.32   {d2[0]}, [OUT, :32]!
  2987 .elseif numpix == 1
  2988     vst1.16   {d2[0]}, [OUT, :16]!
  2989 .else
  2990     .error bilinear_store_0565 numpix is unsupported
  2991 .endif
  2992 .endm
  2994 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
  2995     bilinear_load_&src_fmt d0, d1, d2
  2996     vmull.u8  q1, d0, d28
  2997     vmlal.u8  q1, d1, d29
  2998     /* 5 cycles bubble */
  2999     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
  3000     vmlsl.u16 q0, d2, d30
  3001     vmlal.u16 q0, d3, d30
  3002     /* 5 cycles bubble */
  3003     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3004     /* 3 cycles bubble */
  3005     vmovn.u16 d0, q0
  3006     /* 1 cycle bubble */
  3007     bilinear_store_&dst_fmt 1, q2, q3
  3008 .endm
  3010 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
  3011     bilinear_load_and_vertical_interpolate_two_&src_fmt \
  3012                 q1, q11, d0, d1, d20, d21, d22, d23
  3013     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
  3014     vmlsl.u16 q0, d2, d30
  3015     vmlal.u16 q0, d3, d30
  3016     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
  3017     vmlsl.u16 q10, d22, d31
  3018     vmlal.u16 q10, d23, d31
  3019     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3020     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
  3021     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3022     vadd.u16  q12, q12, q13
  3023     vmovn.u16 d0, q0
  3024     bilinear_store_&dst_fmt 2, q2, q3
  3025 .endm
  3027 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
  3028     bilinear_load_and_vertical_interpolate_four_&src_fmt \
  3029                 q1, q11, d0, d1, d20, d21, d22, d23 \
  3030                 q3, q9,  d4, d5, d16, d17, d18, d19
  3031     pld       [TMP1, PF_OFFS]
  3032     sub       TMP1, TMP1, STRIDE
  3033     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
  3034     vmlsl.u16 q0, d2, d30
  3035     vmlal.u16 q0, d3, d30
  3036     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
  3037     vmlsl.u16 q10, d22, d31
  3038     vmlal.u16 q10, d23, d31
  3039     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3040     vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
  3041     vmlsl.u16 q2, d6, d30
  3042     vmlal.u16 q2, d7, d30
  3043     vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
  3044     pld       [TMP2, PF_OFFS]
  3045     vmlsl.u16 q8, d18, d31
  3046     vmlal.u16 q8, d19, d31
  3047     vadd.u16  q12, q12, q13
  3048     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3049     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
  3050     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  3051     vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
  3052     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3053     vmovn.u16 d0, q0
  3054     vmovn.u16 d1, q2
  3055     vadd.u16  q12, q12, q13
  3056     bilinear_store_&dst_fmt 4, q2, q3
  3057 .endm
  3059 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
  3060 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
  3061     bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
  3062 .else
  3063     bilinear_interpolate_four_pixels src_fmt, dst_fmt
  3064 .endif
  3065 .endm
  3067 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
  3068 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
  3069     bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
  3070 .endif
  3071 .endm
  3073 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  3074 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
  3075     bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
  3076 .else
  3077     bilinear_interpolate_four_pixels src_fmt, dst_fmt
  3078 .endif
  3079 .endm
  3081 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
  3082 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
  3083     bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
  3084 .else
  3085     bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
  3086     bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  3087 .endif
  3088 .endm
  3090 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
  3091 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
  3092     bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
  3093 .else
  3094     bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
  3095 .endif
  3096 .endm
  3098 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
  3099 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
  3100     bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
  3101 .else
  3102     bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  3103     bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  3104 .endif
  3105 .endm
  3107 .set BILINEAR_FLAG_UNROLL_4,          0
  3108 .set BILINEAR_FLAG_UNROLL_8,          1
  3109 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
  3111 /*
  3112  * Main template macro for generating NEON optimized bilinear scanline
  3113  * functions.
  3115  * Bilinear scanline scaler macro template uses the following arguments:
  3116  *  fname             - name of the function to generate
  3117  *  src_fmt           - source color format (8888 or 0565)
  3118  *  dst_fmt           - destination color format (8888 or 0565)
  3119  *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
  3120  *  prefetch_distance - prefetch in the source image by that many
  3121  *                      pixels ahead
  3122  */
  3124 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
  3125                                        src_bpp_shift, dst_bpp_shift, \
  3126                                        prefetch_distance, flags
  3128 pixman_asm_function fname
  3129     OUT       .req      r0
  3130     TOP       .req      r1
  3131     BOTTOM    .req      r2
  3132     WT        .req      r3
  3133     WB        .req      r4
  3134     X         .req      r5
  3135     UX        .req      r6
  3136     WIDTH     .req      ip
  3137     TMP1      .req      r3
  3138     TMP2      .req      r4
  3139     PF_OFFS   .req      r7
  3140     TMP3      .req      r8
  3141     TMP4      .req      r9
  3142     STRIDE    .req      r2
  3144     .fnstart
  3145     mov       ip, sp
  3146     .save     {r4, r5, r6, r7, r8, r9}
  3147     push      {r4, r5, r6, r7, r8, r9}
  3148     mov       PF_OFFS, #prefetch_distance
  3149     ldmia     ip, {WB, X, UX, WIDTH}
  3150     mul       PF_OFFS, PF_OFFS, UX
  3152 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
  3153     .vsave    {d8-d15}
  3154     vpush     {d8-d15}
  3155 .endif
  3157     sub       STRIDE, BOTTOM, TOP
  3158     .unreq    BOTTOM
  3160     cmp       WIDTH, #0
  3161     ble       3f
  3163     vdup.u16  q12, X
  3164     vdup.u16  q13, UX
  3165     vdup.u8   d28, WT
  3166     vdup.u8   d29, WB
  3167     vadd.u16  d25, d25, d26
  3169     /* ensure good destination alignment  */
  3170     cmp       WIDTH, #1
  3171     blt       0f
  3172     tst       OUT, #(1 << dst_bpp_shift)
  3173     beq       0f
  3174     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3175     vadd.u16  q12, q12, q13
  3176     bilinear_interpolate_last_pixel src_fmt, dst_fmt
  3177     sub       WIDTH, WIDTH, #1
  3178 0:
  3179     vadd.u16  q13, q13, q13
  3180     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3181     vadd.u16  q12, q12, q13
  3183     cmp       WIDTH, #2
  3184     blt       0f
  3185     tst       OUT, #(1 << (dst_bpp_shift + 1))
  3186     beq       0f
  3187     bilinear_interpolate_two_pixels src_fmt, dst_fmt
  3188     sub       WIDTH, WIDTH, #2
  3189 0:
  3190 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
  3191 /*********** 8 pixels per iteration *****************/
  3192     cmp       WIDTH, #4
  3193     blt       0f
  3194     tst       OUT, #(1 << (dst_bpp_shift + 2))
  3195     beq       0f
  3196     bilinear_interpolate_four_pixels src_fmt, dst_fmt
  3197     sub       WIDTH, WIDTH, #4
  3198 0:
  3199     subs      WIDTH, WIDTH, #8
  3200     blt       1f
  3201     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
  3202     bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
  3203     subs      WIDTH, WIDTH, #8
  3204     blt       5f
  3205 0:
  3206     bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
  3207     subs      WIDTH, WIDTH, #8
  3208     bge       0b
  3209 5:
  3210     bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
  3211 1:
  3212     tst       WIDTH, #4
  3213     beq       2f
  3214     bilinear_interpolate_four_pixels src_fmt, dst_fmt
  3215 2:
  3216 .else
  3217 /*********** 4 pixels per iteration *****************/
  3218     subs      WIDTH, WIDTH, #4
  3219     blt       1f
  3220     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
  3221     bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
  3222     subs      WIDTH, WIDTH, #4
  3223     blt       5f
  3224 0:
  3225     bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
  3226     subs      WIDTH, WIDTH, #4
  3227     bge       0b
  3228 5:
  3229     bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
  3230 1:
  3231 /****************************************************/
  3232 .endif
  3233     /* handle the remaining trailing pixels */
  3234     tst       WIDTH, #2
  3235     beq       2f
  3236     bilinear_interpolate_two_pixels src_fmt, dst_fmt
  3237 2:
  3238     tst       WIDTH, #1
  3239     beq       3f
  3240     bilinear_interpolate_last_pixel src_fmt, dst_fmt
  3241 3:
  3242 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
  3243     vpop      {d8-d15}
  3244 .endif
  3245     pop       {r4, r5, r6, r7, r8, r9}
  3246     bx        lr
  3247     .fnend
  3249     .unreq    OUT
  3250     .unreq    TOP
  3251     .unreq    WT
  3252     .unreq    WB
  3253     .unreq    X
  3254     .unreq    UX
  3255     .unreq    WIDTH
  3256     .unreq    TMP1
  3257     .unreq    TMP2
  3258     .unreq    PF_OFFS
  3259     .unreq    TMP3
  3260     .unreq    TMP4
  3261     .unreq    STRIDE
  3262 .endfunc
  3264 .endm
  3266 /*****************************************************************************/
  3268 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
  3270 .macro bilinear_interpolate_four_pixels_8888_8888_head
  3271     mov       TMP1, X, asr #16
  3272     add       X, X, UX
  3273     add       TMP1, TOP, TMP1, asl #2
  3274     mov       TMP2, X, asr #16
  3275     add       X, X, UX
  3276     add       TMP2, TOP, TMP2, asl #2
  3278     vld1.32   {d22}, [TMP1], STRIDE
  3279     vld1.32   {d23}, [TMP1]
  3280     mov       TMP3, X, asr #16
  3281     add       X, X, UX
  3282     add       TMP3, TOP, TMP3, asl #2
  3283     vmull.u8  q8, d22, d28
  3284     vmlal.u8  q8, d23, d29
  3286     vld1.32   {d22}, [TMP2], STRIDE
  3287     vld1.32   {d23}, [TMP2]
  3288     mov       TMP4, X, asr #16
  3289     add       X, X, UX
  3290     add       TMP4, TOP, TMP4, asl #2
  3291     vmull.u8  q9, d22, d28
  3292     vmlal.u8  q9, d23, d29
  3294     vld1.32   {d22}, [TMP3], STRIDE
  3295     vld1.32   {d23}, [TMP3]
  3296     vmull.u8  q10, d22, d28
  3297     vmlal.u8  q10, d23, d29
  3299     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  3300     vmlsl.u16 q0, d16, d30
  3301     vmlal.u16 q0, d17, d30
  3303     pld       [TMP4, PF_OFFS]
  3304     vld1.32   {d16}, [TMP4], STRIDE
  3305     vld1.32   {d17}, [TMP4]
  3306     pld       [TMP4, PF_OFFS]
  3307     vmull.u8  q11, d16, d28
  3308     vmlal.u8  q11, d17, d29
  3310     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  3311     vmlsl.u16 q1, d18, d31
  3312 .endm
  3314 .macro bilinear_interpolate_four_pixels_8888_8888_tail
  3315     vmlal.u16 q1, d19, d31
  3316     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3317     vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  3318     vmlsl.u16 q2, d20, d30
  3319     vmlal.u16 q2, d21, d30
  3320     vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  3321     vmlsl.u16 q3, d22, d31
  3322     vmlal.u16 q3, d23, d31
  3323     vadd.u16  q12, q12, q13
  3324     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3325     vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  3326     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  3327     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3328     vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  3329     vmovn.u16 d6, q0
  3330     vmovn.u16 d7, q2
  3331     vadd.u16  q12, q12, q13
  3332     vst1.32   {d6, d7}, [OUT, :128]!
  3333 .endm
  3335 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
  3336     mov       TMP1, X, asr #16
  3337     add       X, X, UX
  3338     add       TMP1, TOP, TMP1, asl #2
  3339     mov       TMP2, X, asr #16
  3340     add       X, X, UX
  3341     add       TMP2, TOP, TMP2, asl #2
  3342         vmlal.u16 q1, d19, d31
  3343         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3344         vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  3345         vmlsl.u16 q2, d20, d30
  3346         vmlal.u16 q2, d21, d30
  3347         vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  3348     vld1.32   {d20}, [TMP1], STRIDE
  3349         vmlsl.u16 q3, d22, d31
  3350         vmlal.u16 q3, d23, d31
  3351     vld1.32   {d21}, [TMP1]
  3352     vmull.u8  q8, d20, d28
  3353     vmlal.u8  q8, d21, d29
  3354         vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3355         vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  3356         vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  3357     vld1.32   {d22}, [TMP2], STRIDE
  3358         vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  3359         vadd.u16  q12, q12, q13
  3360     vld1.32   {d23}, [TMP2]
  3361     vmull.u8  q9, d22, d28
  3362     mov       TMP3, X, asr #16
  3363     add       X, X, UX
  3364     add       TMP3, TOP, TMP3, asl #2
  3365     mov       TMP4, X, asr #16
  3366     add       X, X, UX
  3367     add       TMP4, TOP, TMP4, asl #2
  3368     vmlal.u8  q9, d23, d29
  3369     vld1.32   {d22}, [TMP3], STRIDE
  3370         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3371     vld1.32   {d23}, [TMP3]
  3372     vmull.u8  q10, d22, d28
  3373     vmlal.u8  q10, d23, d29
  3374         vmovn.u16 d6, q0
  3375     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  3376         vmovn.u16 d7, q2
  3377     vmlsl.u16 q0, d16, d30
  3378     vmlal.u16 q0, d17, d30
  3379     pld       [TMP4, PF_OFFS]
  3380     vld1.32   {d16}, [TMP4], STRIDE
  3381         vadd.u16  q12, q12, q13
  3382     vld1.32   {d17}, [TMP4]
  3383     pld       [TMP4, PF_OFFS]
  3384     vmull.u8  q11, d16, d28
  3385     vmlal.u8  q11, d17, d29
  3386         vst1.32   {d6, d7}, [OUT, :128]!
  3387     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  3388     vmlsl.u16 q1, d18, d31
  3389 .endm
  3391 /*****************************************************************************/
  3393 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
  3395 .macro bilinear_interpolate_eight_pixels_8888_0565_head
  3396     mov       TMP1, X, asr #16
  3397     add       X, X, UX
  3398     add       TMP1, TOP, TMP1, asl #2
  3399     mov       TMP2, X, asr #16
  3400     add       X, X, UX
  3401     add       TMP2, TOP, TMP2, asl #2
  3402     vld1.32   {d20}, [TMP1], STRIDE
  3403     vld1.32   {d21}, [TMP1]
  3404     vmull.u8  q8, d20, d28
  3405     vmlal.u8  q8, d21, d29
  3406     vld1.32   {d22}, [TMP2], STRIDE
  3407     vld1.32   {d23}, [TMP2]
  3408     vmull.u8  q9, d22, d28
  3409     mov       TMP3, X, asr #16
  3410     add       X, X, UX
  3411     add       TMP3, TOP, TMP3, asl #2
  3412     mov       TMP4, X, asr #16
  3413     add       X, X, UX
  3414     add       TMP4, TOP, TMP4, asl #2
  3415     vmlal.u8  q9, d23, d29
  3416     vld1.32   {d22}, [TMP3], STRIDE
  3417     vld1.32   {d23}, [TMP3]
  3418     vmull.u8  q10, d22, d28
  3419     vmlal.u8  q10, d23, d29
  3420     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  3421     vmlsl.u16 q0, d16, d30
  3422     vmlal.u16 q0, d17, d30
  3423     pld       [TMP4, PF_OFFS]
  3424     vld1.32   {d16}, [TMP4], STRIDE
  3425     vld1.32   {d17}, [TMP4]
  3426     pld       [TMP4, PF_OFFS]
  3427     vmull.u8  q11, d16, d28
  3428     vmlal.u8  q11, d17, d29
  3429     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  3430     vmlsl.u16 q1, d18, d31
  3432     mov       TMP1, X, asr #16
  3433     add       X, X, UX
  3434     add       TMP1, TOP, TMP1, asl #2
  3435     mov       TMP2, X, asr #16
  3436     add       X, X, UX
  3437     add       TMP2, TOP, TMP2, asl #2
  3438         vmlal.u16 q1, d19, d31
  3439         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3440         vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  3441         vmlsl.u16 q2, d20, d30
  3442         vmlal.u16 q2, d21, d30
  3443         vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  3444     vld1.32   {d20}, [TMP1], STRIDE
  3445         vmlsl.u16 q3, d22, d31
  3446         vmlal.u16 q3, d23, d31
  3447     vld1.32   {d21}, [TMP1]
  3448     vmull.u8  q8, d20, d28
  3449     vmlal.u8  q8, d21, d29
  3450         vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3451         vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  3452         vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  3453     vld1.32   {d22}, [TMP2], STRIDE
  3454         vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  3455         vadd.u16  q12, q12, q13
  3456     vld1.32   {d23}, [TMP2]
  3457     vmull.u8  q9, d22, d28
  3458     mov       TMP3, X, asr #16
  3459     add       X, X, UX
  3460     add       TMP3, TOP, TMP3, asl #2
  3461     mov       TMP4, X, asr #16
  3462     add       X, X, UX
  3463     add       TMP4, TOP, TMP4, asl #2
  3464     vmlal.u8  q9, d23, d29
  3465     vld1.32   {d22}, [TMP3], STRIDE
  3466         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3467     vld1.32   {d23}, [TMP3]
  3468     vmull.u8  q10, d22, d28
  3469     vmlal.u8  q10, d23, d29
  3470         vmovn.u16 d8, q0
  3471     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  3472         vmovn.u16 d9, q2
  3473     vmlsl.u16 q0, d16, d30
  3474     vmlal.u16 q0, d17, d30
  3475     pld       [TMP4, PF_OFFS]
  3476     vld1.32   {d16}, [TMP4], STRIDE
  3477         vadd.u16  q12, q12, q13
  3478     vld1.32   {d17}, [TMP4]
  3479     pld       [TMP4, PF_OFFS]
  3480     vmull.u8  q11, d16, d28
  3481     vmlal.u8  q11, d17, d29
  3482     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  3483     vmlsl.u16 q1, d18, d31
  3484 .endm
  3486 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
  3487     vmlal.u16 q1, d19, d31
  3488     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3489     vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  3490     vmlsl.u16 q2, d20, d30
  3491     vmlal.u16 q2, d21, d30
  3492     vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  3493     vmlsl.u16 q3, d22, d31
  3494     vmlal.u16 q3, d23, d31
  3495     vadd.u16  q12, q12, q13
  3496     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3497     vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  3498     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  3499     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3500     vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  3501     vmovn.u16 d10, q0
  3502     vmovn.u16 d11, q2
  3503     vadd.u16  q12, q12, q13
  3505     vuzp.u8   d8, d9
  3506     vuzp.u8   d10, d11
  3507     vuzp.u8   d9, d11
  3508     vuzp.u8   d8, d10
  3509     vshll.u8  q6, d9, #8
  3510     vshll.u8  q5, d10, #8
  3511     vshll.u8  q7, d8, #8
  3512     vsri.u16  q5, q6, #5
  3513     vsri.u16  q5, q7, #11
  3514     vst1.32   {d10, d11}, [OUT, :128]!
  3515 .endm
  3517 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
  3518     mov       TMP1, X, asr #16
  3519     add       X, X, UX
  3520     add       TMP1, TOP, TMP1, asl #2
  3521     mov       TMP2, X, asr #16
  3522     add       X, X, UX
  3523     add       TMP2, TOP, TMP2, asl #2
  3524         vmlal.u16 q1, d19, d31
  3525         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3526             vuzp.u8 d8, d9
  3527         vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  3528         vmlsl.u16 q2, d20, d30
  3529         vmlal.u16 q2, d21, d30
  3530         vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  3531     vld1.32   {d20}, [TMP1], STRIDE
  3532         vmlsl.u16 q3, d22, d31
  3533         vmlal.u16 q3, d23, d31
  3534     vld1.32   {d21}, [TMP1]
  3535     vmull.u8  q8, d20, d28
  3536     vmlal.u8  q8, d21, d29
  3537         vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3538         vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  3539         vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  3540     vld1.32   {d22}, [TMP2], STRIDE
  3541         vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  3542         vadd.u16  q12, q12, q13
  3543     vld1.32   {d23}, [TMP2]
  3544     vmull.u8  q9, d22, d28
  3545     mov       TMP3, X, asr #16
  3546     add       X, X, UX
  3547     add       TMP3, TOP, TMP3, asl #2
  3548     mov       TMP4, X, asr #16
  3549     add       X, X, UX
  3550     add       TMP4, TOP, TMP4, asl #2
  3551     vmlal.u8  q9, d23, d29
  3552     vld1.32   {d22}, [TMP3], STRIDE
  3553         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3554     vld1.32   {d23}, [TMP3]
  3555     vmull.u8  q10, d22, d28
  3556     vmlal.u8  q10, d23, d29
  3557         vmovn.u16 d10, q0
  3558     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  3559         vmovn.u16 d11, q2
  3560     vmlsl.u16 q0, d16, d30
  3561     vmlal.u16 q0, d17, d30
  3562     pld       [TMP4, PF_OFFS]
  3563     vld1.32   {d16}, [TMP4], STRIDE
  3564         vadd.u16  q12, q12, q13
  3565     vld1.32   {d17}, [TMP4]
  3566     pld       [TMP4, PF_OFFS]
  3567     vmull.u8  q11, d16, d28
  3568     vmlal.u8  q11, d17, d29
  3569             vuzp.u8 d10, d11
  3570     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  3571     vmlsl.u16 q1, d18, d31
  3573     mov       TMP1, X, asr #16
  3574     add       X, X, UX
  3575     add       TMP1, TOP, TMP1, asl #2
  3576     mov       TMP2, X, asr #16
  3577     add       X, X, UX
  3578     add       TMP2, TOP, TMP2, asl #2
  3579         vmlal.u16 q1, d19, d31
  3580             vuzp.u8 d9, d11
  3581         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3582         vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
  3583             vuzp.u8 d8, d10
  3584         vmlsl.u16 q2, d20, d30
  3585         vmlal.u16 q2, d21, d30
  3586         vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
  3587     vld1.32   {d20}, [TMP1], STRIDE
  3588         vmlsl.u16 q3, d22, d31
  3589         vmlal.u16 q3, d23, d31
  3590     vld1.32   {d21}, [TMP1]
  3591     vmull.u8  q8, d20, d28
  3592     vmlal.u8  q8, d21, d29
  3593             vshll.u8  q6, d9, #8
  3594             vshll.u8  q5, d10, #8
  3595             vshll.u8  q7, d8, #8
  3596         vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  3597             vsri.u16  q5, q6, #5
  3598         vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  3599             vsri.u16  q5, q7, #11
  3600         vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  3601     vld1.32   {d22}, [TMP2], STRIDE
  3602         vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  3603         vadd.u16  q12, q12, q13
  3604     vld1.32   {d23}, [TMP2]
  3605     vmull.u8  q9, d22, d28
  3606     mov       TMP3, X, asr #16
  3607     add       X, X, UX
  3608     add       TMP3, TOP, TMP3, asl #2
  3609     mov       TMP4, X, asr #16
  3610     add       X, X, UX
  3611     add       TMP4, TOP, TMP4, asl #2
  3612     vmlal.u8  q9, d23, d29
  3613     vld1.32   {d22}, [TMP3], STRIDE
  3614         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  3615     vld1.32   {d23}, [TMP3]
  3616     vmull.u8  q10, d22, d28
  3617     vmlal.u8  q10, d23, d29
  3618         vmovn.u16 d8, q0
  3619     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
  3620         vmovn.u16 d9, q2
  3621     vmlsl.u16 q0, d16, d30
  3622     vmlal.u16 q0, d17, d30
  3623     pld       [TMP4, PF_OFFS]
  3624     vld1.32   {d16}, [TMP4], STRIDE
  3625         vadd.u16  q12, q12, q13
  3626     vld1.32   {d17}, [TMP4]
  3627     pld       [TMP4, PF_OFFS]
  3628     vmull.u8  q11, d16, d28
  3629     vmlal.u8  q11, d17, d29
  3630     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
  3631             vst1.32   {d10, d11}, [OUT, :128]!
  3632     vmlsl.u16 q1, d18, d31
  3633 .endm
  3634 /*****************************************************************************/
  3636 generate_bilinear_scanline_func \
  3637     pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
  3638     2, 2, 28, BILINEAR_FLAG_UNROLL_4
  3640 generate_bilinear_scanline_func \
  3641     pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
  3642     2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
  3644 generate_bilinear_scanline_func \
  3645     pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
  3646     1, 2, 28, BILINEAR_FLAG_UNROLL_4
  3648 generate_bilinear_scanline_func \
  3649     pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
  3650     1, 1, 28, BILINEAR_FLAG_UNROLL_4

mercurial