gfx/cairo/libpixman/src/pixman-arm-neon-asm.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  * Copyright © 2009 Nokia Corporation
     3  *
     4  * Permission is hereby granted, free of charge, to any person obtaining a
     5  * copy of this software and associated documentation files (the "Software"),
     6  * to deal in the Software without restriction, including without limitation
     7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     8  * and/or sell copies of the Software, and to permit persons to whom the
     9  * Software is furnished to do so, subject to the following conditions:
    10  *
    11  * The above copyright notice and this permission notice (including the next
    12  * paragraph) shall be included in all copies or substantial portions of the
    13  * Software.
    14  *
    15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
    18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    21  * DEALINGS IN THE SOFTWARE.
    22  *
    23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
    24  */
    26 /*
    27  * This file contains a macro ('generate_composite_function') which can
    28  * construct 2D image processing functions, based on a common template.
    29  * Any combinations of source, destination and mask images with 8bpp,
    30  * 16bpp, 24bpp, 32bpp color formats are supported.
    31  *
    32  * This macro takes care of:
    33  *  - handling of leading and trailing unaligned pixels
    34  *  - doing most of the work related to L2 cache preload
    35  *  - encourages the use of software pipelining for better instructions
    36  *    scheduling
    37  *
    38  * The user of this macro has to provide some configuration parameters
    39  * (bit depths for the images, prefetch distance, etc.) and a set of
    40  * macros, which should implement basic code chunks responsible for
    41  * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
    42  * examples.
    43  *
    44  * TODO:
    45  *  - try overlapped pixel method (from Ian Rickards) when processing
    46  *    exactly two blocks of pixels
    47  *  - maybe add an option to do reverse scanline processing
    48  */
    50 /*
    51  * Bit flags for 'generate_composite_function' macro which are used
    52  * to tune generated functions behavior.
    53  */
    54 .set FLAG_DST_WRITEONLY,       0
    55 .set FLAG_DST_READWRITE,       1
    56 .set FLAG_DEINTERLEAVE_32BPP,  2
    58 /*
    59  * Offset in stack where mask and source pointer/stride can be accessed
    60  * from 'init' macro. This is useful for doing special handling for solid mask.
    61  */
    62 .set ARGS_STACK_OFFSET,        40
    64 /*
    65  * Constants for selecting preferable prefetch type.
    66  */
    67 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
    68 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
    69 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
    71 /*
    72  * Definitions of supplementary pixld/pixst macros (for partial load/store of
    73  * pixel data).
    74  */
    76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
    77 .if abits > 0
    78     op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
    79 .else
    80     op&.&elem_size {d&reg1}, [&mem_operand&]!
    81 .endif
    82 .endm
    84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
    85 .if abits > 0
    86     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
    87 .else
    88     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
    89 .endif
    90 .endm
    92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
    93 .if abits > 0
    94     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
    95 .else
    96     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
    97 .endif
    98 .endm
   100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
   101     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
   102 .endm
   104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
   105     op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
   106 .endm
   108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
   109     op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
   110 .endm
   112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
   113 .if numbytes == 32
   114     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
   115                               %(basereg+6), %(basereg+7), mem_operand, abits
   116 .elseif numbytes == 16
   117     pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
   118 .elseif numbytes == 8
   119     pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
   120 .elseif numbytes == 4
   121     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
   122         pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
   123     .elseif elem_size == 16
   124         pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
   125         pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
   126     .else
   127         pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
   128         pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
   129         pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
   130         pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
   131     .endif
   132 .elseif numbytes == 2
   133     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
   134         pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
   135     .else
   136         pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
   137         pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
   138     .endif
   139 .elseif numbytes == 1
   140     pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
   141 .else
   142     .error "unsupported size: numbytes"
   143 .endif
   144 .endm
   146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
   147 .if bpp > 0
   148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   149     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
   150                       %(basereg+6), %(basereg+7), mem_operand, abits
   151 .elseif (bpp == 24) && (numpix == 8)
   152     pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
   153 .elseif (bpp == 24) && (numpix == 4)
   154     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
   155     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
   156     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
   157     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
   158 .elseif (bpp == 24) && (numpix == 2)
   159     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
   160     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
   161 .elseif (bpp == 24) && (numpix == 1)
   162     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
   163 .else
   164     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
   165 .endif
   166 .endif
   167 .endm
   169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
   170 .if bpp > 0
   171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   172     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
   173                       %(basereg+6), %(basereg+7), mem_operand, abits
   174 .elseif (bpp == 24) && (numpix == 8)
   175     pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
   176 .elseif (bpp == 24) && (numpix == 4)
   177     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
   178     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
   179     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
   180     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
   181 .elseif (bpp == 24) && (numpix == 2)
   182     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
   183     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
   184 .elseif (bpp == 24) && (numpix == 1)
   185     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
   186 .else
   187     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
   188 .endif
   189 .endif
   190 .endm
   192 .macro pixld_a numpix, bpp, basereg, mem_operand
   193 .if (bpp * numpix) <= 128
   194     pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
   195 .else
   196     pixld numpix, bpp, basereg, mem_operand, 128
   197 .endif
   198 .endm
   200 .macro pixst_a numpix, bpp, basereg, mem_operand
   201 .if (bpp * numpix) <= 128
   202     pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
   203 .else
   204     pixst numpix, bpp, basereg, mem_operand, 128
   205 .endif
   206 .endm
   208 /*
   209  * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
   210  * aliases to be defined)
   211  */
   212 .macro pixld1_s elem_size, reg1, mem_operand
   213 .if elem_size == 16
   214     mov     TMP1, VX, asr #16
   215     adds    VX, VX, UNIT_X
   216 5:  subpls  VX, VX, SRC_WIDTH_FIXED
   217     bpl     5b
   218     add     TMP1, mem_operand, TMP1, asl #1
   219     mov     TMP2, VX, asr #16
   220     adds    VX, VX, UNIT_X
   221 5:  subpls  VX, VX, SRC_WIDTH_FIXED
   222     bpl     5b
   223     add     TMP2, mem_operand, TMP2, asl #1
   224     vld1.16 {d&reg1&[0]}, [TMP1, :16]
   225     mov     TMP1, VX, asr #16
   226     adds    VX, VX, UNIT_X
   227 5:  subpls  VX, VX, SRC_WIDTH_FIXED
   228     bpl     5b
   229     add     TMP1, mem_operand, TMP1, asl #1
   230     vld1.16 {d&reg1&[1]}, [TMP2, :16]
   231     mov     TMP2, VX, asr #16
   232     adds    VX, VX, UNIT_X
   233 5:  subpls  VX, VX, SRC_WIDTH_FIXED
   234     bpl     5b
   235     add     TMP2, mem_operand, TMP2, asl #1
   236     vld1.16 {d&reg1&[2]}, [TMP1, :16]
   237     vld1.16 {d&reg1&[3]}, [TMP2, :16]
   238 .elseif elem_size == 32
   239     mov     TMP1, VX, asr #16
   240     adds    VX, VX, UNIT_X
   241 5:  subpls  VX, VX, SRC_WIDTH_FIXED
   242     bpl     5b
   243     add     TMP1, mem_operand, TMP1, asl #2
   244     mov     TMP2, VX, asr #16
   245     adds    VX, VX, UNIT_X
   246 5:  subpls  VX, VX, SRC_WIDTH_FIXED
   247     bpl     5b
   248     add     TMP2, mem_operand, TMP2, asl #2
   249     vld1.32 {d&reg1&[0]}, [TMP1, :32]
   250     vld1.32 {d&reg1&[1]}, [TMP2, :32]
   251 .else
   252     .error "unsupported"
   253 .endif
   254 .endm
   256 .macro pixld2_s elem_size, reg1, reg2, mem_operand
   257 .if 0 /* elem_size == 32 */
   258     mov     TMP1, VX, asr #16
   259     add     VX, VX, UNIT_X, asl #1
   260     add     TMP1, mem_operand, TMP1, asl #2
   261     mov     TMP2, VX, asr #16
   262     sub     VX, VX, UNIT_X
   263     add     TMP2, mem_operand, TMP2, asl #2
   264     vld1.32 {d&reg1&[0]}, [TMP1, :32]
   265     mov     TMP1, VX, asr #16
   266     add     VX, VX, UNIT_X, asl #1
   267     add     TMP1, mem_operand, TMP1, asl #2
   268     vld1.32 {d&reg2&[0]}, [TMP2, :32]
   269     mov     TMP2, VX, asr #16
   270     add     VX, VX, UNIT_X
   271     add     TMP2, mem_operand, TMP2, asl #2
   272     vld1.32 {d&reg1&[1]}, [TMP1, :32]
   273     vld1.32 {d&reg2&[1]}, [TMP2, :32]
   274 .else
   275     pixld1_s elem_size, reg1, mem_operand
   276     pixld1_s elem_size, reg2, mem_operand
   277 .endif
   278 .endm
   280 .macro pixld0_s elem_size, reg1, idx, mem_operand
   281 .if elem_size == 16
   282     mov     TMP1, VX, asr #16
   283     adds    VX, VX, UNIT_X
   284 5:  subpls  VX, VX, SRC_WIDTH_FIXED
   285     bpl     5b
   286     add     TMP1, mem_operand, TMP1, asl #1
   287     vld1.16 {d&reg1&[idx]}, [TMP1, :16]
   288 .elseif elem_size == 32
   289     mov     TMP1, VX, asr #16
   290     adds    VX, VX, UNIT_X
   291 5:  subpls  VX, VX, SRC_WIDTH_FIXED
   292     bpl     5b
   293     add     TMP1, mem_operand, TMP1, asl #2
   294     vld1.32 {d&reg1&[idx]}, [TMP1, :32]
   295 .endif
   296 .endm
   298 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
   299 .if numbytes == 32
   300     pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
   301     pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
   302     pixdeinterleave elem_size, %(basereg+4)
   303 .elseif numbytes == 16
   304     pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
   305 .elseif numbytes == 8
   306     pixld1_s elem_size, %(basereg+1), mem_operand
   307 .elseif numbytes == 4
   308     .if elem_size == 32
   309         pixld0_s elem_size, %(basereg+0), 1, mem_operand
   310     .elseif elem_size == 16
   311         pixld0_s elem_size, %(basereg+0), 2, mem_operand
   312         pixld0_s elem_size, %(basereg+0), 3, mem_operand
   313     .else
   314         pixld0_s elem_size, %(basereg+0), 4, mem_operand
   315         pixld0_s elem_size, %(basereg+0), 5, mem_operand
   316         pixld0_s elem_size, %(basereg+0), 6, mem_operand
   317         pixld0_s elem_size, %(basereg+0), 7, mem_operand
   318     .endif
   319 .elseif numbytes == 2
   320     .if elem_size == 16
   321         pixld0_s elem_size, %(basereg+0), 1, mem_operand
   322     .else
   323         pixld0_s elem_size, %(basereg+0), 2, mem_operand
   324         pixld0_s elem_size, %(basereg+0), 3, mem_operand
   325     .endif
   326 .elseif numbytes == 1
   327     pixld0_s elem_size, %(basereg+0), 1, mem_operand
   328 .else
   329     .error "unsupported size: numbytes"
   330 .endif
   331 .endm
   333 .macro pixld_s numpix, bpp, basereg, mem_operand
   334 .if bpp > 0
   335     pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
   336 .endif
   337 .endm
   339 .macro vuzp8 reg1, reg2
   340     vuzp.8 d&reg1, d&reg2
   341 .endm
   343 .macro vzip8 reg1, reg2
   344     vzip.8 d&reg1, d&reg2
   345 .endm
   347 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
   348 .macro pixdeinterleave bpp, basereg
   349 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   350     vuzp8 %(basereg+0), %(basereg+1)
   351     vuzp8 %(basereg+2), %(basereg+3)
   352     vuzp8 %(basereg+1), %(basereg+3)
   353     vuzp8 %(basereg+0), %(basereg+2)
   354 .endif
   355 .endm
   357 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
   358 .macro pixinterleave bpp, basereg
   359 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   360     vzip8 %(basereg+0), %(basereg+2)
   361     vzip8 %(basereg+1), %(basereg+3)
   362     vzip8 %(basereg+2), %(basereg+3)
   363     vzip8 %(basereg+0), %(basereg+1)
   364 .endif
   365 .endm
   367 /*
   368  * This is a macro for implementing cache preload. The main idea is that
   369  * cache preload logic is mostly independent from the rest of pixels
   370  * processing code. It starts at the top left pixel and moves forward
   371  * across pixels and can jump across scanlines. Prefetch distance is
   372  * handled in an 'incremental' way: it starts from 0 and advances to the
   373  * optimal distance over time. After reaching optimal prefetch distance,
   374  * it is kept constant. There are some checks which prevent prefetching
   375  * unneeded pixel lines below the image (but it still can prefetch a bit
   376  * more data on the right side of the image - not a big issue and may
   377  * be actually helpful when rendering text glyphs). Additional trick is
   378  * the use of LDR instruction for prefetch instead of PLD when moving to
   379  * the next line, the point is that we have a high chance of getting TLB
   380  * miss in this case, and PLD would be useless.
   381  *
   382  * This sounds like it may introduce a noticeable overhead (when working with
   383  * fully cached data). But in reality, due to having a separate pipeline and
   384  * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
   385  * execute simultaneously with NEON and be completely shadowed by it. Thus
   386  * we get no performance overhead at all (*). This looks like a very nice
   387  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
   388  * but still can implement some rather advanced prefetch logic in sofware
   389  * for almost zero cost!
   390  *
   391  * (*) The overhead of the prefetcher is visible when running some trivial
   392  * pixels processing like simple copy. Anyway, having prefetch is a must
   393  * when working with the graphics data.
   394  */
   395 .macro PF a, x:vararg
   396 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
   397     a x
   398 .endif
   399 .endm
   401 .macro cache_preload std_increment, boost_increment
   402 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
   403 .if regs_shortage
   404     PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
   405 .endif
   406 .if std_increment != 0
   407     PF add PF_X, PF_X, #std_increment
   408 .endif
   409     PF tst PF_CTL, #0xF
   410     PF addne PF_X, PF_X, #boost_increment
   411     PF subne PF_CTL, PF_CTL, #1
   412     PF cmp PF_X, ORIG_W
   413 .if src_bpp_shift >= 0
   414     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   415 .endif
   416 .if dst_r_bpp != 0
   417     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   418 .endif
   419 .if mask_bpp_shift >= 0
   420     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   421 .endif
   422     PF subge PF_X, PF_X, ORIG_W
   423     PF subges PF_CTL, PF_CTL, #0x10
   424 .if src_bpp_shift >= 0
   425     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   426 .endif
   427 .if dst_r_bpp != 0
   428     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   429 .endif
   430 .if mask_bpp_shift >= 0
   431     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   432 .endif
   433 .endif
   434 .endm
   436 .macro cache_preload_simple
   437 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
   438 .if src_bpp > 0
   439     pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
   440 .endif
   441 .if dst_r_bpp > 0
   442     pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
   443 .endif
   444 .if mask_bpp > 0
   445     pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
   446 .endif
   447 .endif
   448 .endm
   450 .macro fetch_mask_pixblock
   451     pixld       pixblock_size, mask_bpp, \
   452                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   453 .endm
   455 /*
   456  * Macro which is used to process leading pixels until destination
   457  * pointer is properly aligned (at 16 bytes boundary). When destination
   458  * buffer uses 16bpp format, this is unnecessary, or even pointless.
   459  */
   460 .macro ensure_destination_ptr_alignment process_pixblock_head, \
   461                                         process_pixblock_tail, \
   462                                         process_pixblock_tail_head
   463 .if dst_w_bpp != 24
   464     tst         DST_R, #0xF
   465     beq         2f
   467 .irp lowbit, 1, 2, 4, 8, 16
   468 local skip1
   469 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
   470 .if lowbit < 16 /* we don't need more than 16-byte alignment */
   471     tst         DST_R, #lowbit
   472     beq         1f
   473 .endif
   474     pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
   475     pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
   476 .if dst_r_bpp > 0
   477     pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
   478 .else
   479     add         DST_R, DST_R, #lowbit
   480 .endif
   481     PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
   482     sub         W, W, #(lowbit * 8 / dst_w_bpp)
   483 1:
   484 .endif
   485 .endr
   486     pixdeinterleave src_bpp, src_basereg
   487     pixdeinterleave mask_bpp, mask_basereg
   488     pixdeinterleave dst_r_bpp, dst_r_basereg
   490     process_pixblock_head
   491     cache_preload 0, pixblock_size
   492     cache_preload_simple
   493     process_pixblock_tail
   495     pixinterleave dst_w_bpp, dst_w_basereg
   496 .irp lowbit, 1, 2, 4, 8, 16
   497 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
   498 .if lowbit < 16 /* we don't need more than 16-byte alignment */
   499     tst         DST_W, #lowbit
   500     beq         1f
   501 .endif
   502     pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
   503 1:
   504 .endif
   505 .endr
   506 .endif
   507 2:
   508 .endm
   510 /*
   511  * Special code for processing up to (pixblock_size - 1) remaining
   512  * trailing pixels. As SIMD processing performs operation on
   513  * pixblock_size pixels, anything smaller than this has to be loaded
   514  * and stored in a special way. Loading and storing of pixel data is
   515  * performed in such a way that we fill some 'slots' in the NEON
   516  * registers (some slots naturally are unused), then perform compositing
   517  * operation as usual. In the end, the data is taken from these 'slots'
   518  * and saved to memory.
   519  *
   520  * cache_preload_flag - allows to suppress prefetch if
   521  *                      set to 0
   522  * dst_aligned_flag   - selects whether destination buffer
   523  *                      is aligned
   524  */
   525 .macro process_trailing_pixels cache_preload_flag, \
   526                                dst_aligned_flag, \
   527                                process_pixblock_head, \
   528                                process_pixblock_tail, \
   529                                process_pixblock_tail_head
   530     tst         W, #(pixblock_size - 1)
   531     beq         2f
   532 .irp chunk_size, 16, 8, 4, 2, 1
   533 .if pixblock_size > chunk_size
   534     tst         W, #chunk_size
   535     beq         1f
   536     pixld_src   chunk_size, src_bpp, src_basereg, SRC
   537     pixld       chunk_size, mask_bpp, mask_basereg, MASK
   538 .if dst_aligned_flag != 0
   539     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
   540 .else
   541     pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
   542 .endif
   543 .if cache_preload_flag != 0
   544     PF add      PF_X, PF_X, #chunk_size
   545 .endif
   546 1:
   547 .endif
   548 .endr
   549     pixdeinterleave src_bpp, src_basereg
   550     pixdeinterleave mask_bpp, mask_basereg
   551     pixdeinterleave dst_r_bpp, dst_r_basereg
   553     process_pixblock_head
   554 .if cache_preload_flag != 0
   555     cache_preload 0, pixblock_size
   556     cache_preload_simple
   557 .endif
   558     process_pixblock_tail
   559     pixinterleave dst_w_bpp, dst_w_basereg
   560 .irp chunk_size, 16, 8, 4, 2, 1
   561 .if pixblock_size > chunk_size
   562     tst         W, #chunk_size
   563     beq         1f
   564 .if dst_aligned_flag != 0
   565     pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
   566 .else
   567     pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
   568 .endif
   569 1:
   570 .endif
   571 .endr
   572 2:
   573 .endm
   575 /*
   576  * Macro, which performs all the needed operations to switch to the next
   577  * scanline and start the next loop iteration unless all the scanlines
   578  * are already processed.
   579  */
   580 .macro advance_to_next_scanline start_of_loop_label
   581 .if regs_shortage
   582     ldrd        W, [sp] /* load W and H (width and height) from stack */
   583 .else
   584     mov         W, ORIG_W
   585 .endif
   586     add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
   587 .if src_bpp != 0
   588     add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
   589 .endif
   590 .if mask_bpp != 0
   591     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
   592 .endif
   593 .if (dst_w_bpp != 24)
   594     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
   595 .endif
   596 .if (src_bpp != 24) && (src_bpp != 0)
   597     sub         SRC, SRC, W, lsl #src_bpp_shift
   598 .endif
   599 .if (mask_bpp != 24) && (mask_bpp != 0)
   600     sub         MASK, MASK, W, lsl #mask_bpp_shift
   601 .endif
   602     subs        H, H, #1
   603     mov         DST_R, DST_W
   604 .if regs_shortage
   605     str         H, [sp, #4] /* save updated height to stack */
   606 .endif
   607     bge         start_of_loop_label
   608 .endm
   610 /*
   611  * Registers are allocated in the following way by default:
   612  * d0, d1, d2, d3     - reserved for loading source pixel data
   613  * d4, d5, d6, d7     - reserved for loading destination pixel data
   614  * d24, d25, d26, d27 - reserved for loading mask pixel data
   615  * d28, d29, d30, d31 - final destination pixel data for writeback to memory
   616  */
   617 .macro generate_composite_function fname, \
   618                                    src_bpp_, \
   619                                    mask_bpp_, \
   620                                    dst_w_bpp_, \
   621                                    flags, \
   622                                    pixblock_size_, \
   623                                    prefetch_distance, \
   624                                    init, \
   625                                    cleanup, \
   626                                    process_pixblock_head, \
   627                                    process_pixblock_tail, \
   628                                    process_pixblock_tail_head, \
   629                                    dst_w_basereg_ = 28, \
   630                                    dst_r_basereg_ = 4, \
   631                                    src_basereg_   = 0, \
   632                                    mask_basereg_  = 24
   634     .func fname
   635     .global fname
   636     /* For ELF format also set function visibility to hidden */
   637 #ifdef __ELF__
   638     .hidden fname
   639     .type fname, %function
   640 #endif
   641 fname:
   642     .fnstart
   643     .save       {r4-r12, lr}
   644     push        {r4-r12, lr}        /* save all registers */
   646 /*
   647  * Select prefetch type for this function. If prefetch distance is
   648  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
   649  * has to be used instead of ADVANCED.
   650  */
   651     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
   652 .if prefetch_distance == 0
   653     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
   654 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
   655         ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
   656     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
   657 .endif
   659 /*
   660  * Make some macro arguments globally visible and accessible
   661  * from other macros
   662  */
   663     .set src_bpp, src_bpp_
   664     .set mask_bpp, mask_bpp_
   665     .set dst_w_bpp, dst_w_bpp_
   666     .set pixblock_size, pixblock_size_
   667     .set dst_w_basereg, dst_w_basereg_
   668     .set dst_r_basereg, dst_r_basereg_
   669     .set src_basereg, src_basereg_
   670     .set mask_basereg, mask_basereg_
   672     .macro pixld_src x:vararg
   673         pixld x
   674     .endm
   675     .macro fetch_src_pixblock
   676         pixld_src   pixblock_size, src_bpp, \
   677                     (src_basereg - pixblock_size * src_bpp / 64), SRC
   678     .endm
   679 /*
   680  * Assign symbolic names to registers
   681  */
   682     W           .req        r0      /* width (is updated during processing) */
   683     H           .req        r1      /* height (is updated during processing) */
   684     DST_W       .req        r2      /* destination buffer pointer for writes */
   685     DST_STRIDE  .req        r3      /* destination image stride */
   686     SRC         .req        r4      /* source buffer pointer */
   687     SRC_STRIDE  .req        r5      /* source image stride */
   688     DST_R       .req        r6      /* destination buffer pointer for reads */
   690     MASK        .req        r7      /* mask pointer */
   691     MASK_STRIDE .req        r8      /* mask stride */
   693     PF_CTL      .req        r9      /* combined lines counter and prefetch */
   694                                     /* distance increment counter */
   695     PF_X        .req        r10     /* pixel index in a scanline for current */
   696                                     /* pretetch position */
   697     PF_SRC      .req        r11     /* pointer to source scanline start */
   698                                     /* for prefetch purposes */
   699     PF_DST      .req        r12     /* pointer to destination scanline start */
   700                                     /* for prefetch purposes */
   701     PF_MASK     .req        r14     /* pointer to mask scanline start */
   702                                     /* for prefetch purposes */
   703 /*
   704  * Check whether we have enough registers for all the local variables.
   705  * If we don't have enough registers, original width and height are
   706  * kept on top of stack (and 'regs_shortage' variable is set to indicate
   707  * this for the rest of code). Even if there are enough registers, the
   708  * allocation scheme may be a bit different depending on whether source
   709  * or mask is not used.
   710  */
   711 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
   712     ORIG_W      .req        r10     /* saved original width */
   713     DUMMY       .req        r12     /* temporary register */
   714     .set        regs_shortage, 0
   715 .elseif mask_bpp == 0
   716     ORIG_W      .req        r7      /* saved original width */
   717     DUMMY       .req        r8      /* temporary register */
   718     .set        regs_shortage, 0
   719 .elseif src_bpp == 0
   720     ORIG_W      .req        r4      /* saved original width */
   721     DUMMY       .req        r5      /* temporary register */
   722     .set        regs_shortage, 0
   723 .else
   724     ORIG_W      .req        r1      /* saved original width */
   725     DUMMY       .req        r1      /* temporary register */
   726     .set        regs_shortage, 1
   727 .endif
   729     .set mask_bpp_shift, -1
   730 .if src_bpp == 32
   731     .set src_bpp_shift, 2
   732 .elseif src_bpp == 24
   733     .set src_bpp_shift, 0
   734 .elseif src_bpp == 16
   735     .set src_bpp_shift, 1
   736 .elseif src_bpp == 8
   737     .set src_bpp_shift, 0
   738 .elseif src_bpp == 0
   739     .set src_bpp_shift, -1
   740 .else
   741     .error "requested src bpp (src_bpp) is not supported"
   742 .endif
   743 .if mask_bpp == 32
   744     .set mask_bpp_shift, 2
   745 .elseif mask_bpp == 24
   746     .set mask_bpp_shift, 0
   747 .elseif mask_bpp == 8
   748     .set mask_bpp_shift, 0
   749 .elseif mask_bpp == 0
   750     .set mask_bpp_shift, -1
   751 .else
   752     .error "requested mask bpp (mask_bpp) is not supported"
   753 .endif
   754 .if dst_w_bpp == 32
   755     .set dst_bpp_shift, 2
   756 .elseif dst_w_bpp == 24
   757     .set dst_bpp_shift, 0
   758 .elseif dst_w_bpp == 16
   759     .set dst_bpp_shift, 1
   760 .elseif dst_w_bpp == 8
   761     .set dst_bpp_shift, 0
   762 .else
   763     .error "requested dst bpp (dst_w_bpp) is not supported"
   764 .endif
   766 .if (((flags) & FLAG_DST_READWRITE) != 0)
   767     .set dst_r_bpp, dst_w_bpp
   768 .else
   769     .set dst_r_bpp, 0
   770 .endif
   771 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
   772     .set DEINTERLEAVE_32BPP_ENABLED, 1
   773 .else
   774     .set DEINTERLEAVE_32BPP_ENABLED, 0
   775 .endif
   777 .if prefetch_distance < 0 || prefetch_distance > 15
   778     .error "invalid prefetch distance (prefetch_distance)"
   779 .endif
   781 .if src_bpp > 0
   782     ldr         SRC, [sp, #40]
   783 .endif
   784 .if mask_bpp > 0
   785     ldr         MASK, [sp, #48]
   786 .endif
   787     PF mov      PF_X, #0
   788 .if src_bpp > 0
   789     ldr         SRC_STRIDE, [sp, #44]
   790 .endif
   791 .if mask_bpp > 0
   792     ldr         MASK_STRIDE, [sp, #52]
   793 .endif
   794     mov         DST_R, DST_W
   796 .if src_bpp == 24
   797     sub         SRC_STRIDE, SRC_STRIDE, W
   798     sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
   799 .endif
   800 .if mask_bpp == 24
   801     sub         MASK_STRIDE, MASK_STRIDE, W
   802     sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
   803 .endif
   804 .if dst_w_bpp == 24
   805     sub         DST_STRIDE, DST_STRIDE, W
   806     sub         DST_STRIDE, DST_STRIDE, W, lsl #1
   807 .endif
   809 /*
   810  * Setup advanced prefetcher initial state
   811  */
   812     PF mov      PF_SRC, SRC
   813     PF mov      PF_DST, DST_R
   814     PF mov      PF_MASK, MASK
   815     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
   816     PF mov      PF_CTL, H, lsl #4
   817     PF add      PF_CTL, #(prefetch_distance - 0x10)
   819     init
   820 .if regs_shortage
   821     .save       {r0, r1}
   822     push        {r0, r1}
   823 .endif
   824     subs        H, H, #1
   825 .if regs_shortage
   826     str         H, [sp, #4] /* save updated height to stack */
   827 .else
   828     mov         ORIG_W, W
   829 .endif
   830     blt         9f
   831     cmp         W, #(pixblock_size * 2)
   832     blt         8f
   833 /*
   834  * This is the start of the pipelined loop, which if optimized for
   835  * long scanlines
   836  */
   837 0:
   838     ensure_destination_ptr_alignment process_pixblock_head, \
   839                                      process_pixblock_tail, \
   840                                      process_pixblock_tail_head
   842     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
   843     pixld_a     pixblock_size, dst_r_bpp, \
   844                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   845     fetch_src_pixblock
   846     pixld       pixblock_size, mask_bpp, \
   847                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   848     PF add      PF_X, PF_X, #pixblock_size
   849     process_pixblock_head
   850     cache_preload 0, pixblock_size
   851     cache_preload_simple
   852     subs        W, W, #(pixblock_size * 2)
   853     blt         2f
   854 1:
   855     process_pixblock_tail_head
   856     cache_preload_simple
   857     subs        W, W, #pixblock_size
   858     bge         1b
   859 2:
   860     process_pixblock_tail
   861     pixst_a     pixblock_size, dst_w_bpp, \
   862                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   864     /* Process the remaining trailing pixels in the scanline */
   865     process_trailing_pixels 1, 1, \
   866                             process_pixblock_head, \
   867                             process_pixblock_tail, \
   868                             process_pixblock_tail_head
   869     advance_to_next_scanline 0b
   871 .if regs_shortage
   872     pop         {r0, r1}
   873 .endif
   874     cleanup
   875     pop         {r4-r12, pc}  /* exit */
   876 /*
   877  * This is the start of the loop, designed to process images with small width
   878  * (less than pixblock_size * 2 pixels). In this case neither pipelining
   879  * nor prefetch are used.
   880  */
   881 8:
   882     /* Process exactly pixblock_size pixels if needed */
   883     tst         W, #pixblock_size
   884     beq         1f
   885     pixld       pixblock_size, dst_r_bpp, \
   886                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   887     fetch_src_pixblock
   888     pixld       pixblock_size, mask_bpp, \
   889                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   890     process_pixblock_head
   891     process_pixblock_tail
   892     pixst       pixblock_size, dst_w_bpp, \
   893                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   894 1:
   895     /* Process the remaining trailing pixels in the scanline */
   896     process_trailing_pixels 0, 0, \
   897                             process_pixblock_head, \
   898                             process_pixblock_tail, \
   899                             process_pixblock_tail_head
   900     advance_to_next_scanline 8b
   901 9:
   902 .if regs_shortage
   903     pop         {r0, r1}
   904 .endif
   905     cleanup
   906     pop         {r4-r12, pc}  /* exit */
   907     .fnend
   909     .purgem     fetch_src_pixblock
   910     .purgem     pixld_src
   912     .unreq      SRC
   913     .unreq      MASK
   914     .unreq      DST_R
   915     .unreq      DST_W
   916     .unreq      ORIG_W
   917     .unreq      W
   918     .unreq      H
   919     .unreq      SRC_STRIDE
   920     .unreq      DST_STRIDE
   921     .unreq      MASK_STRIDE
   922     .unreq      PF_CTL
   923     .unreq      PF_X
   924     .unreq      PF_SRC
   925     .unreq      PF_DST
   926     .unreq      PF_MASK
   927     .unreq      DUMMY
   928     .endfunc
   929 .endm
   931 /*
   932  * A simplified variant of function generation template for a single
   933  * scanline processing (for implementing pixman combine functions)
   934  */
   935 .macro generate_composite_function_scanline        use_nearest_scaling, \
   936                                                    fname, \
   937                                                    src_bpp_, \
   938                                                    mask_bpp_, \
   939                                                    dst_w_bpp_, \
   940                                                    flags, \
   941                                                    pixblock_size_, \
   942                                                    init, \
   943                                                    cleanup, \
   944                                                    process_pixblock_head, \
   945                                                    process_pixblock_tail, \
   946                                                    process_pixblock_tail_head, \
   947                                                    dst_w_basereg_ = 28, \
   948                                                    dst_r_basereg_ = 4, \
   949                                                    src_basereg_   = 0, \
   950                                                    mask_basereg_  = 24
   952     .func fname
   953     .global fname
   954     /* For ELF format also set function visibility to hidden */
   955 #ifdef __ELF__
   956     .hidden fname
   957     .type fname, %function
   958 #endif
   959 fname:
   960     .fnstart
   961     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
   962 /*
   963  * Make some macro arguments globally visible and accessible
   964  * from other macros
   965  */
   966     .set src_bpp, src_bpp_
   967     .set mask_bpp, mask_bpp_
   968     .set dst_w_bpp, dst_w_bpp_
   969     .set pixblock_size, pixblock_size_
   970     .set dst_w_basereg, dst_w_basereg_
   971     .set dst_r_basereg, dst_r_basereg_
   972     .set src_basereg, src_basereg_
   973     .set mask_basereg, mask_basereg_
   975 .if use_nearest_scaling != 0
   976     /*
   977      * Assign symbolic names to registers for nearest scaling
   978      */
   979     W           .req        r0
   980     DST_W       .req        r1
   981     SRC         .req        r2
   982     VX          .req        r3
   983     UNIT_X      .req        ip
   984     MASK        .req        lr
   985     TMP1        .req        r4
   986     TMP2        .req        r5
   987     DST_R       .req        r6
   988     SRC_WIDTH_FIXED .req        r7
   990     .macro pixld_src x:vararg
   991         pixld_s x
   992     .endm
   994     ldr         UNIT_X, [sp]
   995     .save       {r4-r8, lr}
   996     push        {r4-r8, lr}
   997     ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
   998     .if mask_bpp != 0
   999     ldr         MASK, [sp, #(24 + 8)]
  1000     .endif
  1001 .else
  1002     /*
  1003      * Assign symbolic names to registers
  1004      */
  1005     W           .req        r0      /* width (is updated during processing) */
  1006     DST_W       .req        r1      /* destination buffer pointer for writes */
  1007     SRC         .req        r2      /* source buffer pointer */
  1008     DST_R       .req        ip      /* destination buffer pointer for reads */
  1009     MASK        .req        r3      /* mask pointer */
  1011     .macro pixld_src x:vararg
  1012         pixld x
  1013     .endm
  1014 .endif
  1016 .if (((flags) & FLAG_DST_READWRITE) != 0)
  1017     .set dst_r_bpp, dst_w_bpp
  1018 .else
  1019     .set dst_r_bpp, 0
  1020 .endif
  1021 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
  1022     .set DEINTERLEAVE_32BPP_ENABLED, 1
  1023 .else
  1024     .set DEINTERLEAVE_32BPP_ENABLED, 0
  1025 .endif
  1027     .macro fetch_src_pixblock
  1028         pixld_src   pixblock_size, src_bpp, \
  1029                     (src_basereg - pixblock_size * src_bpp / 64), SRC
  1030     .endm
  1032     init
  1033     mov         DST_R, DST_W
  1035     cmp         W, #pixblock_size
  1036     blt         8f
  1038     ensure_destination_ptr_alignment process_pixblock_head, \
  1039                                      process_pixblock_tail, \
  1040                                      process_pixblock_tail_head
  1042     subs        W, W, #pixblock_size
  1043     blt         7f
  1045     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
  1046     pixld_a     pixblock_size, dst_r_bpp, \
  1047                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
  1048     fetch_src_pixblock
  1049     pixld       pixblock_size, mask_bpp, \
  1050                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
  1051     process_pixblock_head
  1052     subs        W, W, #pixblock_size
  1053     blt         2f
  1054 1:
  1055     process_pixblock_tail_head
  1056     subs        W, W, #pixblock_size
  1057     bge         1b
  1058 2:
  1059     process_pixblock_tail
  1060     pixst_a     pixblock_size, dst_w_bpp, \
  1061                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
  1062 7:
  1063     /* Process the remaining trailing pixels in the scanline (dst aligned) */
  1064     process_trailing_pixels 0, 1, \
  1065                             process_pixblock_head, \
  1066                             process_pixblock_tail, \
  1067                             process_pixblock_tail_head
  1069     cleanup
  1070 .if use_nearest_scaling != 0
  1071     pop         {r4-r8, pc}  /* exit */
  1072 .else
  1073     bx          lr  /* exit */
  1074 .endif
  1075 8:
  1076     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
  1077     process_trailing_pixels 0, 0, \
  1078                             process_pixblock_head, \
  1079                             process_pixblock_tail, \
  1080                             process_pixblock_tail_head
  1082     cleanup
  1084 .if use_nearest_scaling != 0
  1085     pop         {r4-r8, pc}  /* exit */
  1087     .unreq      DST_R
  1088     .unreq      SRC
  1089     .unreq      W
  1090     .unreq      VX
  1091     .unreq      UNIT_X
  1092     .unreq      TMP1
  1093     .unreq      TMP2
  1094     .unreq      DST_W
  1095     .unreq      MASK
  1096     .unreq      SRC_WIDTH_FIXED
  1098 .else
  1099     bx          lr  /* exit */
  1101     .unreq      SRC
  1102     .unreq      MASK
  1103     .unreq      DST_R
  1104     .unreq      DST_W
  1105     .unreq      W
  1106 .endif
  1108     .purgem     fetch_src_pixblock
  1109     .purgem     pixld_src
  1111     .fnend
  1112     .endfunc
  1113 .endm
  1115 .macro generate_composite_function_single_scanline x:vararg
  1116     generate_composite_function_scanline 0, x
  1117 .endm
  1119 .macro generate_composite_function_nearest_scanline x:vararg
  1120     generate_composite_function_scanline 1, x
  1121 .endm
  1123 /* Default prologue/epilogue, nothing special needs to be done */
  1125 .macro default_init
  1126 .endm
  1128 .macro default_cleanup
  1129 .endm
  1131 /*
  1132  * Prologue/epilogue variant which additionally saves/restores d8-d15
  1133  * registers (they need to be saved/restored by callee according to ABI).
  1134  * This is required if the code needs to use all the NEON registers.
  1135  */
  1137 .macro default_init_need_all_regs
  1138     .vsave      {d8-d15}
  1139     vpush       {d8-d15}
  1140 .endm
  1142 .macro default_cleanup_need_all_regs
  1143     vpop        {d8-d15}
  1144 .endm
  1146 /******************************************************************************/
  1148 /*
  1149  * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
  1150  * into a planar a8r8g8b8 format (with a, r, g, b color components
  1151  * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
  1153  * Warning: the conversion is destructive and the original
  1154  *          value (in) is lost.
  1155  */
  1156 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
  1157     vshrn.u16   out_r, in,    #8
  1158     vshrn.u16   out_g, in,    #3
  1159     vsli.u16    in,    in,    #5
  1160     vmov.u8     out_a, #255
  1161     vsri.u8     out_r, out_r, #5
  1162     vsri.u8     out_g, out_g, #6
  1163     vshrn.u16   out_b, in,    #2
  1164 .endm
  1166 .macro convert_0565_to_x888 in, out_r, out_g, out_b
  1167     vshrn.u16   out_r, in,    #8
  1168     vshrn.u16   out_g, in,    #3
  1169     vsli.u16    in,    in,    #5
  1170     vsri.u8     out_r, out_r, #5
  1171     vsri.u8     out_g, out_g, #6
  1172     vshrn.u16   out_b, in,    #2
  1173 .endm
  1175 /*
  1176  * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
  1177  * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
  1178  * pixels packed in 128-bit register (out). Requires two temporary 128-bit
  1179  * registers (tmp1, tmp2)
  1180  */
  1181 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
  1182     vshll.u8    tmp1, in_g, #8
  1183     vshll.u8    out, in_r, #8
  1184     vshll.u8    tmp2, in_b, #8
  1185     vsri.u16    out, tmp1, #5
  1186     vsri.u16    out, tmp2, #11
  1187 .endm
  1189 /*
  1190  * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
  1191  * returned in (out0, out1) registers pair. Requires one temporary
  1192  * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
  1193  * value from 'in' is lost
  1194  */
  1195 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
  1196     vshl.u16    out0, in,   #5  /* G top 6 bits */
  1197     vshl.u16    tmp,  in,   #11 /* B top 5 bits */
  1198     vsri.u16    in,   in,   #5  /* R is ready in top bits */
  1199     vsri.u16    out0, out0, #6  /* G is ready in top bits */
  1200     vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
  1201     vshr.u16    out1, in,   #8  /* R is in place */
  1202     vsri.u16    out0, tmp,  #8  /* G & B is in place */
  1203     vzip.u16    out0, out1      /* everything is in place */
  1204 .endm

mercurial