The Tor Browser: gfx/cairo/libpixman/src/pixman-arm-neon-asm.h@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*

     2  * Copyright © 2009 Nokia Corporation

     3  *

     4  * Permission is hereby granted, free of charge, to any person obtaining a

     5  * copy of this software and associated documentation files (the "Software"),

     6  * to deal in the Software without restriction, including without limitation

     7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,

     8  * and/or sell copies of the Software, and to permit persons to whom the

     9  * Software is furnished to do so, subject to the following conditions:

    10  *

    11  * The above copyright notice and this permission notice (including the next

    12  * paragraph) shall be included in all copies or substantial portions of the

    13  * Software.

    14  *

    15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

    16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

    17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL

    18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

    19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING

    20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER

    21  * DEALINGS IN THE SOFTWARE.

    22  *

    23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)

    24  */

    26 /*

    27  * This file contains a macro ('generate_composite_function') which can

    28  * construct 2D image processing functions, based on a common template.

    29  * Any combinations of source, destination and mask images with 8bpp,

    30  * 16bpp, 24bpp, 32bpp color formats are supported.

    31  *

    32  * This macro takes care of:

    33  *  - handling of leading and trailing unaligned pixels

    34  *  - doing most of the work related to L2 cache preload

    35  *  - encourages the use of software pipelining for better instructions

    36  *    scheduling

    37  *

    38  * The user of this macro has to provide some configuration parameters

    39  * (bit depths for the images, prefetch distance, etc.) and a set of

    40  * macros, which should implement basic code chunks responsible for

    41  * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage

    42  * examples.

    43  *

    44  * TODO:

    45  *  - try overlapped pixel method (from Ian Rickards) when processing

    46  *    exactly two blocks of pixels

    47  *  - maybe add an option to do reverse scanline processing

    48  */

    50 /*

    51  * Bit flags for 'generate_composite_function' macro which are used

    52  * to tune generated functions behavior.

    53  */

    54 .set FLAG_DST_WRITEONLY,       0

    55 .set FLAG_DST_READWRITE,       1

    56 .set FLAG_DEINTERLEAVE_32BPP,  2

    58 /*

    59  * Offset in stack where mask and source pointer/stride can be accessed

    60  * from 'init' macro. This is useful for doing special handling for solid mask.

    61  */

    62 .set ARGS_STACK_OFFSET,        40

    64 /*

    65  * Constants for selecting preferable prefetch type.

    66  */

    67 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */

    68 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */

    69 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */

    71 /*

    72  * Definitions of supplementary pixld/pixst macros (for partial load/store of

    73  * pixel data).

    74  */

    76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits

    77 .if abits > 0

    78     op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!

    79 .else

    80     op&.&elem_size {d&reg1}, [&mem_operand&]!

    81 .endif

    82 .endm

    84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits

    85 .if abits > 0

    86     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!

    87 .else

    88     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!

    89 .endif

    90 .endm

    92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits

    93 .if abits > 0

    94     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!

    95 .else

    96     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!

    97 .endif

    98 .endm

   100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits

   101     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!

   102 .endm

   104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand

   105     op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!

   106 .endm

   108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand

   109     op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!

   110 .endm

   112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits

   113 .if numbytes == 32

   114     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \

   115                               %(basereg+6), %(basereg+7), mem_operand, abits

   116 .elseif numbytes == 16

   117     pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits

   118 .elseif numbytes == 8

   119     pixldst1 op, elem_size, %(basereg+1), mem_operand, abits

   120 .elseif numbytes == 4

   121     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)

   122         pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits

   123     .elseif elem_size == 16

   124         pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits

   125         pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits

   126     .else

   127         pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits

   128         pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits

   129         pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits

   130         pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits

   131     .endif

   132 .elseif numbytes == 2

   133     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)

   134         pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits

   135     .else

   136         pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits

   137         pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits

   138     .endif

   139 .elseif numbytes == 1

   140     pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits

   141 .else

   142     .error "unsupported size: numbytes"

   143 .endif

   144 .endm

   146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0

   147 .if bpp > 0

   148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)

   149     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \

   150                       %(basereg+6), %(basereg+7), mem_operand, abits

   151 .elseif (bpp == 24) && (numpix == 8)

   152     pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand

   153 .elseif (bpp == 24) && (numpix == 4)

   154     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand

   155     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand

   156     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand

   157     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand

   158 .elseif (bpp == 24) && (numpix == 2)

   159     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand

   160     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand

   161 .elseif (bpp == 24) && (numpix == 1)

   162     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand

   163 .else

   164     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits

   165 .endif

   166 .endif

   167 .endm

   169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0

   170 .if bpp > 0

   171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)

   172     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \

   173                       %(basereg+6), %(basereg+7), mem_operand, abits

   174 .elseif (bpp == 24) && (numpix == 8)

   175     pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand

   176 .elseif (bpp == 24) && (numpix == 4)

   177     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand

   178     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand

   179     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand

   180     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand

   181 .elseif (bpp == 24) && (numpix == 2)

   182     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand

   183     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand

   184 .elseif (bpp == 24) && (numpix == 1)

   185     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand

   186 .else

   187     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits

   188 .endif

   189 .endif

   190 .endm

   192 .macro pixld_a numpix, bpp, basereg, mem_operand

   193 .if (bpp * numpix) <= 128

   194     pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)

   195 .else

   196     pixld numpix, bpp, basereg, mem_operand, 128

   197 .endif

   198 .endm

   200 .macro pixst_a numpix, bpp, basereg, mem_operand

   201 .if (bpp * numpix) <= 128

   202     pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)

   203 .else

   204     pixst numpix, bpp, basereg, mem_operand, 128

   205 .endif

   206 .endm

   208 /*

   209  * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register

   210  * aliases to be defined)

   211  */

   212 .macro pixld1_s elem_size, reg1, mem_operand

   213 .if elem_size == 16

   214     mov     TMP1, VX, asr #16

   215     adds    VX, VX, UNIT_X

   216 5:  subpls  VX, VX, SRC_WIDTH_FIXED

   217     bpl     5b

   218     add     TMP1, mem_operand, TMP1, asl #1

   219     mov     TMP2, VX, asr #16

   220     adds    VX, VX, UNIT_X

   221 5:  subpls  VX, VX, SRC_WIDTH_FIXED

   222     bpl     5b

   223     add     TMP2, mem_operand, TMP2, asl #1

   224     vld1.16 {d&reg1&[0]}, [TMP1, :16]

   225     mov     TMP1, VX, asr #16

   226     adds    VX, VX, UNIT_X

   227 5:  subpls  VX, VX, SRC_WIDTH_FIXED

   228     bpl     5b

   229     add     TMP1, mem_operand, TMP1, asl #1

   230     vld1.16 {d&reg1&[1]}, [TMP2, :16]

   231     mov     TMP2, VX, asr #16

   232     adds    VX, VX, UNIT_X

   233 5:  subpls  VX, VX, SRC_WIDTH_FIXED

   234     bpl     5b

   235     add     TMP2, mem_operand, TMP2, asl #1

   236     vld1.16 {d&reg1&[2]}, [TMP1, :16]

   237     vld1.16 {d&reg1&[3]}, [TMP2, :16]

   238 .elseif elem_size == 32

   239     mov     TMP1, VX, asr #16

   240     adds    VX, VX, UNIT_X

   241 5:  subpls  VX, VX, SRC_WIDTH_FIXED

   242     bpl     5b

   243     add     TMP1, mem_operand, TMP1, asl #2

   244     mov     TMP2, VX, asr #16

   245     adds    VX, VX, UNIT_X

   246 5:  subpls  VX, VX, SRC_WIDTH_FIXED

   247     bpl     5b

   248     add     TMP2, mem_operand, TMP2, asl #2

   249     vld1.32 {d&reg1&[0]}, [TMP1, :32]

   250     vld1.32 {d&reg1&[1]}, [TMP2, :32]

   251 .else

   252     .error "unsupported"

   253 .endif

   254 .endm

   256 .macro pixld2_s elem_size, reg1, reg2, mem_operand

   257 .if 0 /* elem_size == 32 */

   258     mov     TMP1, VX, asr #16

   259     add     VX, VX, UNIT_X, asl #1

   260     add     TMP1, mem_operand, TMP1, asl #2

   261     mov     TMP2, VX, asr #16

   262     sub     VX, VX, UNIT_X

   263     add     TMP2, mem_operand, TMP2, asl #2

   264     vld1.32 {d&reg1&[0]}, [TMP1, :32]

   265     mov     TMP1, VX, asr #16

   266     add     VX, VX, UNIT_X, asl #1

   267     add     TMP1, mem_operand, TMP1, asl #2

   268     vld1.32 {d&reg2&[0]}, [TMP2, :32]

   269     mov     TMP2, VX, asr #16

   270     add     VX, VX, UNIT_X

   271     add     TMP2, mem_operand, TMP2, asl #2

   272     vld1.32 {d&reg1&[1]}, [TMP1, :32]

   273     vld1.32 {d&reg2&[1]}, [TMP2, :32]

   274 .else

   275     pixld1_s elem_size, reg1, mem_operand

   276     pixld1_s elem_size, reg2, mem_operand

   277 .endif

   278 .endm

   280 .macro pixld0_s elem_size, reg1, idx, mem_operand

   281 .if elem_size == 16

   282     mov     TMP1, VX, asr #16

   283     adds    VX, VX, UNIT_X

   284 5:  subpls  VX, VX, SRC_WIDTH_FIXED

   285     bpl     5b

   286     add     TMP1, mem_operand, TMP1, asl #1

   287     vld1.16 {d&reg1&[idx]}, [TMP1, :16]

   288 .elseif elem_size == 32

   289     mov     TMP1, VX, asr #16

   290     adds    VX, VX, UNIT_X

   291 5:  subpls  VX, VX, SRC_WIDTH_FIXED

   292     bpl     5b

   293     add     TMP1, mem_operand, TMP1, asl #2

   294     vld1.32 {d&reg1&[idx]}, [TMP1, :32]

   295 .endif

   296 .endm

   298 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand

   299 .if numbytes == 32

   300     pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand

   301     pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand

   302     pixdeinterleave elem_size, %(basereg+4)

   303 .elseif numbytes == 16

   304     pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand

   305 .elseif numbytes == 8

   306     pixld1_s elem_size, %(basereg+1), mem_operand

   307 .elseif numbytes == 4

   308     .if elem_size == 32

   309         pixld0_s elem_size, %(basereg+0), 1, mem_operand

   310     .elseif elem_size == 16

   311         pixld0_s elem_size, %(basereg+0), 2, mem_operand

   312         pixld0_s elem_size, %(basereg+0), 3, mem_operand

   313     .else

   314         pixld0_s elem_size, %(basereg+0), 4, mem_operand

   315         pixld0_s elem_size, %(basereg+0), 5, mem_operand

   316         pixld0_s elem_size, %(basereg+0), 6, mem_operand

   317         pixld0_s elem_size, %(basereg+0), 7, mem_operand

   318     .endif

   319 .elseif numbytes == 2

   320     .if elem_size == 16

   321         pixld0_s elem_size, %(basereg+0), 1, mem_operand

   322     .else

   323         pixld0_s elem_size, %(basereg+0), 2, mem_operand

   324         pixld0_s elem_size, %(basereg+0), 3, mem_operand

   325     .endif

   326 .elseif numbytes == 1

   327     pixld0_s elem_size, %(basereg+0), 1, mem_operand

   328 .else

   329     .error "unsupported size: numbytes"

   330 .endif

   331 .endm

   333 .macro pixld_s numpix, bpp, basereg, mem_operand

   334 .if bpp > 0

   335     pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand

   336 .endif

   337 .endm

   339 .macro vuzp8 reg1, reg2

   340     vuzp.8 d&reg1, d&reg2

   341 .endm

   343 .macro vzip8 reg1, reg2

   344     vzip.8 d&reg1, d&reg2

   345 .endm

   347 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */

   348 .macro pixdeinterleave bpp, basereg

   349 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)

   350     vuzp8 %(basereg+0), %(basereg+1)

   351     vuzp8 %(basereg+2), %(basereg+3)

   352     vuzp8 %(basereg+1), %(basereg+3)

   353     vuzp8 %(basereg+0), %(basereg+2)

   354 .endif

   355 .endm

   357 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */

   358 .macro pixinterleave bpp, basereg

   359 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)

   360     vzip8 %(basereg+0), %(basereg+2)

   361     vzip8 %(basereg+1), %(basereg+3)

   362     vzip8 %(basereg+2), %(basereg+3)

   363     vzip8 %(basereg+0), %(basereg+1)

   364 .endif

   365 .endm

   367 /*

   368  * This is a macro for implementing cache preload. The main idea is that

   369  * cache preload logic is mostly independent from the rest of pixels

   370  * processing code. It starts at the top left pixel and moves forward

   371  * across pixels and can jump across scanlines. Prefetch distance is

   372  * handled in an 'incremental' way: it starts from 0 and advances to the

   373  * optimal distance over time. After reaching optimal prefetch distance,

   374  * it is kept constant. There are some checks which prevent prefetching

   375  * unneeded pixel lines below the image (but it still can prefetch a bit

   376  * more data on the right side of the image - not a big issue and may

   377  * be actually helpful when rendering text glyphs). Additional trick is

   378  * the use of LDR instruction for prefetch instead of PLD when moving to

   379  * the next line, the point is that we have a high chance of getting TLB

   380  * miss in this case, and PLD would be useless.

   381  *

   382  * This sounds like it may introduce a noticeable overhead (when working with

   383  * fully cached data). But in reality, due to having a separate pipeline and

   384  * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can

   385  * execute simultaneously with NEON and be completely shadowed by it. Thus

   386  * we get no performance overhead at all (*). This looks like a very nice

   387  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,

   388  * but still can implement some rather advanced prefetch logic in sofware

   389  * for almost zero cost!

   390  *

   391  * (*) The overhead of the prefetcher is visible when running some trivial

   392  * pixels processing like simple copy. Anyway, having prefetch is a must

   393  * when working with the graphics data.

   394  */

   395 .macro PF a, x:vararg

   396 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)

   397     a x

   398 .endif

   399 .endm

   401 .macro cache_preload std_increment, boost_increment

   402 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)

   403 .if regs_shortage

   404     PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */

   405 .endif

   406 .if std_increment != 0

   407     PF add PF_X, PF_X, #std_increment

   408 .endif

   409     PF tst PF_CTL, #0xF

   410     PF addne PF_X, PF_X, #boost_increment

   411     PF subne PF_CTL, PF_CTL, #1

   412     PF cmp PF_X, ORIG_W

   413 .if src_bpp_shift >= 0

   414     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

   415 .endif

   416 .if dst_r_bpp != 0

   417     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

   418 .endif

   419 .if mask_bpp_shift >= 0

   420     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]

   421 .endif

   422     PF subge PF_X, PF_X, ORIG_W

   423     PF subges PF_CTL, PF_CTL, #0x10

   424 .if src_bpp_shift >= 0

   425     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

   426 .endif

   427 .if dst_r_bpp != 0

   428     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

   429 .endif

   430 .if mask_bpp_shift >= 0

   431     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

   432 .endif

   433 .endif

   434 .endm

   436 .macro cache_preload_simple

   437 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)

   438 .if src_bpp > 0

   439     pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]

   440 .endif

   441 .if dst_r_bpp > 0

   442     pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]

   443 .endif

   444 .if mask_bpp > 0

   445     pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]

   446 .endif

   447 .endif

   448 .endm

   450 .macro fetch_mask_pixblock

   451     pixld       pixblock_size, mask_bpp, \

   452                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK

   453 .endm

   455 /*

   456  * Macro which is used to process leading pixels until destination

   457  * pointer is properly aligned (at 16 bytes boundary). When destination

   458  * buffer uses 16bpp format, this is unnecessary, or even pointless.

   459  */

   460 .macro ensure_destination_ptr_alignment process_pixblock_head, \

   461                                         process_pixblock_tail, \

   462                                         process_pixblock_tail_head

   463 .if dst_w_bpp != 24

   464     tst         DST_R, #0xF

   465     beq         2f

   467 .irp lowbit, 1, 2, 4, 8, 16

   468 local skip1

   469 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))

   470 .if lowbit < 16 /* we don't need more than 16-byte alignment */

   471     tst         DST_R, #lowbit

   472     beq         1f

   473 .endif

   474     pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC

   475     pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK

   476 .if dst_r_bpp > 0

   477     pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R

   478 .else

   479     add         DST_R, DST_R, #lowbit

   480 .endif

   481     PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)

   482     sub         W, W, #(lowbit * 8 / dst_w_bpp)

   483 1:

   484 .endif

   485 .endr

   486     pixdeinterleave src_bpp, src_basereg

   487     pixdeinterleave mask_bpp, mask_basereg

   488     pixdeinterleave dst_r_bpp, dst_r_basereg

   490     process_pixblock_head

   491     cache_preload 0, pixblock_size

   492     cache_preload_simple

   493     process_pixblock_tail

   495     pixinterleave dst_w_bpp, dst_w_basereg

   496 .irp lowbit, 1, 2, 4, 8, 16

   497 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))

   498 .if lowbit < 16 /* we don't need more than 16-byte alignment */

   499     tst         DST_W, #lowbit

   500     beq         1f

   501 .endif

   502     pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W

   503 1:

   504 .endif

   505 .endr

   506 .endif

   507 2:

   508 .endm

   510 /*

   511  * Special code for processing up to (pixblock_size - 1) remaining

   512  * trailing pixels. As SIMD processing performs operation on

   513  * pixblock_size pixels, anything smaller than this has to be loaded

   514  * and stored in a special way. Loading and storing of pixel data is

   515  * performed in such a way that we fill some 'slots' in the NEON

   516  * registers (some slots naturally are unused), then perform compositing

   517  * operation as usual. In the end, the data is taken from these 'slots'

   518  * and saved to memory.

   519  *

   520  * cache_preload_flag - allows to suppress prefetch if

   521  *                      set to 0

   522  * dst_aligned_flag   - selects whether destination buffer

   523  *                      is aligned

   524  */

   525 .macro process_trailing_pixels cache_preload_flag, \

   526                                dst_aligned_flag, \

   527                                process_pixblock_head, \

   528                                process_pixblock_tail, \

   529                                process_pixblock_tail_head

   530     tst         W, #(pixblock_size - 1)

   531     beq         2f

   532 .irp chunk_size, 16, 8, 4, 2, 1

   533 .if pixblock_size > chunk_size

   534     tst         W, #chunk_size

   535     beq         1f

   536     pixld_src   chunk_size, src_bpp, src_basereg, SRC

   537     pixld       chunk_size, mask_bpp, mask_basereg, MASK

   538 .if dst_aligned_flag != 0

   539     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R

   540 .else

   541     pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R

   542 .endif

   543 .if cache_preload_flag != 0

   544     PF add      PF_X, PF_X, #chunk_size

   545 .endif

   546 1:

   547 .endif

   548 .endr

   549     pixdeinterleave src_bpp, src_basereg

   550     pixdeinterleave mask_bpp, mask_basereg

   551     pixdeinterleave dst_r_bpp, dst_r_basereg

   553     process_pixblock_head

   554 .if cache_preload_flag != 0

   555     cache_preload 0, pixblock_size

   556     cache_preload_simple

   557 .endif

   558     process_pixblock_tail

   559     pixinterleave dst_w_bpp, dst_w_basereg

   560 .irp chunk_size, 16, 8, 4, 2, 1

   561 .if pixblock_size > chunk_size

   562     tst         W, #chunk_size

   563     beq         1f

   564 .if dst_aligned_flag != 0

   565     pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W

   566 .else

   567     pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W

   568 .endif

   569 1:

   570 .endif

   571 .endr

   572 2:

   573 .endm

   575 /*

   576  * Macro, which performs all the needed operations to switch to the next

   577  * scanline and start the next loop iteration unless all the scanlines

   578  * are already processed.

   579  */

   580 .macro advance_to_next_scanline start_of_loop_label

   581 .if regs_shortage

   582     ldrd        W, [sp] /* load W and H (width and height) from stack */

   583 .else

   584     mov         W, ORIG_W

   585 .endif

   586     add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift

   587 .if src_bpp != 0

   588     add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift

   589 .endif

   590 .if mask_bpp != 0

   591     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift

   592 .endif

   593 .if (dst_w_bpp != 24)

   594     sub         DST_W, DST_W, W, lsl #dst_bpp_shift

   595 .endif

   596 .if (src_bpp != 24) && (src_bpp != 0)

   597     sub         SRC, SRC, W, lsl #src_bpp_shift

   598 .endif

   599 .if (mask_bpp != 24) && (mask_bpp != 0)

   600     sub         MASK, MASK, W, lsl #mask_bpp_shift

   601 .endif

   602     subs        H, H, #1

   603     mov         DST_R, DST_W

   604 .if regs_shortage

   605     str         H, [sp, #4] /* save updated height to stack */

   606 .endif

   607     bge         start_of_loop_label

   608 .endm

   610 /*

   611  * Registers are allocated in the following way by default:

   612  * d0, d1, d2, d3     - reserved for loading source pixel data

   613  * d4, d5, d6, d7     - reserved for loading destination pixel data

   614  * d24, d25, d26, d27 - reserved for loading mask pixel data

   615  * d28, d29, d30, d31 - final destination pixel data for writeback to memory

   616  */

   617 .macro generate_composite_function fname, \

   618                                    src_bpp_, \

   619                                    mask_bpp_, \

   620                                    dst_w_bpp_, \

   621                                    flags, \

   622                                    pixblock_size_, \

   623                                    prefetch_distance, \

   624                                    init, \

   625                                    cleanup, \

   626                                    process_pixblock_head, \

   627                                    process_pixblock_tail, \

   628                                    process_pixblock_tail_head, \

   629                                    dst_w_basereg_ = 28, \

   630                                    dst_r_basereg_ = 4, \

   631                                    src_basereg_   = 0, \

   632                                    mask_basereg_  = 24

   634     .func fname

   635     .global fname

   636     /* For ELF format also set function visibility to hidden */

   637 #ifdef __ELF__

   638     .hidden fname

   639     .type fname, %function

   640 #endif

   641 fname:

   642     .fnstart

   643     .save       {r4-r12, lr}

   644     push        {r4-r12, lr}        /* save all registers */

   646 /*

   647  * Select prefetch type for this function. If prefetch distance is

   648  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch

   649  * has to be used instead of ADVANCED.

   650  */

   651     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT

   652 .if prefetch_distance == 0

   653     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE

   654 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \

   655         ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))

   656     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE

   657 .endif

   659 /*

   660  * Make some macro arguments globally visible and accessible

   661  * from other macros

   662  */

   663     .set src_bpp, src_bpp_

   664     .set mask_bpp, mask_bpp_

   665     .set dst_w_bpp, dst_w_bpp_

   666     .set pixblock_size, pixblock_size_

   667     .set dst_w_basereg, dst_w_basereg_

   668     .set dst_r_basereg, dst_r_basereg_

   669     .set src_basereg, src_basereg_

   670     .set mask_basereg, mask_basereg_

   672     .macro pixld_src x:vararg

   673         pixld x

   674     .endm

   675     .macro fetch_src_pixblock

   676         pixld_src   pixblock_size, src_bpp, \

   677                     (src_basereg - pixblock_size * src_bpp / 64), SRC

   678     .endm

   679 /*

   680  * Assign symbolic names to registers

   681  */

   682     W           .req        r0      /* width (is updated during processing) */

   683     H           .req        r1      /* height (is updated during processing) */

   684     DST_W       .req        r2      /* destination buffer pointer for writes */

   685     DST_STRIDE  .req        r3      /* destination image stride */

   686     SRC         .req        r4      /* source buffer pointer */

   687     SRC_STRIDE  .req        r5      /* source image stride */

   688     DST_R       .req        r6      /* destination buffer pointer for reads */

   690     MASK        .req        r7      /* mask pointer */

   691     MASK_STRIDE .req        r8      /* mask stride */

   693     PF_CTL      .req        r9      /* combined lines counter and prefetch */

   694                                     /* distance increment counter */

   695     PF_X        .req        r10     /* pixel index in a scanline for current */

   696                                     /* pretetch position */

   697     PF_SRC      .req        r11     /* pointer to source scanline start */

   698                                     /* for prefetch purposes */

   699     PF_DST      .req        r12     /* pointer to destination scanline start */

   700                                     /* for prefetch purposes */

   701     PF_MASK     .req        r14     /* pointer to mask scanline start */

   702                                     /* for prefetch purposes */

   703 /*

   704  * Check whether we have enough registers for all the local variables.

   705  * If we don't have enough registers, original width and height are

   706  * kept on top of stack (and 'regs_shortage' variable is set to indicate

   707  * this for the rest of code). Even if there are enough registers, the

   708  * allocation scheme may be a bit different depending on whether source

   709  * or mask is not used.

   710  */

   711 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)

   712     ORIG_W      .req        r10     /* saved original width */

   713     DUMMY       .req        r12     /* temporary register */

   714     .set        regs_shortage, 0

   715 .elseif mask_bpp == 0

   716     ORIG_W      .req        r7      /* saved original width */

   717     DUMMY       .req        r8      /* temporary register */

   718     .set        regs_shortage, 0

   719 .elseif src_bpp == 0

   720     ORIG_W      .req        r4      /* saved original width */

   721     DUMMY       .req        r5      /* temporary register */

   722     .set        regs_shortage, 0

   723 .else

   724     ORIG_W      .req        r1      /* saved original width */

   725     DUMMY       .req        r1      /* temporary register */

   726     .set        regs_shortage, 1

   727 .endif

   729     .set mask_bpp_shift, -1

   730 .if src_bpp == 32

   731     .set src_bpp_shift, 2

   732 .elseif src_bpp == 24

   733     .set src_bpp_shift, 0

   734 .elseif src_bpp == 16

   735     .set src_bpp_shift, 1

   736 .elseif src_bpp == 8

   737     .set src_bpp_shift, 0

   738 .elseif src_bpp == 0

   739     .set src_bpp_shift, -1

   740 .else

   741     .error "requested src bpp (src_bpp) is not supported"

   742 .endif

   743 .if mask_bpp == 32

   744     .set mask_bpp_shift, 2

   745 .elseif mask_bpp == 24

   746     .set mask_bpp_shift, 0

   747 .elseif mask_bpp == 8

   748     .set mask_bpp_shift, 0

   749 .elseif mask_bpp == 0

   750     .set mask_bpp_shift, -1

   751 .else

   752     .error "requested mask bpp (mask_bpp) is not supported"

   753 .endif

   754 .if dst_w_bpp == 32

   755     .set dst_bpp_shift, 2

   756 .elseif dst_w_bpp == 24

   757     .set dst_bpp_shift, 0

   758 .elseif dst_w_bpp == 16

   759     .set dst_bpp_shift, 1

   760 .elseif dst_w_bpp == 8

   761     .set dst_bpp_shift, 0

   762 .else

   763     .error "requested dst bpp (dst_w_bpp) is not supported"

   764 .endif

   766 .if (((flags) & FLAG_DST_READWRITE) != 0)

   767     .set dst_r_bpp, dst_w_bpp

   768 .else

   769     .set dst_r_bpp, 0

   770 .endif

   771 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)

   772     .set DEINTERLEAVE_32BPP_ENABLED, 1

   773 .else

   774     .set DEINTERLEAVE_32BPP_ENABLED, 0

   775 .endif

   777 .if prefetch_distance < 0 || prefetch_distance > 15

   778     .error "invalid prefetch distance (prefetch_distance)"

   779 .endif

   781 .if src_bpp > 0

   782     ldr         SRC, [sp, #40]

   783 .endif

   784 .if mask_bpp > 0

   785     ldr         MASK, [sp, #48]

   786 .endif

   787     PF mov      PF_X, #0

   788 .if src_bpp > 0

   789     ldr         SRC_STRIDE, [sp, #44]

   790 .endif

   791 .if mask_bpp > 0

   792     ldr         MASK_STRIDE, [sp, #52]

   793 .endif

   794     mov         DST_R, DST_W

   796 .if src_bpp == 24

   797     sub         SRC_STRIDE, SRC_STRIDE, W

   798     sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1

   799 .endif

   800 .if mask_bpp == 24

   801     sub         MASK_STRIDE, MASK_STRIDE, W

   802     sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1

   803 .endif

   804 .if dst_w_bpp == 24

   805     sub         DST_STRIDE, DST_STRIDE, W

   806     sub         DST_STRIDE, DST_STRIDE, W, lsl #1

   807 .endif

   809 /*

   810  * Setup advanced prefetcher initial state

   811  */

   812     PF mov      PF_SRC, SRC

   813     PF mov      PF_DST, DST_R

   814     PF mov      PF_MASK, MASK

   815     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */

   816     PF mov      PF_CTL, H, lsl #4

   817     PF add      PF_CTL, #(prefetch_distance - 0x10)

   819     init

   820 .if regs_shortage

   821     .save       {r0, r1}

   822     push        {r0, r1}

   823 .endif

   824     subs        H, H, #1

   825 .if regs_shortage

   826     str         H, [sp, #4] /* save updated height to stack */

   827 .else

   828     mov         ORIG_W, W

   829 .endif

   830     blt         9f

   831     cmp         W, #(pixblock_size * 2)

   832     blt         8f

   833 /*

   834  * This is the start of the pipelined loop, which if optimized for

   835  * long scanlines

   836  */

   837 0:

   838     ensure_destination_ptr_alignment process_pixblock_head, \

   839                                      process_pixblock_tail, \

   840                                      process_pixblock_tail_head

   842     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */

   843     pixld_a     pixblock_size, dst_r_bpp, \

   844                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R

   845     fetch_src_pixblock

   846     pixld       pixblock_size, mask_bpp, \

   847                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK

   848     PF add      PF_X, PF_X, #pixblock_size

   849     process_pixblock_head

   850     cache_preload 0, pixblock_size

   851     cache_preload_simple

   852     subs        W, W, #(pixblock_size * 2)

   853     blt         2f

   854 1:

   855     process_pixblock_tail_head

   856     cache_preload_simple

   857     subs        W, W, #pixblock_size

   858     bge         1b

   859 2:

   860     process_pixblock_tail

   861     pixst_a     pixblock_size, dst_w_bpp, \

   862                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W

   864     /* Process the remaining trailing pixels in the scanline */

   865     process_trailing_pixels 1, 1, \

   866                             process_pixblock_head, \

   867                             process_pixblock_tail, \

   868                             process_pixblock_tail_head

   869     advance_to_next_scanline 0b

   871 .if regs_shortage

   872     pop         {r0, r1}

   873 .endif

   874     cleanup

   875     pop         {r4-r12, pc}  /* exit */

   876 /*

   877  * This is the start of the loop, designed to process images with small width

   878  * (less than pixblock_size * 2 pixels). In this case neither pipelining

   879  * nor prefetch are used.

   880  */

   881 8:

   882     /* Process exactly pixblock_size pixels if needed */

   883     tst         W, #pixblock_size

   884     beq         1f

   885     pixld       pixblock_size, dst_r_bpp, \

   886                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R

   887     fetch_src_pixblock

   888     pixld       pixblock_size, mask_bpp, \

   889                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK

   890     process_pixblock_head

   891     process_pixblock_tail

   892     pixst       pixblock_size, dst_w_bpp, \

   893                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W

   894 1:

   895     /* Process the remaining trailing pixels in the scanline */

   896     process_trailing_pixels 0, 0, \

   897                             process_pixblock_head, \

   898                             process_pixblock_tail, \

   899                             process_pixblock_tail_head

   900     advance_to_next_scanline 8b

   901 9:

   902 .if regs_shortage

   903     pop         {r0, r1}

   904 .endif

   905     cleanup

   906     pop         {r4-r12, pc}  /* exit */

   907     .fnend

   909     .purgem     fetch_src_pixblock

   910     .purgem     pixld_src

   912     .unreq      SRC

   913     .unreq      MASK

   914     .unreq      DST_R

   915     .unreq      DST_W

   916     .unreq      ORIG_W

   917     .unreq      W

   918     .unreq      H

   919     .unreq      SRC_STRIDE

   920     .unreq      DST_STRIDE

   921     .unreq      MASK_STRIDE

   922     .unreq      PF_CTL

   923     .unreq      PF_X

   924     .unreq      PF_SRC

   925     .unreq      PF_DST

   926     .unreq      PF_MASK

   927     .unreq      DUMMY

   928     .endfunc

   929 .endm

   931 /*

   932  * A simplified variant of function generation template for a single

   933  * scanline processing (for implementing pixman combine functions)

   934  */

   935 .macro generate_composite_function_scanline        use_nearest_scaling, \

   936                                                    fname, \

   937                                                    src_bpp_, \

   938                                                    mask_bpp_, \

   939                                                    dst_w_bpp_, \

   940                                                    flags, \

   941                                                    pixblock_size_, \

   942                                                    init, \

   943                                                    cleanup, \

   944                                                    process_pixblock_head, \

   945                                                    process_pixblock_tail, \

   946                                                    process_pixblock_tail_head, \

   947                                                    dst_w_basereg_ = 28, \

   948                                                    dst_r_basereg_ = 4, \

   949                                                    src_basereg_   = 0, \

   950                                                    mask_basereg_  = 24

   952     .func fname

   953     .global fname

   954     /* For ELF format also set function visibility to hidden */

   955 #ifdef __ELF__

   956     .hidden fname

   957     .type fname, %function

   958 #endif

   959 fname:

   960     .fnstart

   961     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE

   962 /*

   963  * Make some macro arguments globally visible and accessible

   964  * from other macros

   965  */

   966     .set src_bpp, src_bpp_

   967     .set mask_bpp, mask_bpp_

   968     .set dst_w_bpp, dst_w_bpp_

   969     .set pixblock_size, pixblock_size_

   970     .set dst_w_basereg, dst_w_basereg_

   971     .set dst_r_basereg, dst_r_basereg_

   972     .set src_basereg, src_basereg_

   973     .set mask_basereg, mask_basereg_

   975 .if use_nearest_scaling != 0

   976     /*

   977      * Assign symbolic names to registers for nearest scaling

   978      */

   979     W           .req        r0

   980     DST_W       .req        r1

   981     SRC         .req        r2

   982     VX          .req        r3

   983     UNIT_X      .req        ip

   984     MASK        .req        lr

   985     TMP1        .req        r4

   986     TMP2        .req        r5

   987     DST_R       .req        r6

   988     SRC_WIDTH_FIXED .req        r7

   990     .macro pixld_src x:vararg

   991         pixld_s x

   992     .endm

   994     ldr         UNIT_X, [sp]

   995     .save       {r4-r8, lr}

   996     push        {r4-r8, lr}

   997     ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]

   998     .if mask_bpp != 0

   999     ldr         MASK, [sp, #(24 + 8)]

  1000     .endif

  1001 .else

  1002     /*

  1003      * Assign symbolic names to registers

  1004      */

  1005     W           .req        r0      /* width (is updated during processing) */

  1006     DST_W       .req        r1      /* destination buffer pointer for writes */

  1007     SRC         .req        r2      /* source buffer pointer */

  1008     DST_R       .req        ip      /* destination buffer pointer for reads */

  1009     MASK        .req        r3      /* mask pointer */

  1011     .macro pixld_src x:vararg

  1012         pixld x

  1013     .endm

  1014 .endif

  1016 .if (((flags) & FLAG_DST_READWRITE) != 0)

  1017     .set dst_r_bpp, dst_w_bpp

  1018 .else

  1019     .set dst_r_bpp, 0

  1020 .endif

  1021 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)

  1022     .set DEINTERLEAVE_32BPP_ENABLED, 1

  1023 .else

  1024     .set DEINTERLEAVE_32BPP_ENABLED, 0

  1025 .endif

  1027     .macro fetch_src_pixblock

  1028         pixld_src   pixblock_size, src_bpp, \

  1029                     (src_basereg - pixblock_size * src_bpp / 64), SRC

  1030     .endm

  1032     init

  1033     mov         DST_R, DST_W

  1035     cmp         W, #pixblock_size

  1036     blt         8f

  1038     ensure_destination_ptr_alignment process_pixblock_head, \

  1039                                      process_pixblock_tail, \

  1040                                      process_pixblock_tail_head

  1042     subs        W, W, #pixblock_size

  1043     blt         7f

  1045     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */

  1046     pixld_a     pixblock_size, dst_r_bpp, \

  1047                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R

  1048     fetch_src_pixblock

  1049     pixld       pixblock_size, mask_bpp, \

  1050                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK

  1051     process_pixblock_head

  1052     subs        W, W, #pixblock_size

  1053     blt         2f

  1054 1:

  1055     process_pixblock_tail_head

  1056     subs        W, W, #pixblock_size

  1057     bge         1b

  1058 2:

  1059     process_pixblock_tail

  1060     pixst_a     pixblock_size, dst_w_bpp, \

  1061                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W

  1062 7:

  1063     /* Process the remaining trailing pixels in the scanline (dst aligned) */

  1064     process_trailing_pixels 0, 1, \

  1065                             process_pixblock_head, \

  1066                             process_pixblock_tail, \

  1067                             process_pixblock_tail_head

  1069     cleanup

  1070 .if use_nearest_scaling != 0

  1071     pop         {r4-r8, pc}  /* exit */

  1072 .else

  1073     bx          lr  /* exit */

  1074 .endif

  1075 8:

  1076     /* Process the remaining trailing pixels in the scanline (dst unaligned) */

  1077     process_trailing_pixels 0, 0, \

  1078                             process_pixblock_head, \

  1079                             process_pixblock_tail, \

  1080                             process_pixblock_tail_head

  1082     cleanup

  1084 .if use_nearest_scaling != 0

  1085     pop         {r4-r8, pc}  /* exit */

  1087     .unreq      DST_R

  1088     .unreq      SRC

  1089     .unreq      W

  1090     .unreq      VX

  1091     .unreq      UNIT_X

  1092     .unreq      TMP1

  1093     .unreq      TMP2

  1094     .unreq      DST_W

  1095     .unreq      MASK

  1096     .unreq      SRC_WIDTH_FIXED

  1098 .else

  1099     bx          lr  /* exit */

  1101     .unreq      SRC

  1102     .unreq      MASK

  1103     .unreq      DST_R

  1104     .unreq      DST_W

  1105     .unreq      W

  1106 .endif

  1108     .purgem     fetch_src_pixblock

  1109     .purgem     pixld_src

  1111     .fnend

  1112     .endfunc

  1113 .endm

  1115 .macro generate_composite_function_single_scanline x:vararg

  1116     generate_composite_function_scanline 0, x

  1117 .endm

  1119 .macro generate_composite_function_nearest_scanline x:vararg

  1120     generate_composite_function_scanline 1, x

  1121 .endm

  1123 /* Default prologue/epilogue, nothing special needs to be done */

  1125 .macro default_init

  1126 .endm

  1128 .macro default_cleanup

  1129 .endm

  1131 /*

  1132  * Prologue/epilogue variant which additionally saves/restores d8-d15

  1133  * registers (they need to be saved/restored by callee according to ABI).

  1134  * This is required if the code needs to use all the NEON registers.

  1135  */

  1137 .macro default_init_need_all_regs

  1138     .vsave      {d8-d15}

  1139     vpush       {d8-d15}

  1140 .endm

  1142 .macro default_cleanup_need_all_regs

  1143     vpop        {d8-d15}

  1144 .endm

  1146 /******************************************************************************/

  1148 /*

  1149  * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)

  1150  * into a planar a8r8g8b8 format (with a, r, g, b color components

  1151  * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).

  1152  *

  1153  * Warning: the conversion is destructive and the original

  1154  *          value (in) is lost.

  1155  */

  1156 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b

  1157     vshrn.u16   out_r, in,    #8

  1158     vshrn.u16   out_g, in,    #3

  1159     vsli.u16    in,    in,    #5

  1160     vmov.u8     out_a, #255

  1161     vsri.u8     out_r, out_r, #5

  1162     vsri.u8     out_g, out_g, #6

  1163     vshrn.u16   out_b, in,    #2

  1164 .endm

  1166 .macro convert_0565_to_x888 in, out_r, out_g, out_b

  1167     vshrn.u16   out_r, in,    #8

  1168     vshrn.u16   out_g, in,    #3

  1169     vsli.u16    in,    in,    #5

  1170     vsri.u8     out_r, out_r, #5

  1171     vsri.u8     out_g, out_g, #6

  1172     vshrn.u16   out_b, in,    #2

  1173 .endm

  1175 /*

  1176  * Conversion from planar a8r8g8b8 format (with a, r, g, b color components

  1177  * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6

  1178  * pixels packed in 128-bit register (out). Requires two temporary 128-bit

  1179  * registers (tmp1, tmp2)

  1180  */

  1181 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2

  1182     vshll.u8    tmp1, in_g, #8

  1183     vshll.u8    out, in_r, #8

  1184     vshll.u8    tmp2, in_b, #8

  1185     vsri.u16    out, tmp1, #5

  1186     vsri.u16    out, tmp2, #11

  1187 .endm

  1189 /*

  1190  * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels

  1191  * returned in (out0, out1) registers pair. Requires one temporary

  1192  * 64-bit register (tmp). 'out1' and 'in' may overlap, the original

  1193  * value from 'in' is lost

  1194  */

  1195 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp

  1196     vshl.u16    out0, in,   #5  /* G top 6 bits */

  1197     vshl.u16    tmp,  in,   #11 /* B top 5 bits */

  1198     vsri.u16    in,   in,   #5  /* R is ready in top bits */

  1199     vsri.u16    out0, out0, #6  /* G is ready in top bits */

  1200     vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */

  1201     vshr.u16    out1, in,   #8  /* R is in place */

  1202     vsri.u16    out0, tmp,  #8  /* G & B is in place */

  1203     vzip.u16    out0, out1      /* everything is in place */

  1204 .endm

The Tor Browser / file revision

gfx/cairo/libpixman/src/pixman-arm-neon-asm.h@b8a032363ba2

gfx/cairo/libpixman/src/pixman-arm-neon-asm.h