The Tor Browser: gfx/cairo/libpixman/src/pixman-arm-simd-asm.h@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*

     2  * Copyright © 2012 Raspberry Pi Foundation

     3  * Copyright © 2012 RISC OS Open Ltd

     4  *

     5  * Permission to use, copy, modify, distribute, and sell this software and its

     6  * documentation for any purpose is hereby granted without fee, provided that

     7  * the above copyright notice appear in all copies and that both that

     8  * copyright notice and this permission notice appear in supporting

     9  * documentation, and that the name of the copyright holders not be used in

    10  * advertising or publicity pertaining to distribution of the software without

    11  * specific, written prior permission.  The copyright holders make no

    12  * representations about the suitability of this software for any purpose.  It

    13  * is provided "as is" without express or implied warranty.

    14  *

    15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS

    16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

    17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY

    18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

    19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN

    20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING

    21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

    22  * SOFTWARE.

    23  *

    24  * Author:  Ben Avison (bavison@riscosopen.org)

    25  *

    26  */

    28 /*

    29  * Because the alignment of pixel data to cachelines, and even the number of

    30  * cachelines per row can vary from row to row, and because of the need to

    31  * preload each scanline once and only once, this prefetch strategy treats

    32  * each row of pixels independently. When a pixel row is long enough, there

    33  * are three distinct phases of prefetch:

    34  * * an inner loop section, where each time a cacheline of data is

    35  *    processed, another cacheline is preloaded (the exact distance ahead is

    36  *    determined empirically using profiling results from lowlevel-blt-bench)

    37  * * a leading section, where enough cachelines are preloaded to ensure no

    38  *    cachelines escape being preloaded when the inner loop starts

    39  * * a trailing section, where a limited number (0 or more) of cachelines

    40  *    are preloaded to deal with data (if any) that hangs off the end of the

    41  *    last iteration of the inner loop, plus any trailing bytes that were not

    42  *    enough to make up one whole iteration of the inner loop

    43  *

    44  * There are (in general) three distinct code paths, selected between

    45  * depending upon how long the pixel row is. If it is long enough that there

    46  * is at least one iteration of the inner loop (as described above) then

    47  * this is described as the "wide" case. If it is shorter than that, but

    48  * there are still enough bytes output that there is at least one 16-byte-

    49  * long, 16-byte-aligned write to the destination (the optimum type of

    50  * write), then this is the "medium" case. If it is not even this long, then

    51  * this is the "narrow" case, and there is no attempt to align writes to

    52  * 16-byte boundaries. In the "medium" and "narrow" cases, all the

    53  * cachelines containing data from the pixel row are prefetched up-front.

    54  */

    56 /*

    57  * Determine whether we put the arguments on the stack for debugging.

    58  */

    59 #undef DEBUG_PARAMS

    61 /*

    62  * Bit flags for 'generate_composite_function' macro which are used

    63  * to tune generated functions behavior.

    64  */

    65 .set FLAG_DST_WRITEONLY,         0

    66 .set FLAG_DST_READWRITE,         1

    67 .set FLAG_COND_EXEC,             0

    68 .set FLAG_BRANCH_OVER,           2

    69 .set FLAG_PROCESS_PRESERVES_PSR, 0

    70 .set FLAG_PROCESS_CORRUPTS_PSR,  4

    71 .set FLAG_PROCESS_DOESNT_STORE,  0

    72 .set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */

    73 .set FLAG_NO_SPILL_LINE_VARS,        0

    74 .set FLAG_SPILL_LINE_VARS_WIDE,      16

    75 .set FLAG_SPILL_LINE_VARS_NON_WIDE,  32

    76 .set FLAG_SPILL_LINE_VARS,           48

    77 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0

    78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64

    80 /*

    81  * Offset into stack where mask and source pointer/stride can be accessed.

    82  */

    83 #ifdef DEBUG_PARAMS

    84 .set ARGS_STACK_OFFSET,        (9*4+9*4)

    85 #else

    86 .set ARGS_STACK_OFFSET,        (9*4)

    87 #endif

    89 /*

    90  * Constants for selecting preferable prefetch type.

    91  */

    92 .set PREFETCH_TYPE_NONE,       0

    93 .set PREFETCH_TYPE_STANDARD,   1

    95 /*

    96  * Definitions of macros for load/store of pixel data.

    97  */

    99 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0

   100  .if numbytes == 16

   101   .if unaligned == 1

   102         op&r&cond    WK&reg0, [base], #4

   103         op&r&cond    WK&reg1, [base], #4

   104         op&r&cond    WK&reg2, [base], #4

   105         op&r&cond    WK&reg3, [base], #4

   106   .else

   107         op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}

   108   .endif

   109  .elseif numbytes == 8

   110   .if unaligned == 1

   111         op&r&cond    WK&reg0, [base], #4

   112         op&r&cond    WK&reg1, [base], #4

   113   .else

   114         op&m&cond&ia base!, {WK&reg0,WK&reg1}

   115   .endif

   116  .elseif numbytes == 4

   117         op&r&cond    WK&reg0, [base], #4

   118  .elseif numbytes == 2

   119         op&r&cond&h  WK&reg0, [base], #2

   120  .elseif numbytes == 1

   121         op&r&cond&b  WK&reg0, [base], #1

   122  .else

   123   .error "unsupported size: numbytes"

   124  .endif

   125 .endm

   127 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base

   128  .if numbytes == 16

   129         stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}

   130  .elseif numbytes == 8

   131         stm&cond&db base, {WK&reg0,WK&reg1}

   132  .elseif numbytes == 4

   133         str&cond    WK&reg0, [base, #-4]

   134  .elseif numbytes == 2

   135         str&cond&h  WK&reg0, [base, #-2]

   136  .elseif numbytes == 1

   137         str&cond&b  WK&reg0, [base, #-1]

   138  .else

   139   .error "unsupported size: numbytes"

   140  .endif

   141 .endm

   143 .macro pixld cond, numbytes, firstreg, base, unaligned

   144         pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned

   145 .endm

   147 .macro pixst cond, numbytes, firstreg, base

   148  .if (flags) & FLAG_DST_READWRITE

   149         pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base

   150  .else

   151         pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base

   152  .endif

   153 .endm

   155 .macro PF a, x:vararg

   156  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)

   157         a x

   158  .endif

   159 .endm

   162 .macro preload_leading_step1  bpp, ptr, base

   163 /* If the destination is already 16-byte aligned, then we need to preload

   164  * between 0 and prefetch_distance (inclusive) cache lines ahead so there

   165  * are no gaps when the inner loop starts.

   166  */

   167  .if bpp > 0

   168         PF  bic,    ptr, base, #31

   169   .set OFFSET, 0

   170   .rept prefetch_distance+1

   171         PF  pld,    [ptr, #OFFSET]

   172    .set OFFSET, OFFSET+32

   173   .endr

   174  .endif

   175 .endm

   177 .macro preload_leading_step2  bpp, bpp_shift, ptr, base

   178 /* However, if the destination is not 16-byte aligned, we may need to

   179  * preload more cache lines than that. The question we need to ask is:

   180  * are the bytes corresponding to the leading pixels more than the amount

   181  * by which the source pointer will be rounded down for preloading, and if

   182  * so, by how many cache lines? Effectively, we want to calculate

   183  *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp

   184  *     inner_loop_offset = (src+leading_bytes)&31

   185  *     extra_needed = leading_bytes - inner_loop_offset

   186  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only

   187  * possible when there are 4 src bytes for every 1 dst byte).

   188  */

   189  .if bpp > 0

   190   .ifc base,DST

   191         /* The test can be simplified further when preloading the destination */

   192         PF  tst,    base, #16

   193         PF  beq,    61f

   194   .else

   195    .if bpp/dst_w_bpp == 4

   196         PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift

   197         PF  and,    SCRATCH, SCRATCH, #31

   198         PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift

   199         PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */

   200         PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */

   201         PF  bcs,    61f

   202         PF  bpl,    60f

   203         PF  pld,    [ptr, #32*(prefetch_distance+2)]

   204    .else

   205         PF  mov,    SCRATCH, base, lsl #32-5

   206         PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift

   207         PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift

   208         PF  bls,    61f

   209    .endif

   210   .endif

   211 60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]

   212 61:

   213  .endif

   214 .endm

   216 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))

   217 .macro preload_middle   bpp, base, scratch_holds_offset

   218  .if bpp > 0

   219         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */

   220   .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)

   221    .if scratch_holds_offset

   222         PF  pld,    [base, SCRATCH]

   223    .else

   224         PF  bic,    SCRATCH, base, #31

   225         PF  pld,    [SCRATCH, #32*prefetch_distance]

   226    .endif

   227   .endif

   228  .endif

   229 .endm

   231 .macro preload_trailing  bpp, bpp_shift, base

   232  .if bpp > 0

   233   .if bpp*pix_per_block > 256

   234         /* Calculations are more complex if more than one fetch per block */

   235         PF  and,    WK1, base, #31

   236         PF  add,    WK1, WK1, WK0, lsl #bpp_shift

   237         PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)

   238         PF  bic,    SCRATCH, base, #31

   239 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]

   240         PF  add,    SCRATCH, SCRATCH, #32

   241         PF  subs,   WK1, WK1, #32

   242         PF  bhi,    80b

   243   .else

   244         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */

   245         PF  mov,    SCRATCH, base, lsl #32-5

   246         PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift

   247         PF  adceqs, SCRATCH, SCRATCH, #0

   248         /* The instruction above has two effects: ensures Z is only

   249          * set if C was clear (so Z indicates that both shifted quantities

   250          * were 0), and clears C if Z was set (so C indicates that the sum

   251          * of the shifted quantities was greater and not equal to 32) */

   252         PF  beq,    82f

   253         PF  bic,    SCRATCH, base, #31

   254         PF  bcc,    81f

   255         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]

   256 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]

   257 82:

   258   .endif

   259  .endif

   260 .endm

   263 .macro preload_line    narrow_case, bpp, bpp_shift, base

   264 /* "narrow_case" - just means that the macro was invoked from the "narrow"

   265  *    code path rather than the "medium" one - because in the narrow case,

   266  *    the row of pixels is known to output no more than 30 bytes, then

   267  *    (assuming the source pixels are no wider than the the destination

   268  *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,

   269  *    meaning there's no need for a loop.

   270  * "bpp" - number of bits per pixel in the channel (source, mask or

   271  *    destination) that's being preloaded, or 0 if this channel is not used

   272  *    for reading

   273  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)

   274  * "base" - base address register of channel to preload (SRC, MASK or DST)

   275  */

   276  .if bpp > 0

   277   .if narrow_case && (bpp <= dst_w_bpp)

   278         /* In these cases, each line for each channel is in either 1 or 2 cache lines */

   279         PF  bic,    WK0, base, #31

   280         PF  pld,    [WK0]

   281         PF  add,    WK1, base, X, LSL #bpp_shift

   282         PF  sub,    WK1, WK1, #1

   283         PF  bic,    WK1, WK1, #31

   284         PF  cmp,    WK1, WK0

   285         PF  beq,    90f

   286         PF  pld,    [WK1]

   287 90:

   288   .else

   289         PF  bic,    WK0, base, #31

   290         PF  pld,    [WK0]

   291         PF  add,    WK1, base, X, lsl #bpp_shift

   292         PF  sub,    WK1, WK1, #1

   293         PF  bic,    WK1, WK1, #31

   294         PF  cmp,    WK1, WK0

   295         PF  beq,    92f

   296 91:     PF  add,    WK0, WK0, #32

   297         PF  cmp,    WK0, WK1

   298         PF  pld,    [WK0]

   299         PF  bne,    91b

   300 92:

   301   .endif

   302  .endif

   303 .endm

   306 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

   307         process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0

   308  .if decrementx

   309         sub&cond X, X, #8*numbytes/dst_w_bpp

   310  .endif

   311         process_tail  cond, numbytes, firstreg

   312  .if !((flags) & FLAG_PROCESS_DOES_STORE)

   313         pixst   cond, numbytes, firstreg, DST

   314  .endif

   315 .endm

   317 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

   318  .if (flags) & FLAG_BRANCH_OVER

   319   .ifc cond,mi

   320         bpl     100f

   321   .endif

   322   .ifc cond,cs

   323         bcc     100f

   324   .endif

   325   .ifc cond,ne

   326         beq     100f

   327   .endif

   328         conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

   329 100:

   330  .else

   331         conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

   332  .endif

   333 .endm

   335 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx

   336  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)

   337         /* Can't interleave reads and writes */

   338         test

   339         conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx

   340   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR

   341         test

   342   .endif

   343         conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx

   344  .else

   345         /* Can interleave reads and writes for better scheduling */

   346         test

   347         process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0

   348         process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0

   349   .if decrementx

   350         sub&cond1 X, X, #8*numbytes1/dst_w_bpp

   351         sub&cond2 X, X, #8*numbytes2/dst_w_bpp

   352   .endif

   353         process_tail  cond1, numbytes1, firstreg1

   354         process_tail  cond2, numbytes2, firstreg2

   355         pixst   cond1, numbytes1, firstreg1, DST

   356         pixst   cond2, numbytes2, firstreg2, DST

   357  .endif

   358 .endm

   361 .macro test_bits_1_0_ptr

   362         movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */

   363 .endm

   365 .macro test_bits_3_2_ptr

   366         movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */

   367 .endm

   369 .macro leading_15bytes  process_head, process_tail

   370         /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */

   371         /* Use unaligned loads in all cases for simplicity */

   372  .if dst_w_bpp == 8

   373         conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1

   374  .elseif dst_w_bpp == 16

   375         test_bits_1_0_ptr

   376         conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1

   377  .endif

   378         conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1

   379 .endm

   381 .macro test_bits_3_2_pix

   382         movs    SCRATCH, X, lsl #dst_bpp_shift+32-3

   383 .endm

   385 .macro test_bits_1_0_pix

   386  .if dst_w_bpp == 8

   387         movs    SCRATCH, X, lsl #dst_bpp_shift+32-1

   388  .else

   389         movs    SCRATCH, X, lsr #1

   390  .endif

   391 .endm

   393 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask

   394         conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0

   395  .if dst_w_bpp == 16

   396         test_bits_1_0_pix

   397         conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0

   398  .elseif dst_w_bpp == 8

   399         conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0

   400  .endif

   401 .endm

   404 .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment

   405 110:

   406  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */

   407  .rept pix_per_block*dst_w_bpp/128

   408         process_head  , 16, 0, unaligned_src, unaligned_mask, 1

   409   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

   410         preload_middle  src_bpp, SRC, 1

   411   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

   412         preload_middle  mask_bpp, MASK, 1

   413   .else

   414         preload_middle  src_bpp, SRC, 0

   415         preload_middle  mask_bpp, MASK, 0

   416   .endif

   417   .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)

   418         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that

   419          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset

   420          * preloads for, to achieve staggered prefetches for multiple channels, because there are

   421          * always two STMs per prefetch, so there is always an opposite STM on which to put the

   422          * preload. Note, no need to BIC the base register here */

   423         PF  pld,    [DST, #32*prefetch_distance - dst_alignment]

   424   .endif

   425         process_tail  , 16, 0

   426   .if !((flags) & FLAG_PROCESS_DOES_STORE)

   427         pixst   , 16, 0, DST

   428   .endif

   429   .set SUBBLOCK, SUBBLOCK+1

   430  .endr

   431         subs    X, X, #pix_per_block

   432         bhs     110b

   433 .endm

   435 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask

   436         /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */

   437  .if dst_r_bpp > 0

   438         tst     DST, #16

   439         bne     111f

   440         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16

   441         b       112f

   442 111:

   443  .endif

   444         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0

   445 112:

   446         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */

   447  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)

   448         PF  and,    WK0, X, #pix_per_block-1

   449  .endif

   450         preload_trailing  src_bpp, src_bpp_shift, SRC

   451         preload_trailing  mask_bpp, mask_bpp_shift, MASK

   452         preload_trailing  dst_r_bpp, dst_bpp_shift, DST

   453         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp

   454         /* The remainder of the line is handled identically to the medium case */

   455         medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask

   456 .endm

   458 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask

   459 120:

   460         process_head  , 16, 0, unaligned_src, unaligned_mask, 0

   461         process_tail  , 16, 0

   462  .if !((flags) & FLAG_PROCESS_DOES_STORE)

   463         pixst   , 16, 0, DST

   464  .endif

   465         subs    X, X, #128/dst_w_bpp

   466         bhs     120b

   467         /* Trailing pixels */

   468         tst     X, #128/dst_w_bpp - 1

   469         beq     exit_label

   470         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask

   471 .endm

   473 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask

   474         tst     X, #16*8/dst_w_bpp

   475         conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0

   476         /* Trailing pixels */

   477         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */

   478         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask

   479 .endm

   481 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label

   482  /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */

   483  .if mask_bpp == 8 || mask_bpp == 16

   484         tst     MASK, #3

   485         bne     141f

   486  .endif

   487   .if src_bpp == 8 || src_bpp == 16

   488         tst     SRC, #3

   489         bne     140f

   490   .endif

   491         action  process_head, process_tail, process_inner_loop, exit_label, 0, 0

   492   .if src_bpp == 8 || src_bpp == 16

   493         b       exit_label

   494 140:

   495         action  process_head, process_tail, process_inner_loop, exit_label, 1, 0

   496   .endif

   497  .if mask_bpp == 8 || mask_bpp == 16

   498         b       exit_label

   499 141:

   500   .if src_bpp == 8 || src_bpp == 16

   501         tst     SRC, #3

   502         bne     142f

   503   .endif

   504         action  process_head, process_tail, process_inner_loop, exit_label, 0, 1

   505   .if src_bpp == 8 || src_bpp == 16

   506         b       exit_label

   507 142:

   508         action  process_head, process_tail, process_inner_loop, exit_label, 1, 1

   509   .endif

   510  .endif

   511 .endm

   514 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one

   515  .if vars_spilled

   516         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */

   517         /* This is ldmia sp,{} */

   518         .word   0xE89D0000 | LINE_SAVED_REGS

   519  .endif

   520         subs    Y, Y, #1

   521  .if vars_spilled

   522   .if (LINE_SAVED_REGS) & (1<<1)

   523         str     Y, [sp]

   524   .endif

   525  .endif

   526         add     DST, DST, STRIDE_D

   527  .if src_bpp > 0

   528         add     SRC, SRC, STRIDE_S

   529  .endif

   530  .if mask_bpp > 0

   531         add     MASK, MASK, STRIDE_M

   532  .endif

   533  .if restore_x

   534         mov     X, ORIG_W

   535  .endif

   536         bhs     loop_label

   537  .ifc "last_one",""

   538   .if vars_spilled

   539         b       197f

   540   .else

   541         b       198f

   542   .endif

   543  .else

   544   .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)

   545         b       198f

   546   .endif

   547  .endif

   548 .endm

   551 .macro generate_composite_function fname, \

   552                                    src_bpp_, \

   553                                    mask_bpp_, \

   554                                    dst_w_bpp_, \

   555                                    flags_, \

   556                                    prefetch_distance_, \

   557                                    init, \

   558                                    newline, \

   559                                    cleanup, \

   560                                    process_head, \

   561                                    process_tail, \

   562                                    process_inner_loop

   564  .func fname

   565  .global fname

   566  /* For ELF format also set function visibility to hidden */

   567 #ifdef __ELF__

   568  .hidden fname

   569  .type fname, %function

   570 #endif

   572 /*

   573  * Make some macro arguments globally visible and accessible

   574  * from other macros

   575  */

   576  .set src_bpp, src_bpp_

   577  .set mask_bpp, mask_bpp_

   578  .set dst_w_bpp, dst_w_bpp_

   579  .set flags, flags_

   580  .set prefetch_distance, prefetch_distance_

   582 /*

   583  * Select prefetch type for this function.

   584  */

   585  .if prefetch_distance == 0

   586   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE

   587  .else

   588   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD

   589  .endif

   591  .if src_bpp == 32

   592   .set src_bpp_shift, 2

   593  .elseif src_bpp == 24

   594   .set src_bpp_shift, 0

   595  .elseif src_bpp == 16

   596   .set src_bpp_shift, 1

   597  .elseif src_bpp == 8

   598   .set src_bpp_shift, 0

   599  .elseif src_bpp == 0

   600   .set src_bpp_shift, -1

   601  .else

   602   .error "requested src bpp (src_bpp) is not supported"

   603  .endif

   605  .if mask_bpp == 32

   606   .set mask_bpp_shift, 2

   607  .elseif mask_bpp == 24

   608   .set mask_bpp_shift, 0

   609  .elseif mask_bpp == 8

   610   .set mask_bpp_shift, 0

   611  .elseif mask_bpp == 0

   612   .set mask_bpp_shift, -1

   613  .else

   614   .error "requested mask bpp (mask_bpp) is not supported"

   615  .endif

   617  .if dst_w_bpp == 32

   618   .set dst_bpp_shift, 2

   619  .elseif dst_w_bpp == 24

   620   .set dst_bpp_shift, 0

   621  .elseif dst_w_bpp == 16

   622   .set dst_bpp_shift, 1

   623  .elseif dst_w_bpp == 8

   624   .set dst_bpp_shift, 0

   625  .else

   626   .error "requested dst bpp (dst_w_bpp) is not supported"

   627  .endif

   629  .if (((flags) & FLAG_DST_READWRITE) != 0)

   630   .set dst_r_bpp, dst_w_bpp

   631  .else

   632   .set dst_r_bpp, 0

   633  .endif

   635  .set pix_per_block, 16*8/dst_w_bpp

   636  .if src_bpp != 0

   637   .if 32*8/src_bpp > pix_per_block

   638    .set pix_per_block, 32*8/src_bpp

   639   .endif

   640  .endif

   641  .if mask_bpp != 0

   642   .if 32*8/mask_bpp > pix_per_block

   643    .set pix_per_block, 32*8/mask_bpp

   644   .endif

   645  .endif

   646  .if dst_r_bpp != 0

   647   .if 32*8/dst_r_bpp > pix_per_block

   648    .set pix_per_block, 32*8/dst_r_bpp

   649   .endif

   650  .endif

   652 /* The standard entry conditions set up by pixman-arm-common.h are:

   653  * r0 = width (pixels)

   654  * r1 = height (rows)

   655  * r2 = pointer to top-left pixel of destination

   656  * r3 = destination stride (pixels)

   657  * [sp] = source pixel value, or pointer to top-left pixel of source

   658  * [sp,#4] = 0 or source stride (pixels)

   659  * The following arguments are unused for non-mask operations

   660  * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask

   661  * [sp,#12] = 0 or mask stride (pixels)

   662  */

   664 /*

   665  * Assign symbolic names to registers

   666  */

   667     X           .req    r0  /* pixels to go on this line */

   668     Y           .req    r1  /* lines to go */

   669     DST         .req    r2  /* destination pixel pointer */

   670     STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */

   671     SRC         .req    r4  /* source pixel pointer */

   672     STRIDE_S    .req    r5  /* source stride (bytes, minus width) */

   673     MASK        .req    r6  /* mask pixel pointer (if applicable) */

   674     STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */

   675     WK0         .req    r8  /* pixel data registers */

   676     WK1         .req    r9

   677     WK2         .req    r10

   678     WK3         .req    r11

   679     SCRATCH     .req    r12

   680     ORIG_W      .req    r14 /* width (pixels) */

   682 fname:

   683         .fnstart

   684 	.save   {r4-r11, lr}

   685         push    {r4-r11, lr}        /* save all registers */

   687         subs    Y, Y, #1

   688         blo     199f

   690 #ifdef DEBUG_PARAMS

   691 	.pad    #9*4

   692         sub     sp, sp, #9*4

   693 #endif

   695  .if src_bpp > 0

   696         ldr     SRC, [sp, #ARGS_STACK_OFFSET]

   697         ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]

   698  .endif

   699  .if mask_bpp > 0

   700         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]

   701         ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]

   702  .endif

   704 #ifdef DEBUG_PARAMS

   705         add     Y, Y, #1

   706         stmia   sp, {r0-r7,pc}

   707         sub     Y, Y, #1

   708 #endif

   710         init

   712         lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */

   713         sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift

   714  .if src_bpp > 0

   715         lsl     STRIDE_S, #src_bpp_shift

   716         sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift

   717  .endif

   718  .if mask_bpp > 0

   719         lsl     STRIDE_M, #mask_bpp_shift

   720         sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift

   721  .endif

   723         /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */

   724         cmp     X, #2*16*8/dst_w_bpp - 1

   725         blo     170f

   726  .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */

   727         /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */

   728         cmp     X, #(prefetch_distance+3)*pix_per_block - 1

   729         blo     160f

   731         /* Wide case */

   732         /* Adjust X so that the decrement instruction can also test for

   733          * inner loop termination. We want it to stop when there are

   734          * (prefetch_distance+1) complete blocks to go. */

   735         sub     X, X, #(prefetch_distance+2)*pix_per_block

   736         mov     ORIG_W, X

   737   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE

   738         /* This is stmdb sp!,{} */

   739         .word   0xE92D0000 | LINE_SAVED_REGS

   740   .endif

   741 151:    /* New line */

   742         newline

   743         preload_leading_step1  src_bpp, WK1, SRC

   744         preload_leading_step1  mask_bpp, WK2, MASK

   745         preload_leading_step1  dst_r_bpp, WK3, DST

   747         tst     DST, #15

   748         beq     154f

   749         rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */

   750   .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)

   751         PF  and,    WK0, WK0, #15

   752   .endif

   754         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC

   755         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK

   756         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST

   758         leading_15bytes  process_head, process_tail

   760 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */

   761  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

   762         and     SCRATCH, SRC, #31

   763         rsb     SCRATCH, SCRATCH, #32*prefetch_distance

   764  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

   765         and     SCRATCH, MASK, #31

   766         rsb     SCRATCH, SCRATCH, #32*prefetch_distance

   767  .endif

   768  .ifc "process_inner_loop",""

   769         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f

   770  .else

   771         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f

   772  .endif

   774 157:    /* Check for another line */

   775         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b

   776  .endif

   778  .ltorg

   780 160:    /* Medium case */

   781         mov     ORIG_W, X

   782  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE

   783         /* This is stmdb sp!,{} */

   784         .word   0xE92D0000 | LINE_SAVED_REGS

   785  .endif

   786 161:    /* New line */

   787         newline

   788         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */

   789         preload_line 0, mask_bpp, mask_bpp_shift, MASK

   790         preload_line 0, dst_r_bpp, dst_bpp_shift, DST

   792         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */

   793         tst     DST, #15

   794         beq     164f

   795         rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */

   797         leading_15bytes  process_head, process_tail

   799 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */

   800         switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f

   802 167:    /* Check for another line */

   803         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b

   805  .ltorg

   807 170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */

   808  .if dst_w_bpp < 32

   809         mov     ORIG_W, X

   810  .endif

   811  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE

   812         /* This is stmdb sp!,{} */

   813         .word   0xE92D0000 | LINE_SAVED_REGS

   814  .endif

   815 171:    /* New line */

   816         newline

   817         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */

   818         preload_line 1, mask_bpp, mask_bpp_shift, MASK

   819         preload_line 1, dst_r_bpp, dst_bpp_shift, DST

   821  .if dst_w_bpp == 8

   822         tst     DST, #3

   823         beq     174f

   824 172:    subs    X, X, #1

   825         blo     177f

   826         process_head  , 1, 0, 1, 1, 0

   827         process_tail  , 1, 0

   828   .if !((flags) & FLAG_PROCESS_DOES_STORE)

   829         pixst   , 1, 0, DST

   830   .endif

   831         tst     DST, #3

   832         bne     172b

   833  .elseif dst_w_bpp == 16

   834         tst     DST, #2

   835         beq     174f

   836         subs    X, X, #1

   837         blo     177f

   838         process_head  , 2, 0, 1, 1, 0

   839         process_tail  , 2, 0

   840   .if !((flags) & FLAG_PROCESS_DOES_STORE)

   841         pixst   , 2, 0, DST

   842   .endif

   843  .endif

   845 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */

   846         switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f

   848 177:    /* Check for another line */

   849         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one

   851 197:

   852  .if (flags) & FLAG_SPILL_LINE_VARS

   853         add     sp, sp, #LINE_SAVED_REG_COUNT*4

   854  .endif

   855 198:

   856         cleanup

   858 #ifdef DEBUG_PARAMS

   859         add     sp, sp, #9*4 /* junk the debug copy of arguments */

   860 #endif

   861 199:

   862         pop     {r4-r11, pc}  /* exit */

   863 	.fnend

   865  .ltorg

   867     .unreq  X

   868     .unreq  Y

   869     .unreq  DST

   870     .unreq  STRIDE_D

   871     .unreq  SRC

   872     .unreq  STRIDE_S

   873     .unreq  MASK

   874     .unreq  STRIDE_M

   875     .unreq  WK0

   876     .unreq  WK1

   877     .unreq  WK2

   878     .unreq  WK3

   879     .unreq  SCRATCH

   880     .unreq  ORIG_W

   881     .endfunc

   882 .endm

   884 .macro line_saved_regs  x:vararg

   885  .set LINE_SAVED_REGS, 0

   886  .set LINE_SAVED_REG_COUNT, 0

   887  .irp SAVED_REG,x

   888   .ifc "SAVED_REG","Y"

   889    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)

   890    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

   891   .endif

   892   .ifc "SAVED_REG","STRIDE_D"

   893    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)

   894    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

   895   .endif

   896   .ifc "SAVED_REG","STRIDE_S"

   897    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)

   898    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

   899   .endif

   900   .ifc "SAVED_REG","STRIDE_M"

   901    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)

   902    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

   903   .endif

   904   .ifc "SAVED_REG","ORIG_W"

   905    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)

   906    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

   907   .endif

   908  .endr

   909 .endm

   911 .macro nop_macro x:vararg

   912 .endm

The Tor Browser / file revision

gfx/cairo/libpixman/src/pixman-arm-simd-asm.h@b8a032363ba2

gfx/cairo/libpixman/src/pixman-arm-simd-asm.h