gfx/cairo/libpixman/src/pixman-arm-simd-asm.S

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  * Copyright © 2012 Raspberry Pi Foundation
     3  * Copyright © 2012 RISC OS Open Ltd
     4  *
     5  * Permission to use, copy, modify, distribute, and sell this software and its
     6  * documentation for any purpose is hereby granted without fee, provided that
     7  * the above copyright notice appear in all copies and that both that
     8  * copyright notice and this permission notice appear in supporting
     9  * documentation, and that the name of the copyright holders not be used in
    10  * advertising or publicity pertaining to distribution of the software without
    11  * specific, written prior permission.  The copyright holders make no
    12  * representations about the suitability of this software for any purpose.  It
    13  * is provided "as is" without express or implied warranty.
    14  *
    15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    22  * SOFTWARE.
    23  *
    24  * Author:  Ben Avison (bavison@riscosopen.org)
    25  *
    26  */
    28 /* Prevent the stack from becoming executable */
    29 #if defined(__linux__) && defined(__ELF__)
    30 .section .note.GNU-stack,"",%progbits
    31 #endif
    33 	.text
    34 	.arch armv6
    35 	.object_arch armv4
    36 	.arm
    37 	.altmacro
    38 	.p2align 2
    40 #include "pixman-arm-simd-asm.h"
    42 /* A head macro should do all processing which results in an output of up to
    43  * 16 bytes, as far as the final load instruction. The corresponding tail macro
    44  * should complete the processing of the up-to-16 bytes. The calling macro will
    45  * sometimes choose to insert a preload or a decrement of X between them.
    46  *   cond           ARM condition code for code block
    47  *   numbytes       Number of output bytes that should be generated this time
    48  *   firstreg       First WK register in which to place output
    49  *   unaligned_src  Whether to use non-wordaligned loads of source image
    50  *   unaligned_mask Whether to use non-wordaligned loads of mask image
    51  *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
    52  */
    54 .macro blit_init
    55         line_saved_regs STRIDE_D, STRIDE_S
    56 .endm
    58 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    59         pixld   cond, numbytes, firstreg, SRC, unaligned_src
    60 .endm
    62 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
    63     WK4     .req    STRIDE_D
    64     WK5     .req    STRIDE_S
    65     WK6     .req    MASK
    66     WK7     .req    STRIDE_M
    67 110:    pixld   , 16, 0, SRC, unaligned_src
    68         pixld   , 16, 4, SRC, unaligned_src
    69         pld     [SRC, SCRATCH]
    70         pixst   , 16, 0, DST
    71         pixst   , 16, 4, DST
    72         subs    X, X, #32*8/src_bpp
    73         bhs     110b
    74     .unreq  WK4
    75     .unreq  WK5
    76     .unreq  WK6
    77     .unreq  WK7
    78 .endm
    80 generate_composite_function \
    81     pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
    82     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
    83     4, /* prefetch distance */ \
    84     blit_init, \
    85     nop_macro, /* newline */ \
    86     nop_macro, /* cleanup */ \
    87     blit_process_head, \
    88     nop_macro, /* process tail */ \
    89     blit_inner_loop
    91 generate_composite_function \
    92     pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
    93     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
    94     4, /* prefetch distance */ \
    95     blit_init, \
    96     nop_macro, /* newline */ \
    97     nop_macro, /* cleanup */ \
    98     blit_process_head, \
    99     nop_macro, /* process tail */ \
   100     blit_inner_loop
   102 generate_composite_function \
   103     pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
   104     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
   105     3, /* prefetch distance */ \
   106     blit_init, \
   107     nop_macro, /* newline */ \
   108     nop_macro, /* cleanup */ \
   109     blit_process_head, \
   110     nop_macro, /* process tail */ \
   111     blit_inner_loop
   113 /******************************************************************************/
   115 .macro src_n_8888_init
   116         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
   117         mov     STRIDE_S, SRC
   118         mov     MASK, SRC
   119         mov     STRIDE_M, SRC
   120 .endm
   122 .macro src_n_0565_init
   123         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
   124         orr     SRC, SRC, lsl #16
   125         mov     STRIDE_S, SRC
   126         mov     MASK, SRC
   127         mov     STRIDE_M, SRC
   128 .endm
   130 .macro src_n_8_init
   131         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
   132         orr     SRC, SRC, lsl #8
   133         orr     SRC, SRC, lsl #16
   134         mov     STRIDE_S, SRC
   135         mov     MASK, SRC
   136         mov     STRIDE_M, SRC
   137 .endm
   139 .macro fill_process_tail  cond, numbytes, firstreg
   140     WK4     .req    SRC
   141     WK5     .req    STRIDE_S
   142     WK6     .req    MASK
   143     WK7     .req    STRIDE_M
   144         pixst   cond, numbytes, 4, DST
   145     .unreq  WK4
   146     .unreq  WK5
   147     .unreq  WK6
   148     .unreq  WK7
   149 .endm
   151 generate_composite_function \
   152     pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
   153     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
   154     0, /* prefetch distance doesn't apply */ \
   155     src_n_8888_init \
   156     nop_macro, /* newline */ \
   157     nop_macro /* cleanup */ \
   158     nop_macro /* process head */ \
   159     fill_process_tail
   161 generate_composite_function \
   162     pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
   163     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
   164     0, /* prefetch distance doesn't apply */ \
   165     src_n_0565_init \
   166     nop_macro, /* newline */ \
   167     nop_macro /* cleanup */ \
   168     nop_macro /* process head */ \
   169     fill_process_tail
   171 generate_composite_function \
   172     pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
   173     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
   174     0, /* prefetch distance doesn't apply */ \
   175     src_n_8_init \
   176     nop_macro, /* newline */ \
   177     nop_macro /* cleanup */ \
   178     nop_macro /* process head */ \
   179     fill_process_tail
   181 /******************************************************************************/
   183 .macro src_x888_8888_pixel, cond, reg
   184         orr&cond WK&reg, WK&reg, #0xFF000000
   185 .endm
   187 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   188         pixld   cond, numbytes, firstreg, SRC, unaligned_src
   189 .endm
   191 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
   192         src_x888_8888_pixel cond, %(firstreg+0)
   193  .if numbytes >= 8
   194         src_x888_8888_pixel cond, %(firstreg+1)
   195   .if numbytes == 16
   196         src_x888_8888_pixel cond, %(firstreg+2)
   197         src_x888_8888_pixel cond, %(firstreg+3)
   198   .endif
   199  .endif
   200 .endm
   202 generate_composite_function \
   203     pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
   204     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
   205     3, /* prefetch distance */ \
   206     nop_macro, /* init */ \
   207     nop_macro, /* newline */ \
   208     nop_macro, /* cleanup */ \
   209     pixman_composite_src_x888_8888_process_head, \
   210     pixman_composite_src_x888_8888_process_tail
   212 /******************************************************************************/
   214 .macro src_0565_8888_init
   215         /* Hold loop invariants in MASK and STRIDE_M */
   216         ldr     MASK, =0x07E007E0
   217         mov     STRIDE_M, #0xFF000000
   218         /* Set GE[3:0] to 1010 so SEL instructions do what we want */
   219         ldr     SCRATCH, =0x80008000
   220         uadd8   SCRATCH, SCRATCH, SCRATCH
   221 .endm
   223 .macro src_0565_8888_2pixels, reg1, reg2
   224         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
   225         bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
   226         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
   227         mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
   228         mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
   229         bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
   230         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
   231         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
   232         pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
   233         sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
   234         mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
   235         pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
   236         sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
   237         orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
   238         orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
   239 .endm
   241 /* This version doesn't need STRIDE_M, but is one instruction longer.
   242    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
   243         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
   244         bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
   245         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
   246         mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
   247         mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
   248         bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
   249         mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
   250         mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
   251         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
   252         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
   253         pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
   254         pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
   255         sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
   256         sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
   257         orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
   258         orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
   259 */
   261 .macro src_0565_8888_1pixel, reg
   262         bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
   263         and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
   264         mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
   265         mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
   266         orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
   267         orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
   268         pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
   269         sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
   270         orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
   271 .endm
   273 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   274  .if numbytes == 16
   275         pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
   276  .elseif numbytes == 8
   277         pixld   , 4, firstreg, SRC, unaligned_src
   278  .elseif numbytes == 4
   279         pixld   , 2, firstreg, SRC, unaligned_src
   280  .endif
   281 .endm
   283 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
   284  .if numbytes == 16
   285         src_0565_8888_2pixels firstreg, %(firstreg+1)
   286         src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
   287  .elseif numbytes == 8
   288         src_0565_8888_2pixels firstreg, %(firstreg+1)
   289  .else
   290         src_0565_8888_1pixel firstreg
   291  .endif
   292 .endm
   294 generate_composite_function \
   295     pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
   296     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
   297     3, /* prefetch distance */ \
   298     src_0565_8888_init, \
   299     nop_macro, /* newline */ \
   300     nop_macro, /* cleanup */ \
   301     src_0565_8888_process_head, \
   302     src_0565_8888_process_tail
   304 /******************************************************************************/
   306 .macro add_8_8_8pixels  cond, dst1, dst2
   307         uqadd8&cond  WK&dst1, WK&dst1, MASK
   308         uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
   309 .endm
   311 .macro add_8_8_4pixels  cond, dst
   312         uqadd8&cond  WK&dst, WK&dst, MASK
   313 .endm
   315 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   316     WK4     .req    MASK
   317     WK5     .req    STRIDE_M
   318  .if numbytes == 16
   319         pixld   cond, 8, 4, SRC, unaligned_src
   320         pixld   cond, 16, firstreg, DST, 0
   321         add_8_8_8pixels cond, firstreg, %(firstreg+1)
   322         pixld   cond, 8, 4, SRC, unaligned_src
   323  .else
   324         pixld   cond, numbytes, 4, SRC, unaligned_src
   325         pixld   cond, numbytes, firstreg, DST, 0
   326  .endif
   327     .unreq  WK4
   328     .unreq  WK5
   329 .endm
   331 .macro add_8_8_process_tail  cond, numbytes, firstreg
   332  .if numbytes == 16
   333         add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
   334  .elseif numbytes == 8
   335         add_8_8_8pixels cond, firstreg, %(firstreg+1)
   336  .else
   337         add_8_8_4pixels cond, firstreg
   338  .endif
   339 .endm
   341 generate_composite_function \
   342     pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
   343     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
   344     2, /* prefetch distance */ \
   345     nop_macro, /* init */ \
   346     nop_macro, /* newline */ \
   347     nop_macro, /* cleanup */ \
   348     add_8_8_process_head, \
   349     add_8_8_process_tail
   351 /******************************************************************************/
   353 .macro over_8888_8888_init
   354         /* Hold loop invariant in MASK */
   355         ldr     MASK, =0x00800080
   356         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
   357         uadd8   SCRATCH, MASK, MASK
   358         line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
   359 .endm
   361 .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   362     WK4     .req    STRIDE_D
   363     WK5     .req    STRIDE_S
   364     WK6     .req    STRIDE_M
   365     WK7     .req    ORIG_W
   366         pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
   367         pixld   , numbytes, firstreg, DST, 0
   368     .unreq  WK4
   369     .unreq  WK5
   370     .unreq  WK6
   371     .unreq  WK7
   372 .endm
   374 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
   375         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
   376         teq     WK&reg0, #0
   377  .if numbytes > 4
   378         teqeq   WK&reg1, #0
   379   .if numbytes > 8
   380         teqeq   WK&reg2, #0
   381         teqeq   WK&reg3, #0
   382   .endif
   383  .endif
   384 .endm
   386 .macro over_8888_8888_prepare  next
   387         mov     WK&next, WK&next, lsr #24
   388 .endm
   390 .macro over_8888_8888_1pixel src, dst, offset, next
   391         /* src = destination component multiplier */
   392         rsb     WK&src, WK&src, #255
   393         /* Split even/odd bytes of dst into SCRATCH/dst */
   394         uxtb16  SCRATCH, WK&dst
   395         uxtb16  WK&dst, WK&dst, ror #8
   396         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
   397         mla     SCRATCH, SCRATCH, WK&src, MASK
   398         mla     WK&dst, WK&dst, WK&src, MASK
   399         /* Where we would have had a stall between the result of the first MLA and the shifter input,
   400          * reload the complete source pixel */
   401         ldr     WK&src, [SRC, #offset]
   402         /* Multiply by 257/256 to approximate 256/255 */
   403         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
   404         /* In this stall, start processing the next pixel */
   405  .if offset < -4
   406         mov     WK&next, WK&next, lsr #24
   407  .endif
   408         uxtab16 WK&dst, WK&dst, WK&dst, ror #8
   409         /* Recombine even/odd bytes of multiplied destination */
   410         mov     SCRATCH, SCRATCH, ror #8
   411         sel     WK&dst, SCRATCH, WK&dst
   412         /* Saturated add of source to multiplied destination */
   413         uqadd8  WK&dst, WK&dst, WK&src
   414 .endm
   416 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
   417     WK4     .req    STRIDE_D
   418     WK5     .req    STRIDE_S
   419     WK6     .req    STRIDE_M
   420     WK7     .req    ORIG_W
   421         over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
   422         beq     10f
   423         over_8888_8888_prepare  %(4+firstreg)
   424  .set PROCESS_REG, firstreg
   425  .set PROCESS_OFF, -numbytes
   426  .rept numbytes / 4
   427         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
   428   .set PROCESS_REG, PROCESS_REG+1
   429   .set PROCESS_OFF, PROCESS_OFF+4
   430  .endr
   431         pixst   , numbytes, firstreg, DST
   432 10:
   433     .unreq  WK4
   434     .unreq  WK5
   435     .unreq  WK6
   436     .unreq  WK7
   437 .endm
   439 generate_composite_function \
   440     pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
   441     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
   442     2, /* prefetch distance */ \
   443     over_8888_8888_init, \
   444     nop_macro, /* newline */ \
   445     nop_macro, /* cleanup */ \
   446     over_8888_8888_process_head, \
   447     over_8888_8888_process_tail
   449 /******************************************************************************/
   451 /* Multiply each byte of a word by a byte.
   452  * Useful when there aren't any obvious ways to fill the stalls with other instructions.
   453  * word  Register containing 4 bytes
   454  * byte  Register containing byte multiplier (bits 8-31 must be 0)
   455  * tmp   Scratch register
   456  * half  Register containing the constant 0x00800080
   457  * GE[3:0] bits must contain 0101
   458  */
   459 .macro mul_8888_8  word, byte, tmp, half
   460         /* Split even/odd bytes of word apart */
   461         uxtb16  tmp, word
   462         uxtb16  word, word, ror #8
   463         /* Multiply bytes together with rounding, then by 257/256 */
   464         mla     tmp, tmp, byte, half
   465         mla     word, word, byte, half /* 1 stall follows */
   466         uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
   467         uxtab16 word, word, word, ror #8
   468         /* Recombine bytes */
   469         mov     tmp, tmp, ror #8
   470         sel     word, tmp, word
   471 .endm
   473 /******************************************************************************/
   475 .macro over_8888_n_8888_init
   476         /* Mask is constant */
   477         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
   478         /* Hold loop invariant in STRIDE_M */
   479         ldr     STRIDE_M, =0x00800080
   480         /* We only want the alpha bits of the constant mask */
   481         mov     MASK, MASK, lsr #24
   482         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
   483         uadd8   SCRATCH, STRIDE_M, STRIDE_M
   484         line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
   485 .endm
   487 .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   488     WK4     .req    Y
   489     WK5     .req    STRIDE_D
   490     WK6     .req    STRIDE_S
   491     WK7     .req    ORIG_W
   492         pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
   493         pixld   , numbytes, firstreg, DST, 0
   494     .unreq  WK4
   495     .unreq  WK5
   496     .unreq  WK6
   497     .unreq  WK7
   498 .endm
   500 .macro over_8888_n_8888_1pixel src, dst
   501         mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
   502         sub     WK7, WK6, WK&src, lsr #24
   503         mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
   504         uqadd8  WK&dst, WK&dst, WK&src
   505 .endm
   507 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
   508     WK4     .req    Y
   509     WK5     .req    STRIDE_D
   510     WK6     .req    STRIDE_S
   511     WK7     .req    ORIG_W
   512         over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
   513         beq     10f
   514         mov     WK6, #255
   515  .set PROCESS_REG, firstreg
   516  .rept numbytes / 4
   517   .if numbytes == 16 && PROCESS_REG == 2
   518         /* We're using WK6 and WK7 as temporaries, so half way through
   519          * 4 pixels, reload the second two source pixels but this time
   520          * into WK4 and WK5 */
   521         ldmdb   SRC, {WK4, WK5}
   522   .endif
   523         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
   524   .set PROCESS_REG, PROCESS_REG+1
   525  .endr
   526         pixst   , numbytes, firstreg, DST
   527 10:
   528     .unreq  WK4
   529     .unreq  WK5
   530     .unreq  WK6
   531     .unreq  WK7
   532 .endm
   534 generate_composite_function \
   535     pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
   536     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
   537     2, /* prefetch distance */ \
   538     over_8888_n_8888_init, \
   539     nop_macro, /* newline */ \
   540     nop_macro, /* cleanup */ \
   541     over_8888_n_8888_process_head, \
   542     over_8888_n_8888_process_tail
   544 /******************************************************************************/
   546 .macro over_n_8_8888_init
   547         /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
   548         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
   549         /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
   550         ldr     SCRATCH, =0x00800080
   551         uxtb16  STRIDE_S, SRC
   552         uxtb16  SRC, SRC, ror #8
   553         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
   554         uadd8   SCRATCH, SCRATCH, SCRATCH
   555         line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
   556 .endm
   558 .macro over_n_8_8888_newline
   559         ldr     STRIDE_D, =0x00800080
   560         b       1f
   561  .ltorg
   562 1:
   563 .endm
   565 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   566     WK4     .req    STRIDE_M
   567         pixld   , numbytes/4, 4, MASK, unaligned_mask
   568         pixld   , numbytes, firstreg, DST, 0
   569     .unreq  WK4
   570 .endm
   572 .macro over_n_8_8888_1pixel src, dst
   573         uxtb    Y, WK4, ror #src*8
   574         /* Trailing part of multiplication of source */
   575         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
   576         mla     Y, SRC, Y, STRIDE_D
   577         mov     ORIG_W, #255
   578         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
   579         uxtab16 Y, Y, Y, ror #8
   580         mov     SCRATCH, SCRATCH, ror #8
   581         sub     ORIG_W, ORIG_W, Y, lsr #24
   582         sel     Y, SCRATCH, Y
   583         /* Then multiply the destination */
   584         mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
   585         uqadd8  WK&dst, WK&dst, Y
   586 .endm
   588 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
   589     WK4     .req    STRIDE_M
   590         teq     WK4, #0
   591         beq     10f
   592  .set PROCESS_REG, firstreg
   593  .rept numbytes / 4
   594         over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
   595   .set PROCESS_REG, PROCESS_REG+1
   596  .endr
   597         pixst   , numbytes, firstreg, DST
   598 10:
   599     .unreq  WK4
   600 .endm
   602 generate_composite_function \
   603     pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
   604     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
   605     2, /* prefetch distance */ \
   606     over_n_8_8888_init, \
   607     over_n_8_8888_newline, \
   608     nop_macro, /* cleanup */ \
   609     over_n_8_8888_process_head, \
   610     over_n_8_8888_process_tail
   612 /******************************************************************************/

mercurial