michael@0: /*
michael@0:  * Copyright © 2012 Raspberry Pi Foundation
michael@0:  * Copyright © 2012 RISC OS Open Ltd
michael@0:  *
michael@0:  * Permission to use, copy, modify, distribute, and sell this software and its
michael@0:  * documentation for any purpose is hereby granted without fee, provided that
michael@0:  * the above copyright notice appear in all copies and that both that
michael@0:  * copyright notice and this permission notice appear in supporting
michael@0:  * documentation, and that the name of the copyright holders not be used in
michael@0:  * advertising or publicity pertaining to distribution of the software without
michael@0:  * specific, written prior permission.  The copyright holders make no
michael@0:  * representations about the suitability of this software for any purpose.  It
michael@0:  * is provided "as is" without express or implied warranty.
michael@0:  *
michael@0:  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
michael@0:  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
michael@0:  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
michael@0:  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
michael@0:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
michael@0:  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
michael@0:  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
michael@0:  * SOFTWARE.
michael@0:  *
michael@0:  * Author:  Ben Avison (bavison@riscosopen.org)
michael@0:  *
michael@0:  */
michael@0: 
michael@0: /* Prevent the stack from becoming executable */
michael@0: #if defined(__linux__) && defined(__ELF__)
michael@0: .section .note.GNU-stack,"",%progbits
michael@0: #endif
michael@0: 
michael@0: 	.text
michael@0: 	.arch armv6
michael@0: 	.object_arch armv4
michael@0: 	.arm
michael@0: 	.altmacro
michael@0: 	.p2align 2
michael@0: 
michael@0: #include "pixman-arm-simd-asm.h"
michael@0: 
michael@0: /* A head macro should do all processing which results in an output of up to
michael@0:  * 16 bytes, as far as the final load instruction. The corresponding tail macro
michael@0:  * should complete the processing of the up-to-16 bytes. The calling macro will
michael@0:  * sometimes choose to insert a preload or a decrement of X between them.
michael@0:  *   cond           ARM condition code for code block
michael@0:  *   numbytes       Number of output bytes that should be generated this time
michael@0:  *   firstreg       First WK register in which to place output
michael@0:  *   unaligned_src  Whether to use non-wordaligned loads of source image
michael@0:  *   unaligned_mask Whether to use non-wordaligned loads of mask image
michael@0:  *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
michael@0:  */
michael@0: 
michael@0: .macro blit_init
michael@0:         line_saved_regs STRIDE_D, STRIDE_S
michael@0: .endm
michael@0: 
michael@0: .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0:         pixld   cond, numbytes, firstreg, SRC, unaligned_src
michael@0: .endm
michael@0: 
michael@0: .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
michael@0:     WK4     .req    STRIDE_D
michael@0:     WK5     .req    STRIDE_S
michael@0:     WK6     .req    MASK
michael@0:     WK7     .req    STRIDE_M
michael@0: 110:    pixld   , 16, 0, SRC, unaligned_src
michael@0:         pixld   , 16, 4, SRC, unaligned_src
michael@0:         pld     [SRC, SCRATCH]
michael@0:         pixst   , 16, 0, DST
michael@0:         pixst   , 16, 4, DST
michael@0:         subs    X, X, #32*8/src_bpp
michael@0:         bhs     110b
michael@0:     .unreq  WK4
michael@0:     .unreq  WK5
michael@0:     .unreq  WK6
michael@0:     .unreq  WK7
michael@0: .endm
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
michael@0:     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0:     4, /* prefetch distance */ \
michael@0:     blit_init, \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     blit_process_head, \
michael@0:     nop_macro, /* process tail */ \
michael@0:     blit_inner_loop
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
michael@0:     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0:     4, /* prefetch distance */ \
michael@0:     blit_init, \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     blit_process_head, \
michael@0:     nop_macro, /* process tail */ \
michael@0:     blit_inner_loop
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
michael@0:     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0:     3, /* prefetch distance */ \
michael@0:     blit_init, \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     blit_process_head, \
michael@0:     nop_macro, /* process tail */ \
michael@0:     blit_inner_loop
michael@0: 
michael@0: /******************************************************************************/
michael@0: 
michael@0: .macro src_n_8888_init
michael@0:         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
michael@0:         mov     STRIDE_S, SRC
michael@0:         mov     MASK, SRC
michael@0:         mov     STRIDE_M, SRC
michael@0: .endm
michael@0: 
michael@0: .macro src_n_0565_init
michael@0:         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
michael@0:         orr     SRC, SRC, lsl #16
michael@0:         mov     STRIDE_S, SRC
michael@0:         mov     MASK, SRC
michael@0:         mov     STRIDE_M, SRC
michael@0: .endm
michael@0: 
michael@0: .macro src_n_8_init
michael@0:         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
michael@0:         orr     SRC, SRC, lsl #8
michael@0:         orr     SRC, SRC, lsl #16
michael@0:         mov     STRIDE_S, SRC
michael@0:         mov     MASK, SRC
michael@0:         mov     STRIDE_M, SRC
michael@0: .endm
michael@0: 
michael@0: .macro fill_process_tail  cond, numbytes, firstreg
michael@0:     WK4     .req    SRC
michael@0:     WK5     .req    STRIDE_S
michael@0:     WK6     .req    MASK
michael@0:     WK7     .req    STRIDE_M
michael@0:         pixst   cond, numbytes, 4, DST
michael@0:     .unreq  WK4
michael@0:     .unreq  WK5
michael@0:     .unreq  WK6
michael@0:     .unreq  WK7
michael@0: .endm
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
michael@0:     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
michael@0:     0, /* prefetch distance doesn't apply */ \
michael@0:     src_n_8888_init \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro /* cleanup */ \
michael@0:     nop_macro /* process head */ \
michael@0:     fill_process_tail
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
michael@0:     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
michael@0:     0, /* prefetch distance doesn't apply */ \
michael@0:     src_n_0565_init \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro /* cleanup */ \
michael@0:     nop_macro /* process head */ \
michael@0:     fill_process_tail
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
michael@0:     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
michael@0:     0, /* prefetch distance doesn't apply */ \
michael@0:     src_n_8_init \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro /* cleanup */ \
michael@0:     nop_macro /* process head */ \
michael@0:     fill_process_tail
michael@0: 
michael@0: /******************************************************************************/
michael@0: 
michael@0: .macro src_x888_8888_pixel, cond, reg
michael@0:         orr&cond WK&reg, WK&reg, #0xFF000000
michael@0: .endm
michael@0: 
michael@0: .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0:         pixld   cond, numbytes, firstreg, SRC, unaligned_src
michael@0: .endm
michael@0: 
michael@0: .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
michael@0:         src_x888_8888_pixel cond, %(firstreg+0)
michael@0:  .if numbytes >= 8
michael@0:         src_x888_8888_pixel cond, %(firstreg+1)
michael@0:   .if numbytes == 16
michael@0:         src_x888_8888_pixel cond, %(firstreg+2)
michael@0:         src_x888_8888_pixel cond, %(firstreg+3)
michael@0:   .endif
michael@0:  .endif
michael@0: .endm
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
michael@0:     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0:     3, /* prefetch distance */ \
michael@0:     nop_macro, /* init */ \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     pixman_composite_src_x888_8888_process_head, \
michael@0:     pixman_composite_src_x888_8888_process_tail
michael@0: 
michael@0: /******************************************************************************/
michael@0: 
michael@0: .macro src_0565_8888_init
michael@0:         /* Hold loop invariants in MASK and STRIDE_M */
michael@0:         ldr     MASK, =0x07E007E0
michael@0:         mov     STRIDE_M, #0xFF000000
michael@0:         /* Set GE[3:0] to 1010 so SEL instructions do what we want */
michael@0:         ldr     SCRATCH, =0x80008000
michael@0:         uadd8   SCRATCH, SCRATCH, SCRATCH
michael@0: .endm
michael@0: 
michael@0: .macro src_0565_8888_2pixels, reg1, reg2
michael@0:         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
michael@0:         bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
michael@0:         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
michael@0:         mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
michael@0:         mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
michael@0:         bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
michael@0:         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
michael@0:         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
michael@0:         pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
michael@0:         sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
michael@0:         mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
michael@0:         pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
michael@0:         sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
michael@0:         orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
michael@0:         orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
michael@0: .endm
michael@0: 
michael@0: /* This version doesn't need STRIDE_M, but is one instruction longer.
michael@0:    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
michael@0:         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
michael@0:         bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
michael@0:         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
michael@0:         mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
michael@0:         mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
michael@0:         bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
michael@0:         mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
michael@0:         mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
michael@0:         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
michael@0:         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
michael@0:         pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
michael@0:         pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
michael@0:         sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
michael@0:         sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
michael@0:         orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
michael@0:         orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
michael@0: */
michael@0: 
michael@0: .macro src_0565_8888_1pixel, reg
michael@0:         bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
michael@0:         and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
michael@0:         mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
michael@0:         mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
michael@0:         orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
michael@0:         orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
michael@0:         pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
michael@0:         sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
michael@0:         orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
michael@0: .endm
michael@0: 
michael@0: .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0:  .if numbytes == 16
michael@0:         pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
michael@0:  .elseif numbytes == 8
michael@0:         pixld   , 4, firstreg, SRC, unaligned_src
michael@0:  .elseif numbytes == 4
michael@0:         pixld   , 2, firstreg, SRC, unaligned_src
michael@0:  .endif
michael@0: .endm
michael@0: 
michael@0: .macro src_0565_8888_process_tail   cond, numbytes, firstreg
michael@0:  .if numbytes == 16
michael@0:         src_0565_8888_2pixels firstreg, %(firstreg+1)
michael@0:         src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
michael@0:  .elseif numbytes == 8
michael@0:         src_0565_8888_2pixels firstreg, %(firstreg+1)
michael@0:  .else
michael@0:         src_0565_8888_1pixel firstreg
michael@0:  .endif
michael@0: .endm
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
michael@0:     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
michael@0:     3, /* prefetch distance */ \
michael@0:     src_0565_8888_init, \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     src_0565_8888_process_head, \
michael@0:     src_0565_8888_process_tail
michael@0: 
michael@0: /******************************************************************************/
michael@0: 
michael@0: .macro add_8_8_8pixels  cond, dst1, dst2
michael@0:         uqadd8&cond  WK&dst1, WK&dst1, MASK
michael@0:         uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
michael@0: .endm
michael@0: 
michael@0: .macro add_8_8_4pixels  cond, dst
michael@0:         uqadd8&cond  WK&dst, WK&dst, MASK
michael@0: .endm
michael@0: 
michael@0: .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0:     WK4     .req    MASK
michael@0:     WK5     .req    STRIDE_M
michael@0:  .if numbytes == 16
michael@0:         pixld   cond, 8, 4, SRC, unaligned_src
michael@0:         pixld   cond, 16, firstreg, DST, 0
michael@0:         add_8_8_8pixels cond, firstreg, %(firstreg+1)
michael@0:         pixld   cond, 8, 4, SRC, unaligned_src
michael@0:  .else
michael@0:         pixld   cond, numbytes, 4, SRC, unaligned_src
michael@0:         pixld   cond, numbytes, firstreg, DST, 0
michael@0:  .endif
michael@0:     .unreq  WK4
michael@0:     .unreq  WK5
michael@0: .endm
michael@0: 
michael@0: .macro add_8_8_process_tail  cond, numbytes, firstreg
michael@0:  .if numbytes == 16
michael@0:         add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
michael@0:  .elseif numbytes == 8
michael@0:         add_8_8_8pixels cond, firstreg, %(firstreg+1)
michael@0:  .else
michael@0:         add_8_8_4pixels cond, firstreg
michael@0:  .endif
michael@0: .endm
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
michael@0:     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0:     2, /* prefetch distance */ \
michael@0:     nop_macro, /* init */ \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     add_8_8_process_head, \
michael@0:     add_8_8_process_tail
michael@0: 
michael@0: /******************************************************************************/
michael@0: 
michael@0: .macro over_8888_8888_init
michael@0:         /* Hold loop invariant in MASK */
michael@0:         ldr     MASK, =0x00800080
michael@0:         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
michael@0:         uadd8   SCRATCH, MASK, MASK
michael@0:         line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
michael@0: .endm
michael@0: 
michael@0: .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0:     WK4     .req    STRIDE_D
michael@0:     WK5     .req    STRIDE_S
michael@0:     WK6     .req    STRIDE_M
michael@0:     WK7     .req    ORIG_W
michael@0:         pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
michael@0:         pixld   , numbytes, firstreg, DST, 0
michael@0:     .unreq  WK4
michael@0:     .unreq  WK5
michael@0:     .unreq  WK6
michael@0:     .unreq  WK7
michael@0: .endm
michael@0: 
michael@0: .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
michael@0:         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
michael@0:         teq     WK&reg0, #0
michael@0:  .if numbytes > 4
michael@0:         teqeq   WK&reg1, #0
michael@0:   .if numbytes > 8
michael@0:         teqeq   WK&reg2, #0
michael@0:         teqeq   WK&reg3, #0
michael@0:   .endif
michael@0:  .endif
michael@0: .endm
michael@0: 
michael@0: .macro over_8888_8888_prepare  next
michael@0:         mov     WK&next, WK&next, lsr #24
michael@0: .endm
michael@0: 
michael@0: .macro over_8888_8888_1pixel src, dst, offset, next
michael@0:         /* src = destination component multiplier */
michael@0:         rsb     WK&src, WK&src, #255
michael@0:         /* Split even/odd bytes of dst into SCRATCH/dst */
michael@0:         uxtb16  SCRATCH, WK&dst
michael@0:         uxtb16  WK&dst, WK&dst, ror #8
michael@0:         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
michael@0:         mla     SCRATCH, SCRATCH, WK&src, MASK
michael@0:         mla     WK&dst, WK&dst, WK&src, MASK
michael@0:         /* Where we would have had a stall between the result of the first MLA and the shifter input,
michael@0:          * reload the complete source pixel */
michael@0:         ldr     WK&src, [SRC, #offset]
michael@0:         /* Multiply by 257/256 to approximate 256/255 */
michael@0:         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
michael@0:         /* In this stall, start processing the next pixel */
michael@0:  .if offset < -4
michael@0:         mov     WK&next, WK&next, lsr #24
michael@0:  .endif
michael@0:         uxtab16 WK&dst, WK&dst, WK&dst, ror #8
michael@0:         /* Recombine even/odd bytes of multiplied destination */
michael@0:         mov     SCRATCH, SCRATCH, ror #8
michael@0:         sel     WK&dst, SCRATCH, WK&dst
michael@0:         /* Saturated add of source to multiplied destination */
michael@0:         uqadd8  WK&dst, WK&dst, WK&src
michael@0: .endm
michael@0: 
michael@0: .macro over_8888_8888_process_tail  cond, numbytes, firstreg
michael@0:     WK4     .req    STRIDE_D
michael@0:     WK5     .req    STRIDE_S
michael@0:     WK6     .req    STRIDE_M
michael@0:     WK7     .req    ORIG_W
michael@0:         over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
michael@0:         beq     10f
michael@0:         over_8888_8888_prepare  %(4+firstreg)
michael@0:  .set PROCESS_REG, firstreg
michael@0:  .set PROCESS_OFF, -numbytes
michael@0:  .rept numbytes / 4
michael@0:         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
michael@0:   .set PROCESS_REG, PROCESS_REG+1
michael@0:   .set PROCESS_OFF, PROCESS_OFF+4
michael@0:  .endr
michael@0:         pixst   , numbytes, firstreg, DST
michael@0: 10:
michael@0:     .unreq  WK4
michael@0:     .unreq  WK5
michael@0:     .unreq  WK6
michael@0:     .unreq  WK7
michael@0: .endm
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
michael@0:     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
michael@0:     2, /* prefetch distance */ \
michael@0:     over_8888_8888_init, \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     over_8888_8888_process_head, \
michael@0:     over_8888_8888_process_tail
michael@0: 
michael@0: /******************************************************************************/
michael@0: 
michael@0: /* Multiply each byte of a word by a byte.
michael@0:  * Useful when there aren't any obvious ways to fill the stalls with other instructions.
michael@0:  * word  Register containing 4 bytes
michael@0:  * byte  Register containing byte multiplier (bits 8-31 must be 0)
michael@0:  * tmp   Scratch register
michael@0:  * half  Register containing the constant 0x00800080
michael@0:  * GE[3:0] bits must contain 0101
michael@0:  */
michael@0: .macro mul_8888_8  word, byte, tmp, half
michael@0:         /* Split even/odd bytes of word apart */
michael@0:         uxtb16  tmp, word
michael@0:         uxtb16  word, word, ror #8
michael@0:         /* Multiply bytes together with rounding, then by 257/256 */
michael@0:         mla     tmp, tmp, byte, half
michael@0:         mla     word, word, byte, half /* 1 stall follows */
michael@0:         uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
michael@0:         uxtab16 word, word, word, ror #8
michael@0:         /* Recombine bytes */
michael@0:         mov     tmp, tmp, ror #8
michael@0:         sel     word, tmp, word
michael@0: .endm
michael@0: 
michael@0: /******************************************************************************/
michael@0: 
michael@0: .macro over_8888_n_8888_init
michael@0:         /* Mask is constant */
michael@0:         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
michael@0:         /* Hold loop invariant in STRIDE_M */
michael@0:         ldr     STRIDE_M, =0x00800080
michael@0:         /* We only want the alpha bits of the constant mask */
michael@0:         mov     MASK, MASK, lsr #24
michael@0:         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
michael@0:         uadd8   SCRATCH, STRIDE_M, STRIDE_M
michael@0:         line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
michael@0: .endm
michael@0: 
michael@0: .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0:     WK4     .req    Y
michael@0:     WK5     .req    STRIDE_D
michael@0:     WK6     .req    STRIDE_S
michael@0:     WK7     .req    ORIG_W
michael@0:         pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
michael@0:         pixld   , numbytes, firstreg, DST, 0
michael@0:     .unreq  WK4
michael@0:     .unreq  WK5
michael@0:     .unreq  WK6
michael@0:     .unreq  WK7
michael@0: .endm
michael@0: 
michael@0: .macro over_8888_n_8888_1pixel src, dst
michael@0:         mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
michael@0:         sub     WK7, WK6, WK&src, lsr #24
michael@0:         mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
michael@0:         uqadd8  WK&dst, WK&dst, WK&src
michael@0: .endm
michael@0: 
michael@0: .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
michael@0:     WK4     .req    Y
michael@0:     WK5     .req    STRIDE_D
michael@0:     WK6     .req    STRIDE_S
michael@0:     WK7     .req    ORIG_W
michael@0:         over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
michael@0:         beq     10f
michael@0:         mov     WK6, #255
michael@0:  .set PROCESS_REG, firstreg
michael@0:  .rept numbytes / 4
michael@0:   .if numbytes == 16 && PROCESS_REG == 2
michael@0:         /* We're using WK6 and WK7 as temporaries, so half way through
michael@0:          * 4 pixels, reload the second two source pixels but this time
michael@0:          * into WK4 and WK5 */
michael@0:         ldmdb   SRC, {WK4, WK5}
michael@0:   .endif
michael@0:         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
michael@0:   .set PROCESS_REG, PROCESS_REG+1
michael@0:  .endr
michael@0:         pixst   , numbytes, firstreg, DST
michael@0: 10:
michael@0:     .unreq  WK4
michael@0:     .unreq  WK5
michael@0:     .unreq  WK6
michael@0:     .unreq  WK7
michael@0: .endm
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
michael@0:     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
michael@0:     2, /* prefetch distance */ \
michael@0:     over_8888_n_8888_init, \
michael@0:     nop_macro, /* newline */ \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     over_8888_n_8888_process_head, \
michael@0:     over_8888_n_8888_process_tail
michael@0: 
michael@0: /******************************************************************************/
michael@0: 
michael@0: .macro over_n_8_8888_init
michael@0:         /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
michael@0:         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
michael@0:         /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
michael@0:         ldr     SCRATCH, =0x00800080
michael@0:         uxtb16  STRIDE_S, SRC
michael@0:         uxtb16  SRC, SRC, ror #8
michael@0:         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
michael@0:         uadd8   SCRATCH, SCRATCH, SCRATCH
michael@0:         line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
michael@0: .endm
michael@0: 
michael@0: .macro over_n_8_8888_newline
michael@0:         ldr     STRIDE_D, =0x00800080
michael@0:         b       1f
michael@0:  .ltorg
michael@0: 1:
michael@0: .endm
michael@0: 
michael@0: .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0:     WK4     .req    STRIDE_M
michael@0:         pixld   , numbytes/4, 4, MASK, unaligned_mask
michael@0:         pixld   , numbytes, firstreg, DST, 0
michael@0:     .unreq  WK4
michael@0: .endm
michael@0: 
michael@0: .macro over_n_8_8888_1pixel src, dst
michael@0:         uxtb    Y, WK4, ror #src*8
michael@0:         /* Trailing part of multiplication of source */
michael@0:         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
michael@0:         mla     Y, SRC, Y, STRIDE_D
michael@0:         mov     ORIG_W, #255
michael@0:         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
michael@0:         uxtab16 Y, Y, Y, ror #8
michael@0:         mov     SCRATCH, SCRATCH, ror #8
michael@0:         sub     ORIG_W, ORIG_W, Y, lsr #24
michael@0:         sel     Y, SCRATCH, Y
michael@0:         /* Then multiply the destination */
michael@0:         mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
michael@0:         uqadd8  WK&dst, WK&dst, Y
michael@0: .endm
michael@0: 
michael@0: .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
michael@0:     WK4     .req    STRIDE_M
michael@0:         teq     WK4, #0
michael@0:         beq     10f
michael@0:  .set PROCESS_REG, firstreg
michael@0:  .rept numbytes / 4
michael@0:         over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
michael@0:   .set PROCESS_REG, PROCESS_REG+1
michael@0:  .endr
michael@0:         pixst   , numbytes, firstreg, DST
michael@0: 10:
michael@0:     .unreq  WK4
michael@0: .endm
michael@0: 
michael@0: generate_composite_function \
michael@0:     pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
michael@0:     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
michael@0:     2, /* prefetch distance */ \
michael@0:     over_n_8_8888_init, \
michael@0:     over_n_8_8888_newline, \
michael@0:     nop_macro, /* cleanup */ \
michael@0:     over_n_8_8888_process_head, \
michael@0:     over_n_8_8888_process_tail
michael@0: 
michael@0: /******************************************************************************/
michael@0: