michael@0: /* michael@0: * Copyright © 2012 Raspberry Pi Foundation michael@0: * Copyright © 2012 RISC OS Open Ltd michael@0: * michael@0: * Permission to use, copy, modify, distribute, and sell this software and its michael@0: * documentation for any purpose is hereby granted without fee, provided that michael@0: * the above copyright notice appear in all copies and that both that michael@0: * copyright notice and this permission notice appear in supporting michael@0: * documentation, and that the name of the copyright holders not be used in michael@0: * advertising or publicity pertaining to distribution of the software without michael@0: * specific, written prior permission. The copyright holders make no michael@0: * representations about the suitability of this software for any purpose. It michael@0: * is provided "as is" without express or implied warranty. michael@0: * michael@0: * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS michael@0: * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND michael@0: * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY michael@0: * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES michael@0: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN michael@0: * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING michael@0: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS michael@0: * SOFTWARE. michael@0: * michael@0: * Author: Ben Avison (bavison@riscosopen.org) michael@0: * michael@0: */ michael@0: michael@0: /* Prevent the stack from becoming executable */ michael@0: #if defined(__linux__) && defined(__ELF__) michael@0: .section .note.GNU-stack,"",%progbits michael@0: #endif michael@0: michael@0: .text michael@0: .arch armv6 michael@0: .object_arch armv4 michael@0: .arm michael@0: .altmacro michael@0: .p2align 2 michael@0: michael@0: #include "pixman-arm-simd-asm.h" michael@0: michael@0: /* A head macro should do all processing which results in an output of up to michael@0: * 16 bytes, as far as the final load instruction. The corresponding tail macro michael@0: * should complete the processing of the up-to-16 bytes. The calling macro will michael@0: * sometimes choose to insert a preload or a decrement of X between them. michael@0: * cond ARM condition code for code block michael@0: * numbytes Number of output bytes that should be generated this time michael@0: * firstreg First WK register in which to place output michael@0: * unaligned_src Whether to use non-wordaligned loads of source image michael@0: * unaligned_mask Whether to use non-wordaligned loads of mask image michael@0: * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output michael@0: */ michael@0: michael@0: .macro blit_init michael@0: line_saved_regs STRIDE_D, STRIDE_S michael@0: .endm michael@0: michael@0: .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload michael@0: pixld cond, numbytes, firstreg, SRC, unaligned_src michael@0: .endm michael@0: michael@0: .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment michael@0: WK4 .req STRIDE_D michael@0: WK5 .req STRIDE_S michael@0: WK6 .req MASK michael@0: WK7 .req STRIDE_M michael@0: 110: pixld , 16, 0, SRC, unaligned_src michael@0: pixld , 16, 4, SRC, unaligned_src michael@0: pld [SRC, SCRATCH] michael@0: pixst , 16, 0, DST michael@0: pixst , 16, 4, DST michael@0: subs X, X, #32*8/src_bpp michael@0: bhs 110b michael@0: .unreq WK4 michael@0: .unreq WK5 michael@0: .unreq WK6 michael@0: .unreq WK7 michael@0: .endm michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ michael@0: FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ michael@0: 4, /* prefetch distance */ \ michael@0: blit_init, \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro, /* cleanup */ \ michael@0: blit_process_head, \ michael@0: nop_macro, /* process tail */ \ michael@0: blit_inner_loop michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ michael@0: FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ michael@0: 4, /* prefetch distance */ \ michael@0: blit_init, \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro, /* cleanup */ \ michael@0: blit_process_head, \ michael@0: nop_macro, /* process tail */ \ michael@0: blit_inner_loop michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ michael@0: FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ michael@0: 3, /* prefetch distance */ \ michael@0: blit_init, \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro, /* cleanup */ \ michael@0: blit_process_head, \ michael@0: nop_macro, /* process tail */ \ michael@0: blit_inner_loop michael@0: michael@0: /******************************************************************************/ michael@0: michael@0: .macro src_n_8888_init michael@0: ldr SRC, [sp, #ARGS_STACK_OFFSET] michael@0: mov STRIDE_S, SRC michael@0: mov MASK, SRC michael@0: mov STRIDE_M, SRC michael@0: .endm michael@0: michael@0: .macro src_n_0565_init michael@0: ldrh SRC, [sp, #ARGS_STACK_OFFSET] michael@0: orr SRC, SRC, lsl #16 michael@0: mov STRIDE_S, SRC michael@0: mov MASK, SRC michael@0: mov STRIDE_M, SRC michael@0: .endm michael@0: michael@0: .macro src_n_8_init michael@0: ldrb SRC, [sp, #ARGS_STACK_OFFSET] michael@0: orr SRC, SRC, lsl #8 michael@0: orr SRC, SRC, lsl #16 michael@0: mov STRIDE_S, SRC michael@0: mov MASK, SRC michael@0: mov STRIDE_M, SRC michael@0: .endm michael@0: michael@0: .macro fill_process_tail cond, numbytes, firstreg michael@0: WK4 .req SRC michael@0: WK5 .req STRIDE_S michael@0: WK6 .req MASK michael@0: WK7 .req STRIDE_M michael@0: pixst cond, numbytes, 4, DST michael@0: .unreq WK4 michael@0: .unreq WK5 michael@0: .unreq WK6 michael@0: .unreq WK7 michael@0: .endm michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ michael@0: FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ michael@0: 0, /* prefetch distance doesn't apply */ \ michael@0: src_n_8888_init \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro /* cleanup */ \ michael@0: nop_macro /* process head */ \ michael@0: fill_process_tail michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ michael@0: FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ michael@0: 0, /* prefetch distance doesn't apply */ \ michael@0: src_n_0565_init \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro /* cleanup */ \ michael@0: nop_macro /* process head */ \ michael@0: fill_process_tail michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ michael@0: FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ michael@0: 0, /* prefetch distance doesn't apply */ \ michael@0: src_n_8_init \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro /* cleanup */ \ michael@0: nop_macro /* process head */ \ michael@0: fill_process_tail michael@0: michael@0: /******************************************************************************/ michael@0: michael@0: .macro src_x888_8888_pixel, cond, reg michael@0: orr&cond WK®, WK®, #0xFF000000 michael@0: .endm michael@0: michael@0: .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload michael@0: pixld cond, numbytes, firstreg, SRC, unaligned_src michael@0: .endm michael@0: michael@0: .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg michael@0: src_x888_8888_pixel cond, %(firstreg+0) michael@0: .if numbytes >= 8 michael@0: src_x888_8888_pixel cond, %(firstreg+1) michael@0: .if numbytes == 16 michael@0: src_x888_8888_pixel cond, %(firstreg+2) michael@0: src_x888_8888_pixel cond, %(firstreg+3) michael@0: .endif michael@0: .endif michael@0: .endm michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ michael@0: FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ michael@0: 3, /* prefetch distance */ \ michael@0: nop_macro, /* init */ \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro, /* cleanup */ \ michael@0: pixman_composite_src_x888_8888_process_head, \ michael@0: pixman_composite_src_x888_8888_process_tail michael@0: michael@0: /******************************************************************************/ michael@0: michael@0: .macro src_0565_8888_init michael@0: /* Hold loop invariants in MASK and STRIDE_M */ michael@0: ldr MASK, =0x07E007E0 michael@0: mov STRIDE_M, #0xFF000000 michael@0: /* Set GE[3:0] to 1010 so SEL instructions do what we want */ michael@0: ldr SCRATCH, =0x80008000 michael@0: uadd8 SCRATCH, SCRATCH, SCRATCH michael@0: .endm michael@0: michael@0: .macro src_0565_8888_2pixels, reg1, reg2 michael@0: and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 michael@0: bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb michael@0: orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg michael@0: mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 michael@0: mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG michael@0: bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 michael@0: orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 michael@0: orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 michael@0: pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- michael@0: sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- michael@0: mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg michael@0: pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- michael@0: sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- michael@0: orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb michael@0: orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB michael@0: .endm michael@0: michael@0: /* This version doesn't need STRIDE_M, but is one instruction longer. michael@0: It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? michael@0: and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 michael@0: bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb michael@0: orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg michael@0: mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB michael@0: mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 michael@0: bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb michael@0: mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 michael@0: mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 michael@0: orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB michael@0: orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb michael@0: pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB michael@0: pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb michael@0: sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB michael@0: sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb michael@0: orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB michael@0: orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb michael@0: */ michael@0: michael@0: .macro src_0565_8888_1pixel, reg michael@0: bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb michael@0: and WK®, WK®, MASK @ 000000000000000000000gggggg00000 michael@0: mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 michael@0: mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 michael@0: orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb michael@0: orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 michael@0: pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb michael@0: sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb michael@0: orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb michael@0: .endm michael@0: michael@0: .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload michael@0: .if numbytes == 16 michael@0: pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src michael@0: .elseif numbytes == 8 michael@0: pixld , 4, firstreg, SRC, unaligned_src michael@0: .elseif numbytes == 4 michael@0: pixld , 2, firstreg, SRC, unaligned_src michael@0: .endif michael@0: .endm michael@0: michael@0: .macro src_0565_8888_process_tail cond, numbytes, firstreg michael@0: .if numbytes == 16 michael@0: src_0565_8888_2pixels firstreg, %(firstreg+1) michael@0: src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) michael@0: .elseif numbytes == 8 michael@0: src_0565_8888_2pixels firstreg, %(firstreg+1) michael@0: .else michael@0: src_0565_8888_1pixel firstreg michael@0: .endif michael@0: .endm michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ michael@0: FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ michael@0: 3, /* prefetch distance */ \ michael@0: src_0565_8888_init, \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro, /* cleanup */ \ michael@0: src_0565_8888_process_head, \ michael@0: src_0565_8888_process_tail michael@0: michael@0: /******************************************************************************/ michael@0: michael@0: .macro add_8_8_8pixels cond, dst1, dst2 michael@0: uqadd8&cond WK&dst1, WK&dst1, MASK michael@0: uqadd8&cond WK&dst2, WK&dst2, STRIDE_M michael@0: .endm michael@0: michael@0: .macro add_8_8_4pixels cond, dst michael@0: uqadd8&cond WK&dst, WK&dst, MASK michael@0: .endm michael@0: michael@0: .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload michael@0: WK4 .req MASK michael@0: WK5 .req STRIDE_M michael@0: .if numbytes == 16 michael@0: pixld cond, 8, 4, SRC, unaligned_src michael@0: pixld cond, 16, firstreg, DST, 0 michael@0: add_8_8_8pixels cond, firstreg, %(firstreg+1) michael@0: pixld cond, 8, 4, SRC, unaligned_src michael@0: .else michael@0: pixld cond, numbytes, 4, SRC, unaligned_src michael@0: pixld cond, numbytes, firstreg, DST, 0 michael@0: .endif michael@0: .unreq WK4 michael@0: .unreq WK5 michael@0: .endm michael@0: michael@0: .macro add_8_8_process_tail cond, numbytes, firstreg michael@0: .if numbytes == 16 michael@0: add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) michael@0: .elseif numbytes == 8 michael@0: add_8_8_8pixels cond, firstreg, %(firstreg+1) michael@0: .else michael@0: add_8_8_4pixels cond, firstreg michael@0: .endif michael@0: .endm michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ michael@0: FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ michael@0: 2, /* prefetch distance */ \ michael@0: nop_macro, /* init */ \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro, /* cleanup */ \ michael@0: add_8_8_process_head, \ michael@0: add_8_8_process_tail michael@0: michael@0: /******************************************************************************/ michael@0: michael@0: .macro over_8888_8888_init michael@0: /* Hold loop invariant in MASK */ michael@0: ldr MASK, =0x00800080 michael@0: /* Set GE[3:0] to 0101 so SEL instructions do what we want */ michael@0: uadd8 SCRATCH, MASK, MASK michael@0: line_saved_regs STRIDE_D, STRIDE_S, ORIG_W michael@0: .endm michael@0: michael@0: .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload michael@0: WK4 .req STRIDE_D michael@0: WK5 .req STRIDE_S michael@0: WK6 .req STRIDE_M michael@0: WK7 .req ORIG_W michael@0: pixld , numbytes, %(4+firstreg), SRC, unaligned_src michael@0: pixld , numbytes, firstreg, DST, 0 michael@0: .unreq WK4 michael@0: .unreq WK5 michael@0: .unreq WK6 michael@0: .unreq WK7 michael@0: .endm michael@0: michael@0: .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 michael@0: /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ michael@0: teq WK®0, #0 michael@0: .if numbytes > 4 michael@0: teqeq WK®1, #0 michael@0: .if numbytes > 8 michael@0: teqeq WK®2, #0 michael@0: teqeq WK®3, #0 michael@0: .endif michael@0: .endif michael@0: .endm michael@0: michael@0: .macro over_8888_8888_prepare next michael@0: mov WK&next, WK&next, lsr #24 michael@0: .endm michael@0: michael@0: .macro over_8888_8888_1pixel src, dst, offset, next michael@0: /* src = destination component multiplier */ michael@0: rsb WK&src, WK&src, #255 michael@0: /* Split even/odd bytes of dst into SCRATCH/dst */ michael@0: uxtb16 SCRATCH, WK&dst michael@0: uxtb16 WK&dst, WK&dst, ror #8 michael@0: /* Multiply through, adding 0.5 to the upper byte of result for rounding */ michael@0: mla SCRATCH, SCRATCH, WK&src, MASK michael@0: mla WK&dst, WK&dst, WK&src, MASK michael@0: /* Where we would have had a stall between the result of the first MLA and the shifter input, michael@0: * reload the complete source pixel */ michael@0: ldr WK&src, [SRC, #offset] michael@0: /* Multiply by 257/256 to approximate 256/255 */ michael@0: uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 michael@0: /* In this stall, start processing the next pixel */ michael@0: .if offset < -4 michael@0: mov WK&next, WK&next, lsr #24 michael@0: .endif michael@0: uxtab16 WK&dst, WK&dst, WK&dst, ror #8 michael@0: /* Recombine even/odd bytes of multiplied destination */ michael@0: mov SCRATCH, SCRATCH, ror #8 michael@0: sel WK&dst, SCRATCH, WK&dst michael@0: /* Saturated add of source to multiplied destination */ michael@0: uqadd8 WK&dst, WK&dst, WK&src michael@0: .endm michael@0: michael@0: .macro over_8888_8888_process_tail cond, numbytes, firstreg michael@0: WK4 .req STRIDE_D michael@0: WK5 .req STRIDE_S michael@0: WK6 .req STRIDE_M michael@0: WK7 .req ORIG_W michael@0: over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) michael@0: beq 10f michael@0: over_8888_8888_prepare %(4+firstreg) michael@0: .set PROCESS_REG, firstreg michael@0: .set PROCESS_OFF, -numbytes michael@0: .rept numbytes / 4 michael@0: over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) michael@0: .set PROCESS_REG, PROCESS_REG+1 michael@0: .set PROCESS_OFF, PROCESS_OFF+4 michael@0: .endr michael@0: pixst , numbytes, firstreg, DST michael@0: 10: michael@0: .unreq WK4 michael@0: .unreq WK5 michael@0: .unreq WK6 michael@0: .unreq WK7 michael@0: .endm michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ michael@0: FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ michael@0: 2, /* prefetch distance */ \ michael@0: over_8888_8888_init, \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro, /* cleanup */ \ michael@0: over_8888_8888_process_head, \ michael@0: over_8888_8888_process_tail michael@0: michael@0: /******************************************************************************/ michael@0: michael@0: /* Multiply each byte of a word by a byte. michael@0: * Useful when there aren't any obvious ways to fill the stalls with other instructions. michael@0: * word Register containing 4 bytes michael@0: * byte Register containing byte multiplier (bits 8-31 must be 0) michael@0: * tmp Scratch register michael@0: * half Register containing the constant 0x00800080 michael@0: * GE[3:0] bits must contain 0101 michael@0: */ michael@0: .macro mul_8888_8 word, byte, tmp, half michael@0: /* Split even/odd bytes of word apart */ michael@0: uxtb16 tmp, word michael@0: uxtb16 word, word, ror #8 michael@0: /* Multiply bytes together with rounding, then by 257/256 */ michael@0: mla tmp, tmp, byte, half michael@0: mla word, word, byte, half /* 1 stall follows */ michael@0: uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ michael@0: uxtab16 word, word, word, ror #8 michael@0: /* Recombine bytes */ michael@0: mov tmp, tmp, ror #8 michael@0: sel word, tmp, word michael@0: .endm michael@0: michael@0: /******************************************************************************/ michael@0: michael@0: .macro over_8888_n_8888_init michael@0: /* Mask is constant */ michael@0: ldr MASK, [sp, #ARGS_STACK_OFFSET+8] michael@0: /* Hold loop invariant in STRIDE_M */ michael@0: ldr STRIDE_M, =0x00800080 michael@0: /* We only want the alpha bits of the constant mask */ michael@0: mov MASK, MASK, lsr #24 michael@0: /* Set GE[3:0] to 0101 so SEL instructions do what we want */ michael@0: uadd8 SCRATCH, STRIDE_M, STRIDE_M michael@0: line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W michael@0: .endm michael@0: michael@0: .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload michael@0: WK4 .req Y michael@0: WK5 .req STRIDE_D michael@0: WK6 .req STRIDE_S michael@0: WK7 .req ORIG_W michael@0: pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src michael@0: pixld , numbytes, firstreg, DST, 0 michael@0: .unreq WK4 michael@0: .unreq WK5 michael@0: .unreq WK6 michael@0: .unreq WK7 michael@0: .endm michael@0: michael@0: .macro over_8888_n_8888_1pixel src, dst michael@0: mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M michael@0: sub WK7, WK6, WK&src, lsr #24 michael@0: mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M michael@0: uqadd8 WK&dst, WK&dst, WK&src michael@0: .endm michael@0: michael@0: .macro over_8888_n_8888_process_tail cond, numbytes, firstreg michael@0: WK4 .req Y michael@0: WK5 .req STRIDE_D michael@0: WK6 .req STRIDE_S michael@0: WK7 .req ORIG_W michael@0: over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) michael@0: beq 10f michael@0: mov WK6, #255 michael@0: .set PROCESS_REG, firstreg michael@0: .rept numbytes / 4 michael@0: .if numbytes == 16 && PROCESS_REG == 2 michael@0: /* We're using WK6 and WK7 as temporaries, so half way through michael@0: * 4 pixels, reload the second two source pixels but this time michael@0: * into WK4 and WK5 */ michael@0: ldmdb SRC, {WK4, WK5} michael@0: .endif michael@0: over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) michael@0: .set PROCESS_REG, PROCESS_REG+1 michael@0: .endr michael@0: pixst , numbytes, firstreg, DST michael@0: 10: michael@0: .unreq WK4 michael@0: .unreq WK5 michael@0: .unreq WK6 michael@0: .unreq WK7 michael@0: .endm michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ michael@0: FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ michael@0: 2, /* prefetch distance */ \ michael@0: over_8888_n_8888_init, \ michael@0: nop_macro, /* newline */ \ michael@0: nop_macro, /* cleanup */ \ michael@0: over_8888_n_8888_process_head, \ michael@0: over_8888_n_8888_process_tail michael@0: michael@0: /******************************************************************************/ michael@0: michael@0: .macro over_n_8_8888_init michael@0: /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ michael@0: ldr SRC, [sp, #ARGS_STACK_OFFSET] michael@0: /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ michael@0: ldr SCRATCH, =0x00800080 michael@0: uxtb16 STRIDE_S, SRC michael@0: uxtb16 SRC, SRC, ror #8 michael@0: /* Set GE[3:0] to 0101 so SEL instructions do what we want */ michael@0: uadd8 SCRATCH, SCRATCH, SCRATCH michael@0: line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W michael@0: .endm michael@0: michael@0: .macro over_n_8_8888_newline michael@0: ldr STRIDE_D, =0x00800080 michael@0: b 1f michael@0: .ltorg michael@0: 1: michael@0: .endm michael@0: michael@0: .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload michael@0: WK4 .req STRIDE_M michael@0: pixld , numbytes/4, 4, MASK, unaligned_mask michael@0: pixld , numbytes, firstreg, DST, 0 michael@0: .unreq WK4 michael@0: .endm michael@0: michael@0: .macro over_n_8_8888_1pixel src, dst michael@0: uxtb Y, WK4, ror #src*8 michael@0: /* Trailing part of multiplication of source */ michael@0: mla SCRATCH, STRIDE_S, Y, STRIDE_D michael@0: mla Y, SRC, Y, STRIDE_D michael@0: mov ORIG_W, #255 michael@0: uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 michael@0: uxtab16 Y, Y, Y, ror #8 michael@0: mov SCRATCH, SCRATCH, ror #8 michael@0: sub ORIG_W, ORIG_W, Y, lsr #24 michael@0: sel Y, SCRATCH, Y michael@0: /* Then multiply the destination */ michael@0: mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D michael@0: uqadd8 WK&dst, WK&dst, Y michael@0: .endm michael@0: michael@0: .macro over_n_8_8888_process_tail cond, numbytes, firstreg michael@0: WK4 .req STRIDE_M michael@0: teq WK4, #0 michael@0: beq 10f michael@0: .set PROCESS_REG, firstreg michael@0: .rept numbytes / 4 michael@0: over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) michael@0: .set PROCESS_REG, PROCESS_REG+1 michael@0: .endr michael@0: pixst , numbytes, firstreg, DST michael@0: 10: michael@0: .unreq WK4 michael@0: .endm michael@0: michael@0: generate_composite_function \ michael@0: pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ michael@0: FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ michael@0: 2, /* prefetch distance */ \ michael@0: over_n_8_8888_init, \ michael@0: over_n_8_8888_newline, \ michael@0: nop_macro, /* cleanup */ \ michael@0: over_n_8_8888_process_head, \ michael@0: over_n_8_8888_process_tail michael@0: michael@0: /******************************************************************************/ michael@0: