michael@0: /* michael@0: * Copyright © 2012 Raspberry Pi Foundation michael@0: * Copyright © 2012 RISC OS Open Ltd michael@0: * michael@0: * Permission to use, copy, modify, distribute, and sell this software and its michael@0: * documentation for any purpose is hereby granted without fee, provided that michael@0: * the above copyright notice appear in all copies and that both that michael@0: * copyright notice and this permission notice appear in supporting michael@0: * documentation, and that the name of the copyright holders not be used in michael@0: * advertising or publicity pertaining to distribution of the software without michael@0: * specific, written prior permission. The copyright holders make no michael@0: * representations about the suitability of this software for any purpose. It michael@0: * is provided "as is" without express or implied warranty. michael@0: * michael@0: * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS michael@0: * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND michael@0: * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY michael@0: * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES michael@0: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN michael@0: * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING michael@0: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS michael@0: * SOFTWARE. michael@0: * michael@0: * Author: Ben Avison (bavison@riscosopen.org) michael@0: * michael@0: */ michael@0: michael@0: /* michael@0: * Because the alignment of pixel data to cachelines, and even the number of michael@0: * cachelines per row can vary from row to row, and because of the need to michael@0: * preload each scanline once and only once, this prefetch strategy treats michael@0: * each row of pixels independently. When a pixel row is long enough, there michael@0: * are three distinct phases of prefetch: michael@0: * * an inner loop section, where each time a cacheline of data is michael@0: * processed, another cacheline is preloaded (the exact distance ahead is michael@0: * determined empirically using profiling results from lowlevel-blt-bench) michael@0: * * a leading section, where enough cachelines are preloaded to ensure no michael@0: * cachelines escape being preloaded when the inner loop starts michael@0: * * a trailing section, where a limited number (0 or more) of cachelines michael@0: * are preloaded to deal with data (if any) that hangs off the end of the michael@0: * last iteration of the inner loop, plus any trailing bytes that were not michael@0: * enough to make up one whole iteration of the inner loop michael@0: * michael@0: * There are (in general) three distinct code paths, selected between michael@0: * depending upon how long the pixel row is. If it is long enough that there michael@0: * is at least one iteration of the inner loop (as described above) then michael@0: * this is described as the "wide" case. If it is shorter than that, but michael@0: * there are still enough bytes output that there is at least one 16-byte- michael@0: * long, 16-byte-aligned write to the destination (the optimum type of michael@0: * write), then this is the "medium" case. If it is not even this long, then michael@0: * this is the "narrow" case, and there is no attempt to align writes to michael@0: * 16-byte boundaries. In the "medium" and "narrow" cases, all the michael@0: * cachelines containing data from the pixel row are prefetched up-front. michael@0: */ michael@0: michael@0: /* michael@0: * Determine whether we put the arguments on the stack for debugging. michael@0: */ michael@0: #undef DEBUG_PARAMS michael@0: michael@0: /* michael@0: * Bit flags for 'generate_composite_function' macro which are used michael@0: * to tune generated functions behavior. michael@0: */ michael@0: .set FLAG_DST_WRITEONLY, 0 michael@0: .set FLAG_DST_READWRITE, 1 michael@0: .set FLAG_COND_EXEC, 0 michael@0: .set FLAG_BRANCH_OVER, 2 michael@0: .set FLAG_PROCESS_PRESERVES_PSR, 0 michael@0: .set FLAG_PROCESS_CORRUPTS_PSR, 4 michael@0: .set FLAG_PROCESS_DOESNT_STORE, 0 michael@0: .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ michael@0: .set FLAG_NO_SPILL_LINE_VARS, 0 michael@0: .set FLAG_SPILL_LINE_VARS_WIDE, 16 michael@0: .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 michael@0: .set FLAG_SPILL_LINE_VARS, 48 michael@0: .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 michael@0: .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 michael@0: michael@0: /* michael@0: * Offset into stack where mask and source pointer/stride can be accessed. michael@0: */ michael@0: #ifdef DEBUG_PARAMS michael@0: .set ARGS_STACK_OFFSET, (9*4+9*4) michael@0: #else michael@0: .set ARGS_STACK_OFFSET, (9*4) michael@0: #endif michael@0: michael@0: /* michael@0: * Constants for selecting preferable prefetch type. michael@0: */ michael@0: .set PREFETCH_TYPE_NONE, 0 michael@0: .set PREFETCH_TYPE_STANDARD, 1 michael@0: michael@0: /* michael@0: * Definitions of macros for load/store of pixel data. michael@0: */ michael@0: michael@0: .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 michael@0: .if numbytes == 16 michael@0: .if unaligned == 1 michael@0: op&r&cond WK®0, [base], #4 michael@0: op&r&cond WK®1, [base], #4 michael@0: op&r&cond WK®2, [base], #4 michael@0: op&r&cond WK®3, [base], #4 michael@0: .else michael@0: op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} michael@0: .endif michael@0: .elseif numbytes == 8 michael@0: .if unaligned == 1 michael@0: op&r&cond WK®0, [base], #4 michael@0: op&r&cond WK®1, [base], #4 michael@0: .else michael@0: op&m&cond&ia base!, {WK®0,WK®1} michael@0: .endif michael@0: .elseif numbytes == 4 michael@0: op&r&cond WK®0, [base], #4 michael@0: .elseif numbytes == 2 michael@0: op&r&cond&h WK®0, [base], #2 michael@0: .elseif numbytes == 1 michael@0: op&r&cond&b WK®0, [base], #1 michael@0: .else michael@0: .error "unsupported size: numbytes" michael@0: .endif michael@0: .endm michael@0: michael@0: .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base michael@0: .if numbytes == 16 michael@0: stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} michael@0: .elseif numbytes == 8 michael@0: stm&cond&db base, {WK®0,WK®1} michael@0: .elseif numbytes == 4 michael@0: str&cond WK®0, [base, #-4] michael@0: .elseif numbytes == 2 michael@0: str&cond&h WK®0, [base, #-2] michael@0: .elseif numbytes == 1 michael@0: str&cond&b WK®0, [base, #-1] michael@0: .else michael@0: .error "unsupported size: numbytes" michael@0: .endif michael@0: .endm michael@0: michael@0: .macro pixld cond, numbytes, firstreg, base, unaligned michael@0: pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned michael@0: .endm michael@0: michael@0: .macro pixst cond, numbytes, firstreg, base michael@0: .if (flags) & FLAG_DST_READWRITE michael@0: pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base michael@0: .else michael@0: pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base michael@0: .endif michael@0: .endm michael@0: michael@0: .macro PF a, x:vararg michael@0: .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) michael@0: a x michael@0: .endif michael@0: .endm michael@0: michael@0: michael@0: .macro preload_leading_step1 bpp, ptr, base michael@0: /* If the destination is already 16-byte aligned, then we need to preload michael@0: * between 0 and prefetch_distance (inclusive) cache lines ahead so there michael@0: * are no gaps when the inner loop starts. michael@0: */ michael@0: .if bpp > 0 michael@0: PF bic, ptr, base, #31 michael@0: .set OFFSET, 0 michael@0: .rept prefetch_distance+1 michael@0: PF pld, [ptr, #OFFSET] michael@0: .set OFFSET, OFFSET+32 michael@0: .endr michael@0: .endif michael@0: .endm michael@0: michael@0: .macro preload_leading_step2 bpp, bpp_shift, ptr, base michael@0: /* However, if the destination is not 16-byte aligned, we may need to michael@0: * preload more cache lines than that. The question we need to ask is: michael@0: * are the bytes corresponding to the leading pixels more than the amount michael@0: * by which the source pointer will be rounded down for preloading, and if michael@0: * so, by how many cache lines? Effectively, we want to calculate michael@0: * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp michael@0: * inner_loop_offset = (src+leading_bytes)&31 michael@0: * extra_needed = leading_bytes - inner_loop_offset michael@0: * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only michael@0: * possible when there are 4 src bytes for every 1 dst byte). michael@0: */ michael@0: .if bpp > 0 michael@0: .ifc base,DST michael@0: /* The test can be simplified further when preloading the destination */ michael@0: PF tst, base, #16 michael@0: PF beq, 61f michael@0: .else michael@0: .if bpp/dst_w_bpp == 4 michael@0: PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift michael@0: PF and, SCRATCH, SCRATCH, #31 michael@0: PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift michael@0: PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ michael@0: PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */ michael@0: PF bcs, 61f michael@0: PF bpl, 60f michael@0: PF pld, [ptr, #32*(prefetch_distance+2)] michael@0: .else michael@0: PF mov, SCRATCH, base, lsl #32-5 michael@0: PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift michael@0: PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift michael@0: PF bls, 61f michael@0: .endif michael@0: .endif michael@0: 60: PF pld, [ptr, #32*(prefetch_distance+1)] michael@0: 61: michael@0: .endif michael@0: .endm michael@0: michael@0: #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) michael@0: .macro preload_middle bpp, base, scratch_holds_offset michael@0: .if bpp > 0 michael@0: /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ michael@0: .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) michael@0: .if scratch_holds_offset michael@0: PF pld, [base, SCRATCH] michael@0: .else michael@0: PF bic, SCRATCH, base, #31 michael@0: PF pld, [SCRATCH, #32*prefetch_distance] michael@0: .endif michael@0: .endif michael@0: .endif michael@0: .endm michael@0: michael@0: .macro preload_trailing bpp, bpp_shift, base michael@0: .if bpp > 0 michael@0: .if bpp*pix_per_block > 256 michael@0: /* Calculations are more complex if more than one fetch per block */ michael@0: PF and, WK1, base, #31 michael@0: PF add, WK1, WK1, WK0, lsl #bpp_shift michael@0: PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) michael@0: PF bic, SCRATCH, base, #31 michael@0: 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] michael@0: PF add, SCRATCH, SCRATCH, #32 michael@0: PF subs, WK1, WK1, #32 michael@0: PF bhi, 80b michael@0: .else michael@0: /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ michael@0: PF mov, SCRATCH, base, lsl #32-5 michael@0: PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift michael@0: PF adceqs, SCRATCH, SCRATCH, #0 michael@0: /* The instruction above has two effects: ensures Z is only michael@0: * set if C was clear (so Z indicates that both shifted quantities michael@0: * were 0), and clears C if Z was set (so C indicates that the sum michael@0: * of the shifted quantities was greater and not equal to 32) */ michael@0: PF beq, 82f michael@0: PF bic, SCRATCH, base, #31 michael@0: PF bcc, 81f michael@0: PF pld, [SCRATCH, #32*(prefetch_distance+2)] michael@0: 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] michael@0: 82: michael@0: .endif michael@0: .endif michael@0: .endm michael@0: michael@0: michael@0: .macro preload_line narrow_case, bpp, bpp_shift, base michael@0: /* "narrow_case" - just means that the macro was invoked from the "narrow" michael@0: * code path rather than the "medium" one - because in the narrow case, michael@0: * the row of pixels is known to output no more than 30 bytes, then michael@0: * (assuming the source pixels are no wider than the the destination michael@0: * pixels) they cannot possibly straddle more than 2 32-byte cachelines, michael@0: * meaning there's no need for a loop. michael@0: * "bpp" - number of bits per pixel in the channel (source, mask or michael@0: * destination) that's being preloaded, or 0 if this channel is not used michael@0: * for reading michael@0: * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) michael@0: * "base" - base address register of channel to preload (SRC, MASK or DST) michael@0: */ michael@0: .if bpp > 0 michael@0: .if narrow_case && (bpp <= dst_w_bpp) michael@0: /* In these cases, each line for each channel is in either 1 or 2 cache lines */ michael@0: PF bic, WK0, base, #31 michael@0: PF pld, [WK0] michael@0: PF add, WK1, base, X, LSL #bpp_shift michael@0: PF sub, WK1, WK1, #1 michael@0: PF bic, WK1, WK1, #31 michael@0: PF cmp, WK1, WK0 michael@0: PF beq, 90f michael@0: PF pld, [WK1] michael@0: 90: michael@0: .else michael@0: PF bic, WK0, base, #31 michael@0: PF pld, [WK0] michael@0: PF add, WK1, base, X, lsl #bpp_shift michael@0: PF sub, WK1, WK1, #1 michael@0: PF bic, WK1, WK1, #31 michael@0: PF cmp, WK1, WK0 michael@0: PF beq, 92f michael@0: 91: PF add, WK0, WK0, #32 michael@0: PF cmp, WK0, WK1 michael@0: PF pld, [WK0] michael@0: PF bne, 91b michael@0: 92: michael@0: .endif michael@0: .endif michael@0: .endm michael@0: michael@0: michael@0: .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx michael@0: process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 michael@0: .if decrementx michael@0: sub&cond X, X, #8*numbytes/dst_w_bpp michael@0: .endif michael@0: process_tail cond, numbytes, firstreg michael@0: .if !((flags) & FLAG_PROCESS_DOES_STORE) michael@0: pixst cond, numbytes, firstreg, DST michael@0: .endif michael@0: .endm michael@0: michael@0: .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx michael@0: .if (flags) & FLAG_BRANCH_OVER michael@0: .ifc cond,mi michael@0: bpl 100f michael@0: .endif michael@0: .ifc cond,cs michael@0: bcc 100f michael@0: .endif michael@0: .ifc cond,ne michael@0: beq 100f michael@0: .endif michael@0: conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx michael@0: 100: michael@0: .else michael@0: conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx michael@0: .endif michael@0: .endm michael@0: michael@0: .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx michael@0: .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) michael@0: /* Can't interleave reads and writes */ michael@0: test michael@0: conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx michael@0: .if (flags) & FLAG_PROCESS_CORRUPTS_PSR michael@0: test michael@0: .endif michael@0: conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx michael@0: .else michael@0: /* Can interleave reads and writes for better scheduling */ michael@0: test michael@0: process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 michael@0: process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 michael@0: .if decrementx michael@0: sub&cond1 X, X, #8*numbytes1/dst_w_bpp michael@0: sub&cond2 X, X, #8*numbytes2/dst_w_bpp michael@0: .endif michael@0: process_tail cond1, numbytes1, firstreg1 michael@0: process_tail cond2, numbytes2, firstreg2 michael@0: pixst cond1, numbytes1, firstreg1, DST michael@0: pixst cond2, numbytes2, firstreg2, DST michael@0: .endif michael@0: .endm michael@0: michael@0: michael@0: .macro test_bits_1_0_ptr michael@0: movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ michael@0: .endm michael@0: michael@0: .macro test_bits_3_2_ptr michael@0: movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ michael@0: .endm michael@0: michael@0: .macro leading_15bytes process_head, process_tail michael@0: /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ michael@0: /* Use unaligned loads in all cases for simplicity */ michael@0: .if dst_w_bpp == 8 michael@0: conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1 michael@0: .elseif dst_w_bpp == 16 michael@0: test_bits_1_0_ptr michael@0: conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1 michael@0: .endif michael@0: conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1 michael@0: .endm michael@0: michael@0: .macro test_bits_3_2_pix michael@0: movs SCRATCH, X, lsl #dst_bpp_shift+32-3 michael@0: .endm michael@0: michael@0: .macro test_bits_1_0_pix michael@0: .if dst_w_bpp == 8 michael@0: movs SCRATCH, X, lsl #dst_bpp_shift+32-1 michael@0: .else michael@0: movs SCRATCH, X, lsr #1 michael@0: .endif michael@0: .endm michael@0: michael@0: .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask michael@0: conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 michael@0: .if dst_w_bpp == 16 michael@0: test_bits_1_0_pix michael@0: conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 michael@0: .elseif dst_w_bpp == 8 michael@0: conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 michael@0: .endif michael@0: .endm michael@0: michael@0: michael@0: .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment michael@0: 110: michael@0: .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ michael@0: .rept pix_per_block*dst_w_bpp/128 michael@0: process_head , 16, 0, unaligned_src, unaligned_mask, 1 michael@0: .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) michael@0: preload_middle src_bpp, SRC, 1 michael@0: .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) michael@0: preload_middle mask_bpp, MASK, 1 michael@0: .else michael@0: preload_middle src_bpp, SRC, 0 michael@0: preload_middle mask_bpp, MASK, 0 michael@0: .endif michael@0: .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) michael@0: /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that michael@0: * destination prefetches are 32-byte aligned. It's also the easiest channel to offset michael@0: * preloads for, to achieve staggered prefetches for multiple channels, because there are michael@0: * always two STMs per prefetch, so there is always an opposite STM on which to put the michael@0: * preload. Note, no need to BIC the base register here */ michael@0: PF pld, [DST, #32*prefetch_distance - dst_alignment] michael@0: .endif michael@0: process_tail , 16, 0 michael@0: .if !((flags) & FLAG_PROCESS_DOES_STORE) michael@0: pixst , 16, 0, DST michael@0: .endif michael@0: .set SUBBLOCK, SUBBLOCK+1 michael@0: .endr michael@0: subs X, X, #pix_per_block michael@0: bhs 110b michael@0: .endm michael@0: michael@0: .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask michael@0: /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ michael@0: .if dst_r_bpp > 0 michael@0: tst DST, #16 michael@0: bne 111f michael@0: process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 michael@0: b 112f michael@0: 111: michael@0: .endif michael@0: process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 michael@0: 112: michael@0: /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ michael@0: .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) michael@0: PF and, WK0, X, #pix_per_block-1 michael@0: .endif michael@0: preload_trailing src_bpp, src_bpp_shift, SRC michael@0: preload_trailing mask_bpp, mask_bpp_shift, MASK michael@0: preload_trailing dst_r_bpp, dst_bpp_shift, DST michael@0: add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp michael@0: /* The remainder of the line is handled identically to the medium case */ michael@0: medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask michael@0: .endm michael@0: michael@0: .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask michael@0: 120: michael@0: process_head , 16, 0, unaligned_src, unaligned_mask, 0 michael@0: process_tail , 16, 0 michael@0: .if !((flags) & FLAG_PROCESS_DOES_STORE) michael@0: pixst , 16, 0, DST michael@0: .endif michael@0: subs X, X, #128/dst_w_bpp michael@0: bhs 120b michael@0: /* Trailing pixels */ michael@0: tst X, #128/dst_w_bpp - 1 michael@0: beq exit_label michael@0: trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask michael@0: .endm michael@0: michael@0: .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask michael@0: tst X, #16*8/dst_w_bpp michael@0: conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 michael@0: /* Trailing pixels */ michael@0: /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ michael@0: trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask michael@0: .endm michael@0: michael@0: .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label michael@0: /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ michael@0: .if mask_bpp == 8 || mask_bpp == 16 michael@0: tst MASK, #3 michael@0: bne 141f michael@0: .endif michael@0: .if src_bpp == 8 || src_bpp == 16 michael@0: tst SRC, #3 michael@0: bne 140f michael@0: .endif michael@0: action process_head, process_tail, process_inner_loop, exit_label, 0, 0 michael@0: .if src_bpp == 8 || src_bpp == 16 michael@0: b exit_label michael@0: 140: michael@0: action process_head, process_tail, process_inner_loop, exit_label, 1, 0 michael@0: .endif michael@0: .if mask_bpp == 8 || mask_bpp == 16 michael@0: b exit_label michael@0: 141: michael@0: .if src_bpp == 8 || src_bpp == 16 michael@0: tst SRC, #3 michael@0: bne 142f michael@0: .endif michael@0: action process_head, process_tail, process_inner_loop, exit_label, 0, 1 michael@0: .if src_bpp == 8 || src_bpp == 16 michael@0: b exit_label michael@0: 142: michael@0: action process_head, process_tail, process_inner_loop, exit_label, 1, 1 michael@0: .endif michael@0: .endif michael@0: .endm michael@0: michael@0: michael@0: .macro end_of_line restore_x, vars_spilled, loop_label, last_one michael@0: .if vars_spilled michael@0: /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ michael@0: /* This is ldmia sp,{} */ michael@0: .word 0xE89D0000 | LINE_SAVED_REGS michael@0: .endif michael@0: subs Y, Y, #1 michael@0: .if vars_spilled michael@0: .if (LINE_SAVED_REGS) & (1<<1) michael@0: str Y, [sp] michael@0: .endif michael@0: .endif michael@0: add DST, DST, STRIDE_D michael@0: .if src_bpp > 0 michael@0: add SRC, SRC, STRIDE_S michael@0: .endif michael@0: .if mask_bpp > 0 michael@0: add MASK, MASK, STRIDE_M michael@0: .endif michael@0: .if restore_x michael@0: mov X, ORIG_W michael@0: .endif michael@0: bhs loop_label michael@0: .ifc "last_one","" michael@0: .if vars_spilled michael@0: b 197f michael@0: .else michael@0: b 198f michael@0: .endif michael@0: .else michael@0: .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) michael@0: b 198f michael@0: .endif michael@0: .endif michael@0: .endm michael@0: michael@0: michael@0: .macro generate_composite_function fname, \ michael@0: src_bpp_, \ michael@0: mask_bpp_, \ michael@0: dst_w_bpp_, \ michael@0: flags_, \ michael@0: prefetch_distance_, \ michael@0: init, \ michael@0: newline, \ michael@0: cleanup, \ michael@0: process_head, \ michael@0: process_tail, \ michael@0: process_inner_loop michael@0: michael@0: .func fname michael@0: .global fname michael@0: /* For ELF format also set function visibility to hidden */ michael@0: #ifdef __ELF__ michael@0: .hidden fname michael@0: .type fname, %function michael@0: #endif michael@0: michael@0: /* michael@0: * Make some macro arguments globally visible and accessible michael@0: * from other macros michael@0: */ michael@0: .set src_bpp, src_bpp_ michael@0: .set mask_bpp, mask_bpp_ michael@0: .set dst_w_bpp, dst_w_bpp_ michael@0: .set flags, flags_ michael@0: .set prefetch_distance, prefetch_distance_ michael@0: michael@0: /* michael@0: * Select prefetch type for this function. michael@0: */ michael@0: .if prefetch_distance == 0 michael@0: .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE michael@0: .else michael@0: .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD michael@0: .endif michael@0: michael@0: .if src_bpp == 32 michael@0: .set src_bpp_shift, 2 michael@0: .elseif src_bpp == 24 michael@0: .set src_bpp_shift, 0 michael@0: .elseif src_bpp == 16 michael@0: .set src_bpp_shift, 1 michael@0: .elseif src_bpp == 8 michael@0: .set src_bpp_shift, 0 michael@0: .elseif src_bpp == 0 michael@0: .set src_bpp_shift, -1 michael@0: .else michael@0: .error "requested src bpp (src_bpp) is not supported" michael@0: .endif michael@0: michael@0: .if mask_bpp == 32 michael@0: .set mask_bpp_shift, 2 michael@0: .elseif mask_bpp == 24 michael@0: .set mask_bpp_shift, 0 michael@0: .elseif mask_bpp == 8 michael@0: .set mask_bpp_shift, 0 michael@0: .elseif mask_bpp == 0 michael@0: .set mask_bpp_shift, -1 michael@0: .else michael@0: .error "requested mask bpp (mask_bpp) is not supported" michael@0: .endif michael@0: michael@0: .if dst_w_bpp == 32 michael@0: .set dst_bpp_shift, 2 michael@0: .elseif dst_w_bpp == 24 michael@0: .set dst_bpp_shift, 0 michael@0: .elseif dst_w_bpp == 16 michael@0: .set dst_bpp_shift, 1 michael@0: .elseif dst_w_bpp == 8 michael@0: .set dst_bpp_shift, 0 michael@0: .else michael@0: .error "requested dst bpp (dst_w_bpp) is not supported" michael@0: .endif michael@0: michael@0: .if (((flags) & FLAG_DST_READWRITE) != 0) michael@0: .set dst_r_bpp, dst_w_bpp michael@0: .else michael@0: .set dst_r_bpp, 0 michael@0: .endif michael@0: michael@0: .set pix_per_block, 16*8/dst_w_bpp michael@0: .if src_bpp != 0 michael@0: .if 32*8/src_bpp > pix_per_block michael@0: .set pix_per_block, 32*8/src_bpp michael@0: .endif michael@0: .endif michael@0: .if mask_bpp != 0 michael@0: .if 32*8/mask_bpp > pix_per_block michael@0: .set pix_per_block, 32*8/mask_bpp michael@0: .endif michael@0: .endif michael@0: .if dst_r_bpp != 0 michael@0: .if 32*8/dst_r_bpp > pix_per_block michael@0: .set pix_per_block, 32*8/dst_r_bpp michael@0: .endif michael@0: .endif michael@0: michael@0: /* The standard entry conditions set up by pixman-arm-common.h are: michael@0: * r0 = width (pixels) michael@0: * r1 = height (rows) michael@0: * r2 = pointer to top-left pixel of destination michael@0: * r3 = destination stride (pixels) michael@0: * [sp] = source pixel value, or pointer to top-left pixel of source michael@0: * [sp,#4] = 0 or source stride (pixels) michael@0: * The following arguments are unused for non-mask operations michael@0: * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask michael@0: * [sp,#12] = 0 or mask stride (pixels) michael@0: */ michael@0: michael@0: /* michael@0: * Assign symbolic names to registers michael@0: */ michael@0: X .req r0 /* pixels to go on this line */ michael@0: Y .req r1 /* lines to go */ michael@0: DST .req r2 /* destination pixel pointer */ michael@0: STRIDE_D .req r3 /* destination stride (bytes, minus width) */ michael@0: SRC .req r4 /* source pixel pointer */ michael@0: STRIDE_S .req r5 /* source stride (bytes, minus width) */ michael@0: MASK .req r6 /* mask pixel pointer (if applicable) */ michael@0: STRIDE_M .req r7 /* mask stride (bytes, minus width) */ michael@0: WK0 .req r8 /* pixel data registers */ michael@0: WK1 .req r9 michael@0: WK2 .req r10 michael@0: WK3 .req r11 michael@0: SCRATCH .req r12 michael@0: ORIG_W .req r14 /* width (pixels) */ michael@0: michael@0: fname: michael@0: .fnstart michael@0: .save {r4-r11, lr} michael@0: push {r4-r11, lr} /* save all registers */ michael@0: michael@0: subs Y, Y, #1 michael@0: blo 199f michael@0: michael@0: #ifdef DEBUG_PARAMS michael@0: .pad #9*4 michael@0: sub sp, sp, #9*4 michael@0: #endif michael@0: michael@0: .if src_bpp > 0 michael@0: ldr SRC, [sp, #ARGS_STACK_OFFSET] michael@0: ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] michael@0: .endif michael@0: .if mask_bpp > 0 michael@0: ldr MASK, [sp, #ARGS_STACK_OFFSET+8] michael@0: ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] michael@0: .endif michael@0: michael@0: #ifdef DEBUG_PARAMS michael@0: add Y, Y, #1 michael@0: stmia sp, {r0-r7,pc} michael@0: sub Y, Y, #1 michael@0: #endif michael@0: michael@0: init michael@0: michael@0: lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ michael@0: sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift michael@0: .if src_bpp > 0 michael@0: lsl STRIDE_S, #src_bpp_shift michael@0: sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift michael@0: .endif michael@0: .if mask_bpp > 0 michael@0: lsl STRIDE_M, #mask_bpp_shift michael@0: sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift michael@0: .endif michael@0: michael@0: /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ michael@0: cmp X, #2*16*8/dst_w_bpp - 1 michael@0: blo 170f michael@0: .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ michael@0: /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ michael@0: cmp X, #(prefetch_distance+3)*pix_per_block - 1 michael@0: blo 160f michael@0: michael@0: /* Wide case */ michael@0: /* Adjust X so that the decrement instruction can also test for michael@0: * inner loop termination. We want it to stop when there are michael@0: * (prefetch_distance+1) complete blocks to go. */ michael@0: sub X, X, #(prefetch_distance+2)*pix_per_block michael@0: mov ORIG_W, X michael@0: .if (flags) & FLAG_SPILL_LINE_VARS_WIDE michael@0: /* This is stmdb sp!,{} */ michael@0: .word 0xE92D0000 | LINE_SAVED_REGS michael@0: .endif michael@0: 151: /* New line */ michael@0: newline michael@0: preload_leading_step1 src_bpp, WK1, SRC michael@0: preload_leading_step1 mask_bpp, WK2, MASK michael@0: preload_leading_step1 dst_r_bpp, WK3, DST michael@0: michael@0: tst DST, #15 michael@0: beq 154f michael@0: rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ michael@0: .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp) michael@0: PF and, WK0, WK0, #15 michael@0: .endif michael@0: michael@0: preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC michael@0: preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK michael@0: preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST michael@0: michael@0: leading_15bytes process_head, process_tail michael@0: michael@0: 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ michael@0: .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) michael@0: and SCRATCH, SRC, #31 michael@0: rsb SCRATCH, SCRATCH, #32*prefetch_distance michael@0: .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) michael@0: and SCRATCH, MASK, #31 michael@0: rsb SCRATCH, SCRATCH, #32*prefetch_distance michael@0: .endif michael@0: .ifc "process_inner_loop","" michael@0: switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f michael@0: .else michael@0: switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f michael@0: .endif michael@0: michael@0: 157: /* Check for another line */ michael@0: end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b michael@0: .endif michael@0: michael@0: .ltorg michael@0: michael@0: 160: /* Medium case */ michael@0: mov ORIG_W, X michael@0: .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE michael@0: /* This is stmdb sp!,{} */ michael@0: .word 0xE92D0000 | LINE_SAVED_REGS michael@0: .endif michael@0: 161: /* New line */ michael@0: newline michael@0: preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ michael@0: preload_line 0, mask_bpp, mask_bpp_shift, MASK michael@0: preload_line 0, dst_r_bpp, dst_bpp_shift, DST michael@0: michael@0: sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ michael@0: tst DST, #15 michael@0: beq 164f michael@0: rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ michael@0: michael@0: leading_15bytes process_head, process_tail michael@0: michael@0: 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ michael@0: switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f michael@0: michael@0: 167: /* Check for another line */ michael@0: end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b michael@0: michael@0: .ltorg michael@0: michael@0: 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ michael@0: .if dst_w_bpp < 32 michael@0: mov ORIG_W, X michael@0: .endif michael@0: .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE michael@0: /* This is stmdb sp!,{} */ michael@0: .word 0xE92D0000 | LINE_SAVED_REGS michael@0: .endif michael@0: 171: /* New line */ michael@0: newline michael@0: preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ michael@0: preload_line 1, mask_bpp, mask_bpp_shift, MASK michael@0: preload_line 1, dst_r_bpp, dst_bpp_shift, DST michael@0: michael@0: .if dst_w_bpp == 8 michael@0: tst DST, #3 michael@0: beq 174f michael@0: 172: subs X, X, #1 michael@0: blo 177f michael@0: process_head , 1, 0, 1, 1, 0 michael@0: process_tail , 1, 0 michael@0: .if !((flags) & FLAG_PROCESS_DOES_STORE) michael@0: pixst , 1, 0, DST michael@0: .endif michael@0: tst DST, #3 michael@0: bne 172b michael@0: .elseif dst_w_bpp == 16 michael@0: tst DST, #2 michael@0: beq 174f michael@0: subs X, X, #1 michael@0: blo 177f michael@0: process_head , 2, 0, 1, 1, 0 michael@0: process_tail , 2, 0 michael@0: .if !((flags) & FLAG_PROCESS_DOES_STORE) michael@0: pixst , 2, 0, DST michael@0: .endif michael@0: .endif michael@0: michael@0: 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ michael@0: switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f michael@0: michael@0: 177: /* Check for another line */ michael@0: end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one michael@0: michael@0: 197: michael@0: .if (flags) & FLAG_SPILL_LINE_VARS michael@0: add sp, sp, #LINE_SAVED_REG_COUNT*4 michael@0: .endif michael@0: 198: michael@0: cleanup michael@0: michael@0: #ifdef DEBUG_PARAMS michael@0: add sp, sp, #9*4 /* junk the debug copy of arguments */ michael@0: #endif michael@0: 199: michael@0: pop {r4-r11, pc} /* exit */ michael@0: .fnend michael@0: michael@0: .ltorg michael@0: michael@0: .unreq X michael@0: .unreq Y michael@0: .unreq DST michael@0: .unreq STRIDE_D michael@0: .unreq SRC michael@0: .unreq STRIDE_S michael@0: .unreq MASK michael@0: .unreq STRIDE_M michael@0: .unreq WK0 michael@0: .unreq WK1 michael@0: .unreq WK2 michael@0: .unreq WK3 michael@0: .unreq SCRATCH michael@0: .unreq ORIG_W michael@0: .endfunc michael@0: .endm michael@0: michael@0: .macro line_saved_regs x:vararg michael@0: .set LINE_SAVED_REGS, 0 michael@0: .set LINE_SAVED_REG_COUNT, 0 michael@0: .irp SAVED_REG,x michael@0: .ifc "SAVED_REG","Y" michael@0: .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) michael@0: .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 michael@0: .endif michael@0: .ifc "SAVED_REG","STRIDE_D" michael@0: .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) michael@0: .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 michael@0: .endif michael@0: .ifc "SAVED_REG","STRIDE_S" michael@0: .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) michael@0: .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 michael@0: .endif michael@0: .ifc "SAVED_REG","STRIDE_M" michael@0: .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) michael@0: .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 michael@0: .endif michael@0: .ifc "SAVED_REG","ORIG_W" michael@0: .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) michael@0: .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 michael@0: .endif michael@0: .endr michael@0: .endm michael@0: michael@0: .macro nop_macro x:vararg michael@0: .endm