1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,912 @@ 1.4 +/* 1.5 + * Copyright © 2012 Raspberry Pi Foundation 1.6 + * Copyright © 2012 RISC OS Open Ltd 1.7 + * 1.8 + * Permission to use, copy, modify, distribute, and sell this software and its 1.9 + * documentation for any purpose is hereby granted without fee, provided that 1.10 + * the above copyright notice appear in all copies and that both that 1.11 + * copyright notice and this permission notice appear in supporting 1.12 + * documentation, and that the name of the copyright holders not be used in 1.13 + * advertising or publicity pertaining to distribution of the software without 1.14 + * specific, written prior permission. The copyright holders make no 1.15 + * representations about the suitability of this software for any purpose. It 1.16 + * is provided "as is" without express or implied warranty. 1.17 + * 1.18 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 1.19 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 1.20 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 1.21 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1.22 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 1.23 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 1.24 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 1.25 + * SOFTWARE. 1.26 + * 1.27 + * Author: Ben Avison (bavison@riscosopen.org) 1.28 + * 1.29 + */ 1.30 + 1.31 +/* 1.32 + * Because the alignment of pixel data to cachelines, and even the number of 1.33 + * cachelines per row can vary from row to row, and because of the need to 1.34 + * preload each scanline once and only once, this prefetch strategy treats 1.35 + * each row of pixels independently. When a pixel row is long enough, there 1.36 + * are three distinct phases of prefetch: 1.37 + * * an inner loop section, where each time a cacheline of data is 1.38 + * processed, another cacheline is preloaded (the exact distance ahead is 1.39 + * determined empirically using profiling results from lowlevel-blt-bench) 1.40 + * * a leading section, where enough cachelines are preloaded to ensure no 1.41 + * cachelines escape being preloaded when the inner loop starts 1.42 + * * a trailing section, where a limited number (0 or more) of cachelines 1.43 + * are preloaded to deal with data (if any) that hangs off the end of the 1.44 + * last iteration of the inner loop, plus any trailing bytes that were not 1.45 + * enough to make up one whole iteration of the inner loop 1.46 + * 1.47 + * There are (in general) three distinct code paths, selected between 1.48 + * depending upon how long the pixel row is. If it is long enough that there 1.49 + * is at least one iteration of the inner loop (as described above) then 1.50 + * this is described as the "wide" case. If it is shorter than that, but 1.51 + * there are still enough bytes output that there is at least one 16-byte- 1.52 + * long, 16-byte-aligned write to the destination (the optimum type of 1.53 + * write), then this is the "medium" case. If it is not even this long, then 1.54 + * this is the "narrow" case, and there is no attempt to align writes to 1.55 + * 16-byte boundaries. In the "medium" and "narrow" cases, all the 1.56 + * cachelines containing data from the pixel row are prefetched up-front. 1.57 + */ 1.58 + 1.59 +/* 1.60 + * Determine whether we put the arguments on the stack for debugging. 1.61 + */ 1.62 +#undef DEBUG_PARAMS 1.63 + 1.64 +/* 1.65 + * Bit flags for 'generate_composite_function' macro which are used 1.66 + * to tune generated functions behavior. 1.67 + */ 1.68 +.set FLAG_DST_WRITEONLY, 0 1.69 +.set FLAG_DST_READWRITE, 1 1.70 +.set FLAG_COND_EXEC, 0 1.71 +.set FLAG_BRANCH_OVER, 2 1.72 +.set FLAG_PROCESS_PRESERVES_PSR, 0 1.73 +.set FLAG_PROCESS_CORRUPTS_PSR, 4 1.74 +.set FLAG_PROCESS_DOESNT_STORE, 0 1.75 +.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ 1.76 +.set FLAG_NO_SPILL_LINE_VARS, 0 1.77 +.set FLAG_SPILL_LINE_VARS_WIDE, 16 1.78 +.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 1.79 +.set FLAG_SPILL_LINE_VARS, 48 1.80 +.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 1.81 +.set FLAG_PROCESS_PRESERVES_SCRATCH, 64 1.82 + 1.83 +/* 1.84 + * Offset into stack where mask and source pointer/stride can be accessed. 1.85 + */ 1.86 +#ifdef DEBUG_PARAMS 1.87 +.set ARGS_STACK_OFFSET, (9*4+9*4) 1.88 +#else 1.89 +.set ARGS_STACK_OFFSET, (9*4) 1.90 +#endif 1.91 + 1.92 +/* 1.93 + * Constants for selecting preferable prefetch type. 1.94 + */ 1.95 +.set PREFETCH_TYPE_NONE, 0 1.96 +.set PREFETCH_TYPE_STANDARD, 1 1.97 + 1.98 +/* 1.99 + * Definitions of macros for load/store of pixel data. 1.100 + */ 1.101 + 1.102 +.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 1.103 + .if numbytes == 16 1.104 + .if unaligned == 1 1.105 + op&r&cond WK®0, [base], #4 1.106 + op&r&cond WK®1, [base], #4 1.107 + op&r&cond WK®2, [base], #4 1.108 + op&r&cond WK®3, [base], #4 1.109 + .else 1.110 + op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} 1.111 + .endif 1.112 + .elseif numbytes == 8 1.113 + .if unaligned == 1 1.114 + op&r&cond WK®0, [base], #4 1.115 + op&r&cond WK®1, [base], #4 1.116 + .else 1.117 + op&m&cond&ia base!, {WK®0,WK®1} 1.118 + .endif 1.119 + .elseif numbytes == 4 1.120 + op&r&cond WK®0, [base], #4 1.121 + .elseif numbytes == 2 1.122 + op&r&cond&h WK®0, [base], #2 1.123 + .elseif numbytes == 1 1.124 + op&r&cond&b WK®0, [base], #1 1.125 + .else 1.126 + .error "unsupported size: numbytes" 1.127 + .endif 1.128 +.endm 1.129 + 1.130 +.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base 1.131 + .if numbytes == 16 1.132 + stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} 1.133 + .elseif numbytes == 8 1.134 + stm&cond&db base, {WK®0,WK®1} 1.135 + .elseif numbytes == 4 1.136 + str&cond WK®0, [base, #-4] 1.137 + .elseif numbytes == 2 1.138 + str&cond&h WK®0, [base, #-2] 1.139 + .elseif numbytes == 1 1.140 + str&cond&b WK®0, [base, #-1] 1.141 + .else 1.142 + .error "unsupported size: numbytes" 1.143 + .endif 1.144 +.endm 1.145 + 1.146 +.macro pixld cond, numbytes, firstreg, base, unaligned 1.147 + pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned 1.148 +.endm 1.149 + 1.150 +.macro pixst cond, numbytes, firstreg, base 1.151 + .if (flags) & FLAG_DST_READWRITE 1.152 + pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base 1.153 + .else 1.154 + pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base 1.155 + .endif 1.156 +.endm 1.157 + 1.158 +.macro PF a, x:vararg 1.159 + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) 1.160 + a x 1.161 + .endif 1.162 +.endm 1.163 + 1.164 + 1.165 +.macro preload_leading_step1 bpp, ptr, base 1.166 +/* If the destination is already 16-byte aligned, then we need to preload 1.167 + * between 0 and prefetch_distance (inclusive) cache lines ahead so there 1.168 + * are no gaps when the inner loop starts. 1.169 + */ 1.170 + .if bpp > 0 1.171 + PF bic, ptr, base, #31 1.172 + .set OFFSET, 0 1.173 + .rept prefetch_distance+1 1.174 + PF pld, [ptr, #OFFSET] 1.175 + .set OFFSET, OFFSET+32 1.176 + .endr 1.177 + .endif 1.178 +.endm 1.179 + 1.180 +.macro preload_leading_step2 bpp, bpp_shift, ptr, base 1.181 +/* However, if the destination is not 16-byte aligned, we may need to 1.182 + * preload more cache lines than that. The question we need to ask is: 1.183 + * are the bytes corresponding to the leading pixels more than the amount 1.184 + * by which the source pointer will be rounded down for preloading, and if 1.185 + * so, by how many cache lines? Effectively, we want to calculate 1.186 + * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp 1.187 + * inner_loop_offset = (src+leading_bytes)&31 1.188 + * extra_needed = leading_bytes - inner_loop_offset 1.189 + * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only 1.190 + * possible when there are 4 src bytes for every 1 dst byte). 1.191 + */ 1.192 + .if bpp > 0 1.193 + .ifc base,DST 1.194 + /* The test can be simplified further when preloading the destination */ 1.195 + PF tst, base, #16 1.196 + PF beq, 61f 1.197 + .else 1.198 + .if bpp/dst_w_bpp == 4 1.199 + PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift 1.200 + PF and, SCRATCH, SCRATCH, #31 1.201 + PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift 1.202 + PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ 1.203 + PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */ 1.204 + PF bcs, 61f 1.205 + PF bpl, 60f 1.206 + PF pld, [ptr, #32*(prefetch_distance+2)] 1.207 + .else 1.208 + PF mov, SCRATCH, base, lsl #32-5 1.209 + PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift 1.210 + PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift 1.211 + PF bls, 61f 1.212 + .endif 1.213 + .endif 1.214 +60: PF pld, [ptr, #32*(prefetch_distance+1)] 1.215 +61: 1.216 + .endif 1.217 +.endm 1.218 + 1.219 +#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) 1.220 +.macro preload_middle bpp, base, scratch_holds_offset 1.221 + .if bpp > 0 1.222 + /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ 1.223 + .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) 1.224 + .if scratch_holds_offset 1.225 + PF pld, [base, SCRATCH] 1.226 + .else 1.227 + PF bic, SCRATCH, base, #31 1.228 + PF pld, [SCRATCH, #32*prefetch_distance] 1.229 + .endif 1.230 + .endif 1.231 + .endif 1.232 +.endm 1.233 + 1.234 +.macro preload_trailing bpp, bpp_shift, base 1.235 + .if bpp > 0 1.236 + .if bpp*pix_per_block > 256 1.237 + /* Calculations are more complex if more than one fetch per block */ 1.238 + PF and, WK1, base, #31 1.239 + PF add, WK1, WK1, WK0, lsl #bpp_shift 1.240 + PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) 1.241 + PF bic, SCRATCH, base, #31 1.242 +80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 1.243 + PF add, SCRATCH, SCRATCH, #32 1.244 + PF subs, WK1, WK1, #32 1.245 + PF bhi, 80b 1.246 + .else 1.247 + /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ 1.248 + PF mov, SCRATCH, base, lsl #32-5 1.249 + PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift 1.250 + PF adceqs, SCRATCH, SCRATCH, #0 1.251 + /* The instruction above has two effects: ensures Z is only 1.252 + * set if C was clear (so Z indicates that both shifted quantities 1.253 + * were 0), and clears C if Z was set (so C indicates that the sum 1.254 + * of the shifted quantities was greater and not equal to 32) */ 1.255 + PF beq, 82f 1.256 + PF bic, SCRATCH, base, #31 1.257 + PF bcc, 81f 1.258 + PF pld, [SCRATCH, #32*(prefetch_distance+2)] 1.259 +81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 1.260 +82: 1.261 + .endif 1.262 + .endif 1.263 +.endm 1.264 + 1.265 + 1.266 +.macro preload_line narrow_case, bpp, bpp_shift, base 1.267 +/* "narrow_case" - just means that the macro was invoked from the "narrow" 1.268 + * code path rather than the "medium" one - because in the narrow case, 1.269 + * the row of pixels is known to output no more than 30 bytes, then 1.270 + * (assuming the source pixels are no wider than the the destination 1.271 + * pixels) they cannot possibly straddle more than 2 32-byte cachelines, 1.272 + * meaning there's no need for a loop. 1.273 + * "bpp" - number of bits per pixel in the channel (source, mask or 1.274 + * destination) that's being preloaded, or 0 if this channel is not used 1.275 + * for reading 1.276 + * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) 1.277 + * "base" - base address register of channel to preload (SRC, MASK or DST) 1.278 + */ 1.279 + .if bpp > 0 1.280 + .if narrow_case && (bpp <= dst_w_bpp) 1.281 + /* In these cases, each line for each channel is in either 1 or 2 cache lines */ 1.282 + PF bic, WK0, base, #31 1.283 + PF pld, [WK0] 1.284 + PF add, WK1, base, X, LSL #bpp_shift 1.285 + PF sub, WK1, WK1, #1 1.286 + PF bic, WK1, WK1, #31 1.287 + PF cmp, WK1, WK0 1.288 + PF beq, 90f 1.289 + PF pld, [WK1] 1.290 +90: 1.291 + .else 1.292 + PF bic, WK0, base, #31 1.293 + PF pld, [WK0] 1.294 + PF add, WK1, base, X, lsl #bpp_shift 1.295 + PF sub, WK1, WK1, #1 1.296 + PF bic, WK1, WK1, #31 1.297 + PF cmp, WK1, WK0 1.298 + PF beq, 92f 1.299 +91: PF add, WK0, WK0, #32 1.300 + PF cmp, WK0, WK1 1.301 + PF pld, [WK0] 1.302 + PF bne, 91b 1.303 +92: 1.304 + .endif 1.305 + .endif 1.306 +.endm 1.307 + 1.308 + 1.309 +.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 1.310 + process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 1.311 + .if decrementx 1.312 + sub&cond X, X, #8*numbytes/dst_w_bpp 1.313 + .endif 1.314 + process_tail cond, numbytes, firstreg 1.315 + .if !((flags) & FLAG_PROCESS_DOES_STORE) 1.316 + pixst cond, numbytes, firstreg, DST 1.317 + .endif 1.318 +.endm 1.319 + 1.320 +.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 1.321 + .if (flags) & FLAG_BRANCH_OVER 1.322 + .ifc cond,mi 1.323 + bpl 100f 1.324 + .endif 1.325 + .ifc cond,cs 1.326 + bcc 100f 1.327 + .endif 1.328 + .ifc cond,ne 1.329 + beq 100f 1.330 + .endif 1.331 + conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 1.332 +100: 1.333 + .else 1.334 + conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 1.335 + .endif 1.336 +.endm 1.337 + 1.338 +.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx 1.339 + .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) 1.340 + /* Can't interleave reads and writes */ 1.341 + test 1.342 + conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx 1.343 + .if (flags) & FLAG_PROCESS_CORRUPTS_PSR 1.344 + test 1.345 + .endif 1.346 + conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx 1.347 + .else 1.348 + /* Can interleave reads and writes for better scheduling */ 1.349 + test 1.350 + process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 1.351 + process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 1.352 + .if decrementx 1.353 + sub&cond1 X, X, #8*numbytes1/dst_w_bpp 1.354 + sub&cond2 X, X, #8*numbytes2/dst_w_bpp 1.355 + .endif 1.356 + process_tail cond1, numbytes1, firstreg1 1.357 + process_tail cond2, numbytes2, firstreg2 1.358 + pixst cond1, numbytes1, firstreg1, DST 1.359 + pixst cond2, numbytes2, firstreg2, DST 1.360 + .endif 1.361 +.endm 1.362 + 1.363 + 1.364 +.macro test_bits_1_0_ptr 1.365 + movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ 1.366 +.endm 1.367 + 1.368 +.macro test_bits_3_2_ptr 1.369 + movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ 1.370 +.endm 1.371 + 1.372 +.macro leading_15bytes process_head, process_tail 1.373 + /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ 1.374 + /* Use unaligned loads in all cases for simplicity */ 1.375 + .if dst_w_bpp == 8 1.376 + conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1 1.377 + .elseif dst_w_bpp == 16 1.378 + test_bits_1_0_ptr 1.379 + conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1 1.380 + .endif 1.381 + conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1 1.382 +.endm 1.383 + 1.384 +.macro test_bits_3_2_pix 1.385 + movs SCRATCH, X, lsl #dst_bpp_shift+32-3 1.386 +.endm 1.387 + 1.388 +.macro test_bits_1_0_pix 1.389 + .if dst_w_bpp == 8 1.390 + movs SCRATCH, X, lsl #dst_bpp_shift+32-1 1.391 + .else 1.392 + movs SCRATCH, X, lsr #1 1.393 + .endif 1.394 +.endm 1.395 + 1.396 +.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 1.397 + conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 1.398 + .if dst_w_bpp == 16 1.399 + test_bits_1_0_pix 1.400 + conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 1.401 + .elseif dst_w_bpp == 8 1.402 + conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 1.403 + .endif 1.404 +.endm 1.405 + 1.406 + 1.407 +.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 1.408 +110: 1.409 + .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ 1.410 + .rept pix_per_block*dst_w_bpp/128 1.411 + process_head , 16, 0, unaligned_src, unaligned_mask, 1 1.412 + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 1.413 + preload_middle src_bpp, SRC, 1 1.414 + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 1.415 + preload_middle mask_bpp, MASK, 1 1.416 + .else 1.417 + preload_middle src_bpp, SRC, 0 1.418 + preload_middle mask_bpp, MASK, 0 1.419 + .endif 1.420 + .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) 1.421 + /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that 1.422 + * destination prefetches are 32-byte aligned. It's also the easiest channel to offset 1.423 + * preloads for, to achieve staggered prefetches for multiple channels, because there are 1.424 + * always two STMs per prefetch, so there is always an opposite STM on which to put the 1.425 + * preload. Note, no need to BIC the base register here */ 1.426 + PF pld, [DST, #32*prefetch_distance - dst_alignment] 1.427 + .endif 1.428 + process_tail , 16, 0 1.429 + .if !((flags) & FLAG_PROCESS_DOES_STORE) 1.430 + pixst , 16, 0, DST 1.431 + .endif 1.432 + .set SUBBLOCK, SUBBLOCK+1 1.433 + .endr 1.434 + subs X, X, #pix_per_block 1.435 + bhs 110b 1.436 +.endm 1.437 + 1.438 +.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask 1.439 + /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ 1.440 + .if dst_r_bpp > 0 1.441 + tst DST, #16 1.442 + bne 111f 1.443 + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 1.444 + b 112f 1.445 +111: 1.446 + .endif 1.447 + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 1.448 +112: 1.449 + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ 1.450 + .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) 1.451 + PF and, WK0, X, #pix_per_block-1 1.452 + .endif 1.453 + preload_trailing src_bpp, src_bpp_shift, SRC 1.454 + preload_trailing mask_bpp, mask_bpp_shift, MASK 1.455 + preload_trailing dst_r_bpp, dst_bpp_shift, DST 1.456 + add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp 1.457 + /* The remainder of the line is handled identically to the medium case */ 1.458 + medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask 1.459 +.endm 1.460 + 1.461 +.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 1.462 +120: 1.463 + process_head , 16, 0, unaligned_src, unaligned_mask, 0 1.464 + process_tail , 16, 0 1.465 + .if !((flags) & FLAG_PROCESS_DOES_STORE) 1.466 + pixst , 16, 0, DST 1.467 + .endif 1.468 + subs X, X, #128/dst_w_bpp 1.469 + bhs 120b 1.470 + /* Trailing pixels */ 1.471 + tst X, #128/dst_w_bpp - 1 1.472 + beq exit_label 1.473 + trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 1.474 +.endm 1.475 + 1.476 +.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 1.477 + tst X, #16*8/dst_w_bpp 1.478 + conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 1.479 + /* Trailing pixels */ 1.480 + /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ 1.481 + trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 1.482 +.endm 1.483 + 1.484 +.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label 1.485 + /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ 1.486 + .if mask_bpp == 8 || mask_bpp == 16 1.487 + tst MASK, #3 1.488 + bne 141f 1.489 + .endif 1.490 + .if src_bpp == 8 || src_bpp == 16 1.491 + tst SRC, #3 1.492 + bne 140f 1.493 + .endif 1.494 + action process_head, process_tail, process_inner_loop, exit_label, 0, 0 1.495 + .if src_bpp == 8 || src_bpp == 16 1.496 + b exit_label 1.497 +140: 1.498 + action process_head, process_tail, process_inner_loop, exit_label, 1, 0 1.499 + .endif 1.500 + .if mask_bpp == 8 || mask_bpp == 16 1.501 + b exit_label 1.502 +141: 1.503 + .if src_bpp == 8 || src_bpp == 16 1.504 + tst SRC, #3 1.505 + bne 142f 1.506 + .endif 1.507 + action process_head, process_tail, process_inner_loop, exit_label, 0, 1 1.508 + .if src_bpp == 8 || src_bpp == 16 1.509 + b exit_label 1.510 +142: 1.511 + action process_head, process_tail, process_inner_loop, exit_label, 1, 1 1.512 + .endif 1.513 + .endif 1.514 +.endm 1.515 + 1.516 + 1.517 +.macro end_of_line restore_x, vars_spilled, loop_label, last_one 1.518 + .if vars_spilled 1.519 + /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ 1.520 + /* This is ldmia sp,{} */ 1.521 + .word 0xE89D0000 | LINE_SAVED_REGS 1.522 + .endif 1.523 + subs Y, Y, #1 1.524 + .if vars_spilled 1.525 + .if (LINE_SAVED_REGS) & (1<<1) 1.526 + str Y, [sp] 1.527 + .endif 1.528 + .endif 1.529 + add DST, DST, STRIDE_D 1.530 + .if src_bpp > 0 1.531 + add SRC, SRC, STRIDE_S 1.532 + .endif 1.533 + .if mask_bpp > 0 1.534 + add MASK, MASK, STRIDE_M 1.535 + .endif 1.536 + .if restore_x 1.537 + mov X, ORIG_W 1.538 + .endif 1.539 + bhs loop_label 1.540 + .ifc "last_one","" 1.541 + .if vars_spilled 1.542 + b 197f 1.543 + .else 1.544 + b 198f 1.545 + .endif 1.546 + .else 1.547 + .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) 1.548 + b 198f 1.549 + .endif 1.550 + .endif 1.551 +.endm 1.552 + 1.553 + 1.554 +.macro generate_composite_function fname, \ 1.555 + src_bpp_, \ 1.556 + mask_bpp_, \ 1.557 + dst_w_bpp_, \ 1.558 + flags_, \ 1.559 + prefetch_distance_, \ 1.560 + init, \ 1.561 + newline, \ 1.562 + cleanup, \ 1.563 + process_head, \ 1.564 + process_tail, \ 1.565 + process_inner_loop 1.566 + 1.567 + .func fname 1.568 + .global fname 1.569 + /* For ELF format also set function visibility to hidden */ 1.570 +#ifdef __ELF__ 1.571 + .hidden fname 1.572 + .type fname, %function 1.573 +#endif 1.574 + 1.575 +/* 1.576 + * Make some macro arguments globally visible and accessible 1.577 + * from other macros 1.578 + */ 1.579 + .set src_bpp, src_bpp_ 1.580 + .set mask_bpp, mask_bpp_ 1.581 + .set dst_w_bpp, dst_w_bpp_ 1.582 + .set flags, flags_ 1.583 + .set prefetch_distance, prefetch_distance_ 1.584 + 1.585 +/* 1.586 + * Select prefetch type for this function. 1.587 + */ 1.588 + .if prefetch_distance == 0 1.589 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 1.590 + .else 1.591 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD 1.592 + .endif 1.593 + 1.594 + .if src_bpp == 32 1.595 + .set src_bpp_shift, 2 1.596 + .elseif src_bpp == 24 1.597 + .set src_bpp_shift, 0 1.598 + .elseif src_bpp == 16 1.599 + .set src_bpp_shift, 1 1.600 + .elseif src_bpp == 8 1.601 + .set src_bpp_shift, 0 1.602 + .elseif src_bpp == 0 1.603 + .set src_bpp_shift, -1 1.604 + .else 1.605 + .error "requested src bpp (src_bpp) is not supported" 1.606 + .endif 1.607 + 1.608 + .if mask_bpp == 32 1.609 + .set mask_bpp_shift, 2 1.610 + .elseif mask_bpp == 24 1.611 + .set mask_bpp_shift, 0 1.612 + .elseif mask_bpp == 8 1.613 + .set mask_bpp_shift, 0 1.614 + .elseif mask_bpp == 0 1.615 + .set mask_bpp_shift, -1 1.616 + .else 1.617 + .error "requested mask bpp (mask_bpp) is not supported" 1.618 + .endif 1.619 + 1.620 + .if dst_w_bpp == 32 1.621 + .set dst_bpp_shift, 2 1.622 + .elseif dst_w_bpp == 24 1.623 + .set dst_bpp_shift, 0 1.624 + .elseif dst_w_bpp == 16 1.625 + .set dst_bpp_shift, 1 1.626 + .elseif dst_w_bpp == 8 1.627 + .set dst_bpp_shift, 0 1.628 + .else 1.629 + .error "requested dst bpp (dst_w_bpp) is not supported" 1.630 + .endif 1.631 + 1.632 + .if (((flags) & FLAG_DST_READWRITE) != 0) 1.633 + .set dst_r_bpp, dst_w_bpp 1.634 + .else 1.635 + .set dst_r_bpp, 0 1.636 + .endif 1.637 + 1.638 + .set pix_per_block, 16*8/dst_w_bpp 1.639 + .if src_bpp != 0 1.640 + .if 32*8/src_bpp > pix_per_block 1.641 + .set pix_per_block, 32*8/src_bpp 1.642 + .endif 1.643 + .endif 1.644 + .if mask_bpp != 0 1.645 + .if 32*8/mask_bpp > pix_per_block 1.646 + .set pix_per_block, 32*8/mask_bpp 1.647 + .endif 1.648 + .endif 1.649 + .if dst_r_bpp != 0 1.650 + .if 32*8/dst_r_bpp > pix_per_block 1.651 + .set pix_per_block, 32*8/dst_r_bpp 1.652 + .endif 1.653 + .endif 1.654 + 1.655 +/* The standard entry conditions set up by pixman-arm-common.h are: 1.656 + * r0 = width (pixels) 1.657 + * r1 = height (rows) 1.658 + * r2 = pointer to top-left pixel of destination 1.659 + * r3 = destination stride (pixels) 1.660 + * [sp] = source pixel value, or pointer to top-left pixel of source 1.661 + * [sp,#4] = 0 or source stride (pixels) 1.662 + * The following arguments are unused for non-mask operations 1.663 + * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask 1.664 + * [sp,#12] = 0 or mask stride (pixels) 1.665 + */ 1.666 + 1.667 +/* 1.668 + * Assign symbolic names to registers 1.669 + */ 1.670 + X .req r0 /* pixels to go on this line */ 1.671 + Y .req r1 /* lines to go */ 1.672 + DST .req r2 /* destination pixel pointer */ 1.673 + STRIDE_D .req r3 /* destination stride (bytes, minus width) */ 1.674 + SRC .req r4 /* source pixel pointer */ 1.675 + STRIDE_S .req r5 /* source stride (bytes, minus width) */ 1.676 + MASK .req r6 /* mask pixel pointer (if applicable) */ 1.677 + STRIDE_M .req r7 /* mask stride (bytes, minus width) */ 1.678 + WK0 .req r8 /* pixel data registers */ 1.679 + WK1 .req r9 1.680 + WK2 .req r10 1.681 + WK3 .req r11 1.682 + SCRATCH .req r12 1.683 + ORIG_W .req r14 /* width (pixels) */ 1.684 + 1.685 +fname: 1.686 + .fnstart 1.687 + .save {r4-r11, lr} 1.688 + push {r4-r11, lr} /* save all registers */ 1.689 + 1.690 + subs Y, Y, #1 1.691 + blo 199f 1.692 + 1.693 +#ifdef DEBUG_PARAMS 1.694 + .pad #9*4 1.695 + sub sp, sp, #9*4 1.696 +#endif 1.697 + 1.698 + .if src_bpp > 0 1.699 + ldr SRC, [sp, #ARGS_STACK_OFFSET] 1.700 + ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] 1.701 + .endif 1.702 + .if mask_bpp > 0 1.703 + ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 1.704 + ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] 1.705 + .endif 1.706 + 1.707 +#ifdef DEBUG_PARAMS 1.708 + add Y, Y, #1 1.709 + stmia sp, {r0-r7,pc} 1.710 + sub Y, Y, #1 1.711 +#endif 1.712 + 1.713 + init 1.714 + 1.715 + lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ 1.716 + sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift 1.717 + .if src_bpp > 0 1.718 + lsl STRIDE_S, #src_bpp_shift 1.719 + sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift 1.720 + .endif 1.721 + .if mask_bpp > 0 1.722 + lsl STRIDE_M, #mask_bpp_shift 1.723 + sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift 1.724 + .endif 1.725 + 1.726 + /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ 1.727 + cmp X, #2*16*8/dst_w_bpp - 1 1.728 + blo 170f 1.729 + .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ 1.730 + /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ 1.731 + cmp X, #(prefetch_distance+3)*pix_per_block - 1 1.732 + blo 160f 1.733 + 1.734 + /* Wide case */ 1.735 + /* Adjust X so that the decrement instruction can also test for 1.736 + * inner loop termination. We want it to stop when there are 1.737 + * (prefetch_distance+1) complete blocks to go. */ 1.738 + sub X, X, #(prefetch_distance+2)*pix_per_block 1.739 + mov ORIG_W, X 1.740 + .if (flags) & FLAG_SPILL_LINE_VARS_WIDE 1.741 + /* This is stmdb sp!,{} */ 1.742 + .word 0xE92D0000 | LINE_SAVED_REGS 1.743 + .endif 1.744 +151: /* New line */ 1.745 + newline 1.746 + preload_leading_step1 src_bpp, WK1, SRC 1.747 + preload_leading_step1 mask_bpp, WK2, MASK 1.748 + preload_leading_step1 dst_r_bpp, WK3, DST 1.749 + 1.750 + tst DST, #15 1.751 + beq 154f 1.752 + rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ 1.753 + .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp) 1.754 + PF and, WK0, WK0, #15 1.755 + .endif 1.756 + 1.757 + preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC 1.758 + preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK 1.759 + preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST 1.760 + 1.761 + leading_15bytes process_head, process_tail 1.762 + 1.763 +154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ 1.764 + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 1.765 + and SCRATCH, SRC, #31 1.766 + rsb SCRATCH, SCRATCH, #32*prefetch_distance 1.767 + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 1.768 + and SCRATCH, MASK, #31 1.769 + rsb SCRATCH, SCRATCH, #32*prefetch_distance 1.770 + .endif 1.771 + .ifc "process_inner_loop","" 1.772 + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f 1.773 + .else 1.774 + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f 1.775 + .endif 1.776 + 1.777 +157: /* Check for another line */ 1.778 + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b 1.779 + .endif 1.780 + 1.781 + .ltorg 1.782 + 1.783 +160: /* Medium case */ 1.784 + mov ORIG_W, X 1.785 + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 1.786 + /* This is stmdb sp!,{} */ 1.787 + .word 0xE92D0000 | LINE_SAVED_REGS 1.788 + .endif 1.789 +161: /* New line */ 1.790 + newline 1.791 + preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 1.792 + preload_line 0, mask_bpp, mask_bpp_shift, MASK 1.793 + preload_line 0, dst_r_bpp, dst_bpp_shift, DST 1.794 + 1.795 + sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ 1.796 + tst DST, #15 1.797 + beq 164f 1.798 + rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ 1.799 + 1.800 + leading_15bytes process_head, process_tail 1.801 + 1.802 +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ 1.803 + switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f 1.804 + 1.805 +167: /* Check for another line */ 1.806 + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b 1.807 + 1.808 + .ltorg 1.809 + 1.810 +170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ 1.811 + .if dst_w_bpp < 32 1.812 + mov ORIG_W, X 1.813 + .endif 1.814 + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 1.815 + /* This is stmdb sp!,{} */ 1.816 + .word 0xE92D0000 | LINE_SAVED_REGS 1.817 + .endif 1.818 +171: /* New line */ 1.819 + newline 1.820 + preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 1.821 + preload_line 1, mask_bpp, mask_bpp_shift, MASK 1.822 + preload_line 1, dst_r_bpp, dst_bpp_shift, DST 1.823 + 1.824 + .if dst_w_bpp == 8 1.825 + tst DST, #3 1.826 + beq 174f 1.827 +172: subs X, X, #1 1.828 + blo 177f 1.829 + process_head , 1, 0, 1, 1, 0 1.830 + process_tail , 1, 0 1.831 + .if !((flags) & FLAG_PROCESS_DOES_STORE) 1.832 + pixst , 1, 0, DST 1.833 + .endif 1.834 + tst DST, #3 1.835 + bne 172b 1.836 + .elseif dst_w_bpp == 16 1.837 + tst DST, #2 1.838 + beq 174f 1.839 + subs X, X, #1 1.840 + blo 177f 1.841 + process_head , 2, 0, 1, 1, 0 1.842 + process_tail , 2, 0 1.843 + .if !((flags) & FLAG_PROCESS_DOES_STORE) 1.844 + pixst , 2, 0, DST 1.845 + .endif 1.846 + .endif 1.847 + 1.848 +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ 1.849 + switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f 1.850 + 1.851 +177: /* Check for another line */ 1.852 + end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one 1.853 + 1.854 +197: 1.855 + .if (flags) & FLAG_SPILL_LINE_VARS 1.856 + add sp, sp, #LINE_SAVED_REG_COUNT*4 1.857 + .endif 1.858 +198: 1.859 + cleanup 1.860 + 1.861 +#ifdef DEBUG_PARAMS 1.862 + add sp, sp, #9*4 /* junk the debug copy of arguments */ 1.863 +#endif 1.864 +199: 1.865 + pop {r4-r11, pc} /* exit */ 1.866 + .fnend 1.867 + 1.868 + .ltorg 1.869 + 1.870 + .unreq X 1.871 + .unreq Y 1.872 + .unreq DST 1.873 + .unreq STRIDE_D 1.874 + .unreq SRC 1.875 + .unreq STRIDE_S 1.876 + .unreq MASK 1.877 + .unreq STRIDE_M 1.878 + .unreq WK0 1.879 + .unreq WK1 1.880 + .unreq WK2 1.881 + .unreq WK3 1.882 + .unreq SCRATCH 1.883 + .unreq ORIG_W 1.884 + .endfunc 1.885 +.endm 1.886 + 1.887 +.macro line_saved_regs x:vararg 1.888 + .set LINE_SAVED_REGS, 0 1.889 + .set LINE_SAVED_REG_COUNT, 0 1.890 + .irp SAVED_REG,x 1.891 + .ifc "SAVED_REG","Y" 1.892 + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) 1.893 + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1.894 + .endif 1.895 + .ifc "SAVED_REG","STRIDE_D" 1.896 + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) 1.897 + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1.898 + .endif 1.899 + .ifc "SAVED_REG","STRIDE_S" 1.900 + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) 1.901 + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1.902 + .endif 1.903 + .ifc "SAVED_REG","STRIDE_M" 1.904 + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) 1.905 + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1.906 + .endif 1.907 + .ifc "SAVED_REG","ORIG_W" 1.908 + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) 1.909 + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1.910 + .endif 1.911 + .endr 1.912 +.endm 1.913 + 1.914 +.macro nop_macro x:vararg 1.915 +.endm