1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,613 @@ 1.4 +/* 1.5 + * Copyright © 2012 Raspberry Pi Foundation 1.6 + * Copyright © 2012 RISC OS Open Ltd 1.7 + * 1.8 + * Permission to use, copy, modify, distribute, and sell this software and its 1.9 + * documentation for any purpose is hereby granted without fee, provided that 1.10 + * the above copyright notice appear in all copies and that both that 1.11 + * copyright notice and this permission notice appear in supporting 1.12 + * documentation, and that the name of the copyright holders not be used in 1.13 + * advertising or publicity pertaining to distribution of the software without 1.14 + * specific, written prior permission. The copyright holders make no 1.15 + * representations about the suitability of this software for any purpose. It 1.16 + * is provided "as is" without express or implied warranty. 1.17 + * 1.18 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 1.19 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 1.20 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 1.21 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1.22 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 1.23 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 1.24 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 1.25 + * SOFTWARE. 1.26 + * 1.27 + * Author: Ben Avison (bavison@riscosopen.org) 1.28 + * 1.29 + */ 1.30 + 1.31 +/* Prevent the stack from becoming executable */ 1.32 +#if defined(__linux__) && defined(__ELF__) 1.33 +.section .note.GNU-stack,"",%progbits 1.34 +#endif 1.35 + 1.36 + .text 1.37 + .arch armv6 1.38 + .object_arch armv4 1.39 + .arm 1.40 + .altmacro 1.41 + .p2align 2 1.42 + 1.43 +#include "pixman-arm-simd-asm.h" 1.44 + 1.45 +/* A head macro should do all processing which results in an output of up to 1.46 + * 16 bytes, as far as the final load instruction. The corresponding tail macro 1.47 + * should complete the processing of the up-to-16 bytes. The calling macro will 1.48 + * sometimes choose to insert a preload or a decrement of X between them. 1.49 + * cond ARM condition code for code block 1.50 + * numbytes Number of output bytes that should be generated this time 1.51 + * firstreg First WK register in which to place output 1.52 + * unaligned_src Whether to use non-wordaligned loads of source image 1.53 + * unaligned_mask Whether to use non-wordaligned loads of mask image 1.54 + * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output 1.55 + */ 1.56 + 1.57 +.macro blit_init 1.58 + line_saved_regs STRIDE_D, STRIDE_S 1.59 +.endm 1.60 + 1.61 +.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1.62 + pixld cond, numbytes, firstreg, SRC, unaligned_src 1.63 +.endm 1.64 + 1.65 +.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 1.66 + WK4 .req STRIDE_D 1.67 + WK5 .req STRIDE_S 1.68 + WK6 .req MASK 1.69 + WK7 .req STRIDE_M 1.70 +110: pixld , 16, 0, SRC, unaligned_src 1.71 + pixld , 16, 4, SRC, unaligned_src 1.72 + pld [SRC, SCRATCH] 1.73 + pixst , 16, 0, DST 1.74 + pixst , 16, 4, DST 1.75 + subs X, X, #32*8/src_bpp 1.76 + bhs 110b 1.77 + .unreq WK4 1.78 + .unreq WK5 1.79 + .unreq WK6 1.80 + .unreq WK7 1.81 +.endm 1.82 + 1.83 +generate_composite_function \ 1.84 + pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ 1.85 + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 1.86 + 4, /* prefetch distance */ \ 1.87 + blit_init, \ 1.88 + nop_macro, /* newline */ \ 1.89 + nop_macro, /* cleanup */ \ 1.90 + blit_process_head, \ 1.91 + nop_macro, /* process tail */ \ 1.92 + blit_inner_loop 1.93 + 1.94 +generate_composite_function \ 1.95 + pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ 1.96 + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 1.97 + 4, /* prefetch distance */ \ 1.98 + blit_init, \ 1.99 + nop_macro, /* newline */ \ 1.100 + nop_macro, /* cleanup */ \ 1.101 + blit_process_head, \ 1.102 + nop_macro, /* process tail */ \ 1.103 + blit_inner_loop 1.104 + 1.105 +generate_composite_function \ 1.106 + pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ 1.107 + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 1.108 + 3, /* prefetch distance */ \ 1.109 + blit_init, \ 1.110 + nop_macro, /* newline */ \ 1.111 + nop_macro, /* cleanup */ \ 1.112 + blit_process_head, \ 1.113 + nop_macro, /* process tail */ \ 1.114 + blit_inner_loop 1.115 + 1.116 +/******************************************************************************/ 1.117 + 1.118 +.macro src_n_8888_init 1.119 + ldr SRC, [sp, #ARGS_STACK_OFFSET] 1.120 + mov STRIDE_S, SRC 1.121 + mov MASK, SRC 1.122 + mov STRIDE_M, SRC 1.123 +.endm 1.124 + 1.125 +.macro src_n_0565_init 1.126 + ldrh SRC, [sp, #ARGS_STACK_OFFSET] 1.127 + orr SRC, SRC, lsl #16 1.128 + mov STRIDE_S, SRC 1.129 + mov MASK, SRC 1.130 + mov STRIDE_M, SRC 1.131 +.endm 1.132 + 1.133 +.macro src_n_8_init 1.134 + ldrb SRC, [sp, #ARGS_STACK_OFFSET] 1.135 + orr SRC, SRC, lsl #8 1.136 + orr SRC, SRC, lsl #16 1.137 + mov STRIDE_S, SRC 1.138 + mov MASK, SRC 1.139 + mov STRIDE_M, SRC 1.140 +.endm 1.141 + 1.142 +.macro fill_process_tail cond, numbytes, firstreg 1.143 + WK4 .req SRC 1.144 + WK5 .req STRIDE_S 1.145 + WK6 .req MASK 1.146 + WK7 .req STRIDE_M 1.147 + pixst cond, numbytes, 4, DST 1.148 + .unreq WK4 1.149 + .unreq WK5 1.150 + .unreq WK6 1.151 + .unreq WK7 1.152 +.endm 1.153 + 1.154 +generate_composite_function \ 1.155 + pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ 1.156 + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 1.157 + 0, /* prefetch distance doesn't apply */ \ 1.158 + src_n_8888_init \ 1.159 + nop_macro, /* newline */ \ 1.160 + nop_macro /* cleanup */ \ 1.161 + nop_macro /* process head */ \ 1.162 + fill_process_tail 1.163 + 1.164 +generate_composite_function \ 1.165 + pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ 1.166 + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 1.167 + 0, /* prefetch distance doesn't apply */ \ 1.168 + src_n_0565_init \ 1.169 + nop_macro, /* newline */ \ 1.170 + nop_macro /* cleanup */ \ 1.171 + nop_macro /* process head */ \ 1.172 + fill_process_tail 1.173 + 1.174 +generate_composite_function \ 1.175 + pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ 1.176 + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 1.177 + 0, /* prefetch distance doesn't apply */ \ 1.178 + src_n_8_init \ 1.179 + nop_macro, /* newline */ \ 1.180 + nop_macro /* cleanup */ \ 1.181 + nop_macro /* process head */ \ 1.182 + fill_process_tail 1.183 + 1.184 +/******************************************************************************/ 1.185 + 1.186 +.macro src_x888_8888_pixel, cond, reg 1.187 + orr&cond WK®, WK®, #0xFF000000 1.188 +.endm 1.189 + 1.190 +.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1.191 + pixld cond, numbytes, firstreg, SRC, unaligned_src 1.192 +.endm 1.193 + 1.194 +.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg 1.195 + src_x888_8888_pixel cond, %(firstreg+0) 1.196 + .if numbytes >= 8 1.197 + src_x888_8888_pixel cond, %(firstreg+1) 1.198 + .if numbytes == 16 1.199 + src_x888_8888_pixel cond, %(firstreg+2) 1.200 + src_x888_8888_pixel cond, %(firstreg+3) 1.201 + .endif 1.202 + .endif 1.203 +.endm 1.204 + 1.205 +generate_composite_function \ 1.206 + pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ 1.207 + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 1.208 + 3, /* prefetch distance */ \ 1.209 + nop_macro, /* init */ \ 1.210 + nop_macro, /* newline */ \ 1.211 + nop_macro, /* cleanup */ \ 1.212 + pixman_composite_src_x888_8888_process_head, \ 1.213 + pixman_composite_src_x888_8888_process_tail 1.214 + 1.215 +/******************************************************************************/ 1.216 + 1.217 +.macro src_0565_8888_init 1.218 + /* Hold loop invariants in MASK and STRIDE_M */ 1.219 + ldr MASK, =0x07E007E0 1.220 + mov STRIDE_M, #0xFF000000 1.221 + /* Set GE[3:0] to 1010 so SEL instructions do what we want */ 1.222 + ldr SCRATCH, =0x80008000 1.223 + uadd8 SCRATCH, SCRATCH, SCRATCH 1.224 +.endm 1.225 + 1.226 +.macro src_0565_8888_2pixels, reg1, reg2 1.227 + and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 1.228 + bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb 1.229 + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg 1.230 + mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 1.231 + mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG 1.232 + bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 1.233 + orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 1.234 + orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 1.235 + pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- 1.236 + sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- 1.237 + mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg 1.238 + pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- 1.239 + sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- 1.240 + orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb 1.241 + orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB 1.242 +.endm 1.243 + 1.244 +/* This version doesn't need STRIDE_M, but is one instruction longer. 1.245 + It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? 1.246 + and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 1.247 + bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb 1.248 + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg 1.249 + mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB 1.250 + mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 1.251 + bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb 1.252 + mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 1.253 + mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 1.254 + orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB 1.255 + orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb 1.256 + pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB 1.257 + pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb 1.258 + sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB 1.259 + sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb 1.260 + orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB 1.261 + orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb 1.262 +*/ 1.263 + 1.264 +.macro src_0565_8888_1pixel, reg 1.265 + bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb 1.266 + and WK®, WK®, MASK @ 000000000000000000000gggggg00000 1.267 + mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 1.268 + mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 1.269 + orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb 1.270 + orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 1.271 + pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb 1.272 + sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb 1.273 + orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb 1.274 +.endm 1.275 + 1.276 +.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1.277 + .if numbytes == 16 1.278 + pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src 1.279 + .elseif numbytes == 8 1.280 + pixld , 4, firstreg, SRC, unaligned_src 1.281 + .elseif numbytes == 4 1.282 + pixld , 2, firstreg, SRC, unaligned_src 1.283 + .endif 1.284 +.endm 1.285 + 1.286 +.macro src_0565_8888_process_tail cond, numbytes, firstreg 1.287 + .if numbytes == 16 1.288 + src_0565_8888_2pixels firstreg, %(firstreg+1) 1.289 + src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) 1.290 + .elseif numbytes == 8 1.291 + src_0565_8888_2pixels firstreg, %(firstreg+1) 1.292 + .else 1.293 + src_0565_8888_1pixel firstreg 1.294 + .endif 1.295 +.endm 1.296 + 1.297 +generate_composite_function \ 1.298 + pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ 1.299 + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ 1.300 + 3, /* prefetch distance */ \ 1.301 + src_0565_8888_init, \ 1.302 + nop_macro, /* newline */ \ 1.303 + nop_macro, /* cleanup */ \ 1.304 + src_0565_8888_process_head, \ 1.305 + src_0565_8888_process_tail 1.306 + 1.307 +/******************************************************************************/ 1.308 + 1.309 +.macro add_8_8_8pixels cond, dst1, dst2 1.310 + uqadd8&cond WK&dst1, WK&dst1, MASK 1.311 + uqadd8&cond WK&dst2, WK&dst2, STRIDE_M 1.312 +.endm 1.313 + 1.314 +.macro add_8_8_4pixels cond, dst 1.315 + uqadd8&cond WK&dst, WK&dst, MASK 1.316 +.endm 1.317 + 1.318 +.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1.319 + WK4 .req MASK 1.320 + WK5 .req STRIDE_M 1.321 + .if numbytes == 16 1.322 + pixld cond, 8, 4, SRC, unaligned_src 1.323 + pixld cond, 16, firstreg, DST, 0 1.324 + add_8_8_8pixels cond, firstreg, %(firstreg+1) 1.325 + pixld cond, 8, 4, SRC, unaligned_src 1.326 + .else 1.327 + pixld cond, numbytes, 4, SRC, unaligned_src 1.328 + pixld cond, numbytes, firstreg, DST, 0 1.329 + .endif 1.330 + .unreq WK4 1.331 + .unreq WK5 1.332 +.endm 1.333 + 1.334 +.macro add_8_8_process_tail cond, numbytes, firstreg 1.335 + .if numbytes == 16 1.336 + add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) 1.337 + .elseif numbytes == 8 1.338 + add_8_8_8pixels cond, firstreg, %(firstreg+1) 1.339 + .else 1.340 + add_8_8_4pixels cond, firstreg 1.341 + .endif 1.342 +.endm 1.343 + 1.344 +generate_composite_function \ 1.345 + pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ 1.346 + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ 1.347 + 2, /* prefetch distance */ \ 1.348 + nop_macro, /* init */ \ 1.349 + nop_macro, /* newline */ \ 1.350 + nop_macro, /* cleanup */ \ 1.351 + add_8_8_process_head, \ 1.352 + add_8_8_process_tail 1.353 + 1.354 +/******************************************************************************/ 1.355 + 1.356 +.macro over_8888_8888_init 1.357 + /* Hold loop invariant in MASK */ 1.358 + ldr MASK, =0x00800080 1.359 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 1.360 + uadd8 SCRATCH, MASK, MASK 1.361 + line_saved_regs STRIDE_D, STRIDE_S, ORIG_W 1.362 +.endm 1.363 + 1.364 +.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1.365 + WK4 .req STRIDE_D 1.366 + WK5 .req STRIDE_S 1.367 + WK6 .req STRIDE_M 1.368 + WK7 .req ORIG_W 1.369 + pixld , numbytes, %(4+firstreg), SRC, unaligned_src 1.370 + pixld , numbytes, firstreg, DST, 0 1.371 + .unreq WK4 1.372 + .unreq WK5 1.373 + .unreq WK6 1.374 + .unreq WK7 1.375 +.endm 1.376 + 1.377 +.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 1.378 + /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ 1.379 + teq WK®0, #0 1.380 + .if numbytes > 4 1.381 + teqeq WK®1, #0 1.382 + .if numbytes > 8 1.383 + teqeq WK®2, #0 1.384 + teqeq WK®3, #0 1.385 + .endif 1.386 + .endif 1.387 +.endm 1.388 + 1.389 +.macro over_8888_8888_prepare next 1.390 + mov WK&next, WK&next, lsr #24 1.391 +.endm 1.392 + 1.393 +.macro over_8888_8888_1pixel src, dst, offset, next 1.394 + /* src = destination component multiplier */ 1.395 + rsb WK&src, WK&src, #255 1.396 + /* Split even/odd bytes of dst into SCRATCH/dst */ 1.397 + uxtb16 SCRATCH, WK&dst 1.398 + uxtb16 WK&dst, WK&dst, ror #8 1.399 + /* Multiply through, adding 0.5 to the upper byte of result for rounding */ 1.400 + mla SCRATCH, SCRATCH, WK&src, MASK 1.401 + mla WK&dst, WK&dst, WK&src, MASK 1.402 + /* Where we would have had a stall between the result of the first MLA and the shifter input, 1.403 + * reload the complete source pixel */ 1.404 + ldr WK&src, [SRC, #offset] 1.405 + /* Multiply by 257/256 to approximate 256/255 */ 1.406 + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 1.407 + /* In this stall, start processing the next pixel */ 1.408 + .if offset < -4 1.409 + mov WK&next, WK&next, lsr #24 1.410 + .endif 1.411 + uxtab16 WK&dst, WK&dst, WK&dst, ror #8 1.412 + /* Recombine even/odd bytes of multiplied destination */ 1.413 + mov SCRATCH, SCRATCH, ror #8 1.414 + sel WK&dst, SCRATCH, WK&dst 1.415 + /* Saturated add of source to multiplied destination */ 1.416 + uqadd8 WK&dst, WK&dst, WK&src 1.417 +.endm 1.418 + 1.419 +.macro over_8888_8888_process_tail cond, numbytes, firstreg 1.420 + WK4 .req STRIDE_D 1.421 + WK5 .req STRIDE_S 1.422 + WK6 .req STRIDE_M 1.423 + WK7 .req ORIG_W 1.424 + over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) 1.425 + beq 10f 1.426 + over_8888_8888_prepare %(4+firstreg) 1.427 + .set PROCESS_REG, firstreg 1.428 + .set PROCESS_OFF, -numbytes 1.429 + .rept numbytes / 4 1.430 + over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) 1.431 + .set PROCESS_REG, PROCESS_REG+1 1.432 + .set PROCESS_OFF, PROCESS_OFF+4 1.433 + .endr 1.434 + pixst , numbytes, firstreg, DST 1.435 +10: 1.436 + .unreq WK4 1.437 + .unreq WK5 1.438 + .unreq WK6 1.439 + .unreq WK7 1.440 +.endm 1.441 + 1.442 +generate_composite_function \ 1.443 + pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ 1.444 + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 1.445 + 2, /* prefetch distance */ \ 1.446 + over_8888_8888_init, \ 1.447 + nop_macro, /* newline */ \ 1.448 + nop_macro, /* cleanup */ \ 1.449 + over_8888_8888_process_head, \ 1.450 + over_8888_8888_process_tail 1.451 + 1.452 +/******************************************************************************/ 1.453 + 1.454 +/* Multiply each byte of a word by a byte. 1.455 + * Useful when there aren't any obvious ways to fill the stalls with other instructions. 1.456 + * word Register containing 4 bytes 1.457 + * byte Register containing byte multiplier (bits 8-31 must be 0) 1.458 + * tmp Scratch register 1.459 + * half Register containing the constant 0x00800080 1.460 + * GE[3:0] bits must contain 0101 1.461 + */ 1.462 +.macro mul_8888_8 word, byte, tmp, half 1.463 + /* Split even/odd bytes of word apart */ 1.464 + uxtb16 tmp, word 1.465 + uxtb16 word, word, ror #8 1.466 + /* Multiply bytes together with rounding, then by 257/256 */ 1.467 + mla tmp, tmp, byte, half 1.468 + mla word, word, byte, half /* 1 stall follows */ 1.469 + uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ 1.470 + uxtab16 word, word, word, ror #8 1.471 + /* Recombine bytes */ 1.472 + mov tmp, tmp, ror #8 1.473 + sel word, tmp, word 1.474 +.endm 1.475 + 1.476 +/******************************************************************************/ 1.477 + 1.478 +.macro over_8888_n_8888_init 1.479 + /* Mask is constant */ 1.480 + ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 1.481 + /* Hold loop invariant in STRIDE_M */ 1.482 + ldr STRIDE_M, =0x00800080 1.483 + /* We only want the alpha bits of the constant mask */ 1.484 + mov MASK, MASK, lsr #24 1.485 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 1.486 + uadd8 SCRATCH, STRIDE_M, STRIDE_M 1.487 + line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W 1.488 +.endm 1.489 + 1.490 +.macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1.491 + WK4 .req Y 1.492 + WK5 .req STRIDE_D 1.493 + WK6 .req STRIDE_S 1.494 + WK7 .req ORIG_W 1.495 + pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src 1.496 + pixld , numbytes, firstreg, DST, 0 1.497 + .unreq WK4 1.498 + .unreq WK5 1.499 + .unreq WK6 1.500 + .unreq WK7 1.501 +.endm 1.502 + 1.503 +.macro over_8888_n_8888_1pixel src, dst 1.504 + mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M 1.505 + sub WK7, WK6, WK&src, lsr #24 1.506 + mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M 1.507 + uqadd8 WK&dst, WK&dst, WK&src 1.508 +.endm 1.509 + 1.510 +.macro over_8888_n_8888_process_tail cond, numbytes, firstreg 1.511 + WK4 .req Y 1.512 + WK5 .req STRIDE_D 1.513 + WK6 .req STRIDE_S 1.514 + WK7 .req ORIG_W 1.515 + over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) 1.516 + beq 10f 1.517 + mov WK6, #255 1.518 + .set PROCESS_REG, firstreg 1.519 + .rept numbytes / 4 1.520 + .if numbytes == 16 && PROCESS_REG == 2 1.521 + /* We're using WK6 and WK7 as temporaries, so half way through 1.522 + * 4 pixels, reload the second two source pixels but this time 1.523 + * into WK4 and WK5 */ 1.524 + ldmdb SRC, {WK4, WK5} 1.525 + .endif 1.526 + over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) 1.527 + .set PROCESS_REG, PROCESS_REG+1 1.528 + .endr 1.529 + pixst , numbytes, firstreg, DST 1.530 +10: 1.531 + .unreq WK4 1.532 + .unreq WK5 1.533 + .unreq WK6 1.534 + .unreq WK7 1.535 +.endm 1.536 + 1.537 +generate_composite_function \ 1.538 + pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ 1.539 + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 1.540 + 2, /* prefetch distance */ \ 1.541 + over_8888_n_8888_init, \ 1.542 + nop_macro, /* newline */ \ 1.543 + nop_macro, /* cleanup */ \ 1.544 + over_8888_n_8888_process_head, \ 1.545 + over_8888_n_8888_process_tail 1.546 + 1.547 +/******************************************************************************/ 1.548 + 1.549 +.macro over_n_8_8888_init 1.550 + /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ 1.551 + ldr SRC, [sp, #ARGS_STACK_OFFSET] 1.552 + /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ 1.553 + ldr SCRATCH, =0x00800080 1.554 + uxtb16 STRIDE_S, SRC 1.555 + uxtb16 SRC, SRC, ror #8 1.556 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 1.557 + uadd8 SCRATCH, SCRATCH, SCRATCH 1.558 + line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W 1.559 +.endm 1.560 + 1.561 +.macro over_n_8_8888_newline 1.562 + ldr STRIDE_D, =0x00800080 1.563 + b 1f 1.564 + .ltorg 1.565 +1: 1.566 +.endm 1.567 + 1.568 +.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1.569 + WK4 .req STRIDE_M 1.570 + pixld , numbytes/4, 4, MASK, unaligned_mask 1.571 + pixld , numbytes, firstreg, DST, 0 1.572 + .unreq WK4 1.573 +.endm 1.574 + 1.575 +.macro over_n_8_8888_1pixel src, dst 1.576 + uxtb Y, WK4, ror #src*8 1.577 + /* Trailing part of multiplication of source */ 1.578 + mla SCRATCH, STRIDE_S, Y, STRIDE_D 1.579 + mla Y, SRC, Y, STRIDE_D 1.580 + mov ORIG_W, #255 1.581 + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 1.582 + uxtab16 Y, Y, Y, ror #8 1.583 + mov SCRATCH, SCRATCH, ror #8 1.584 + sub ORIG_W, ORIG_W, Y, lsr #24 1.585 + sel Y, SCRATCH, Y 1.586 + /* Then multiply the destination */ 1.587 + mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D 1.588 + uqadd8 WK&dst, WK&dst, Y 1.589 +.endm 1.590 + 1.591 +.macro over_n_8_8888_process_tail cond, numbytes, firstreg 1.592 + WK4 .req STRIDE_M 1.593 + teq WK4, #0 1.594 + beq 10f 1.595 + .set PROCESS_REG, firstreg 1.596 + .rept numbytes / 4 1.597 + over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) 1.598 + .set PROCESS_REG, PROCESS_REG+1 1.599 + .endr 1.600 + pixst , numbytes, firstreg, DST 1.601 +10: 1.602 + .unreq WK4 1.603 +.endm 1.604 + 1.605 +generate_composite_function \ 1.606 + pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ 1.607 + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 1.608 + 2, /* prefetch distance */ \ 1.609 + over_n_8_8888_init, \ 1.610 + over_n_8_8888_newline, \ 1.611 + nop_macro, /* cleanup */ \ 1.612 + over_n_8_8888_process_head, \ 1.613 + over_n_8_8888_process_tail 1.614 + 1.615 +/******************************************************************************/ 1.616 +