gfx/cairo/libpixman/src/pixman-arm-simd-asm.S

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright © 2012 Raspberry Pi Foundation
michael@0 3 * Copyright © 2012 RISC OS Open Ltd
michael@0 4 *
michael@0 5 * Permission to use, copy, modify, distribute, and sell this software and its
michael@0 6 * documentation for any purpose is hereby granted without fee, provided that
michael@0 7 * the above copyright notice appear in all copies and that both that
michael@0 8 * copyright notice and this permission notice appear in supporting
michael@0 9 * documentation, and that the name of the copyright holders not be used in
michael@0 10 * advertising or publicity pertaining to distribution of the software without
michael@0 11 * specific, written prior permission. The copyright holders make no
michael@0 12 * representations about the suitability of this software for any purpose. It
michael@0 13 * is provided "as is" without express or implied warranty.
michael@0 14 *
michael@0 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
michael@0 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
michael@0 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
michael@0 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
michael@0 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
michael@0 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
michael@0 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
michael@0 22 * SOFTWARE.
michael@0 23 *
michael@0 24 * Author: Ben Avison (bavison@riscosopen.org)
michael@0 25 *
michael@0 26 */
michael@0 27
michael@0 28 /* Prevent the stack from becoming executable */
michael@0 29 #if defined(__linux__) && defined(__ELF__)
michael@0 30 .section .note.GNU-stack,"",%progbits
michael@0 31 #endif
michael@0 32
michael@0 33 .text
michael@0 34 .arch armv6
michael@0 35 .object_arch armv4
michael@0 36 .arm
michael@0 37 .altmacro
michael@0 38 .p2align 2
michael@0 39
michael@0 40 #include "pixman-arm-simd-asm.h"
michael@0 41
michael@0 42 /* A head macro should do all processing which results in an output of up to
michael@0 43 * 16 bytes, as far as the final load instruction. The corresponding tail macro
michael@0 44 * should complete the processing of the up-to-16 bytes. The calling macro will
michael@0 45 * sometimes choose to insert a preload or a decrement of X between them.
michael@0 46 * cond ARM condition code for code block
michael@0 47 * numbytes Number of output bytes that should be generated this time
michael@0 48 * firstreg First WK register in which to place output
michael@0 49 * unaligned_src Whether to use non-wordaligned loads of source image
michael@0 50 * unaligned_mask Whether to use non-wordaligned loads of mask image
michael@0 51 * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
michael@0 52 */
michael@0 53
michael@0 54 .macro blit_init
michael@0 55 line_saved_regs STRIDE_D, STRIDE_S
michael@0 56 .endm
michael@0 57
michael@0 58 .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0 59 pixld cond, numbytes, firstreg, SRC, unaligned_src
michael@0 60 .endm
michael@0 61
michael@0 62 .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
michael@0 63 WK4 .req STRIDE_D
michael@0 64 WK5 .req STRIDE_S
michael@0 65 WK6 .req MASK
michael@0 66 WK7 .req STRIDE_M
michael@0 67 110: pixld , 16, 0, SRC, unaligned_src
michael@0 68 pixld , 16, 4, SRC, unaligned_src
michael@0 69 pld [SRC, SCRATCH]
michael@0 70 pixst , 16, 0, DST
michael@0 71 pixst , 16, 4, DST
michael@0 72 subs X, X, #32*8/src_bpp
michael@0 73 bhs 110b
michael@0 74 .unreq WK4
michael@0 75 .unreq WK5
michael@0 76 .unreq WK6
michael@0 77 .unreq WK7
michael@0 78 .endm
michael@0 79
michael@0 80 generate_composite_function \
michael@0 81 pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
michael@0 82 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0 83 4, /* prefetch distance */ \
michael@0 84 blit_init, \
michael@0 85 nop_macro, /* newline */ \
michael@0 86 nop_macro, /* cleanup */ \
michael@0 87 blit_process_head, \
michael@0 88 nop_macro, /* process tail */ \
michael@0 89 blit_inner_loop
michael@0 90
michael@0 91 generate_composite_function \
michael@0 92 pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
michael@0 93 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0 94 4, /* prefetch distance */ \
michael@0 95 blit_init, \
michael@0 96 nop_macro, /* newline */ \
michael@0 97 nop_macro, /* cleanup */ \
michael@0 98 blit_process_head, \
michael@0 99 nop_macro, /* process tail */ \
michael@0 100 blit_inner_loop
michael@0 101
michael@0 102 generate_composite_function \
michael@0 103 pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
michael@0 104 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0 105 3, /* prefetch distance */ \
michael@0 106 blit_init, \
michael@0 107 nop_macro, /* newline */ \
michael@0 108 nop_macro, /* cleanup */ \
michael@0 109 blit_process_head, \
michael@0 110 nop_macro, /* process tail */ \
michael@0 111 blit_inner_loop
michael@0 112
michael@0 113 /******************************************************************************/
michael@0 114
michael@0 115 .macro src_n_8888_init
michael@0 116 ldr SRC, [sp, #ARGS_STACK_OFFSET]
michael@0 117 mov STRIDE_S, SRC
michael@0 118 mov MASK, SRC
michael@0 119 mov STRIDE_M, SRC
michael@0 120 .endm
michael@0 121
michael@0 122 .macro src_n_0565_init
michael@0 123 ldrh SRC, [sp, #ARGS_STACK_OFFSET]
michael@0 124 orr SRC, SRC, lsl #16
michael@0 125 mov STRIDE_S, SRC
michael@0 126 mov MASK, SRC
michael@0 127 mov STRIDE_M, SRC
michael@0 128 .endm
michael@0 129
michael@0 130 .macro src_n_8_init
michael@0 131 ldrb SRC, [sp, #ARGS_STACK_OFFSET]
michael@0 132 orr SRC, SRC, lsl #8
michael@0 133 orr SRC, SRC, lsl #16
michael@0 134 mov STRIDE_S, SRC
michael@0 135 mov MASK, SRC
michael@0 136 mov STRIDE_M, SRC
michael@0 137 .endm
michael@0 138
michael@0 139 .macro fill_process_tail cond, numbytes, firstreg
michael@0 140 WK4 .req SRC
michael@0 141 WK5 .req STRIDE_S
michael@0 142 WK6 .req MASK
michael@0 143 WK7 .req STRIDE_M
michael@0 144 pixst cond, numbytes, 4, DST
michael@0 145 .unreq WK4
michael@0 146 .unreq WK5
michael@0 147 .unreq WK6
michael@0 148 .unreq WK7
michael@0 149 .endm
michael@0 150
michael@0 151 generate_composite_function \
michael@0 152 pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
michael@0 153 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
michael@0 154 0, /* prefetch distance doesn't apply */ \
michael@0 155 src_n_8888_init \
michael@0 156 nop_macro, /* newline */ \
michael@0 157 nop_macro /* cleanup */ \
michael@0 158 nop_macro /* process head */ \
michael@0 159 fill_process_tail
michael@0 160
michael@0 161 generate_composite_function \
michael@0 162 pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
michael@0 163 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
michael@0 164 0, /* prefetch distance doesn't apply */ \
michael@0 165 src_n_0565_init \
michael@0 166 nop_macro, /* newline */ \
michael@0 167 nop_macro /* cleanup */ \
michael@0 168 nop_macro /* process head */ \
michael@0 169 fill_process_tail
michael@0 170
michael@0 171 generate_composite_function \
michael@0 172 pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
michael@0 173 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
michael@0 174 0, /* prefetch distance doesn't apply */ \
michael@0 175 src_n_8_init \
michael@0 176 nop_macro, /* newline */ \
michael@0 177 nop_macro /* cleanup */ \
michael@0 178 nop_macro /* process head */ \
michael@0 179 fill_process_tail
michael@0 180
michael@0 181 /******************************************************************************/
michael@0 182
michael@0 183 .macro src_x888_8888_pixel, cond, reg
michael@0 184 orr&cond WK&reg, WK&reg, #0xFF000000
michael@0 185 .endm
michael@0 186
michael@0 187 .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0 188 pixld cond, numbytes, firstreg, SRC, unaligned_src
michael@0 189 .endm
michael@0 190
michael@0 191 .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
michael@0 192 src_x888_8888_pixel cond, %(firstreg+0)
michael@0 193 .if numbytes >= 8
michael@0 194 src_x888_8888_pixel cond, %(firstreg+1)
michael@0 195 .if numbytes == 16
michael@0 196 src_x888_8888_pixel cond, %(firstreg+2)
michael@0 197 src_x888_8888_pixel cond, %(firstreg+3)
michael@0 198 .endif
michael@0 199 .endif
michael@0 200 .endm
michael@0 201
michael@0 202 generate_composite_function \
michael@0 203 pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
michael@0 204 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0 205 3, /* prefetch distance */ \
michael@0 206 nop_macro, /* init */ \
michael@0 207 nop_macro, /* newline */ \
michael@0 208 nop_macro, /* cleanup */ \
michael@0 209 pixman_composite_src_x888_8888_process_head, \
michael@0 210 pixman_composite_src_x888_8888_process_tail
michael@0 211
michael@0 212 /******************************************************************************/
michael@0 213
michael@0 214 .macro src_0565_8888_init
michael@0 215 /* Hold loop invariants in MASK and STRIDE_M */
michael@0 216 ldr MASK, =0x07E007E0
michael@0 217 mov STRIDE_M, #0xFF000000
michael@0 218 /* Set GE[3:0] to 1010 so SEL instructions do what we want */
michael@0 219 ldr SCRATCH, =0x80008000
michael@0 220 uadd8 SCRATCH, SCRATCH, SCRATCH
michael@0 221 .endm
michael@0 222
michael@0 223 .macro src_0565_8888_2pixels, reg1, reg2
michael@0 224 and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
michael@0 225 bic WK&reg2, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
michael@0 226 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
michael@0 227 mov WK&reg1, WK&reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
michael@0 228 mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
michael@0 229 bic WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
michael@0 230 orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
michael@0 231 orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
michael@0 232 pkhtb WK&reg1, WK&reg1, WK&reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
michael@0 233 sel WK&reg1, WK&reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
michael@0 234 mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
michael@0 235 pkhtb WK&reg2, WK&reg2, WK&reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
michael@0 236 sel WK&reg2, WK&reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
michael@0 237 orr WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
michael@0 238 orr WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
michael@0 239 .endm
michael@0 240
michael@0 241 /* This version doesn't need STRIDE_M, but is one instruction longer.
michael@0 242 It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
michael@0 243 and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
michael@0 244 bic WK&reg1, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
michael@0 245 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
michael@0 246 mov WK&reg2, WK&reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
michael@0 247 mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
michael@0 248 bic WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
michael@0 249 mov WK&reg2, WK&reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
michael@0 250 mov WK&reg1, WK&reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
michael@0 251 orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
michael@0 252 orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
michael@0 253 pkhbt WK&reg2, WK&reg2, WK&reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
michael@0 254 pkhbt WK&reg1, WK&reg1, WK&reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
michael@0 255 sel WK&reg2, SCRATCH, WK&reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
michael@0 256 sel WK&reg1, SCRATCH, WK&reg1 @ --------rrrrrrrrggggggggbbbbbbbb
michael@0 257 orr WK&reg2, WK&reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
michael@0 258 orr WK&reg1, WK&reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
michael@0 259 */
michael@0 260
michael@0 261 .macro src_0565_8888_1pixel, reg
michael@0 262 bic SCRATCH, WK&reg, MASK @ 0000000000000000rrrrr000000bbbbb
michael@0 263 and WK&reg, WK&reg, MASK @ 000000000000000000000gggggg00000
michael@0 264 mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
michael@0 265 mov WK&reg, WK&reg, lsl #5 @ 0000000000000000gggggg0000000000
michael@0 266 orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
michael@0 267 orr WK&reg, WK&reg, WK&reg, lsr #6 @ 000000000000000gggggggggggg00000
michael@0 268 pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
michael@0 269 sel WK&reg, WK&reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
michael@0 270 orr WK&reg, WK&reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
michael@0 271 .endm
michael@0 272
michael@0 273 .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0 274 .if numbytes == 16
michael@0 275 pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
michael@0 276 .elseif numbytes == 8
michael@0 277 pixld , 4, firstreg, SRC, unaligned_src
michael@0 278 .elseif numbytes == 4
michael@0 279 pixld , 2, firstreg, SRC, unaligned_src
michael@0 280 .endif
michael@0 281 .endm
michael@0 282
michael@0 283 .macro src_0565_8888_process_tail cond, numbytes, firstreg
michael@0 284 .if numbytes == 16
michael@0 285 src_0565_8888_2pixels firstreg, %(firstreg+1)
michael@0 286 src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
michael@0 287 .elseif numbytes == 8
michael@0 288 src_0565_8888_2pixels firstreg, %(firstreg+1)
michael@0 289 .else
michael@0 290 src_0565_8888_1pixel firstreg
michael@0 291 .endif
michael@0 292 .endm
michael@0 293
michael@0 294 generate_composite_function \
michael@0 295 pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
michael@0 296 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
michael@0 297 3, /* prefetch distance */ \
michael@0 298 src_0565_8888_init, \
michael@0 299 nop_macro, /* newline */ \
michael@0 300 nop_macro, /* cleanup */ \
michael@0 301 src_0565_8888_process_head, \
michael@0 302 src_0565_8888_process_tail
michael@0 303
michael@0 304 /******************************************************************************/
michael@0 305
michael@0 306 .macro add_8_8_8pixels cond, dst1, dst2
michael@0 307 uqadd8&cond WK&dst1, WK&dst1, MASK
michael@0 308 uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
michael@0 309 .endm
michael@0 310
michael@0 311 .macro add_8_8_4pixels cond, dst
michael@0 312 uqadd8&cond WK&dst, WK&dst, MASK
michael@0 313 .endm
michael@0 314
michael@0 315 .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0 316 WK4 .req MASK
michael@0 317 WK5 .req STRIDE_M
michael@0 318 .if numbytes == 16
michael@0 319 pixld cond, 8, 4, SRC, unaligned_src
michael@0 320 pixld cond, 16, firstreg, DST, 0
michael@0 321 add_8_8_8pixels cond, firstreg, %(firstreg+1)
michael@0 322 pixld cond, 8, 4, SRC, unaligned_src
michael@0 323 .else
michael@0 324 pixld cond, numbytes, 4, SRC, unaligned_src
michael@0 325 pixld cond, numbytes, firstreg, DST, 0
michael@0 326 .endif
michael@0 327 .unreq WK4
michael@0 328 .unreq WK5
michael@0 329 .endm
michael@0 330
michael@0 331 .macro add_8_8_process_tail cond, numbytes, firstreg
michael@0 332 .if numbytes == 16
michael@0 333 add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
michael@0 334 .elseif numbytes == 8
michael@0 335 add_8_8_8pixels cond, firstreg, %(firstreg+1)
michael@0 336 .else
michael@0 337 add_8_8_4pixels cond, firstreg
michael@0 338 .endif
michael@0 339 .endm
michael@0 340
michael@0 341 generate_composite_function \
michael@0 342 pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
michael@0 343 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
michael@0 344 2, /* prefetch distance */ \
michael@0 345 nop_macro, /* init */ \
michael@0 346 nop_macro, /* newline */ \
michael@0 347 nop_macro, /* cleanup */ \
michael@0 348 add_8_8_process_head, \
michael@0 349 add_8_8_process_tail
michael@0 350
michael@0 351 /******************************************************************************/
michael@0 352
michael@0 353 .macro over_8888_8888_init
michael@0 354 /* Hold loop invariant in MASK */
michael@0 355 ldr MASK, =0x00800080
michael@0 356 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
michael@0 357 uadd8 SCRATCH, MASK, MASK
michael@0 358 line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
michael@0 359 .endm
michael@0 360
michael@0 361 .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0 362 WK4 .req STRIDE_D
michael@0 363 WK5 .req STRIDE_S
michael@0 364 WK6 .req STRIDE_M
michael@0 365 WK7 .req ORIG_W
michael@0 366 pixld , numbytes, %(4+firstreg), SRC, unaligned_src
michael@0 367 pixld , numbytes, firstreg, DST, 0
michael@0 368 .unreq WK4
michael@0 369 .unreq WK5
michael@0 370 .unreq WK6
michael@0 371 .unreq WK7
michael@0 372 .endm
michael@0 373
michael@0 374 .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
michael@0 375 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
michael@0 376 teq WK&reg0, #0
michael@0 377 .if numbytes > 4
michael@0 378 teqeq WK&reg1, #0
michael@0 379 .if numbytes > 8
michael@0 380 teqeq WK&reg2, #0
michael@0 381 teqeq WK&reg3, #0
michael@0 382 .endif
michael@0 383 .endif
michael@0 384 .endm
michael@0 385
michael@0 386 .macro over_8888_8888_prepare next
michael@0 387 mov WK&next, WK&next, lsr #24
michael@0 388 .endm
michael@0 389
michael@0 390 .macro over_8888_8888_1pixel src, dst, offset, next
michael@0 391 /* src = destination component multiplier */
michael@0 392 rsb WK&src, WK&src, #255
michael@0 393 /* Split even/odd bytes of dst into SCRATCH/dst */
michael@0 394 uxtb16 SCRATCH, WK&dst
michael@0 395 uxtb16 WK&dst, WK&dst, ror #8
michael@0 396 /* Multiply through, adding 0.5 to the upper byte of result for rounding */
michael@0 397 mla SCRATCH, SCRATCH, WK&src, MASK
michael@0 398 mla WK&dst, WK&dst, WK&src, MASK
michael@0 399 /* Where we would have had a stall between the result of the first MLA and the shifter input,
michael@0 400 * reload the complete source pixel */
michael@0 401 ldr WK&src, [SRC, #offset]
michael@0 402 /* Multiply by 257/256 to approximate 256/255 */
michael@0 403 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
michael@0 404 /* In this stall, start processing the next pixel */
michael@0 405 .if offset < -4
michael@0 406 mov WK&next, WK&next, lsr #24
michael@0 407 .endif
michael@0 408 uxtab16 WK&dst, WK&dst, WK&dst, ror #8
michael@0 409 /* Recombine even/odd bytes of multiplied destination */
michael@0 410 mov SCRATCH, SCRATCH, ror #8
michael@0 411 sel WK&dst, SCRATCH, WK&dst
michael@0 412 /* Saturated add of source to multiplied destination */
michael@0 413 uqadd8 WK&dst, WK&dst, WK&src
michael@0 414 .endm
michael@0 415
michael@0 416 .macro over_8888_8888_process_tail cond, numbytes, firstreg
michael@0 417 WK4 .req STRIDE_D
michael@0 418 WK5 .req STRIDE_S
michael@0 419 WK6 .req STRIDE_M
michael@0 420 WK7 .req ORIG_W
michael@0 421 over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
michael@0 422 beq 10f
michael@0 423 over_8888_8888_prepare %(4+firstreg)
michael@0 424 .set PROCESS_REG, firstreg
michael@0 425 .set PROCESS_OFF, -numbytes
michael@0 426 .rept numbytes / 4
michael@0 427 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
michael@0 428 .set PROCESS_REG, PROCESS_REG+1
michael@0 429 .set PROCESS_OFF, PROCESS_OFF+4
michael@0 430 .endr
michael@0 431 pixst , numbytes, firstreg, DST
michael@0 432 10:
michael@0 433 .unreq WK4
michael@0 434 .unreq WK5
michael@0 435 .unreq WK6
michael@0 436 .unreq WK7
michael@0 437 .endm
michael@0 438
michael@0 439 generate_composite_function \
michael@0 440 pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
michael@0 441 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
michael@0 442 2, /* prefetch distance */ \
michael@0 443 over_8888_8888_init, \
michael@0 444 nop_macro, /* newline */ \
michael@0 445 nop_macro, /* cleanup */ \
michael@0 446 over_8888_8888_process_head, \
michael@0 447 over_8888_8888_process_tail
michael@0 448
michael@0 449 /******************************************************************************/
michael@0 450
michael@0 451 /* Multiply each byte of a word by a byte.
michael@0 452 * Useful when there aren't any obvious ways to fill the stalls with other instructions.
michael@0 453 * word Register containing 4 bytes
michael@0 454 * byte Register containing byte multiplier (bits 8-31 must be 0)
michael@0 455 * tmp Scratch register
michael@0 456 * half Register containing the constant 0x00800080
michael@0 457 * GE[3:0] bits must contain 0101
michael@0 458 */
michael@0 459 .macro mul_8888_8 word, byte, tmp, half
michael@0 460 /* Split even/odd bytes of word apart */
michael@0 461 uxtb16 tmp, word
michael@0 462 uxtb16 word, word, ror #8
michael@0 463 /* Multiply bytes together with rounding, then by 257/256 */
michael@0 464 mla tmp, tmp, byte, half
michael@0 465 mla word, word, byte, half /* 1 stall follows */
michael@0 466 uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
michael@0 467 uxtab16 word, word, word, ror #8
michael@0 468 /* Recombine bytes */
michael@0 469 mov tmp, tmp, ror #8
michael@0 470 sel word, tmp, word
michael@0 471 .endm
michael@0 472
michael@0 473 /******************************************************************************/
michael@0 474
michael@0 475 .macro over_8888_n_8888_init
michael@0 476 /* Mask is constant */
michael@0 477 ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
michael@0 478 /* Hold loop invariant in STRIDE_M */
michael@0 479 ldr STRIDE_M, =0x00800080
michael@0 480 /* We only want the alpha bits of the constant mask */
michael@0 481 mov MASK, MASK, lsr #24
michael@0 482 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
michael@0 483 uadd8 SCRATCH, STRIDE_M, STRIDE_M
michael@0 484 line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
michael@0 485 .endm
michael@0 486
michael@0 487 .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0 488 WK4 .req Y
michael@0 489 WK5 .req STRIDE_D
michael@0 490 WK6 .req STRIDE_S
michael@0 491 WK7 .req ORIG_W
michael@0 492 pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
michael@0 493 pixld , numbytes, firstreg, DST, 0
michael@0 494 .unreq WK4
michael@0 495 .unreq WK5
michael@0 496 .unreq WK6
michael@0 497 .unreq WK7
michael@0 498 .endm
michael@0 499
michael@0 500 .macro over_8888_n_8888_1pixel src, dst
michael@0 501 mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
michael@0 502 sub WK7, WK6, WK&src, lsr #24
michael@0 503 mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
michael@0 504 uqadd8 WK&dst, WK&dst, WK&src
michael@0 505 .endm
michael@0 506
michael@0 507 .macro over_8888_n_8888_process_tail cond, numbytes, firstreg
michael@0 508 WK4 .req Y
michael@0 509 WK5 .req STRIDE_D
michael@0 510 WK6 .req STRIDE_S
michael@0 511 WK7 .req ORIG_W
michael@0 512 over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
michael@0 513 beq 10f
michael@0 514 mov WK6, #255
michael@0 515 .set PROCESS_REG, firstreg
michael@0 516 .rept numbytes / 4
michael@0 517 .if numbytes == 16 && PROCESS_REG == 2
michael@0 518 /* We're using WK6 and WK7 as temporaries, so half way through
michael@0 519 * 4 pixels, reload the second two source pixels but this time
michael@0 520 * into WK4 and WK5 */
michael@0 521 ldmdb SRC, {WK4, WK5}
michael@0 522 .endif
michael@0 523 over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
michael@0 524 .set PROCESS_REG, PROCESS_REG+1
michael@0 525 .endr
michael@0 526 pixst , numbytes, firstreg, DST
michael@0 527 10:
michael@0 528 .unreq WK4
michael@0 529 .unreq WK5
michael@0 530 .unreq WK6
michael@0 531 .unreq WK7
michael@0 532 .endm
michael@0 533
michael@0 534 generate_composite_function \
michael@0 535 pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
michael@0 536 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
michael@0 537 2, /* prefetch distance */ \
michael@0 538 over_8888_n_8888_init, \
michael@0 539 nop_macro, /* newline */ \
michael@0 540 nop_macro, /* cleanup */ \
michael@0 541 over_8888_n_8888_process_head, \
michael@0 542 over_8888_n_8888_process_tail
michael@0 543
michael@0 544 /******************************************************************************/
michael@0 545
michael@0 546 .macro over_n_8_8888_init
michael@0 547 /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
michael@0 548 ldr SRC, [sp, #ARGS_STACK_OFFSET]
michael@0 549 /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
michael@0 550 ldr SCRATCH, =0x00800080
michael@0 551 uxtb16 STRIDE_S, SRC
michael@0 552 uxtb16 SRC, SRC, ror #8
michael@0 553 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
michael@0 554 uadd8 SCRATCH, SCRATCH, SCRATCH
michael@0 555 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
michael@0 556 .endm
michael@0 557
michael@0 558 .macro over_n_8_8888_newline
michael@0 559 ldr STRIDE_D, =0x00800080
michael@0 560 b 1f
michael@0 561 .ltorg
michael@0 562 1:
michael@0 563 .endm
michael@0 564
michael@0 565 .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
michael@0 566 WK4 .req STRIDE_M
michael@0 567 pixld , numbytes/4, 4, MASK, unaligned_mask
michael@0 568 pixld , numbytes, firstreg, DST, 0
michael@0 569 .unreq WK4
michael@0 570 .endm
michael@0 571
michael@0 572 .macro over_n_8_8888_1pixel src, dst
michael@0 573 uxtb Y, WK4, ror #src*8
michael@0 574 /* Trailing part of multiplication of source */
michael@0 575 mla SCRATCH, STRIDE_S, Y, STRIDE_D
michael@0 576 mla Y, SRC, Y, STRIDE_D
michael@0 577 mov ORIG_W, #255
michael@0 578 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
michael@0 579 uxtab16 Y, Y, Y, ror #8
michael@0 580 mov SCRATCH, SCRATCH, ror #8
michael@0 581 sub ORIG_W, ORIG_W, Y, lsr #24
michael@0 582 sel Y, SCRATCH, Y
michael@0 583 /* Then multiply the destination */
michael@0 584 mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
michael@0 585 uqadd8 WK&dst, WK&dst, Y
michael@0 586 .endm
michael@0 587
michael@0 588 .macro over_n_8_8888_process_tail cond, numbytes, firstreg
michael@0 589 WK4 .req STRIDE_M
michael@0 590 teq WK4, #0
michael@0 591 beq 10f
michael@0 592 .set PROCESS_REG, firstreg
michael@0 593 .rept numbytes / 4
michael@0 594 over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
michael@0 595 .set PROCESS_REG, PROCESS_REG+1
michael@0 596 .endr
michael@0 597 pixst , numbytes, firstreg, DST
michael@0 598 10:
michael@0 599 .unreq WK4
michael@0 600 .endm
michael@0 601
michael@0 602 generate_composite_function \
michael@0 603 pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
michael@0 604 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
michael@0 605 2, /* prefetch distance */ \
michael@0 606 over_n_8_8888_init, \
michael@0 607 over_n_8_8888_newline, \
michael@0 608 nop_macro, /* cleanup */ \
michael@0 609 over_n_8_8888_process_head, \
michael@0 610 over_n_8_8888_process_tail
michael@0 611
michael@0 612 /******************************************************************************/
michael@0 613

mercurial