gfx/cairo/libpixman/src/pixman-arm-simd-asm.S

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,613 @@
     1.4 +/*
     1.5 + * Copyright © 2012 Raspberry Pi Foundation
     1.6 + * Copyright © 2012 RISC OS Open Ltd
     1.7 + *
     1.8 + * Permission to use, copy, modify, distribute, and sell this software and its
     1.9 + * documentation for any purpose is hereby granted without fee, provided that
    1.10 + * the above copyright notice appear in all copies and that both that
    1.11 + * copyright notice and this permission notice appear in supporting
    1.12 + * documentation, and that the name of the copyright holders not be used in
    1.13 + * advertising or publicity pertaining to distribution of the software without
    1.14 + * specific, written prior permission.  The copyright holders make no
    1.15 + * representations about the suitability of this software for any purpose.  It
    1.16 + * is provided "as is" without express or implied warranty.
    1.17 + *
    1.18 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    1.19 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    1.20 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    1.21 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    1.22 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    1.23 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    1.24 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    1.25 + * SOFTWARE.
    1.26 + *
    1.27 + * Author:  Ben Avison (bavison@riscosopen.org)
    1.28 + *
    1.29 + */
    1.30 +
    1.31 +/* Prevent the stack from becoming executable */
    1.32 +#if defined(__linux__) && defined(__ELF__)
    1.33 +.section .note.GNU-stack,"",%progbits
    1.34 +#endif
    1.35 +
    1.36 +	.text
    1.37 +	.arch armv6
    1.38 +	.object_arch armv4
    1.39 +	.arm
    1.40 +	.altmacro
    1.41 +	.p2align 2
    1.42 +
    1.43 +#include "pixman-arm-simd-asm.h"
    1.44 +
    1.45 +/* A head macro should do all processing which results in an output of up to
    1.46 + * 16 bytes, as far as the final load instruction. The corresponding tail macro
    1.47 + * should complete the processing of the up-to-16 bytes. The calling macro will
    1.48 + * sometimes choose to insert a preload or a decrement of X between them.
    1.49 + *   cond           ARM condition code for code block
    1.50 + *   numbytes       Number of output bytes that should be generated this time
    1.51 + *   firstreg       First WK register in which to place output
    1.52 + *   unaligned_src  Whether to use non-wordaligned loads of source image
    1.53 + *   unaligned_mask Whether to use non-wordaligned loads of mask image
    1.54 + *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
    1.55 + */
    1.56 +
    1.57 +.macro blit_init
    1.58 +        line_saved_regs STRIDE_D, STRIDE_S
    1.59 +.endm
    1.60 +
    1.61 +.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    1.62 +        pixld   cond, numbytes, firstreg, SRC, unaligned_src
    1.63 +.endm
    1.64 +
    1.65 +.macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
    1.66 +    WK4     .req    STRIDE_D
    1.67 +    WK5     .req    STRIDE_S
    1.68 +    WK6     .req    MASK
    1.69 +    WK7     .req    STRIDE_M
    1.70 +110:    pixld   , 16, 0, SRC, unaligned_src
    1.71 +        pixld   , 16, 4, SRC, unaligned_src
    1.72 +        pld     [SRC, SCRATCH]
    1.73 +        pixst   , 16, 0, DST
    1.74 +        pixst   , 16, 4, DST
    1.75 +        subs    X, X, #32*8/src_bpp
    1.76 +        bhs     110b
    1.77 +    .unreq  WK4
    1.78 +    .unreq  WK5
    1.79 +    .unreq  WK6
    1.80 +    .unreq  WK7
    1.81 +.endm
    1.82 +
    1.83 +generate_composite_function \
    1.84 +    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
    1.85 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
    1.86 +    4, /* prefetch distance */ \
    1.87 +    blit_init, \
    1.88 +    nop_macro, /* newline */ \
    1.89 +    nop_macro, /* cleanup */ \
    1.90 +    blit_process_head, \
    1.91 +    nop_macro, /* process tail */ \
    1.92 +    blit_inner_loop
    1.93 +
    1.94 +generate_composite_function \
    1.95 +    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
    1.96 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
    1.97 +    4, /* prefetch distance */ \
    1.98 +    blit_init, \
    1.99 +    nop_macro, /* newline */ \
   1.100 +    nop_macro, /* cleanup */ \
   1.101 +    blit_process_head, \
   1.102 +    nop_macro, /* process tail */ \
   1.103 +    blit_inner_loop
   1.104 +
   1.105 +generate_composite_function \
   1.106 +    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
   1.107 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
   1.108 +    3, /* prefetch distance */ \
   1.109 +    blit_init, \
   1.110 +    nop_macro, /* newline */ \
   1.111 +    nop_macro, /* cleanup */ \
   1.112 +    blit_process_head, \
   1.113 +    nop_macro, /* process tail */ \
   1.114 +    blit_inner_loop
   1.115 +
   1.116 +/******************************************************************************/
   1.117 +
   1.118 +.macro src_n_8888_init
   1.119 +        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
   1.120 +        mov     STRIDE_S, SRC
   1.121 +        mov     MASK, SRC
   1.122 +        mov     STRIDE_M, SRC
   1.123 +.endm
   1.124 +
   1.125 +.macro src_n_0565_init
   1.126 +        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
   1.127 +        orr     SRC, SRC, lsl #16
   1.128 +        mov     STRIDE_S, SRC
   1.129 +        mov     MASK, SRC
   1.130 +        mov     STRIDE_M, SRC
   1.131 +.endm
   1.132 +
   1.133 +.macro src_n_8_init
   1.134 +        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
   1.135 +        orr     SRC, SRC, lsl #8
   1.136 +        orr     SRC, SRC, lsl #16
   1.137 +        mov     STRIDE_S, SRC
   1.138 +        mov     MASK, SRC
   1.139 +        mov     STRIDE_M, SRC
   1.140 +.endm
   1.141 +
   1.142 +.macro fill_process_tail  cond, numbytes, firstreg
   1.143 +    WK4     .req    SRC
   1.144 +    WK5     .req    STRIDE_S
   1.145 +    WK6     .req    MASK
   1.146 +    WK7     .req    STRIDE_M
   1.147 +        pixst   cond, numbytes, 4, DST
   1.148 +    .unreq  WK4
   1.149 +    .unreq  WK5
   1.150 +    .unreq  WK6
   1.151 +    .unreq  WK7
   1.152 +.endm
   1.153 +
   1.154 +generate_composite_function \
   1.155 +    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
   1.156 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
   1.157 +    0, /* prefetch distance doesn't apply */ \
   1.158 +    src_n_8888_init \
   1.159 +    nop_macro, /* newline */ \
   1.160 +    nop_macro /* cleanup */ \
   1.161 +    nop_macro /* process head */ \
   1.162 +    fill_process_tail
   1.163 +
   1.164 +generate_composite_function \
   1.165 +    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
   1.166 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
   1.167 +    0, /* prefetch distance doesn't apply */ \
   1.168 +    src_n_0565_init \
   1.169 +    nop_macro, /* newline */ \
   1.170 +    nop_macro /* cleanup */ \
   1.171 +    nop_macro /* process head */ \
   1.172 +    fill_process_tail
   1.173 +
   1.174 +generate_composite_function \
   1.175 +    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
   1.176 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
   1.177 +    0, /* prefetch distance doesn't apply */ \
   1.178 +    src_n_8_init \
   1.179 +    nop_macro, /* newline */ \
   1.180 +    nop_macro /* cleanup */ \
   1.181 +    nop_macro /* process head */ \
   1.182 +    fill_process_tail
   1.183 +
   1.184 +/******************************************************************************/
   1.185 +
   1.186 +.macro src_x888_8888_pixel, cond, reg
   1.187 +        orr&cond WK&reg, WK&reg, #0xFF000000
   1.188 +.endm
   1.189 +
   1.190 +.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1.191 +        pixld   cond, numbytes, firstreg, SRC, unaligned_src
   1.192 +.endm
   1.193 +
   1.194 +.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
   1.195 +        src_x888_8888_pixel cond, %(firstreg+0)
   1.196 + .if numbytes >= 8
   1.197 +        src_x888_8888_pixel cond, %(firstreg+1)
   1.198 +  .if numbytes == 16
   1.199 +        src_x888_8888_pixel cond, %(firstreg+2)
   1.200 +        src_x888_8888_pixel cond, %(firstreg+3)
   1.201 +  .endif
   1.202 + .endif
   1.203 +.endm
   1.204 +
   1.205 +generate_composite_function \
   1.206 +    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
   1.207 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
   1.208 +    3, /* prefetch distance */ \
   1.209 +    nop_macro, /* init */ \
   1.210 +    nop_macro, /* newline */ \
   1.211 +    nop_macro, /* cleanup */ \
   1.212 +    pixman_composite_src_x888_8888_process_head, \
   1.213 +    pixman_composite_src_x888_8888_process_tail
   1.214 +
   1.215 +/******************************************************************************/
   1.216 +
   1.217 +.macro src_0565_8888_init
   1.218 +        /* Hold loop invariants in MASK and STRIDE_M */
   1.219 +        ldr     MASK, =0x07E007E0
   1.220 +        mov     STRIDE_M, #0xFF000000
   1.221 +        /* Set GE[3:0] to 1010 so SEL instructions do what we want */
   1.222 +        ldr     SCRATCH, =0x80008000
   1.223 +        uadd8   SCRATCH, SCRATCH, SCRATCH
   1.224 +.endm
   1.225 +
   1.226 +.macro src_0565_8888_2pixels, reg1, reg2
   1.227 +        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
   1.228 +        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
   1.229 +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
   1.230 +        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
   1.231 +        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
   1.232 +        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
   1.233 +        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
   1.234 +        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
   1.235 +        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
   1.236 +        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
   1.237 +        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
   1.238 +        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
   1.239 +        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
   1.240 +        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
   1.241 +        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
   1.242 +.endm
   1.243 +
   1.244 +/* This version doesn't need STRIDE_M, but is one instruction longer.
   1.245 +   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
   1.246 +        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
   1.247 +        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
   1.248 +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
   1.249 +        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
   1.250 +        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
   1.251 +        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
   1.252 +        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
   1.253 +        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
   1.254 +        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
   1.255 +        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
   1.256 +        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
   1.257 +        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
   1.258 +        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
   1.259 +        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
   1.260 +        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
   1.261 +        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
   1.262 +*/
   1.263 +
   1.264 +.macro src_0565_8888_1pixel, reg
   1.265 +        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
   1.266 +        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
   1.267 +        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
   1.268 +        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
   1.269 +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
   1.270 +        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
   1.271 +        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
   1.272 +        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
   1.273 +        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
   1.274 +.endm
   1.275 +
   1.276 +.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1.277 + .if numbytes == 16
   1.278 +        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
   1.279 + .elseif numbytes == 8
   1.280 +        pixld   , 4, firstreg, SRC, unaligned_src
   1.281 + .elseif numbytes == 4
   1.282 +        pixld   , 2, firstreg, SRC, unaligned_src
   1.283 + .endif
   1.284 +.endm
   1.285 +
   1.286 +.macro src_0565_8888_process_tail   cond, numbytes, firstreg
   1.287 + .if numbytes == 16
   1.288 +        src_0565_8888_2pixels firstreg, %(firstreg+1)
   1.289 +        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
   1.290 + .elseif numbytes == 8
   1.291 +        src_0565_8888_2pixels firstreg, %(firstreg+1)
   1.292 + .else
   1.293 +        src_0565_8888_1pixel firstreg
   1.294 + .endif
   1.295 +.endm
   1.296 +
   1.297 +generate_composite_function \
   1.298 +    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
   1.299 +    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
   1.300 +    3, /* prefetch distance */ \
   1.301 +    src_0565_8888_init, \
   1.302 +    nop_macro, /* newline */ \
   1.303 +    nop_macro, /* cleanup */ \
   1.304 +    src_0565_8888_process_head, \
   1.305 +    src_0565_8888_process_tail
   1.306 +
   1.307 +/******************************************************************************/
   1.308 +
   1.309 +.macro add_8_8_8pixels  cond, dst1, dst2
   1.310 +        uqadd8&cond  WK&dst1, WK&dst1, MASK
   1.311 +        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
   1.312 +.endm
   1.313 +
   1.314 +.macro add_8_8_4pixels  cond, dst
   1.315 +        uqadd8&cond  WK&dst, WK&dst, MASK
   1.316 +.endm
   1.317 +
   1.318 +.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1.319 +    WK4     .req    MASK
   1.320 +    WK5     .req    STRIDE_M
   1.321 + .if numbytes == 16
   1.322 +        pixld   cond, 8, 4, SRC, unaligned_src
   1.323 +        pixld   cond, 16, firstreg, DST, 0
   1.324 +        add_8_8_8pixels cond, firstreg, %(firstreg+1)
   1.325 +        pixld   cond, 8, 4, SRC, unaligned_src
   1.326 + .else
   1.327 +        pixld   cond, numbytes, 4, SRC, unaligned_src
   1.328 +        pixld   cond, numbytes, firstreg, DST, 0
   1.329 + .endif
   1.330 +    .unreq  WK4
   1.331 +    .unreq  WK5
   1.332 +.endm
   1.333 +
   1.334 +.macro add_8_8_process_tail  cond, numbytes, firstreg
   1.335 + .if numbytes == 16
   1.336 +        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
   1.337 + .elseif numbytes == 8
   1.338 +        add_8_8_8pixels cond, firstreg, %(firstreg+1)
   1.339 + .else
   1.340 +        add_8_8_4pixels cond, firstreg
   1.341 + .endif
   1.342 +.endm
   1.343 +
   1.344 +generate_composite_function \
   1.345 +    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
   1.346 +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
   1.347 +    2, /* prefetch distance */ \
   1.348 +    nop_macro, /* init */ \
   1.349 +    nop_macro, /* newline */ \
   1.350 +    nop_macro, /* cleanup */ \
   1.351 +    add_8_8_process_head, \
   1.352 +    add_8_8_process_tail
   1.353 +
   1.354 +/******************************************************************************/
   1.355 +
   1.356 +.macro over_8888_8888_init
   1.357 +        /* Hold loop invariant in MASK */
   1.358 +        ldr     MASK, =0x00800080
   1.359 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
   1.360 +        uadd8   SCRATCH, MASK, MASK
   1.361 +        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
   1.362 +.endm
   1.363 +
   1.364 +.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1.365 +    WK4     .req    STRIDE_D
   1.366 +    WK5     .req    STRIDE_S
   1.367 +    WK6     .req    STRIDE_M
   1.368 +    WK7     .req    ORIG_W
   1.369 +        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
   1.370 +        pixld   , numbytes, firstreg, DST, 0
   1.371 +    .unreq  WK4
   1.372 +    .unreq  WK5
   1.373 +    .unreq  WK6
   1.374 +    .unreq  WK7
   1.375 +.endm
   1.376 +
   1.377 +.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
   1.378 +        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
   1.379 +        teq     WK&reg0, #0
   1.380 + .if numbytes > 4
   1.381 +        teqeq   WK&reg1, #0
   1.382 +  .if numbytes > 8
   1.383 +        teqeq   WK&reg2, #0
   1.384 +        teqeq   WK&reg3, #0
   1.385 +  .endif
   1.386 + .endif
   1.387 +.endm
   1.388 +
   1.389 +.macro over_8888_8888_prepare  next
   1.390 +        mov     WK&next, WK&next, lsr #24
   1.391 +.endm
   1.392 +
   1.393 +.macro over_8888_8888_1pixel src, dst, offset, next
   1.394 +        /* src = destination component multiplier */
   1.395 +        rsb     WK&src, WK&src, #255
   1.396 +        /* Split even/odd bytes of dst into SCRATCH/dst */
   1.397 +        uxtb16  SCRATCH, WK&dst
   1.398 +        uxtb16  WK&dst, WK&dst, ror #8
   1.399 +        /* Multiply through, adding 0.5 to the upper byte of result for rounding */
   1.400 +        mla     SCRATCH, SCRATCH, WK&src, MASK
   1.401 +        mla     WK&dst, WK&dst, WK&src, MASK
   1.402 +        /* Where we would have had a stall between the result of the first MLA and the shifter input,
   1.403 +         * reload the complete source pixel */
   1.404 +        ldr     WK&src, [SRC, #offset]
   1.405 +        /* Multiply by 257/256 to approximate 256/255 */
   1.406 +        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
   1.407 +        /* In this stall, start processing the next pixel */
   1.408 + .if offset < -4
   1.409 +        mov     WK&next, WK&next, lsr #24
   1.410 + .endif
   1.411 +        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
   1.412 +        /* Recombine even/odd bytes of multiplied destination */
   1.413 +        mov     SCRATCH, SCRATCH, ror #8
   1.414 +        sel     WK&dst, SCRATCH, WK&dst
   1.415 +        /* Saturated add of source to multiplied destination */
   1.416 +        uqadd8  WK&dst, WK&dst, WK&src
   1.417 +.endm
   1.418 +
   1.419 +.macro over_8888_8888_process_tail  cond, numbytes, firstreg
   1.420 +    WK4     .req    STRIDE_D
   1.421 +    WK5     .req    STRIDE_S
   1.422 +    WK6     .req    STRIDE_M
   1.423 +    WK7     .req    ORIG_W
   1.424 +        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
   1.425 +        beq     10f
   1.426 +        over_8888_8888_prepare  %(4+firstreg)
   1.427 + .set PROCESS_REG, firstreg
   1.428 + .set PROCESS_OFF, -numbytes
   1.429 + .rept numbytes / 4
   1.430 +        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
   1.431 +  .set PROCESS_REG, PROCESS_REG+1
   1.432 +  .set PROCESS_OFF, PROCESS_OFF+4
   1.433 + .endr
   1.434 +        pixst   , numbytes, firstreg, DST
   1.435 +10:
   1.436 +    .unreq  WK4
   1.437 +    .unreq  WK5
   1.438 +    .unreq  WK6
   1.439 +    .unreq  WK7
   1.440 +.endm
   1.441 +
   1.442 +generate_composite_function \
   1.443 +    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
   1.444 +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
   1.445 +    2, /* prefetch distance */ \
   1.446 +    over_8888_8888_init, \
   1.447 +    nop_macro, /* newline */ \
   1.448 +    nop_macro, /* cleanup */ \
   1.449 +    over_8888_8888_process_head, \
   1.450 +    over_8888_8888_process_tail
   1.451 +
   1.452 +/******************************************************************************/
   1.453 +
   1.454 +/* Multiply each byte of a word by a byte.
   1.455 + * Useful when there aren't any obvious ways to fill the stalls with other instructions.
   1.456 + * word  Register containing 4 bytes
   1.457 + * byte  Register containing byte multiplier (bits 8-31 must be 0)
   1.458 + * tmp   Scratch register
   1.459 + * half  Register containing the constant 0x00800080
   1.460 + * GE[3:0] bits must contain 0101
   1.461 + */
   1.462 +.macro mul_8888_8  word, byte, tmp, half
   1.463 +        /* Split even/odd bytes of word apart */
   1.464 +        uxtb16  tmp, word
   1.465 +        uxtb16  word, word, ror #8
   1.466 +        /* Multiply bytes together with rounding, then by 257/256 */
   1.467 +        mla     tmp, tmp, byte, half
   1.468 +        mla     word, word, byte, half /* 1 stall follows */
   1.469 +        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
   1.470 +        uxtab16 word, word, word, ror #8
   1.471 +        /* Recombine bytes */
   1.472 +        mov     tmp, tmp, ror #8
   1.473 +        sel     word, tmp, word
   1.474 +.endm
   1.475 +
   1.476 +/******************************************************************************/
   1.477 +
   1.478 +.macro over_8888_n_8888_init
   1.479 +        /* Mask is constant */
   1.480 +        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
   1.481 +        /* Hold loop invariant in STRIDE_M */
   1.482 +        ldr     STRIDE_M, =0x00800080
   1.483 +        /* We only want the alpha bits of the constant mask */
   1.484 +        mov     MASK, MASK, lsr #24
   1.485 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
   1.486 +        uadd8   SCRATCH, STRIDE_M, STRIDE_M
   1.487 +        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
   1.488 +.endm
   1.489 +
   1.490 +.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1.491 +    WK4     .req    Y
   1.492 +    WK5     .req    STRIDE_D
   1.493 +    WK6     .req    STRIDE_S
   1.494 +    WK7     .req    ORIG_W
   1.495 +        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
   1.496 +        pixld   , numbytes, firstreg, DST, 0
   1.497 +    .unreq  WK4
   1.498 +    .unreq  WK5
   1.499 +    .unreq  WK6
   1.500 +    .unreq  WK7
   1.501 +.endm
   1.502 +
   1.503 +.macro over_8888_n_8888_1pixel src, dst
   1.504 +        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
   1.505 +        sub     WK7, WK6, WK&src, lsr #24
   1.506 +        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
   1.507 +        uqadd8  WK&dst, WK&dst, WK&src
   1.508 +.endm
   1.509 +
   1.510 +.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
   1.511 +    WK4     .req    Y
   1.512 +    WK5     .req    STRIDE_D
   1.513 +    WK6     .req    STRIDE_S
   1.514 +    WK7     .req    ORIG_W
   1.515 +        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
   1.516 +        beq     10f
   1.517 +        mov     WK6, #255
   1.518 + .set PROCESS_REG, firstreg
   1.519 + .rept numbytes / 4
   1.520 +  .if numbytes == 16 && PROCESS_REG == 2
   1.521 +        /* We're using WK6 and WK7 as temporaries, so half way through
   1.522 +         * 4 pixels, reload the second two source pixels but this time
   1.523 +         * into WK4 and WK5 */
   1.524 +        ldmdb   SRC, {WK4, WK5}
   1.525 +  .endif
   1.526 +        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
   1.527 +  .set PROCESS_REG, PROCESS_REG+1
   1.528 + .endr
   1.529 +        pixst   , numbytes, firstreg, DST
   1.530 +10:
   1.531 +    .unreq  WK4
   1.532 +    .unreq  WK5
   1.533 +    .unreq  WK6
   1.534 +    .unreq  WK7
   1.535 +.endm
   1.536 +
   1.537 +generate_composite_function \
   1.538 +    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
   1.539 +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
   1.540 +    2, /* prefetch distance */ \
   1.541 +    over_8888_n_8888_init, \
   1.542 +    nop_macro, /* newline */ \
   1.543 +    nop_macro, /* cleanup */ \
   1.544 +    over_8888_n_8888_process_head, \
   1.545 +    over_8888_n_8888_process_tail
   1.546 +
   1.547 +/******************************************************************************/
   1.548 +
   1.549 +.macro over_n_8_8888_init
   1.550 +        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
   1.551 +        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
   1.552 +        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
   1.553 +        ldr     SCRATCH, =0x00800080
   1.554 +        uxtb16  STRIDE_S, SRC
   1.555 +        uxtb16  SRC, SRC, ror #8
   1.556 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
   1.557 +        uadd8   SCRATCH, SCRATCH, SCRATCH
   1.558 +        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
   1.559 +.endm
   1.560 +
   1.561 +.macro over_n_8_8888_newline
   1.562 +        ldr     STRIDE_D, =0x00800080
   1.563 +        b       1f
   1.564 + .ltorg
   1.565 +1:
   1.566 +.endm
   1.567 +
   1.568 +.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   1.569 +    WK4     .req    STRIDE_M
   1.570 +        pixld   , numbytes/4, 4, MASK, unaligned_mask
   1.571 +        pixld   , numbytes, firstreg, DST, 0
   1.572 +    .unreq  WK4
   1.573 +.endm
   1.574 +
   1.575 +.macro over_n_8_8888_1pixel src, dst
   1.576 +        uxtb    Y, WK4, ror #src*8
   1.577 +        /* Trailing part of multiplication of source */
   1.578 +        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
   1.579 +        mla     Y, SRC, Y, STRIDE_D
   1.580 +        mov     ORIG_W, #255
   1.581 +        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
   1.582 +        uxtab16 Y, Y, Y, ror #8
   1.583 +        mov     SCRATCH, SCRATCH, ror #8
   1.584 +        sub     ORIG_W, ORIG_W, Y, lsr #24
   1.585 +        sel     Y, SCRATCH, Y
   1.586 +        /* Then multiply the destination */
   1.587 +        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
   1.588 +        uqadd8  WK&dst, WK&dst, Y
   1.589 +.endm
   1.590 +
   1.591 +.macro over_n_8_8888_process_tail  cond, numbytes, firstreg
   1.592 +    WK4     .req    STRIDE_M
   1.593 +        teq     WK4, #0
   1.594 +        beq     10f
   1.595 + .set PROCESS_REG, firstreg
   1.596 + .rept numbytes / 4
   1.597 +        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
   1.598 +  .set PROCESS_REG, PROCESS_REG+1
   1.599 + .endr
   1.600 +        pixst   , numbytes, firstreg, DST
   1.601 +10:
   1.602 +    .unreq  WK4
   1.603 +.endm
   1.604 +
   1.605 +generate_composite_function \
   1.606 +    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
   1.607 +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
   1.608 +    2, /* prefetch distance */ \
   1.609 +    over_n_8_8888_init, \
   1.610 +    over_n_8_8888_newline, \
   1.611 +    nop_macro, /* cleanup */ \
   1.612 +    over_n_8_8888_process_head, \
   1.613 +    over_n_8_8888_process_tail
   1.614 +
   1.615 +/******************************************************************************/
   1.616 +

mercurial