Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright © 2012 Raspberry Pi Foundation |
michael@0 | 3 | * Copyright © 2012 RISC OS Open Ltd |
michael@0 | 4 | * |
michael@0 | 5 | * Permission to use, copy, modify, distribute, and sell this software and its |
michael@0 | 6 | * documentation for any purpose is hereby granted without fee, provided that |
michael@0 | 7 | * the above copyright notice appear in all copies and that both that |
michael@0 | 8 | * copyright notice and this permission notice appear in supporting |
michael@0 | 9 | * documentation, and that the name of the copyright holders not be used in |
michael@0 | 10 | * advertising or publicity pertaining to distribution of the software without |
michael@0 | 11 | * specific, written prior permission. The copyright holders make no |
michael@0 | 12 | * representations about the suitability of this software for any purpose. It |
michael@0 | 13 | * is provided "as is" without express or implied warranty. |
michael@0 | 14 | * |
michael@0 | 15 | * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
michael@0 | 16 | * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
michael@0 | 17 | * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
michael@0 | 18 | * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
michael@0 | 19 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
michael@0 | 20 | * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
michael@0 | 21 | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
michael@0 | 22 | * SOFTWARE. |
michael@0 | 23 | * |
michael@0 | 24 | * Author: Ben Avison (bavison@riscosopen.org) |
michael@0 | 25 | * |
michael@0 | 26 | */ |
michael@0 | 27 | |
michael@0 | 28 | /* Prevent the stack from becoming executable */ |
michael@0 | 29 | #if defined(__linux__) && defined(__ELF__) |
michael@0 | 30 | .section .note.GNU-stack,"",%progbits |
michael@0 | 31 | #endif |
michael@0 | 32 | |
michael@0 | 33 | .text |
michael@0 | 34 | .arch armv6 |
michael@0 | 35 | .object_arch armv4 |
michael@0 | 36 | .arm |
michael@0 | 37 | .altmacro |
michael@0 | 38 | .p2align 2 |
michael@0 | 39 | |
michael@0 | 40 | #include "pixman-arm-simd-asm.h" |
michael@0 | 41 | |
michael@0 | 42 | /* A head macro should do all processing which results in an output of up to |
michael@0 | 43 | * 16 bytes, as far as the final load instruction. The corresponding tail macro |
michael@0 | 44 | * should complete the processing of the up-to-16 bytes. The calling macro will |
michael@0 | 45 | * sometimes choose to insert a preload or a decrement of X between them. |
michael@0 | 46 | * cond ARM condition code for code block |
michael@0 | 47 | * numbytes Number of output bytes that should be generated this time |
michael@0 | 48 | * firstreg First WK register in which to place output |
michael@0 | 49 | * unaligned_src Whether to use non-wordaligned loads of source image |
michael@0 | 50 | * unaligned_mask Whether to use non-wordaligned loads of mask image |
michael@0 | 51 | * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output |
michael@0 | 52 | */ |
michael@0 | 53 | |
michael@0 | 54 | .macro blit_init |
michael@0 | 55 | line_saved_regs STRIDE_D, STRIDE_S |
michael@0 | 56 | .endm |
michael@0 | 57 | |
michael@0 | 58 | .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
michael@0 | 59 | pixld cond, numbytes, firstreg, SRC, unaligned_src |
michael@0 | 60 | .endm |
michael@0 | 61 | |
michael@0 | 62 | .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment |
michael@0 | 63 | WK4 .req STRIDE_D |
michael@0 | 64 | WK5 .req STRIDE_S |
michael@0 | 65 | WK6 .req MASK |
michael@0 | 66 | WK7 .req STRIDE_M |
michael@0 | 67 | 110: pixld , 16, 0, SRC, unaligned_src |
michael@0 | 68 | pixld , 16, 4, SRC, unaligned_src |
michael@0 | 69 | pld [SRC, SCRATCH] |
michael@0 | 70 | pixst , 16, 0, DST |
michael@0 | 71 | pixst , 16, 4, DST |
michael@0 | 72 | subs X, X, #32*8/src_bpp |
michael@0 | 73 | bhs 110b |
michael@0 | 74 | .unreq WK4 |
michael@0 | 75 | .unreq WK5 |
michael@0 | 76 | .unreq WK6 |
michael@0 | 77 | .unreq WK7 |
michael@0 | 78 | .endm |
michael@0 | 79 | |
michael@0 | 80 | generate_composite_function \ |
michael@0 | 81 | pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ |
michael@0 | 82 | FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
michael@0 | 83 | 4, /* prefetch distance */ \ |
michael@0 | 84 | blit_init, \ |
michael@0 | 85 | nop_macro, /* newline */ \ |
michael@0 | 86 | nop_macro, /* cleanup */ \ |
michael@0 | 87 | blit_process_head, \ |
michael@0 | 88 | nop_macro, /* process tail */ \ |
michael@0 | 89 | blit_inner_loop |
michael@0 | 90 | |
michael@0 | 91 | generate_composite_function \ |
michael@0 | 92 | pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ |
michael@0 | 93 | FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
michael@0 | 94 | 4, /* prefetch distance */ \ |
michael@0 | 95 | blit_init, \ |
michael@0 | 96 | nop_macro, /* newline */ \ |
michael@0 | 97 | nop_macro, /* cleanup */ \ |
michael@0 | 98 | blit_process_head, \ |
michael@0 | 99 | nop_macro, /* process tail */ \ |
michael@0 | 100 | blit_inner_loop |
michael@0 | 101 | |
michael@0 | 102 | generate_composite_function \ |
michael@0 | 103 | pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ |
michael@0 | 104 | FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
michael@0 | 105 | 3, /* prefetch distance */ \ |
michael@0 | 106 | blit_init, \ |
michael@0 | 107 | nop_macro, /* newline */ \ |
michael@0 | 108 | nop_macro, /* cleanup */ \ |
michael@0 | 109 | blit_process_head, \ |
michael@0 | 110 | nop_macro, /* process tail */ \ |
michael@0 | 111 | blit_inner_loop |
michael@0 | 112 | |
michael@0 | 113 | /******************************************************************************/ |
michael@0 | 114 | |
michael@0 | 115 | .macro src_n_8888_init |
michael@0 | 116 | ldr SRC, [sp, #ARGS_STACK_OFFSET] |
michael@0 | 117 | mov STRIDE_S, SRC |
michael@0 | 118 | mov MASK, SRC |
michael@0 | 119 | mov STRIDE_M, SRC |
michael@0 | 120 | .endm |
michael@0 | 121 | |
michael@0 | 122 | .macro src_n_0565_init |
michael@0 | 123 | ldrh SRC, [sp, #ARGS_STACK_OFFSET] |
michael@0 | 124 | orr SRC, SRC, lsl #16 |
michael@0 | 125 | mov STRIDE_S, SRC |
michael@0 | 126 | mov MASK, SRC |
michael@0 | 127 | mov STRIDE_M, SRC |
michael@0 | 128 | .endm |
michael@0 | 129 | |
michael@0 | 130 | .macro src_n_8_init |
michael@0 | 131 | ldrb SRC, [sp, #ARGS_STACK_OFFSET] |
michael@0 | 132 | orr SRC, SRC, lsl #8 |
michael@0 | 133 | orr SRC, SRC, lsl #16 |
michael@0 | 134 | mov STRIDE_S, SRC |
michael@0 | 135 | mov MASK, SRC |
michael@0 | 136 | mov STRIDE_M, SRC |
michael@0 | 137 | .endm |
michael@0 | 138 | |
michael@0 | 139 | .macro fill_process_tail cond, numbytes, firstreg |
michael@0 | 140 | WK4 .req SRC |
michael@0 | 141 | WK5 .req STRIDE_S |
michael@0 | 142 | WK6 .req MASK |
michael@0 | 143 | WK7 .req STRIDE_M |
michael@0 | 144 | pixst cond, numbytes, 4, DST |
michael@0 | 145 | .unreq WK4 |
michael@0 | 146 | .unreq WK5 |
michael@0 | 147 | .unreq WK6 |
michael@0 | 148 | .unreq WK7 |
michael@0 | 149 | .endm |
michael@0 | 150 | |
michael@0 | 151 | generate_composite_function \ |
michael@0 | 152 | pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ |
michael@0 | 153 | FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
michael@0 | 154 | 0, /* prefetch distance doesn't apply */ \ |
michael@0 | 155 | src_n_8888_init \ |
michael@0 | 156 | nop_macro, /* newline */ \ |
michael@0 | 157 | nop_macro /* cleanup */ \ |
michael@0 | 158 | nop_macro /* process head */ \ |
michael@0 | 159 | fill_process_tail |
michael@0 | 160 | |
michael@0 | 161 | generate_composite_function \ |
michael@0 | 162 | pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ |
michael@0 | 163 | FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
michael@0 | 164 | 0, /* prefetch distance doesn't apply */ \ |
michael@0 | 165 | src_n_0565_init \ |
michael@0 | 166 | nop_macro, /* newline */ \ |
michael@0 | 167 | nop_macro /* cleanup */ \ |
michael@0 | 168 | nop_macro /* process head */ \ |
michael@0 | 169 | fill_process_tail |
michael@0 | 170 | |
michael@0 | 171 | generate_composite_function \ |
michael@0 | 172 | pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ |
michael@0 | 173 | FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
michael@0 | 174 | 0, /* prefetch distance doesn't apply */ \ |
michael@0 | 175 | src_n_8_init \ |
michael@0 | 176 | nop_macro, /* newline */ \ |
michael@0 | 177 | nop_macro /* cleanup */ \ |
michael@0 | 178 | nop_macro /* process head */ \ |
michael@0 | 179 | fill_process_tail |
michael@0 | 180 | |
michael@0 | 181 | /******************************************************************************/ |
michael@0 | 182 | |
michael@0 | 183 | .macro src_x888_8888_pixel, cond, reg |
michael@0 | 184 | orr&cond WK®, WK®, #0xFF000000 |
michael@0 | 185 | .endm |
michael@0 | 186 | |
michael@0 | 187 | .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
michael@0 | 188 | pixld cond, numbytes, firstreg, SRC, unaligned_src |
michael@0 | 189 | .endm |
michael@0 | 190 | |
michael@0 | 191 | .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg |
michael@0 | 192 | src_x888_8888_pixel cond, %(firstreg+0) |
michael@0 | 193 | .if numbytes >= 8 |
michael@0 | 194 | src_x888_8888_pixel cond, %(firstreg+1) |
michael@0 | 195 | .if numbytes == 16 |
michael@0 | 196 | src_x888_8888_pixel cond, %(firstreg+2) |
michael@0 | 197 | src_x888_8888_pixel cond, %(firstreg+3) |
michael@0 | 198 | .endif |
michael@0 | 199 | .endif |
michael@0 | 200 | .endm |
michael@0 | 201 | |
michael@0 | 202 | generate_composite_function \ |
michael@0 | 203 | pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ |
michael@0 | 204 | FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
michael@0 | 205 | 3, /* prefetch distance */ \ |
michael@0 | 206 | nop_macro, /* init */ \ |
michael@0 | 207 | nop_macro, /* newline */ \ |
michael@0 | 208 | nop_macro, /* cleanup */ \ |
michael@0 | 209 | pixman_composite_src_x888_8888_process_head, \ |
michael@0 | 210 | pixman_composite_src_x888_8888_process_tail |
michael@0 | 211 | |
michael@0 | 212 | /******************************************************************************/ |
michael@0 | 213 | |
michael@0 | 214 | .macro src_0565_8888_init |
michael@0 | 215 | /* Hold loop invariants in MASK and STRIDE_M */ |
michael@0 | 216 | ldr MASK, =0x07E007E0 |
michael@0 | 217 | mov STRIDE_M, #0xFF000000 |
michael@0 | 218 | /* Set GE[3:0] to 1010 so SEL instructions do what we want */ |
michael@0 | 219 | ldr SCRATCH, =0x80008000 |
michael@0 | 220 | uadd8 SCRATCH, SCRATCH, SCRATCH |
michael@0 | 221 | .endm |
michael@0 | 222 | |
michael@0 | 223 | .macro src_0565_8888_2pixels, reg1, reg2 |
michael@0 | 224 | and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 |
michael@0 | 225 | bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb |
michael@0 | 226 | orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg |
michael@0 | 227 | mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 |
michael@0 | 228 | mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG |
michael@0 | 229 | bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 |
michael@0 | 230 | orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 |
michael@0 | 231 | orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 |
michael@0 | 232 | pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- |
michael@0 | 233 | sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- |
michael@0 | 234 | mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg |
michael@0 | 235 | pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- |
michael@0 | 236 | sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- |
michael@0 | 237 | orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb |
michael@0 | 238 | orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB |
michael@0 | 239 | .endm |
michael@0 | 240 | |
michael@0 | 241 | /* This version doesn't need STRIDE_M, but is one instruction longer. |
michael@0 | 242 | It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? |
michael@0 | 243 | and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 |
michael@0 | 244 | bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb |
michael@0 | 245 | orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg |
michael@0 | 246 | mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB |
michael@0 | 247 | mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 |
michael@0 | 248 | bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb |
michael@0 | 249 | mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 |
michael@0 | 250 | mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 |
michael@0 | 251 | orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB |
michael@0 | 252 | orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb |
michael@0 | 253 | pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB |
michael@0 | 254 | pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb |
michael@0 | 255 | sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB |
michael@0 | 256 | sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb |
michael@0 | 257 | orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB |
michael@0 | 258 | orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb |
michael@0 | 259 | */ |
michael@0 | 260 | |
michael@0 | 261 | .macro src_0565_8888_1pixel, reg |
michael@0 | 262 | bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb |
michael@0 | 263 | and WK®, WK®, MASK @ 000000000000000000000gggggg00000 |
michael@0 | 264 | mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 |
michael@0 | 265 | mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 |
michael@0 | 266 | orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb |
michael@0 | 267 | orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 |
michael@0 | 268 | pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb |
michael@0 | 269 | sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb |
michael@0 | 270 | orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb |
michael@0 | 271 | .endm |
michael@0 | 272 | |
michael@0 | 273 | .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
michael@0 | 274 | .if numbytes == 16 |
michael@0 | 275 | pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src |
michael@0 | 276 | .elseif numbytes == 8 |
michael@0 | 277 | pixld , 4, firstreg, SRC, unaligned_src |
michael@0 | 278 | .elseif numbytes == 4 |
michael@0 | 279 | pixld , 2, firstreg, SRC, unaligned_src |
michael@0 | 280 | .endif |
michael@0 | 281 | .endm |
michael@0 | 282 | |
michael@0 | 283 | .macro src_0565_8888_process_tail cond, numbytes, firstreg |
michael@0 | 284 | .if numbytes == 16 |
michael@0 | 285 | src_0565_8888_2pixels firstreg, %(firstreg+1) |
michael@0 | 286 | src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) |
michael@0 | 287 | .elseif numbytes == 8 |
michael@0 | 288 | src_0565_8888_2pixels firstreg, %(firstreg+1) |
michael@0 | 289 | .else |
michael@0 | 290 | src_0565_8888_1pixel firstreg |
michael@0 | 291 | .endif |
michael@0 | 292 | .endm |
michael@0 | 293 | |
michael@0 | 294 | generate_composite_function \ |
michael@0 | 295 | pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ |
michael@0 | 296 | FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ |
michael@0 | 297 | 3, /* prefetch distance */ \ |
michael@0 | 298 | src_0565_8888_init, \ |
michael@0 | 299 | nop_macro, /* newline */ \ |
michael@0 | 300 | nop_macro, /* cleanup */ \ |
michael@0 | 301 | src_0565_8888_process_head, \ |
michael@0 | 302 | src_0565_8888_process_tail |
michael@0 | 303 | |
michael@0 | 304 | /******************************************************************************/ |
michael@0 | 305 | |
michael@0 | 306 | .macro add_8_8_8pixels cond, dst1, dst2 |
michael@0 | 307 | uqadd8&cond WK&dst1, WK&dst1, MASK |
michael@0 | 308 | uqadd8&cond WK&dst2, WK&dst2, STRIDE_M |
michael@0 | 309 | .endm |
michael@0 | 310 | |
michael@0 | 311 | .macro add_8_8_4pixels cond, dst |
michael@0 | 312 | uqadd8&cond WK&dst, WK&dst, MASK |
michael@0 | 313 | .endm |
michael@0 | 314 | |
michael@0 | 315 | .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
michael@0 | 316 | WK4 .req MASK |
michael@0 | 317 | WK5 .req STRIDE_M |
michael@0 | 318 | .if numbytes == 16 |
michael@0 | 319 | pixld cond, 8, 4, SRC, unaligned_src |
michael@0 | 320 | pixld cond, 16, firstreg, DST, 0 |
michael@0 | 321 | add_8_8_8pixels cond, firstreg, %(firstreg+1) |
michael@0 | 322 | pixld cond, 8, 4, SRC, unaligned_src |
michael@0 | 323 | .else |
michael@0 | 324 | pixld cond, numbytes, 4, SRC, unaligned_src |
michael@0 | 325 | pixld cond, numbytes, firstreg, DST, 0 |
michael@0 | 326 | .endif |
michael@0 | 327 | .unreq WK4 |
michael@0 | 328 | .unreq WK5 |
michael@0 | 329 | .endm |
michael@0 | 330 | |
michael@0 | 331 | .macro add_8_8_process_tail cond, numbytes, firstreg |
michael@0 | 332 | .if numbytes == 16 |
michael@0 | 333 | add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) |
michael@0 | 334 | .elseif numbytes == 8 |
michael@0 | 335 | add_8_8_8pixels cond, firstreg, %(firstreg+1) |
michael@0 | 336 | .else |
michael@0 | 337 | add_8_8_4pixels cond, firstreg |
michael@0 | 338 | .endif |
michael@0 | 339 | .endm |
michael@0 | 340 | |
michael@0 | 341 | generate_composite_function \ |
michael@0 | 342 | pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ |
michael@0 | 343 | FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
michael@0 | 344 | 2, /* prefetch distance */ \ |
michael@0 | 345 | nop_macro, /* init */ \ |
michael@0 | 346 | nop_macro, /* newline */ \ |
michael@0 | 347 | nop_macro, /* cleanup */ \ |
michael@0 | 348 | add_8_8_process_head, \ |
michael@0 | 349 | add_8_8_process_tail |
michael@0 | 350 | |
michael@0 | 351 | /******************************************************************************/ |
michael@0 | 352 | |
michael@0 | 353 | .macro over_8888_8888_init |
michael@0 | 354 | /* Hold loop invariant in MASK */ |
michael@0 | 355 | ldr MASK, =0x00800080 |
michael@0 | 356 | /* Set GE[3:0] to 0101 so SEL instructions do what we want */ |
michael@0 | 357 | uadd8 SCRATCH, MASK, MASK |
michael@0 | 358 | line_saved_regs STRIDE_D, STRIDE_S, ORIG_W |
michael@0 | 359 | .endm |
michael@0 | 360 | |
michael@0 | 361 | .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
michael@0 | 362 | WK4 .req STRIDE_D |
michael@0 | 363 | WK5 .req STRIDE_S |
michael@0 | 364 | WK6 .req STRIDE_M |
michael@0 | 365 | WK7 .req ORIG_W |
michael@0 | 366 | pixld , numbytes, %(4+firstreg), SRC, unaligned_src |
michael@0 | 367 | pixld , numbytes, firstreg, DST, 0 |
michael@0 | 368 | .unreq WK4 |
michael@0 | 369 | .unreq WK5 |
michael@0 | 370 | .unreq WK6 |
michael@0 | 371 | .unreq WK7 |
michael@0 | 372 | .endm |
michael@0 | 373 | |
michael@0 | 374 | .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 |
michael@0 | 375 | /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ |
michael@0 | 376 | teq WK®0, #0 |
michael@0 | 377 | .if numbytes > 4 |
michael@0 | 378 | teqeq WK®1, #0 |
michael@0 | 379 | .if numbytes > 8 |
michael@0 | 380 | teqeq WK®2, #0 |
michael@0 | 381 | teqeq WK®3, #0 |
michael@0 | 382 | .endif |
michael@0 | 383 | .endif |
michael@0 | 384 | .endm |
michael@0 | 385 | |
michael@0 | 386 | .macro over_8888_8888_prepare next |
michael@0 | 387 | mov WK&next, WK&next, lsr #24 |
michael@0 | 388 | .endm |
michael@0 | 389 | |
michael@0 | 390 | .macro over_8888_8888_1pixel src, dst, offset, next |
michael@0 | 391 | /* src = destination component multiplier */ |
michael@0 | 392 | rsb WK&src, WK&src, #255 |
michael@0 | 393 | /* Split even/odd bytes of dst into SCRATCH/dst */ |
michael@0 | 394 | uxtb16 SCRATCH, WK&dst |
michael@0 | 395 | uxtb16 WK&dst, WK&dst, ror #8 |
michael@0 | 396 | /* Multiply through, adding 0.5 to the upper byte of result for rounding */ |
michael@0 | 397 | mla SCRATCH, SCRATCH, WK&src, MASK |
michael@0 | 398 | mla WK&dst, WK&dst, WK&src, MASK |
michael@0 | 399 | /* Where we would have had a stall between the result of the first MLA and the shifter input, |
michael@0 | 400 | * reload the complete source pixel */ |
michael@0 | 401 | ldr WK&src, [SRC, #offset] |
michael@0 | 402 | /* Multiply by 257/256 to approximate 256/255 */ |
michael@0 | 403 | uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 |
michael@0 | 404 | /* In this stall, start processing the next pixel */ |
michael@0 | 405 | .if offset < -4 |
michael@0 | 406 | mov WK&next, WK&next, lsr #24 |
michael@0 | 407 | .endif |
michael@0 | 408 | uxtab16 WK&dst, WK&dst, WK&dst, ror #8 |
michael@0 | 409 | /* Recombine even/odd bytes of multiplied destination */ |
michael@0 | 410 | mov SCRATCH, SCRATCH, ror #8 |
michael@0 | 411 | sel WK&dst, SCRATCH, WK&dst |
michael@0 | 412 | /* Saturated add of source to multiplied destination */ |
michael@0 | 413 | uqadd8 WK&dst, WK&dst, WK&src |
michael@0 | 414 | .endm |
michael@0 | 415 | |
michael@0 | 416 | .macro over_8888_8888_process_tail cond, numbytes, firstreg |
michael@0 | 417 | WK4 .req STRIDE_D |
michael@0 | 418 | WK5 .req STRIDE_S |
michael@0 | 419 | WK6 .req STRIDE_M |
michael@0 | 420 | WK7 .req ORIG_W |
michael@0 | 421 | over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) |
michael@0 | 422 | beq 10f |
michael@0 | 423 | over_8888_8888_prepare %(4+firstreg) |
michael@0 | 424 | .set PROCESS_REG, firstreg |
michael@0 | 425 | .set PROCESS_OFF, -numbytes |
michael@0 | 426 | .rept numbytes / 4 |
michael@0 | 427 | over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) |
michael@0 | 428 | .set PROCESS_REG, PROCESS_REG+1 |
michael@0 | 429 | .set PROCESS_OFF, PROCESS_OFF+4 |
michael@0 | 430 | .endr |
michael@0 | 431 | pixst , numbytes, firstreg, DST |
michael@0 | 432 | 10: |
michael@0 | 433 | .unreq WK4 |
michael@0 | 434 | .unreq WK5 |
michael@0 | 435 | .unreq WK6 |
michael@0 | 436 | .unreq WK7 |
michael@0 | 437 | .endm |
michael@0 | 438 | |
michael@0 | 439 | generate_composite_function \ |
michael@0 | 440 | pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ |
michael@0 | 441 | FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ |
michael@0 | 442 | 2, /* prefetch distance */ \ |
michael@0 | 443 | over_8888_8888_init, \ |
michael@0 | 444 | nop_macro, /* newline */ \ |
michael@0 | 445 | nop_macro, /* cleanup */ \ |
michael@0 | 446 | over_8888_8888_process_head, \ |
michael@0 | 447 | over_8888_8888_process_tail |
michael@0 | 448 | |
michael@0 | 449 | /******************************************************************************/ |
michael@0 | 450 | |
michael@0 | 451 | /* Multiply each byte of a word by a byte. |
michael@0 | 452 | * Useful when there aren't any obvious ways to fill the stalls with other instructions. |
michael@0 | 453 | * word Register containing 4 bytes |
michael@0 | 454 | * byte Register containing byte multiplier (bits 8-31 must be 0) |
michael@0 | 455 | * tmp Scratch register |
michael@0 | 456 | * half Register containing the constant 0x00800080 |
michael@0 | 457 | * GE[3:0] bits must contain 0101 |
michael@0 | 458 | */ |
michael@0 | 459 | .macro mul_8888_8 word, byte, tmp, half |
michael@0 | 460 | /* Split even/odd bytes of word apart */ |
michael@0 | 461 | uxtb16 tmp, word |
michael@0 | 462 | uxtb16 word, word, ror #8 |
michael@0 | 463 | /* Multiply bytes together with rounding, then by 257/256 */ |
michael@0 | 464 | mla tmp, tmp, byte, half |
michael@0 | 465 | mla word, word, byte, half /* 1 stall follows */ |
michael@0 | 466 | uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ |
michael@0 | 467 | uxtab16 word, word, word, ror #8 |
michael@0 | 468 | /* Recombine bytes */ |
michael@0 | 469 | mov tmp, tmp, ror #8 |
michael@0 | 470 | sel word, tmp, word |
michael@0 | 471 | .endm |
michael@0 | 472 | |
michael@0 | 473 | /******************************************************************************/ |
michael@0 | 474 | |
michael@0 | 475 | .macro over_8888_n_8888_init |
michael@0 | 476 | /* Mask is constant */ |
michael@0 | 477 | ldr MASK, [sp, #ARGS_STACK_OFFSET+8] |
michael@0 | 478 | /* Hold loop invariant in STRIDE_M */ |
michael@0 | 479 | ldr STRIDE_M, =0x00800080 |
michael@0 | 480 | /* We only want the alpha bits of the constant mask */ |
michael@0 | 481 | mov MASK, MASK, lsr #24 |
michael@0 | 482 | /* Set GE[3:0] to 0101 so SEL instructions do what we want */ |
michael@0 | 483 | uadd8 SCRATCH, STRIDE_M, STRIDE_M |
michael@0 | 484 | line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W |
michael@0 | 485 | .endm |
michael@0 | 486 | |
michael@0 | 487 | .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
michael@0 | 488 | WK4 .req Y |
michael@0 | 489 | WK5 .req STRIDE_D |
michael@0 | 490 | WK6 .req STRIDE_S |
michael@0 | 491 | WK7 .req ORIG_W |
michael@0 | 492 | pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src |
michael@0 | 493 | pixld , numbytes, firstreg, DST, 0 |
michael@0 | 494 | .unreq WK4 |
michael@0 | 495 | .unreq WK5 |
michael@0 | 496 | .unreq WK6 |
michael@0 | 497 | .unreq WK7 |
michael@0 | 498 | .endm |
michael@0 | 499 | |
michael@0 | 500 | .macro over_8888_n_8888_1pixel src, dst |
michael@0 | 501 | mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M |
michael@0 | 502 | sub WK7, WK6, WK&src, lsr #24 |
michael@0 | 503 | mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M |
michael@0 | 504 | uqadd8 WK&dst, WK&dst, WK&src |
michael@0 | 505 | .endm |
michael@0 | 506 | |
michael@0 | 507 | .macro over_8888_n_8888_process_tail cond, numbytes, firstreg |
michael@0 | 508 | WK4 .req Y |
michael@0 | 509 | WK5 .req STRIDE_D |
michael@0 | 510 | WK6 .req STRIDE_S |
michael@0 | 511 | WK7 .req ORIG_W |
michael@0 | 512 | over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) |
michael@0 | 513 | beq 10f |
michael@0 | 514 | mov WK6, #255 |
michael@0 | 515 | .set PROCESS_REG, firstreg |
michael@0 | 516 | .rept numbytes / 4 |
michael@0 | 517 | .if numbytes == 16 && PROCESS_REG == 2 |
michael@0 | 518 | /* We're using WK6 and WK7 as temporaries, so half way through |
michael@0 | 519 | * 4 pixels, reload the second two source pixels but this time |
michael@0 | 520 | * into WK4 and WK5 */ |
michael@0 | 521 | ldmdb SRC, {WK4, WK5} |
michael@0 | 522 | .endif |
michael@0 | 523 | over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) |
michael@0 | 524 | .set PROCESS_REG, PROCESS_REG+1 |
michael@0 | 525 | .endr |
michael@0 | 526 | pixst , numbytes, firstreg, DST |
michael@0 | 527 | 10: |
michael@0 | 528 | .unreq WK4 |
michael@0 | 529 | .unreq WK5 |
michael@0 | 530 | .unreq WK6 |
michael@0 | 531 | .unreq WK7 |
michael@0 | 532 | .endm |
michael@0 | 533 | |
michael@0 | 534 | generate_composite_function \ |
michael@0 | 535 | pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ |
michael@0 | 536 | FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ |
michael@0 | 537 | 2, /* prefetch distance */ \ |
michael@0 | 538 | over_8888_n_8888_init, \ |
michael@0 | 539 | nop_macro, /* newline */ \ |
michael@0 | 540 | nop_macro, /* cleanup */ \ |
michael@0 | 541 | over_8888_n_8888_process_head, \ |
michael@0 | 542 | over_8888_n_8888_process_tail |
michael@0 | 543 | |
michael@0 | 544 | /******************************************************************************/ |
michael@0 | 545 | |
michael@0 | 546 | .macro over_n_8_8888_init |
michael@0 | 547 | /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ |
michael@0 | 548 | ldr SRC, [sp, #ARGS_STACK_OFFSET] |
michael@0 | 549 | /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ |
michael@0 | 550 | ldr SCRATCH, =0x00800080 |
michael@0 | 551 | uxtb16 STRIDE_S, SRC |
michael@0 | 552 | uxtb16 SRC, SRC, ror #8 |
michael@0 | 553 | /* Set GE[3:0] to 0101 so SEL instructions do what we want */ |
michael@0 | 554 | uadd8 SCRATCH, SCRATCH, SCRATCH |
michael@0 | 555 | line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W |
michael@0 | 556 | .endm |
michael@0 | 557 | |
michael@0 | 558 | .macro over_n_8_8888_newline |
michael@0 | 559 | ldr STRIDE_D, =0x00800080 |
michael@0 | 560 | b 1f |
michael@0 | 561 | .ltorg |
michael@0 | 562 | 1: |
michael@0 | 563 | .endm |
michael@0 | 564 | |
michael@0 | 565 | .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
michael@0 | 566 | WK4 .req STRIDE_M |
michael@0 | 567 | pixld , numbytes/4, 4, MASK, unaligned_mask |
michael@0 | 568 | pixld , numbytes, firstreg, DST, 0 |
michael@0 | 569 | .unreq WK4 |
michael@0 | 570 | .endm |
michael@0 | 571 | |
michael@0 | 572 | .macro over_n_8_8888_1pixel src, dst |
michael@0 | 573 | uxtb Y, WK4, ror #src*8 |
michael@0 | 574 | /* Trailing part of multiplication of source */ |
michael@0 | 575 | mla SCRATCH, STRIDE_S, Y, STRIDE_D |
michael@0 | 576 | mla Y, SRC, Y, STRIDE_D |
michael@0 | 577 | mov ORIG_W, #255 |
michael@0 | 578 | uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 |
michael@0 | 579 | uxtab16 Y, Y, Y, ror #8 |
michael@0 | 580 | mov SCRATCH, SCRATCH, ror #8 |
michael@0 | 581 | sub ORIG_W, ORIG_W, Y, lsr #24 |
michael@0 | 582 | sel Y, SCRATCH, Y |
michael@0 | 583 | /* Then multiply the destination */ |
michael@0 | 584 | mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D |
michael@0 | 585 | uqadd8 WK&dst, WK&dst, Y |
michael@0 | 586 | .endm |
michael@0 | 587 | |
michael@0 | 588 | .macro over_n_8_8888_process_tail cond, numbytes, firstreg |
michael@0 | 589 | WK4 .req STRIDE_M |
michael@0 | 590 | teq WK4, #0 |
michael@0 | 591 | beq 10f |
michael@0 | 592 | .set PROCESS_REG, firstreg |
michael@0 | 593 | .rept numbytes / 4 |
michael@0 | 594 | over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) |
michael@0 | 595 | .set PROCESS_REG, PROCESS_REG+1 |
michael@0 | 596 | .endr |
michael@0 | 597 | pixst , numbytes, firstreg, DST |
michael@0 | 598 | 10: |
michael@0 | 599 | .unreq WK4 |
michael@0 | 600 | .endm |
michael@0 | 601 | |
michael@0 | 602 | generate_composite_function \ |
michael@0 | 603 | pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ |
michael@0 | 604 | FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ |
michael@0 | 605 | 2, /* prefetch distance */ \ |
michael@0 | 606 | over_n_8_8888_init, \ |
michael@0 | 607 | over_n_8_8888_newline, \ |
michael@0 | 608 | nop_macro, /* cleanup */ \ |
michael@0 | 609 | over_n_8_8888_process_head, \ |
michael@0 | 610 | over_n_8_8888_process_tail |
michael@0 | 611 | |
michael@0 | 612 | /******************************************************************************/ |
michael@0 | 613 |