Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright © 2012 Raspberry Pi Foundation |
michael@0 | 3 | * Copyright © 2012 RISC OS Open Ltd |
michael@0 | 4 | * |
michael@0 | 5 | * Permission to use, copy, modify, distribute, and sell this software and its |
michael@0 | 6 | * documentation for any purpose is hereby granted without fee, provided that |
michael@0 | 7 | * the above copyright notice appear in all copies and that both that |
michael@0 | 8 | * copyright notice and this permission notice appear in supporting |
michael@0 | 9 | * documentation, and that the name of the copyright holders not be used in |
michael@0 | 10 | * advertising or publicity pertaining to distribution of the software without |
michael@0 | 11 | * specific, written prior permission. The copyright holders make no |
michael@0 | 12 | * representations about the suitability of this software for any purpose. It |
michael@0 | 13 | * is provided "as is" without express or implied warranty. |
michael@0 | 14 | * |
michael@0 | 15 | * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
michael@0 | 16 | * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
michael@0 | 17 | * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
michael@0 | 18 | * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
michael@0 | 19 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
michael@0 | 20 | * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
michael@0 | 21 | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
michael@0 | 22 | * SOFTWARE. |
michael@0 | 23 | * |
michael@0 | 24 | * Author: Ben Avison (bavison@riscosopen.org) |
michael@0 | 25 | * |
michael@0 | 26 | */ |
michael@0 | 27 | |
michael@0 | 28 | /* |
michael@0 | 29 | * Because the alignment of pixel data to cachelines, and even the number of |
michael@0 | 30 | * cachelines per row can vary from row to row, and because of the need to |
michael@0 | 31 | * preload each scanline once and only once, this prefetch strategy treats |
michael@0 | 32 | * each row of pixels independently. When a pixel row is long enough, there |
michael@0 | 33 | * are three distinct phases of prefetch: |
michael@0 | 34 | * * an inner loop section, where each time a cacheline of data is |
michael@0 | 35 | * processed, another cacheline is preloaded (the exact distance ahead is |
michael@0 | 36 | * determined empirically using profiling results from lowlevel-blt-bench) |
michael@0 | 37 | * * a leading section, where enough cachelines are preloaded to ensure no |
michael@0 | 38 | * cachelines escape being preloaded when the inner loop starts |
michael@0 | 39 | * * a trailing section, where a limited number (0 or more) of cachelines |
michael@0 | 40 | * are preloaded to deal with data (if any) that hangs off the end of the |
michael@0 | 41 | * last iteration of the inner loop, plus any trailing bytes that were not |
michael@0 | 42 | * enough to make up one whole iteration of the inner loop |
michael@0 | 43 | * |
michael@0 | 44 | * There are (in general) three distinct code paths, selected between |
michael@0 | 45 | * depending upon how long the pixel row is. If it is long enough that there |
michael@0 | 46 | * is at least one iteration of the inner loop (as described above) then |
michael@0 | 47 | * this is described as the "wide" case. If it is shorter than that, but |
michael@0 | 48 | * there are still enough bytes output that there is at least one 16-byte- |
michael@0 | 49 | * long, 16-byte-aligned write to the destination (the optimum type of |
michael@0 | 50 | * write), then this is the "medium" case. If it is not even this long, then |
michael@0 | 51 | * this is the "narrow" case, and there is no attempt to align writes to |
michael@0 | 52 | * 16-byte boundaries. In the "medium" and "narrow" cases, all the |
michael@0 | 53 | * cachelines containing data from the pixel row are prefetched up-front. |
michael@0 | 54 | */ |
michael@0 | 55 | |
michael@0 | 56 | /* |
michael@0 | 57 | * Determine whether we put the arguments on the stack for debugging. |
michael@0 | 58 | */ |
michael@0 | 59 | #undef DEBUG_PARAMS |
michael@0 | 60 | |
michael@0 | 61 | /* |
michael@0 | 62 | * Bit flags for 'generate_composite_function' macro which are used |
michael@0 | 63 | * to tune generated functions behavior. |
michael@0 | 64 | */ |
michael@0 | 65 | .set FLAG_DST_WRITEONLY, 0 |
michael@0 | 66 | .set FLAG_DST_READWRITE, 1 |
michael@0 | 67 | .set FLAG_COND_EXEC, 0 |
michael@0 | 68 | .set FLAG_BRANCH_OVER, 2 |
michael@0 | 69 | .set FLAG_PROCESS_PRESERVES_PSR, 0 |
michael@0 | 70 | .set FLAG_PROCESS_CORRUPTS_PSR, 4 |
michael@0 | 71 | .set FLAG_PROCESS_DOESNT_STORE, 0 |
michael@0 | 72 | .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ |
michael@0 | 73 | .set FLAG_NO_SPILL_LINE_VARS, 0 |
michael@0 | 74 | .set FLAG_SPILL_LINE_VARS_WIDE, 16 |
michael@0 | 75 | .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 |
michael@0 | 76 | .set FLAG_SPILL_LINE_VARS, 48 |
michael@0 | 77 | .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 |
michael@0 | 78 | .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 |
michael@0 | 79 | |
michael@0 | 80 | /* |
michael@0 | 81 | * Offset into stack where mask and source pointer/stride can be accessed. |
michael@0 | 82 | */ |
michael@0 | 83 | #ifdef DEBUG_PARAMS |
michael@0 | 84 | .set ARGS_STACK_OFFSET, (9*4+9*4) |
michael@0 | 85 | #else |
michael@0 | 86 | .set ARGS_STACK_OFFSET, (9*4) |
michael@0 | 87 | #endif |
michael@0 | 88 | |
michael@0 | 89 | /* |
michael@0 | 90 | * Constants for selecting preferable prefetch type. |
michael@0 | 91 | */ |
michael@0 | 92 | .set PREFETCH_TYPE_NONE, 0 |
michael@0 | 93 | .set PREFETCH_TYPE_STANDARD, 1 |
michael@0 | 94 | |
michael@0 | 95 | /* |
michael@0 | 96 | * Definitions of macros for load/store of pixel data. |
michael@0 | 97 | */ |
michael@0 | 98 | |
michael@0 | 99 | .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 |
michael@0 | 100 | .if numbytes == 16 |
michael@0 | 101 | .if unaligned == 1 |
michael@0 | 102 | op&r&cond WK®0, [base], #4 |
michael@0 | 103 | op&r&cond WK®1, [base], #4 |
michael@0 | 104 | op&r&cond WK®2, [base], #4 |
michael@0 | 105 | op&r&cond WK®3, [base], #4 |
michael@0 | 106 | .else |
michael@0 | 107 | op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} |
michael@0 | 108 | .endif |
michael@0 | 109 | .elseif numbytes == 8 |
michael@0 | 110 | .if unaligned == 1 |
michael@0 | 111 | op&r&cond WK®0, [base], #4 |
michael@0 | 112 | op&r&cond WK®1, [base], #4 |
michael@0 | 113 | .else |
michael@0 | 114 | op&m&cond&ia base!, {WK®0,WK®1} |
michael@0 | 115 | .endif |
michael@0 | 116 | .elseif numbytes == 4 |
michael@0 | 117 | op&r&cond WK®0, [base], #4 |
michael@0 | 118 | .elseif numbytes == 2 |
michael@0 | 119 | op&r&cond&h WK®0, [base], #2 |
michael@0 | 120 | .elseif numbytes == 1 |
michael@0 | 121 | op&r&cond&b WK®0, [base], #1 |
michael@0 | 122 | .else |
michael@0 | 123 | .error "unsupported size: numbytes" |
michael@0 | 124 | .endif |
michael@0 | 125 | .endm |
michael@0 | 126 | |
michael@0 | 127 | .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base |
michael@0 | 128 | .if numbytes == 16 |
michael@0 | 129 | stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} |
michael@0 | 130 | .elseif numbytes == 8 |
michael@0 | 131 | stm&cond&db base, {WK®0,WK®1} |
michael@0 | 132 | .elseif numbytes == 4 |
michael@0 | 133 | str&cond WK®0, [base, #-4] |
michael@0 | 134 | .elseif numbytes == 2 |
michael@0 | 135 | str&cond&h WK®0, [base, #-2] |
michael@0 | 136 | .elseif numbytes == 1 |
michael@0 | 137 | str&cond&b WK®0, [base, #-1] |
michael@0 | 138 | .else |
michael@0 | 139 | .error "unsupported size: numbytes" |
michael@0 | 140 | .endif |
michael@0 | 141 | .endm |
michael@0 | 142 | |
michael@0 | 143 | .macro pixld cond, numbytes, firstreg, base, unaligned |
michael@0 | 144 | pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned |
michael@0 | 145 | .endm |
michael@0 | 146 | |
michael@0 | 147 | .macro pixst cond, numbytes, firstreg, base |
michael@0 | 148 | .if (flags) & FLAG_DST_READWRITE |
michael@0 | 149 | pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base |
michael@0 | 150 | .else |
michael@0 | 151 | pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base |
michael@0 | 152 | .endif |
michael@0 | 153 | .endm |
michael@0 | 154 | |
michael@0 | 155 | .macro PF a, x:vararg |
michael@0 | 156 | .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) |
michael@0 | 157 | a x |
michael@0 | 158 | .endif |
michael@0 | 159 | .endm |
michael@0 | 160 | |
michael@0 | 161 | |
michael@0 | 162 | .macro preload_leading_step1 bpp, ptr, base |
michael@0 | 163 | /* If the destination is already 16-byte aligned, then we need to preload |
michael@0 | 164 | * between 0 and prefetch_distance (inclusive) cache lines ahead so there |
michael@0 | 165 | * are no gaps when the inner loop starts. |
michael@0 | 166 | */ |
michael@0 | 167 | .if bpp > 0 |
michael@0 | 168 | PF bic, ptr, base, #31 |
michael@0 | 169 | .set OFFSET, 0 |
michael@0 | 170 | .rept prefetch_distance+1 |
michael@0 | 171 | PF pld, [ptr, #OFFSET] |
michael@0 | 172 | .set OFFSET, OFFSET+32 |
michael@0 | 173 | .endr |
michael@0 | 174 | .endif |
michael@0 | 175 | .endm |
michael@0 | 176 | |
michael@0 | 177 | .macro preload_leading_step2 bpp, bpp_shift, ptr, base |
michael@0 | 178 | /* However, if the destination is not 16-byte aligned, we may need to |
michael@0 | 179 | * preload more cache lines than that. The question we need to ask is: |
michael@0 | 180 | * are the bytes corresponding to the leading pixels more than the amount |
michael@0 | 181 | * by which the source pointer will be rounded down for preloading, and if |
michael@0 | 182 | * so, by how many cache lines? Effectively, we want to calculate |
michael@0 | 183 | * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp |
michael@0 | 184 | * inner_loop_offset = (src+leading_bytes)&31 |
michael@0 | 185 | * extra_needed = leading_bytes - inner_loop_offset |
michael@0 | 186 | * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only |
michael@0 | 187 | * possible when there are 4 src bytes for every 1 dst byte). |
michael@0 | 188 | */ |
michael@0 | 189 | .if bpp > 0 |
michael@0 | 190 | .ifc base,DST |
michael@0 | 191 | /* The test can be simplified further when preloading the destination */ |
michael@0 | 192 | PF tst, base, #16 |
michael@0 | 193 | PF beq, 61f |
michael@0 | 194 | .else |
michael@0 | 195 | .if bpp/dst_w_bpp == 4 |
michael@0 | 196 | PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift |
michael@0 | 197 | PF and, SCRATCH, SCRATCH, #31 |
michael@0 | 198 | PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift |
michael@0 | 199 | PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ |
michael@0 | 200 | PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */ |
michael@0 | 201 | PF bcs, 61f |
michael@0 | 202 | PF bpl, 60f |
michael@0 | 203 | PF pld, [ptr, #32*(prefetch_distance+2)] |
michael@0 | 204 | .else |
michael@0 | 205 | PF mov, SCRATCH, base, lsl #32-5 |
michael@0 | 206 | PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift |
michael@0 | 207 | PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift |
michael@0 | 208 | PF bls, 61f |
michael@0 | 209 | .endif |
michael@0 | 210 | .endif |
michael@0 | 211 | 60: PF pld, [ptr, #32*(prefetch_distance+1)] |
michael@0 | 212 | 61: |
michael@0 | 213 | .endif |
michael@0 | 214 | .endm |
michael@0 | 215 | |
michael@0 | 216 | #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) |
michael@0 | 217 | .macro preload_middle bpp, base, scratch_holds_offset |
michael@0 | 218 | .if bpp > 0 |
michael@0 | 219 | /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ |
michael@0 | 220 | .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) |
michael@0 | 221 | .if scratch_holds_offset |
michael@0 | 222 | PF pld, [base, SCRATCH] |
michael@0 | 223 | .else |
michael@0 | 224 | PF bic, SCRATCH, base, #31 |
michael@0 | 225 | PF pld, [SCRATCH, #32*prefetch_distance] |
michael@0 | 226 | .endif |
michael@0 | 227 | .endif |
michael@0 | 228 | .endif |
michael@0 | 229 | .endm |
michael@0 | 230 | |
michael@0 | 231 | .macro preload_trailing bpp, bpp_shift, base |
michael@0 | 232 | .if bpp > 0 |
michael@0 | 233 | .if bpp*pix_per_block > 256 |
michael@0 | 234 | /* Calculations are more complex if more than one fetch per block */ |
michael@0 | 235 | PF and, WK1, base, #31 |
michael@0 | 236 | PF add, WK1, WK1, WK0, lsl #bpp_shift |
michael@0 | 237 | PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) |
michael@0 | 238 | PF bic, SCRATCH, base, #31 |
michael@0 | 239 | 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] |
michael@0 | 240 | PF add, SCRATCH, SCRATCH, #32 |
michael@0 | 241 | PF subs, WK1, WK1, #32 |
michael@0 | 242 | PF bhi, 80b |
michael@0 | 243 | .else |
michael@0 | 244 | /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ |
michael@0 | 245 | PF mov, SCRATCH, base, lsl #32-5 |
michael@0 | 246 | PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift |
michael@0 | 247 | PF adceqs, SCRATCH, SCRATCH, #0 |
michael@0 | 248 | /* The instruction above has two effects: ensures Z is only |
michael@0 | 249 | * set if C was clear (so Z indicates that both shifted quantities |
michael@0 | 250 | * were 0), and clears C if Z was set (so C indicates that the sum |
michael@0 | 251 | * of the shifted quantities was greater and not equal to 32) */ |
michael@0 | 252 | PF beq, 82f |
michael@0 | 253 | PF bic, SCRATCH, base, #31 |
michael@0 | 254 | PF bcc, 81f |
michael@0 | 255 | PF pld, [SCRATCH, #32*(prefetch_distance+2)] |
michael@0 | 256 | 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] |
michael@0 | 257 | 82: |
michael@0 | 258 | .endif |
michael@0 | 259 | .endif |
michael@0 | 260 | .endm |
michael@0 | 261 | |
michael@0 | 262 | |
michael@0 | 263 | .macro preload_line narrow_case, bpp, bpp_shift, base |
michael@0 | 264 | /* "narrow_case" - just means that the macro was invoked from the "narrow" |
michael@0 | 265 | * code path rather than the "medium" one - because in the narrow case, |
michael@0 | 266 | * the row of pixels is known to output no more than 30 bytes, then |
michael@0 | 267 | * (assuming the source pixels are no wider than the the destination |
michael@0 | 268 | * pixels) they cannot possibly straddle more than 2 32-byte cachelines, |
michael@0 | 269 | * meaning there's no need for a loop. |
michael@0 | 270 | * "bpp" - number of bits per pixel in the channel (source, mask or |
michael@0 | 271 | * destination) that's being preloaded, or 0 if this channel is not used |
michael@0 | 272 | * for reading |
michael@0 | 273 | * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) |
michael@0 | 274 | * "base" - base address register of channel to preload (SRC, MASK or DST) |
michael@0 | 275 | */ |
michael@0 | 276 | .if bpp > 0 |
michael@0 | 277 | .if narrow_case && (bpp <= dst_w_bpp) |
michael@0 | 278 | /* In these cases, each line for each channel is in either 1 or 2 cache lines */ |
michael@0 | 279 | PF bic, WK0, base, #31 |
michael@0 | 280 | PF pld, [WK0] |
michael@0 | 281 | PF add, WK1, base, X, LSL #bpp_shift |
michael@0 | 282 | PF sub, WK1, WK1, #1 |
michael@0 | 283 | PF bic, WK1, WK1, #31 |
michael@0 | 284 | PF cmp, WK1, WK0 |
michael@0 | 285 | PF beq, 90f |
michael@0 | 286 | PF pld, [WK1] |
michael@0 | 287 | 90: |
michael@0 | 288 | .else |
michael@0 | 289 | PF bic, WK0, base, #31 |
michael@0 | 290 | PF pld, [WK0] |
michael@0 | 291 | PF add, WK1, base, X, lsl #bpp_shift |
michael@0 | 292 | PF sub, WK1, WK1, #1 |
michael@0 | 293 | PF bic, WK1, WK1, #31 |
michael@0 | 294 | PF cmp, WK1, WK0 |
michael@0 | 295 | PF beq, 92f |
michael@0 | 296 | 91: PF add, WK0, WK0, #32 |
michael@0 | 297 | PF cmp, WK0, WK1 |
michael@0 | 298 | PF pld, [WK0] |
michael@0 | 299 | PF bne, 91b |
michael@0 | 300 | 92: |
michael@0 | 301 | .endif |
michael@0 | 302 | .endif |
michael@0 | 303 | .endm |
michael@0 | 304 | |
michael@0 | 305 | |
michael@0 | 306 | .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
michael@0 | 307 | process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 |
michael@0 | 308 | .if decrementx |
michael@0 | 309 | sub&cond X, X, #8*numbytes/dst_w_bpp |
michael@0 | 310 | .endif |
michael@0 | 311 | process_tail cond, numbytes, firstreg |
michael@0 | 312 | .if !((flags) & FLAG_PROCESS_DOES_STORE) |
michael@0 | 313 | pixst cond, numbytes, firstreg, DST |
michael@0 | 314 | .endif |
michael@0 | 315 | .endm |
michael@0 | 316 | |
michael@0 | 317 | .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
michael@0 | 318 | .if (flags) & FLAG_BRANCH_OVER |
michael@0 | 319 | .ifc cond,mi |
michael@0 | 320 | bpl 100f |
michael@0 | 321 | .endif |
michael@0 | 322 | .ifc cond,cs |
michael@0 | 323 | bcc 100f |
michael@0 | 324 | .endif |
michael@0 | 325 | .ifc cond,ne |
michael@0 | 326 | beq 100f |
michael@0 | 327 | .endif |
michael@0 | 328 | conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
michael@0 | 329 | 100: |
michael@0 | 330 | .else |
michael@0 | 331 | conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
michael@0 | 332 | .endif |
michael@0 | 333 | .endm |
michael@0 | 334 | |
michael@0 | 335 | .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx |
michael@0 | 336 | .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) |
michael@0 | 337 | /* Can't interleave reads and writes */ |
michael@0 | 338 | test |
michael@0 | 339 | conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx |
michael@0 | 340 | .if (flags) & FLAG_PROCESS_CORRUPTS_PSR |
michael@0 | 341 | test |
michael@0 | 342 | .endif |
michael@0 | 343 | conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx |
michael@0 | 344 | .else |
michael@0 | 345 | /* Can interleave reads and writes for better scheduling */ |
michael@0 | 346 | test |
michael@0 | 347 | process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 |
michael@0 | 348 | process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 |
michael@0 | 349 | .if decrementx |
michael@0 | 350 | sub&cond1 X, X, #8*numbytes1/dst_w_bpp |
michael@0 | 351 | sub&cond2 X, X, #8*numbytes2/dst_w_bpp |
michael@0 | 352 | .endif |
michael@0 | 353 | process_tail cond1, numbytes1, firstreg1 |
michael@0 | 354 | process_tail cond2, numbytes2, firstreg2 |
michael@0 | 355 | pixst cond1, numbytes1, firstreg1, DST |
michael@0 | 356 | pixst cond2, numbytes2, firstreg2, DST |
michael@0 | 357 | .endif |
michael@0 | 358 | .endm |
michael@0 | 359 | |
michael@0 | 360 | |
michael@0 | 361 | .macro test_bits_1_0_ptr |
michael@0 | 362 | movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ |
michael@0 | 363 | .endm |
michael@0 | 364 | |
michael@0 | 365 | .macro test_bits_3_2_ptr |
michael@0 | 366 | movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ |
michael@0 | 367 | .endm |
michael@0 | 368 | |
michael@0 | 369 | .macro leading_15bytes process_head, process_tail |
michael@0 | 370 | /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ |
michael@0 | 371 | /* Use unaligned loads in all cases for simplicity */ |
michael@0 | 372 | .if dst_w_bpp == 8 |
michael@0 | 373 | conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1 |
michael@0 | 374 | .elseif dst_w_bpp == 16 |
michael@0 | 375 | test_bits_1_0_ptr |
michael@0 | 376 | conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1 |
michael@0 | 377 | .endif |
michael@0 | 378 | conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1 |
michael@0 | 379 | .endm |
michael@0 | 380 | |
michael@0 | 381 | .macro test_bits_3_2_pix |
michael@0 | 382 | movs SCRATCH, X, lsl #dst_bpp_shift+32-3 |
michael@0 | 383 | .endm |
michael@0 | 384 | |
michael@0 | 385 | .macro test_bits_1_0_pix |
michael@0 | 386 | .if dst_w_bpp == 8 |
michael@0 | 387 | movs SCRATCH, X, lsl #dst_bpp_shift+32-1 |
michael@0 | 388 | .else |
michael@0 | 389 | movs SCRATCH, X, lsr #1 |
michael@0 | 390 | .endif |
michael@0 | 391 | .endm |
michael@0 | 392 | |
michael@0 | 393 | .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
michael@0 | 394 | conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 |
michael@0 | 395 | .if dst_w_bpp == 16 |
michael@0 | 396 | test_bits_1_0_pix |
michael@0 | 397 | conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 |
michael@0 | 398 | .elseif dst_w_bpp == 8 |
michael@0 | 399 | conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 |
michael@0 | 400 | .endif |
michael@0 | 401 | .endm |
michael@0 | 402 | |
michael@0 | 403 | |
michael@0 | 404 | .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment |
michael@0 | 405 | 110: |
michael@0 | 406 | .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ |
michael@0 | 407 | .rept pix_per_block*dst_w_bpp/128 |
michael@0 | 408 | process_head , 16, 0, unaligned_src, unaligned_mask, 1 |
michael@0 | 409 | .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
michael@0 | 410 | preload_middle src_bpp, SRC, 1 |
michael@0 | 411 | .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
michael@0 | 412 | preload_middle mask_bpp, MASK, 1 |
michael@0 | 413 | .else |
michael@0 | 414 | preload_middle src_bpp, SRC, 0 |
michael@0 | 415 | preload_middle mask_bpp, MASK, 0 |
michael@0 | 416 | .endif |
michael@0 | 417 | .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) |
michael@0 | 418 | /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that |
michael@0 | 419 | * destination prefetches are 32-byte aligned. It's also the easiest channel to offset |
michael@0 | 420 | * preloads for, to achieve staggered prefetches for multiple channels, because there are |
michael@0 | 421 | * always two STMs per prefetch, so there is always an opposite STM on which to put the |
michael@0 | 422 | * preload. Note, no need to BIC the base register here */ |
michael@0 | 423 | PF pld, [DST, #32*prefetch_distance - dst_alignment] |
michael@0 | 424 | .endif |
michael@0 | 425 | process_tail , 16, 0 |
michael@0 | 426 | .if !((flags) & FLAG_PROCESS_DOES_STORE) |
michael@0 | 427 | pixst , 16, 0, DST |
michael@0 | 428 | .endif |
michael@0 | 429 | .set SUBBLOCK, SUBBLOCK+1 |
michael@0 | 430 | .endr |
michael@0 | 431 | subs X, X, #pix_per_block |
michael@0 | 432 | bhs 110b |
michael@0 | 433 | .endm |
michael@0 | 434 | |
michael@0 | 435 | .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask |
michael@0 | 436 | /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ |
michael@0 | 437 | .if dst_r_bpp > 0 |
michael@0 | 438 | tst DST, #16 |
michael@0 | 439 | bne 111f |
michael@0 | 440 | process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 |
michael@0 | 441 | b 112f |
michael@0 | 442 | 111: |
michael@0 | 443 | .endif |
michael@0 | 444 | process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 |
michael@0 | 445 | 112: |
michael@0 | 446 | /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ |
michael@0 | 447 | .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) |
michael@0 | 448 | PF and, WK0, X, #pix_per_block-1 |
michael@0 | 449 | .endif |
michael@0 | 450 | preload_trailing src_bpp, src_bpp_shift, SRC |
michael@0 | 451 | preload_trailing mask_bpp, mask_bpp_shift, MASK |
michael@0 | 452 | preload_trailing dst_r_bpp, dst_bpp_shift, DST |
michael@0 | 453 | add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp |
michael@0 | 454 | /* The remainder of the line is handled identically to the medium case */ |
michael@0 | 455 | medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask |
michael@0 | 456 | .endm |
michael@0 | 457 | |
michael@0 | 458 | .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask |
michael@0 | 459 | 120: |
michael@0 | 460 | process_head , 16, 0, unaligned_src, unaligned_mask, 0 |
michael@0 | 461 | process_tail , 16, 0 |
michael@0 | 462 | .if !((flags) & FLAG_PROCESS_DOES_STORE) |
michael@0 | 463 | pixst , 16, 0, DST |
michael@0 | 464 | .endif |
michael@0 | 465 | subs X, X, #128/dst_w_bpp |
michael@0 | 466 | bhs 120b |
michael@0 | 467 | /* Trailing pixels */ |
michael@0 | 468 | tst X, #128/dst_w_bpp - 1 |
michael@0 | 469 | beq exit_label |
michael@0 | 470 | trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
michael@0 | 471 | .endm |
michael@0 | 472 | |
michael@0 | 473 | .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask |
michael@0 | 474 | tst X, #16*8/dst_w_bpp |
michael@0 | 475 | conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 |
michael@0 | 476 | /* Trailing pixels */ |
michael@0 | 477 | /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ |
michael@0 | 478 | trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
michael@0 | 479 | .endm |
michael@0 | 480 | |
michael@0 | 481 | .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label |
michael@0 | 482 | /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ |
michael@0 | 483 | .if mask_bpp == 8 || mask_bpp == 16 |
michael@0 | 484 | tst MASK, #3 |
michael@0 | 485 | bne 141f |
michael@0 | 486 | .endif |
michael@0 | 487 | .if src_bpp == 8 || src_bpp == 16 |
michael@0 | 488 | tst SRC, #3 |
michael@0 | 489 | bne 140f |
michael@0 | 490 | .endif |
michael@0 | 491 | action process_head, process_tail, process_inner_loop, exit_label, 0, 0 |
michael@0 | 492 | .if src_bpp == 8 || src_bpp == 16 |
michael@0 | 493 | b exit_label |
michael@0 | 494 | 140: |
michael@0 | 495 | action process_head, process_tail, process_inner_loop, exit_label, 1, 0 |
michael@0 | 496 | .endif |
michael@0 | 497 | .if mask_bpp == 8 || mask_bpp == 16 |
michael@0 | 498 | b exit_label |
michael@0 | 499 | 141: |
michael@0 | 500 | .if src_bpp == 8 || src_bpp == 16 |
michael@0 | 501 | tst SRC, #3 |
michael@0 | 502 | bne 142f |
michael@0 | 503 | .endif |
michael@0 | 504 | action process_head, process_tail, process_inner_loop, exit_label, 0, 1 |
michael@0 | 505 | .if src_bpp == 8 || src_bpp == 16 |
michael@0 | 506 | b exit_label |
michael@0 | 507 | 142: |
michael@0 | 508 | action process_head, process_tail, process_inner_loop, exit_label, 1, 1 |
michael@0 | 509 | .endif |
michael@0 | 510 | .endif |
michael@0 | 511 | .endm |
michael@0 | 512 | |
michael@0 | 513 | |
michael@0 | 514 | .macro end_of_line restore_x, vars_spilled, loop_label, last_one |
michael@0 | 515 | .if vars_spilled |
michael@0 | 516 | /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ |
michael@0 | 517 | /* This is ldmia sp,{} */ |
michael@0 | 518 | .word 0xE89D0000 | LINE_SAVED_REGS |
michael@0 | 519 | .endif |
michael@0 | 520 | subs Y, Y, #1 |
michael@0 | 521 | .if vars_spilled |
michael@0 | 522 | .if (LINE_SAVED_REGS) & (1<<1) |
michael@0 | 523 | str Y, [sp] |
michael@0 | 524 | .endif |
michael@0 | 525 | .endif |
michael@0 | 526 | add DST, DST, STRIDE_D |
michael@0 | 527 | .if src_bpp > 0 |
michael@0 | 528 | add SRC, SRC, STRIDE_S |
michael@0 | 529 | .endif |
michael@0 | 530 | .if mask_bpp > 0 |
michael@0 | 531 | add MASK, MASK, STRIDE_M |
michael@0 | 532 | .endif |
michael@0 | 533 | .if restore_x |
michael@0 | 534 | mov X, ORIG_W |
michael@0 | 535 | .endif |
michael@0 | 536 | bhs loop_label |
michael@0 | 537 | .ifc "last_one","" |
michael@0 | 538 | .if vars_spilled |
michael@0 | 539 | b 197f |
michael@0 | 540 | .else |
michael@0 | 541 | b 198f |
michael@0 | 542 | .endif |
michael@0 | 543 | .else |
michael@0 | 544 | .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) |
michael@0 | 545 | b 198f |
michael@0 | 546 | .endif |
michael@0 | 547 | .endif |
michael@0 | 548 | .endm |
michael@0 | 549 | |
michael@0 | 550 | |
michael@0 | 551 | .macro generate_composite_function fname, \ |
michael@0 | 552 | src_bpp_, \ |
michael@0 | 553 | mask_bpp_, \ |
michael@0 | 554 | dst_w_bpp_, \ |
michael@0 | 555 | flags_, \ |
michael@0 | 556 | prefetch_distance_, \ |
michael@0 | 557 | init, \ |
michael@0 | 558 | newline, \ |
michael@0 | 559 | cleanup, \ |
michael@0 | 560 | process_head, \ |
michael@0 | 561 | process_tail, \ |
michael@0 | 562 | process_inner_loop |
michael@0 | 563 | |
michael@0 | 564 | .func fname |
michael@0 | 565 | .global fname |
michael@0 | 566 | /* For ELF format also set function visibility to hidden */ |
michael@0 | 567 | #ifdef __ELF__ |
michael@0 | 568 | .hidden fname |
michael@0 | 569 | .type fname, %function |
michael@0 | 570 | #endif |
michael@0 | 571 | |
michael@0 | 572 | /* |
michael@0 | 573 | * Make some macro arguments globally visible and accessible |
michael@0 | 574 | * from other macros |
michael@0 | 575 | */ |
michael@0 | 576 | .set src_bpp, src_bpp_ |
michael@0 | 577 | .set mask_bpp, mask_bpp_ |
michael@0 | 578 | .set dst_w_bpp, dst_w_bpp_ |
michael@0 | 579 | .set flags, flags_ |
michael@0 | 580 | .set prefetch_distance, prefetch_distance_ |
michael@0 | 581 | |
michael@0 | 582 | /* |
michael@0 | 583 | * Select prefetch type for this function. |
michael@0 | 584 | */ |
michael@0 | 585 | .if prefetch_distance == 0 |
michael@0 | 586 | .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE |
michael@0 | 587 | .else |
michael@0 | 588 | .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD |
michael@0 | 589 | .endif |
michael@0 | 590 | |
michael@0 | 591 | .if src_bpp == 32 |
michael@0 | 592 | .set src_bpp_shift, 2 |
michael@0 | 593 | .elseif src_bpp == 24 |
michael@0 | 594 | .set src_bpp_shift, 0 |
michael@0 | 595 | .elseif src_bpp == 16 |
michael@0 | 596 | .set src_bpp_shift, 1 |
michael@0 | 597 | .elseif src_bpp == 8 |
michael@0 | 598 | .set src_bpp_shift, 0 |
michael@0 | 599 | .elseif src_bpp == 0 |
michael@0 | 600 | .set src_bpp_shift, -1 |
michael@0 | 601 | .else |
michael@0 | 602 | .error "requested src bpp (src_bpp) is not supported" |
michael@0 | 603 | .endif |
michael@0 | 604 | |
michael@0 | 605 | .if mask_bpp == 32 |
michael@0 | 606 | .set mask_bpp_shift, 2 |
michael@0 | 607 | .elseif mask_bpp == 24 |
michael@0 | 608 | .set mask_bpp_shift, 0 |
michael@0 | 609 | .elseif mask_bpp == 8 |
michael@0 | 610 | .set mask_bpp_shift, 0 |
michael@0 | 611 | .elseif mask_bpp == 0 |
michael@0 | 612 | .set mask_bpp_shift, -1 |
michael@0 | 613 | .else |
michael@0 | 614 | .error "requested mask bpp (mask_bpp) is not supported" |
michael@0 | 615 | .endif |
michael@0 | 616 | |
michael@0 | 617 | .if dst_w_bpp == 32 |
michael@0 | 618 | .set dst_bpp_shift, 2 |
michael@0 | 619 | .elseif dst_w_bpp == 24 |
michael@0 | 620 | .set dst_bpp_shift, 0 |
michael@0 | 621 | .elseif dst_w_bpp == 16 |
michael@0 | 622 | .set dst_bpp_shift, 1 |
michael@0 | 623 | .elseif dst_w_bpp == 8 |
michael@0 | 624 | .set dst_bpp_shift, 0 |
michael@0 | 625 | .else |
michael@0 | 626 | .error "requested dst bpp (dst_w_bpp) is not supported" |
michael@0 | 627 | .endif |
michael@0 | 628 | |
michael@0 | 629 | .if (((flags) & FLAG_DST_READWRITE) != 0) |
michael@0 | 630 | .set dst_r_bpp, dst_w_bpp |
michael@0 | 631 | .else |
michael@0 | 632 | .set dst_r_bpp, 0 |
michael@0 | 633 | .endif |
michael@0 | 634 | |
michael@0 | 635 | .set pix_per_block, 16*8/dst_w_bpp |
michael@0 | 636 | .if src_bpp != 0 |
michael@0 | 637 | .if 32*8/src_bpp > pix_per_block |
michael@0 | 638 | .set pix_per_block, 32*8/src_bpp |
michael@0 | 639 | .endif |
michael@0 | 640 | .endif |
michael@0 | 641 | .if mask_bpp != 0 |
michael@0 | 642 | .if 32*8/mask_bpp > pix_per_block |
michael@0 | 643 | .set pix_per_block, 32*8/mask_bpp |
michael@0 | 644 | .endif |
michael@0 | 645 | .endif |
michael@0 | 646 | .if dst_r_bpp != 0 |
michael@0 | 647 | .if 32*8/dst_r_bpp > pix_per_block |
michael@0 | 648 | .set pix_per_block, 32*8/dst_r_bpp |
michael@0 | 649 | .endif |
michael@0 | 650 | .endif |
michael@0 | 651 | |
michael@0 | 652 | /* The standard entry conditions set up by pixman-arm-common.h are: |
michael@0 | 653 | * r0 = width (pixels) |
michael@0 | 654 | * r1 = height (rows) |
michael@0 | 655 | * r2 = pointer to top-left pixel of destination |
michael@0 | 656 | * r3 = destination stride (pixels) |
michael@0 | 657 | * [sp] = source pixel value, or pointer to top-left pixel of source |
michael@0 | 658 | * [sp,#4] = 0 or source stride (pixels) |
michael@0 | 659 | * The following arguments are unused for non-mask operations |
michael@0 | 660 | * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask |
michael@0 | 661 | * [sp,#12] = 0 or mask stride (pixels) |
michael@0 | 662 | */ |
michael@0 | 663 | |
michael@0 | 664 | /* |
michael@0 | 665 | * Assign symbolic names to registers |
michael@0 | 666 | */ |
michael@0 | 667 | X .req r0 /* pixels to go on this line */ |
michael@0 | 668 | Y .req r1 /* lines to go */ |
michael@0 | 669 | DST .req r2 /* destination pixel pointer */ |
michael@0 | 670 | STRIDE_D .req r3 /* destination stride (bytes, minus width) */ |
michael@0 | 671 | SRC .req r4 /* source pixel pointer */ |
michael@0 | 672 | STRIDE_S .req r5 /* source stride (bytes, minus width) */ |
michael@0 | 673 | MASK .req r6 /* mask pixel pointer (if applicable) */ |
michael@0 | 674 | STRIDE_M .req r7 /* mask stride (bytes, minus width) */ |
michael@0 | 675 | WK0 .req r8 /* pixel data registers */ |
michael@0 | 676 | WK1 .req r9 |
michael@0 | 677 | WK2 .req r10 |
michael@0 | 678 | WK3 .req r11 |
michael@0 | 679 | SCRATCH .req r12 |
michael@0 | 680 | ORIG_W .req r14 /* width (pixels) */ |
michael@0 | 681 | |
michael@0 | 682 | fname: |
michael@0 | 683 | .fnstart |
michael@0 | 684 | .save {r4-r11, lr} |
michael@0 | 685 | push {r4-r11, lr} /* save all registers */ |
michael@0 | 686 | |
michael@0 | 687 | subs Y, Y, #1 |
michael@0 | 688 | blo 199f |
michael@0 | 689 | |
michael@0 | 690 | #ifdef DEBUG_PARAMS |
michael@0 | 691 | .pad #9*4 |
michael@0 | 692 | sub sp, sp, #9*4 |
michael@0 | 693 | #endif |
michael@0 | 694 | |
michael@0 | 695 | .if src_bpp > 0 |
michael@0 | 696 | ldr SRC, [sp, #ARGS_STACK_OFFSET] |
michael@0 | 697 | ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] |
michael@0 | 698 | .endif |
michael@0 | 699 | .if mask_bpp > 0 |
michael@0 | 700 | ldr MASK, [sp, #ARGS_STACK_OFFSET+8] |
michael@0 | 701 | ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] |
michael@0 | 702 | .endif |
michael@0 | 703 | |
michael@0 | 704 | #ifdef DEBUG_PARAMS |
michael@0 | 705 | add Y, Y, #1 |
michael@0 | 706 | stmia sp, {r0-r7,pc} |
michael@0 | 707 | sub Y, Y, #1 |
michael@0 | 708 | #endif |
michael@0 | 709 | |
michael@0 | 710 | init |
michael@0 | 711 | |
michael@0 | 712 | lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ |
michael@0 | 713 | sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift |
michael@0 | 714 | .if src_bpp > 0 |
michael@0 | 715 | lsl STRIDE_S, #src_bpp_shift |
michael@0 | 716 | sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift |
michael@0 | 717 | .endif |
michael@0 | 718 | .if mask_bpp > 0 |
michael@0 | 719 | lsl STRIDE_M, #mask_bpp_shift |
michael@0 | 720 | sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift |
michael@0 | 721 | .endif |
michael@0 | 722 | |
michael@0 | 723 | /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ |
michael@0 | 724 | cmp X, #2*16*8/dst_w_bpp - 1 |
michael@0 | 725 | blo 170f |
michael@0 | 726 | .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ |
michael@0 | 727 | /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ |
michael@0 | 728 | cmp X, #(prefetch_distance+3)*pix_per_block - 1 |
michael@0 | 729 | blo 160f |
michael@0 | 730 | |
michael@0 | 731 | /* Wide case */ |
michael@0 | 732 | /* Adjust X so that the decrement instruction can also test for |
michael@0 | 733 | * inner loop termination. We want it to stop when there are |
michael@0 | 734 | * (prefetch_distance+1) complete blocks to go. */ |
michael@0 | 735 | sub X, X, #(prefetch_distance+2)*pix_per_block |
michael@0 | 736 | mov ORIG_W, X |
michael@0 | 737 | .if (flags) & FLAG_SPILL_LINE_VARS_WIDE |
michael@0 | 738 | /* This is stmdb sp!,{} */ |
michael@0 | 739 | .word 0xE92D0000 | LINE_SAVED_REGS |
michael@0 | 740 | .endif |
michael@0 | 741 | 151: /* New line */ |
michael@0 | 742 | newline |
michael@0 | 743 | preload_leading_step1 src_bpp, WK1, SRC |
michael@0 | 744 | preload_leading_step1 mask_bpp, WK2, MASK |
michael@0 | 745 | preload_leading_step1 dst_r_bpp, WK3, DST |
michael@0 | 746 | |
michael@0 | 747 | tst DST, #15 |
michael@0 | 748 | beq 154f |
michael@0 | 749 | rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ |
michael@0 | 750 | .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp) |
michael@0 | 751 | PF and, WK0, WK0, #15 |
michael@0 | 752 | .endif |
michael@0 | 753 | |
michael@0 | 754 | preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC |
michael@0 | 755 | preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK |
michael@0 | 756 | preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST |
michael@0 | 757 | |
michael@0 | 758 | leading_15bytes process_head, process_tail |
michael@0 | 759 | |
michael@0 | 760 | 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ |
michael@0 | 761 | .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
michael@0 | 762 | and SCRATCH, SRC, #31 |
michael@0 | 763 | rsb SCRATCH, SCRATCH, #32*prefetch_distance |
michael@0 | 764 | .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
michael@0 | 765 | and SCRATCH, MASK, #31 |
michael@0 | 766 | rsb SCRATCH, SCRATCH, #32*prefetch_distance |
michael@0 | 767 | .endif |
michael@0 | 768 | .ifc "process_inner_loop","" |
michael@0 | 769 | switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f |
michael@0 | 770 | .else |
michael@0 | 771 | switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f |
michael@0 | 772 | .endif |
michael@0 | 773 | |
michael@0 | 774 | 157: /* Check for another line */ |
michael@0 | 775 | end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b |
michael@0 | 776 | .endif |
michael@0 | 777 | |
michael@0 | 778 | .ltorg |
michael@0 | 779 | |
michael@0 | 780 | 160: /* Medium case */ |
michael@0 | 781 | mov ORIG_W, X |
michael@0 | 782 | .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE |
michael@0 | 783 | /* This is stmdb sp!,{} */ |
michael@0 | 784 | .word 0xE92D0000 | LINE_SAVED_REGS |
michael@0 | 785 | .endif |
michael@0 | 786 | 161: /* New line */ |
michael@0 | 787 | newline |
michael@0 | 788 | preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ |
michael@0 | 789 | preload_line 0, mask_bpp, mask_bpp_shift, MASK |
michael@0 | 790 | preload_line 0, dst_r_bpp, dst_bpp_shift, DST |
michael@0 | 791 | |
michael@0 | 792 | sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ |
michael@0 | 793 | tst DST, #15 |
michael@0 | 794 | beq 164f |
michael@0 | 795 | rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ |
michael@0 | 796 | |
michael@0 | 797 | leading_15bytes process_head, process_tail |
michael@0 | 798 | |
michael@0 | 799 | 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ |
michael@0 | 800 | switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f |
michael@0 | 801 | |
michael@0 | 802 | 167: /* Check for another line */ |
michael@0 | 803 | end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b |
michael@0 | 804 | |
michael@0 | 805 | .ltorg |
michael@0 | 806 | |
michael@0 | 807 | 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ |
michael@0 | 808 | .if dst_w_bpp < 32 |
michael@0 | 809 | mov ORIG_W, X |
michael@0 | 810 | .endif |
michael@0 | 811 | .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE |
michael@0 | 812 | /* This is stmdb sp!,{} */ |
michael@0 | 813 | .word 0xE92D0000 | LINE_SAVED_REGS |
michael@0 | 814 | .endif |
michael@0 | 815 | 171: /* New line */ |
michael@0 | 816 | newline |
michael@0 | 817 | preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ |
michael@0 | 818 | preload_line 1, mask_bpp, mask_bpp_shift, MASK |
michael@0 | 819 | preload_line 1, dst_r_bpp, dst_bpp_shift, DST |
michael@0 | 820 | |
michael@0 | 821 | .if dst_w_bpp == 8 |
michael@0 | 822 | tst DST, #3 |
michael@0 | 823 | beq 174f |
michael@0 | 824 | 172: subs X, X, #1 |
michael@0 | 825 | blo 177f |
michael@0 | 826 | process_head , 1, 0, 1, 1, 0 |
michael@0 | 827 | process_tail , 1, 0 |
michael@0 | 828 | .if !((flags) & FLAG_PROCESS_DOES_STORE) |
michael@0 | 829 | pixst , 1, 0, DST |
michael@0 | 830 | .endif |
michael@0 | 831 | tst DST, #3 |
michael@0 | 832 | bne 172b |
michael@0 | 833 | .elseif dst_w_bpp == 16 |
michael@0 | 834 | tst DST, #2 |
michael@0 | 835 | beq 174f |
michael@0 | 836 | subs X, X, #1 |
michael@0 | 837 | blo 177f |
michael@0 | 838 | process_head , 2, 0, 1, 1, 0 |
michael@0 | 839 | process_tail , 2, 0 |
michael@0 | 840 | .if !((flags) & FLAG_PROCESS_DOES_STORE) |
michael@0 | 841 | pixst , 2, 0, DST |
michael@0 | 842 | .endif |
michael@0 | 843 | .endif |
michael@0 | 844 | |
michael@0 | 845 | 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ |
michael@0 | 846 | switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f |
michael@0 | 847 | |
michael@0 | 848 | 177: /* Check for another line */ |
michael@0 | 849 | end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one |
michael@0 | 850 | |
michael@0 | 851 | 197: |
michael@0 | 852 | .if (flags) & FLAG_SPILL_LINE_VARS |
michael@0 | 853 | add sp, sp, #LINE_SAVED_REG_COUNT*4 |
michael@0 | 854 | .endif |
michael@0 | 855 | 198: |
michael@0 | 856 | cleanup |
michael@0 | 857 | |
michael@0 | 858 | #ifdef DEBUG_PARAMS |
michael@0 | 859 | add sp, sp, #9*4 /* junk the debug copy of arguments */ |
michael@0 | 860 | #endif |
michael@0 | 861 | 199: |
michael@0 | 862 | pop {r4-r11, pc} /* exit */ |
michael@0 | 863 | .fnend |
michael@0 | 864 | |
michael@0 | 865 | .ltorg |
michael@0 | 866 | |
michael@0 | 867 | .unreq X |
michael@0 | 868 | .unreq Y |
michael@0 | 869 | .unreq DST |
michael@0 | 870 | .unreq STRIDE_D |
michael@0 | 871 | .unreq SRC |
michael@0 | 872 | .unreq STRIDE_S |
michael@0 | 873 | .unreq MASK |
michael@0 | 874 | .unreq STRIDE_M |
michael@0 | 875 | .unreq WK0 |
michael@0 | 876 | .unreq WK1 |
michael@0 | 877 | .unreq WK2 |
michael@0 | 878 | .unreq WK3 |
michael@0 | 879 | .unreq SCRATCH |
michael@0 | 880 | .unreq ORIG_W |
michael@0 | 881 | .endfunc |
michael@0 | 882 | .endm |
michael@0 | 883 | |
michael@0 | 884 | .macro line_saved_regs x:vararg |
michael@0 | 885 | .set LINE_SAVED_REGS, 0 |
michael@0 | 886 | .set LINE_SAVED_REG_COUNT, 0 |
michael@0 | 887 | .irp SAVED_REG,x |
michael@0 | 888 | .ifc "SAVED_REG","Y" |
michael@0 | 889 | .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) |
michael@0 | 890 | .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
michael@0 | 891 | .endif |
michael@0 | 892 | .ifc "SAVED_REG","STRIDE_D" |
michael@0 | 893 | .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) |
michael@0 | 894 | .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
michael@0 | 895 | .endif |
michael@0 | 896 | .ifc "SAVED_REG","STRIDE_S" |
michael@0 | 897 | .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) |
michael@0 | 898 | .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
michael@0 | 899 | .endif |
michael@0 | 900 | .ifc "SAVED_REG","STRIDE_M" |
michael@0 | 901 | .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) |
michael@0 | 902 | .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
michael@0 | 903 | .endif |
michael@0 | 904 | .ifc "SAVED_REG","ORIG_W" |
michael@0 | 905 | .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) |
michael@0 | 906 | .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
michael@0 | 907 | .endif |
michael@0 | 908 | .endr |
michael@0 | 909 | .endm |
michael@0 | 910 | |
michael@0 | 911 | .macro nop_macro x:vararg |
michael@0 | 912 | .endm |