gfx/cairo/libpixman/src/pixman-arm-simd-asm.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright © 2012 Raspberry Pi Foundation
michael@0 3 * Copyright © 2012 RISC OS Open Ltd
michael@0 4 *
michael@0 5 * Permission to use, copy, modify, distribute, and sell this software and its
michael@0 6 * documentation for any purpose is hereby granted without fee, provided that
michael@0 7 * the above copyright notice appear in all copies and that both that
michael@0 8 * copyright notice and this permission notice appear in supporting
michael@0 9 * documentation, and that the name of the copyright holders not be used in
michael@0 10 * advertising or publicity pertaining to distribution of the software without
michael@0 11 * specific, written prior permission. The copyright holders make no
michael@0 12 * representations about the suitability of this software for any purpose. It
michael@0 13 * is provided "as is" without express or implied warranty.
michael@0 14 *
michael@0 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
michael@0 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
michael@0 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
michael@0 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
michael@0 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
michael@0 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
michael@0 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
michael@0 22 * SOFTWARE.
michael@0 23 *
michael@0 24 * Author: Ben Avison (bavison@riscosopen.org)
michael@0 25 *
michael@0 26 */
michael@0 27
michael@0 28 /*
michael@0 29 * Because the alignment of pixel data to cachelines, and even the number of
michael@0 30 * cachelines per row can vary from row to row, and because of the need to
michael@0 31 * preload each scanline once and only once, this prefetch strategy treats
michael@0 32 * each row of pixels independently. When a pixel row is long enough, there
michael@0 33 * are three distinct phases of prefetch:
michael@0 34 * * an inner loop section, where each time a cacheline of data is
michael@0 35 * processed, another cacheline is preloaded (the exact distance ahead is
michael@0 36 * determined empirically using profiling results from lowlevel-blt-bench)
michael@0 37 * * a leading section, where enough cachelines are preloaded to ensure no
michael@0 38 * cachelines escape being preloaded when the inner loop starts
michael@0 39 * * a trailing section, where a limited number (0 or more) of cachelines
michael@0 40 * are preloaded to deal with data (if any) that hangs off the end of the
michael@0 41 * last iteration of the inner loop, plus any trailing bytes that were not
michael@0 42 * enough to make up one whole iteration of the inner loop
michael@0 43 *
michael@0 44 * There are (in general) three distinct code paths, selected between
michael@0 45 * depending upon how long the pixel row is. If it is long enough that there
michael@0 46 * is at least one iteration of the inner loop (as described above) then
michael@0 47 * this is described as the "wide" case. If it is shorter than that, but
michael@0 48 * there are still enough bytes output that there is at least one 16-byte-
michael@0 49 * long, 16-byte-aligned write to the destination (the optimum type of
michael@0 50 * write), then this is the "medium" case. If it is not even this long, then
michael@0 51 * this is the "narrow" case, and there is no attempt to align writes to
michael@0 52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the
michael@0 53 * cachelines containing data from the pixel row are prefetched up-front.
michael@0 54 */
michael@0 55
michael@0 56 /*
michael@0 57 * Determine whether we put the arguments on the stack for debugging.
michael@0 58 */
michael@0 59 #undef DEBUG_PARAMS
michael@0 60
michael@0 61 /*
michael@0 62 * Bit flags for 'generate_composite_function' macro which are used
michael@0 63 * to tune generated functions behavior.
michael@0 64 */
michael@0 65 .set FLAG_DST_WRITEONLY, 0
michael@0 66 .set FLAG_DST_READWRITE, 1
michael@0 67 .set FLAG_COND_EXEC, 0
michael@0 68 .set FLAG_BRANCH_OVER, 2
michael@0 69 .set FLAG_PROCESS_PRESERVES_PSR, 0
michael@0 70 .set FLAG_PROCESS_CORRUPTS_PSR, 4
michael@0 71 .set FLAG_PROCESS_DOESNT_STORE, 0
michael@0 72 .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */
michael@0 73 .set FLAG_NO_SPILL_LINE_VARS, 0
michael@0 74 .set FLAG_SPILL_LINE_VARS_WIDE, 16
michael@0 75 .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32
michael@0 76 .set FLAG_SPILL_LINE_VARS, 48
michael@0 77 .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
michael@0 78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
michael@0 79
michael@0 80 /*
michael@0 81 * Offset into stack where mask and source pointer/stride can be accessed.
michael@0 82 */
michael@0 83 #ifdef DEBUG_PARAMS
michael@0 84 .set ARGS_STACK_OFFSET, (9*4+9*4)
michael@0 85 #else
michael@0 86 .set ARGS_STACK_OFFSET, (9*4)
michael@0 87 #endif
michael@0 88
michael@0 89 /*
michael@0 90 * Constants for selecting preferable prefetch type.
michael@0 91 */
michael@0 92 .set PREFETCH_TYPE_NONE, 0
michael@0 93 .set PREFETCH_TYPE_STANDARD, 1
michael@0 94
michael@0 95 /*
michael@0 96 * Definitions of macros for load/store of pixel data.
michael@0 97 */
michael@0 98
michael@0 99 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
michael@0 100 .if numbytes == 16
michael@0 101 .if unaligned == 1
michael@0 102 op&r&cond WK&reg0, [base], #4
michael@0 103 op&r&cond WK&reg1, [base], #4
michael@0 104 op&r&cond WK&reg2, [base], #4
michael@0 105 op&r&cond WK&reg3, [base], #4
michael@0 106 .else
michael@0 107 op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
michael@0 108 .endif
michael@0 109 .elseif numbytes == 8
michael@0 110 .if unaligned == 1
michael@0 111 op&r&cond WK&reg0, [base], #4
michael@0 112 op&r&cond WK&reg1, [base], #4
michael@0 113 .else
michael@0 114 op&m&cond&ia base!, {WK&reg0,WK&reg1}
michael@0 115 .endif
michael@0 116 .elseif numbytes == 4
michael@0 117 op&r&cond WK&reg0, [base], #4
michael@0 118 .elseif numbytes == 2
michael@0 119 op&r&cond&h WK&reg0, [base], #2
michael@0 120 .elseif numbytes == 1
michael@0 121 op&r&cond&b WK&reg0, [base], #1
michael@0 122 .else
michael@0 123 .error "unsupported size: numbytes"
michael@0 124 .endif
michael@0 125 .endm
michael@0 126
michael@0 127 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
michael@0 128 .if numbytes == 16
michael@0 129 stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
michael@0 130 .elseif numbytes == 8
michael@0 131 stm&cond&db base, {WK&reg0,WK&reg1}
michael@0 132 .elseif numbytes == 4
michael@0 133 str&cond WK&reg0, [base, #-4]
michael@0 134 .elseif numbytes == 2
michael@0 135 str&cond&h WK&reg0, [base, #-2]
michael@0 136 .elseif numbytes == 1
michael@0 137 str&cond&b WK&reg0, [base, #-1]
michael@0 138 .else
michael@0 139 .error "unsupported size: numbytes"
michael@0 140 .endif
michael@0 141 .endm
michael@0 142
michael@0 143 .macro pixld cond, numbytes, firstreg, base, unaligned
michael@0 144 pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
michael@0 145 .endm
michael@0 146
michael@0 147 .macro pixst cond, numbytes, firstreg, base
michael@0 148 .if (flags) & FLAG_DST_READWRITE
michael@0 149 pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
michael@0 150 .else
michael@0 151 pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
michael@0 152 .endif
michael@0 153 .endm
michael@0 154
michael@0 155 .macro PF a, x:vararg
michael@0 156 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
michael@0 157 a x
michael@0 158 .endif
michael@0 159 .endm
michael@0 160
michael@0 161
michael@0 162 .macro preload_leading_step1 bpp, ptr, base
michael@0 163 /* If the destination is already 16-byte aligned, then we need to preload
michael@0 164 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
michael@0 165 * are no gaps when the inner loop starts.
michael@0 166 */
michael@0 167 .if bpp > 0
michael@0 168 PF bic, ptr, base, #31
michael@0 169 .set OFFSET, 0
michael@0 170 .rept prefetch_distance+1
michael@0 171 PF pld, [ptr, #OFFSET]
michael@0 172 .set OFFSET, OFFSET+32
michael@0 173 .endr
michael@0 174 .endif
michael@0 175 .endm
michael@0 176
michael@0 177 .macro preload_leading_step2 bpp, bpp_shift, ptr, base
michael@0 178 /* However, if the destination is not 16-byte aligned, we may need to
michael@0 179 * preload more cache lines than that. The question we need to ask is:
michael@0 180 * are the bytes corresponding to the leading pixels more than the amount
michael@0 181 * by which the source pointer will be rounded down for preloading, and if
michael@0 182 * so, by how many cache lines? Effectively, we want to calculate
michael@0 183 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
michael@0 184 * inner_loop_offset = (src+leading_bytes)&31
michael@0 185 * extra_needed = leading_bytes - inner_loop_offset
michael@0 186 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
michael@0 187 * possible when there are 4 src bytes for every 1 dst byte).
michael@0 188 */
michael@0 189 .if bpp > 0
michael@0 190 .ifc base,DST
michael@0 191 /* The test can be simplified further when preloading the destination */
michael@0 192 PF tst, base, #16
michael@0 193 PF beq, 61f
michael@0 194 .else
michael@0 195 .if bpp/dst_w_bpp == 4
michael@0 196 PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
michael@0 197 PF and, SCRATCH, SCRATCH, #31
michael@0 198 PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
michael@0 199 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
michael@0 200 PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */
michael@0 201 PF bcs, 61f
michael@0 202 PF bpl, 60f
michael@0 203 PF pld, [ptr, #32*(prefetch_distance+2)]
michael@0 204 .else
michael@0 205 PF mov, SCRATCH, base, lsl #32-5
michael@0 206 PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
michael@0 207 PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
michael@0 208 PF bls, 61f
michael@0 209 .endif
michael@0 210 .endif
michael@0 211 60: PF pld, [ptr, #32*(prefetch_distance+1)]
michael@0 212 61:
michael@0 213 .endif
michael@0 214 .endm
michael@0 215
michael@0 216 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
michael@0 217 .macro preload_middle bpp, base, scratch_holds_offset
michael@0 218 .if bpp > 0
michael@0 219 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
michael@0 220 .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
michael@0 221 .if scratch_holds_offset
michael@0 222 PF pld, [base, SCRATCH]
michael@0 223 .else
michael@0 224 PF bic, SCRATCH, base, #31
michael@0 225 PF pld, [SCRATCH, #32*prefetch_distance]
michael@0 226 .endif
michael@0 227 .endif
michael@0 228 .endif
michael@0 229 .endm
michael@0 230
michael@0 231 .macro preload_trailing bpp, bpp_shift, base
michael@0 232 .if bpp > 0
michael@0 233 .if bpp*pix_per_block > 256
michael@0 234 /* Calculations are more complex if more than one fetch per block */
michael@0 235 PF and, WK1, base, #31
michael@0 236 PF add, WK1, WK1, WK0, lsl #bpp_shift
michael@0 237 PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
michael@0 238 PF bic, SCRATCH, base, #31
michael@0 239 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
michael@0 240 PF add, SCRATCH, SCRATCH, #32
michael@0 241 PF subs, WK1, WK1, #32
michael@0 242 PF bhi, 80b
michael@0 243 .else
michael@0 244 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
michael@0 245 PF mov, SCRATCH, base, lsl #32-5
michael@0 246 PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
michael@0 247 PF adceqs, SCRATCH, SCRATCH, #0
michael@0 248 /* The instruction above has two effects: ensures Z is only
michael@0 249 * set if C was clear (so Z indicates that both shifted quantities
michael@0 250 * were 0), and clears C if Z was set (so C indicates that the sum
michael@0 251 * of the shifted quantities was greater and not equal to 32) */
michael@0 252 PF beq, 82f
michael@0 253 PF bic, SCRATCH, base, #31
michael@0 254 PF bcc, 81f
michael@0 255 PF pld, [SCRATCH, #32*(prefetch_distance+2)]
michael@0 256 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
michael@0 257 82:
michael@0 258 .endif
michael@0 259 .endif
michael@0 260 .endm
michael@0 261
michael@0 262
michael@0 263 .macro preload_line narrow_case, bpp, bpp_shift, base
michael@0 264 /* "narrow_case" - just means that the macro was invoked from the "narrow"
michael@0 265 * code path rather than the "medium" one - because in the narrow case,
michael@0 266 * the row of pixels is known to output no more than 30 bytes, then
michael@0 267 * (assuming the source pixels are no wider than the the destination
michael@0 268 * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
michael@0 269 * meaning there's no need for a loop.
michael@0 270 * "bpp" - number of bits per pixel in the channel (source, mask or
michael@0 271 * destination) that's being preloaded, or 0 if this channel is not used
michael@0 272 * for reading
michael@0 273 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
michael@0 274 * "base" - base address register of channel to preload (SRC, MASK or DST)
michael@0 275 */
michael@0 276 .if bpp > 0
michael@0 277 .if narrow_case && (bpp <= dst_w_bpp)
michael@0 278 /* In these cases, each line for each channel is in either 1 or 2 cache lines */
michael@0 279 PF bic, WK0, base, #31
michael@0 280 PF pld, [WK0]
michael@0 281 PF add, WK1, base, X, LSL #bpp_shift
michael@0 282 PF sub, WK1, WK1, #1
michael@0 283 PF bic, WK1, WK1, #31
michael@0 284 PF cmp, WK1, WK0
michael@0 285 PF beq, 90f
michael@0 286 PF pld, [WK1]
michael@0 287 90:
michael@0 288 .else
michael@0 289 PF bic, WK0, base, #31
michael@0 290 PF pld, [WK0]
michael@0 291 PF add, WK1, base, X, lsl #bpp_shift
michael@0 292 PF sub, WK1, WK1, #1
michael@0 293 PF bic, WK1, WK1, #31
michael@0 294 PF cmp, WK1, WK0
michael@0 295 PF beq, 92f
michael@0 296 91: PF add, WK0, WK0, #32
michael@0 297 PF cmp, WK0, WK1
michael@0 298 PF pld, [WK0]
michael@0 299 PF bne, 91b
michael@0 300 92:
michael@0 301 .endif
michael@0 302 .endif
michael@0 303 .endm
michael@0 304
michael@0 305
michael@0 306 .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
michael@0 307 process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
michael@0 308 .if decrementx
michael@0 309 sub&cond X, X, #8*numbytes/dst_w_bpp
michael@0 310 .endif
michael@0 311 process_tail cond, numbytes, firstreg
michael@0 312 .if !((flags) & FLAG_PROCESS_DOES_STORE)
michael@0 313 pixst cond, numbytes, firstreg, DST
michael@0 314 .endif
michael@0 315 .endm
michael@0 316
michael@0 317 .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
michael@0 318 .if (flags) & FLAG_BRANCH_OVER
michael@0 319 .ifc cond,mi
michael@0 320 bpl 100f
michael@0 321 .endif
michael@0 322 .ifc cond,cs
michael@0 323 bcc 100f
michael@0 324 .endif
michael@0 325 .ifc cond,ne
michael@0 326 beq 100f
michael@0 327 .endif
michael@0 328 conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
michael@0 329 100:
michael@0 330 .else
michael@0 331 conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
michael@0 332 .endif
michael@0 333 .endm
michael@0 334
michael@0 335 .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
michael@0 336 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
michael@0 337 /* Can't interleave reads and writes */
michael@0 338 test
michael@0 339 conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
michael@0 340 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
michael@0 341 test
michael@0 342 .endif
michael@0 343 conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
michael@0 344 .else
michael@0 345 /* Can interleave reads and writes for better scheduling */
michael@0 346 test
michael@0 347 process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
michael@0 348 process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
michael@0 349 .if decrementx
michael@0 350 sub&cond1 X, X, #8*numbytes1/dst_w_bpp
michael@0 351 sub&cond2 X, X, #8*numbytes2/dst_w_bpp
michael@0 352 .endif
michael@0 353 process_tail cond1, numbytes1, firstreg1
michael@0 354 process_tail cond2, numbytes2, firstreg2
michael@0 355 pixst cond1, numbytes1, firstreg1, DST
michael@0 356 pixst cond2, numbytes2, firstreg2, DST
michael@0 357 .endif
michael@0 358 .endm
michael@0 359
michael@0 360
michael@0 361 .macro test_bits_1_0_ptr
michael@0 362 movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
michael@0 363 .endm
michael@0 364
michael@0 365 .macro test_bits_3_2_ptr
michael@0 366 movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
michael@0 367 .endm
michael@0 368
michael@0 369 .macro leading_15bytes process_head, process_tail
michael@0 370 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
michael@0 371 /* Use unaligned loads in all cases for simplicity */
michael@0 372 .if dst_w_bpp == 8
michael@0 373 conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
michael@0 374 .elseif dst_w_bpp == 16
michael@0 375 test_bits_1_0_ptr
michael@0 376 conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1
michael@0 377 .endif
michael@0 378 conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
michael@0 379 .endm
michael@0 380
michael@0 381 .macro test_bits_3_2_pix
michael@0 382 movs SCRATCH, X, lsl #dst_bpp_shift+32-3
michael@0 383 .endm
michael@0 384
michael@0 385 .macro test_bits_1_0_pix
michael@0 386 .if dst_w_bpp == 8
michael@0 387 movs SCRATCH, X, lsl #dst_bpp_shift+32-1
michael@0 388 .else
michael@0 389 movs SCRATCH, X, lsr #1
michael@0 390 .endif
michael@0 391 .endm
michael@0 392
michael@0 393 .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
michael@0 394 conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
michael@0 395 .if dst_w_bpp == 16
michael@0 396 test_bits_1_0_pix
michael@0 397 conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
michael@0 398 .elseif dst_w_bpp == 8
michael@0 399 conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
michael@0 400 .endif
michael@0 401 .endm
michael@0 402
michael@0 403
michael@0 404 .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
michael@0 405 110:
michael@0 406 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
michael@0 407 .rept pix_per_block*dst_w_bpp/128
michael@0 408 process_head , 16, 0, unaligned_src, unaligned_mask, 1
michael@0 409 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
michael@0 410 preload_middle src_bpp, SRC, 1
michael@0 411 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
michael@0 412 preload_middle mask_bpp, MASK, 1
michael@0 413 .else
michael@0 414 preload_middle src_bpp, SRC, 0
michael@0 415 preload_middle mask_bpp, MASK, 0
michael@0 416 .endif
michael@0 417 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
michael@0 418 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
michael@0 419 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
michael@0 420 * preloads for, to achieve staggered prefetches for multiple channels, because there are
michael@0 421 * always two STMs per prefetch, so there is always an opposite STM on which to put the
michael@0 422 * preload. Note, no need to BIC the base register here */
michael@0 423 PF pld, [DST, #32*prefetch_distance - dst_alignment]
michael@0 424 .endif
michael@0 425 process_tail , 16, 0
michael@0 426 .if !((flags) & FLAG_PROCESS_DOES_STORE)
michael@0 427 pixst , 16, 0, DST
michael@0 428 .endif
michael@0 429 .set SUBBLOCK, SUBBLOCK+1
michael@0 430 .endr
michael@0 431 subs X, X, #pix_per_block
michael@0 432 bhs 110b
michael@0 433 .endm
michael@0 434
michael@0 435 .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
michael@0 436 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
michael@0 437 .if dst_r_bpp > 0
michael@0 438 tst DST, #16
michael@0 439 bne 111f
michael@0 440 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16
michael@0 441 b 112f
michael@0 442 111:
michael@0 443 .endif
michael@0 444 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0
michael@0 445 112:
michael@0 446 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
michael@0 447 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
michael@0 448 PF and, WK0, X, #pix_per_block-1
michael@0 449 .endif
michael@0 450 preload_trailing src_bpp, src_bpp_shift, SRC
michael@0 451 preload_trailing mask_bpp, mask_bpp_shift, MASK
michael@0 452 preload_trailing dst_r_bpp, dst_bpp_shift, DST
michael@0 453 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
michael@0 454 /* The remainder of the line is handled identically to the medium case */
michael@0 455 medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
michael@0 456 .endm
michael@0 457
michael@0 458 .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
michael@0 459 120:
michael@0 460 process_head , 16, 0, unaligned_src, unaligned_mask, 0
michael@0 461 process_tail , 16, 0
michael@0 462 .if !((flags) & FLAG_PROCESS_DOES_STORE)
michael@0 463 pixst , 16, 0, DST
michael@0 464 .endif
michael@0 465 subs X, X, #128/dst_w_bpp
michael@0 466 bhs 120b
michael@0 467 /* Trailing pixels */
michael@0 468 tst X, #128/dst_w_bpp - 1
michael@0 469 beq exit_label
michael@0 470 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
michael@0 471 .endm
michael@0 472
michael@0 473 .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
michael@0 474 tst X, #16*8/dst_w_bpp
michael@0 475 conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
michael@0 476 /* Trailing pixels */
michael@0 477 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
michael@0 478 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
michael@0 479 .endm
michael@0 480
michael@0 481 .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
michael@0 482 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
michael@0 483 .if mask_bpp == 8 || mask_bpp == 16
michael@0 484 tst MASK, #3
michael@0 485 bne 141f
michael@0 486 .endif
michael@0 487 .if src_bpp == 8 || src_bpp == 16
michael@0 488 tst SRC, #3
michael@0 489 bne 140f
michael@0 490 .endif
michael@0 491 action process_head, process_tail, process_inner_loop, exit_label, 0, 0
michael@0 492 .if src_bpp == 8 || src_bpp == 16
michael@0 493 b exit_label
michael@0 494 140:
michael@0 495 action process_head, process_tail, process_inner_loop, exit_label, 1, 0
michael@0 496 .endif
michael@0 497 .if mask_bpp == 8 || mask_bpp == 16
michael@0 498 b exit_label
michael@0 499 141:
michael@0 500 .if src_bpp == 8 || src_bpp == 16
michael@0 501 tst SRC, #3
michael@0 502 bne 142f
michael@0 503 .endif
michael@0 504 action process_head, process_tail, process_inner_loop, exit_label, 0, 1
michael@0 505 .if src_bpp == 8 || src_bpp == 16
michael@0 506 b exit_label
michael@0 507 142:
michael@0 508 action process_head, process_tail, process_inner_loop, exit_label, 1, 1
michael@0 509 .endif
michael@0 510 .endif
michael@0 511 .endm
michael@0 512
michael@0 513
michael@0 514 .macro end_of_line restore_x, vars_spilled, loop_label, last_one
michael@0 515 .if vars_spilled
michael@0 516 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
michael@0 517 /* This is ldmia sp,{} */
michael@0 518 .word 0xE89D0000 | LINE_SAVED_REGS
michael@0 519 .endif
michael@0 520 subs Y, Y, #1
michael@0 521 .if vars_spilled
michael@0 522 .if (LINE_SAVED_REGS) & (1<<1)
michael@0 523 str Y, [sp]
michael@0 524 .endif
michael@0 525 .endif
michael@0 526 add DST, DST, STRIDE_D
michael@0 527 .if src_bpp > 0
michael@0 528 add SRC, SRC, STRIDE_S
michael@0 529 .endif
michael@0 530 .if mask_bpp > 0
michael@0 531 add MASK, MASK, STRIDE_M
michael@0 532 .endif
michael@0 533 .if restore_x
michael@0 534 mov X, ORIG_W
michael@0 535 .endif
michael@0 536 bhs loop_label
michael@0 537 .ifc "last_one",""
michael@0 538 .if vars_spilled
michael@0 539 b 197f
michael@0 540 .else
michael@0 541 b 198f
michael@0 542 .endif
michael@0 543 .else
michael@0 544 .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
michael@0 545 b 198f
michael@0 546 .endif
michael@0 547 .endif
michael@0 548 .endm
michael@0 549
michael@0 550
michael@0 551 .macro generate_composite_function fname, \
michael@0 552 src_bpp_, \
michael@0 553 mask_bpp_, \
michael@0 554 dst_w_bpp_, \
michael@0 555 flags_, \
michael@0 556 prefetch_distance_, \
michael@0 557 init, \
michael@0 558 newline, \
michael@0 559 cleanup, \
michael@0 560 process_head, \
michael@0 561 process_tail, \
michael@0 562 process_inner_loop
michael@0 563
michael@0 564 .func fname
michael@0 565 .global fname
michael@0 566 /* For ELF format also set function visibility to hidden */
michael@0 567 #ifdef __ELF__
michael@0 568 .hidden fname
michael@0 569 .type fname, %function
michael@0 570 #endif
michael@0 571
michael@0 572 /*
michael@0 573 * Make some macro arguments globally visible and accessible
michael@0 574 * from other macros
michael@0 575 */
michael@0 576 .set src_bpp, src_bpp_
michael@0 577 .set mask_bpp, mask_bpp_
michael@0 578 .set dst_w_bpp, dst_w_bpp_
michael@0 579 .set flags, flags_
michael@0 580 .set prefetch_distance, prefetch_distance_
michael@0 581
michael@0 582 /*
michael@0 583 * Select prefetch type for this function.
michael@0 584 */
michael@0 585 .if prefetch_distance == 0
michael@0 586 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
michael@0 587 .else
michael@0 588 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
michael@0 589 .endif
michael@0 590
michael@0 591 .if src_bpp == 32
michael@0 592 .set src_bpp_shift, 2
michael@0 593 .elseif src_bpp == 24
michael@0 594 .set src_bpp_shift, 0
michael@0 595 .elseif src_bpp == 16
michael@0 596 .set src_bpp_shift, 1
michael@0 597 .elseif src_bpp == 8
michael@0 598 .set src_bpp_shift, 0
michael@0 599 .elseif src_bpp == 0
michael@0 600 .set src_bpp_shift, -1
michael@0 601 .else
michael@0 602 .error "requested src bpp (src_bpp) is not supported"
michael@0 603 .endif
michael@0 604
michael@0 605 .if mask_bpp == 32
michael@0 606 .set mask_bpp_shift, 2
michael@0 607 .elseif mask_bpp == 24
michael@0 608 .set mask_bpp_shift, 0
michael@0 609 .elseif mask_bpp == 8
michael@0 610 .set mask_bpp_shift, 0
michael@0 611 .elseif mask_bpp == 0
michael@0 612 .set mask_bpp_shift, -1
michael@0 613 .else
michael@0 614 .error "requested mask bpp (mask_bpp) is not supported"
michael@0 615 .endif
michael@0 616
michael@0 617 .if dst_w_bpp == 32
michael@0 618 .set dst_bpp_shift, 2
michael@0 619 .elseif dst_w_bpp == 24
michael@0 620 .set dst_bpp_shift, 0
michael@0 621 .elseif dst_w_bpp == 16
michael@0 622 .set dst_bpp_shift, 1
michael@0 623 .elseif dst_w_bpp == 8
michael@0 624 .set dst_bpp_shift, 0
michael@0 625 .else
michael@0 626 .error "requested dst bpp (dst_w_bpp) is not supported"
michael@0 627 .endif
michael@0 628
michael@0 629 .if (((flags) & FLAG_DST_READWRITE) != 0)
michael@0 630 .set dst_r_bpp, dst_w_bpp
michael@0 631 .else
michael@0 632 .set dst_r_bpp, 0
michael@0 633 .endif
michael@0 634
michael@0 635 .set pix_per_block, 16*8/dst_w_bpp
michael@0 636 .if src_bpp != 0
michael@0 637 .if 32*8/src_bpp > pix_per_block
michael@0 638 .set pix_per_block, 32*8/src_bpp
michael@0 639 .endif
michael@0 640 .endif
michael@0 641 .if mask_bpp != 0
michael@0 642 .if 32*8/mask_bpp > pix_per_block
michael@0 643 .set pix_per_block, 32*8/mask_bpp
michael@0 644 .endif
michael@0 645 .endif
michael@0 646 .if dst_r_bpp != 0
michael@0 647 .if 32*8/dst_r_bpp > pix_per_block
michael@0 648 .set pix_per_block, 32*8/dst_r_bpp
michael@0 649 .endif
michael@0 650 .endif
michael@0 651
michael@0 652 /* The standard entry conditions set up by pixman-arm-common.h are:
michael@0 653 * r0 = width (pixels)
michael@0 654 * r1 = height (rows)
michael@0 655 * r2 = pointer to top-left pixel of destination
michael@0 656 * r3 = destination stride (pixels)
michael@0 657 * [sp] = source pixel value, or pointer to top-left pixel of source
michael@0 658 * [sp,#4] = 0 or source stride (pixels)
michael@0 659 * The following arguments are unused for non-mask operations
michael@0 660 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
michael@0 661 * [sp,#12] = 0 or mask stride (pixels)
michael@0 662 */
michael@0 663
michael@0 664 /*
michael@0 665 * Assign symbolic names to registers
michael@0 666 */
michael@0 667 X .req r0 /* pixels to go on this line */
michael@0 668 Y .req r1 /* lines to go */
michael@0 669 DST .req r2 /* destination pixel pointer */
michael@0 670 STRIDE_D .req r3 /* destination stride (bytes, minus width) */
michael@0 671 SRC .req r4 /* source pixel pointer */
michael@0 672 STRIDE_S .req r5 /* source stride (bytes, minus width) */
michael@0 673 MASK .req r6 /* mask pixel pointer (if applicable) */
michael@0 674 STRIDE_M .req r7 /* mask stride (bytes, minus width) */
michael@0 675 WK0 .req r8 /* pixel data registers */
michael@0 676 WK1 .req r9
michael@0 677 WK2 .req r10
michael@0 678 WK3 .req r11
michael@0 679 SCRATCH .req r12
michael@0 680 ORIG_W .req r14 /* width (pixels) */
michael@0 681
michael@0 682 fname:
michael@0 683 .fnstart
michael@0 684 .save {r4-r11, lr}
michael@0 685 push {r4-r11, lr} /* save all registers */
michael@0 686
michael@0 687 subs Y, Y, #1
michael@0 688 blo 199f
michael@0 689
michael@0 690 #ifdef DEBUG_PARAMS
michael@0 691 .pad #9*4
michael@0 692 sub sp, sp, #9*4
michael@0 693 #endif
michael@0 694
michael@0 695 .if src_bpp > 0
michael@0 696 ldr SRC, [sp, #ARGS_STACK_OFFSET]
michael@0 697 ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
michael@0 698 .endif
michael@0 699 .if mask_bpp > 0
michael@0 700 ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
michael@0 701 ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
michael@0 702 .endif
michael@0 703
michael@0 704 #ifdef DEBUG_PARAMS
michael@0 705 add Y, Y, #1
michael@0 706 stmia sp, {r0-r7,pc}
michael@0 707 sub Y, Y, #1
michael@0 708 #endif
michael@0 709
michael@0 710 init
michael@0 711
michael@0 712 lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
michael@0 713 sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
michael@0 714 .if src_bpp > 0
michael@0 715 lsl STRIDE_S, #src_bpp_shift
michael@0 716 sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
michael@0 717 .endif
michael@0 718 .if mask_bpp > 0
michael@0 719 lsl STRIDE_M, #mask_bpp_shift
michael@0 720 sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
michael@0 721 .endif
michael@0 722
michael@0 723 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
michael@0 724 cmp X, #2*16*8/dst_w_bpp - 1
michael@0 725 blo 170f
michael@0 726 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
michael@0 727 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
michael@0 728 cmp X, #(prefetch_distance+3)*pix_per_block - 1
michael@0 729 blo 160f
michael@0 730
michael@0 731 /* Wide case */
michael@0 732 /* Adjust X so that the decrement instruction can also test for
michael@0 733 * inner loop termination. We want it to stop when there are
michael@0 734 * (prefetch_distance+1) complete blocks to go. */
michael@0 735 sub X, X, #(prefetch_distance+2)*pix_per_block
michael@0 736 mov ORIG_W, X
michael@0 737 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
michael@0 738 /* This is stmdb sp!,{} */
michael@0 739 .word 0xE92D0000 | LINE_SAVED_REGS
michael@0 740 .endif
michael@0 741 151: /* New line */
michael@0 742 newline
michael@0 743 preload_leading_step1 src_bpp, WK1, SRC
michael@0 744 preload_leading_step1 mask_bpp, WK2, MASK
michael@0 745 preload_leading_step1 dst_r_bpp, WK3, DST
michael@0 746
michael@0 747 tst DST, #15
michael@0 748 beq 154f
michael@0 749 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
michael@0 750 .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
michael@0 751 PF and, WK0, WK0, #15
michael@0 752 .endif
michael@0 753
michael@0 754 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
michael@0 755 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
michael@0 756 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
michael@0 757
michael@0 758 leading_15bytes process_head, process_tail
michael@0 759
michael@0 760 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
michael@0 761 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
michael@0 762 and SCRATCH, SRC, #31
michael@0 763 rsb SCRATCH, SCRATCH, #32*prefetch_distance
michael@0 764 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
michael@0 765 and SCRATCH, MASK, #31
michael@0 766 rsb SCRATCH, SCRATCH, #32*prefetch_distance
michael@0 767 .endif
michael@0 768 .ifc "process_inner_loop",""
michael@0 769 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
michael@0 770 .else
michael@0 771 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
michael@0 772 .endif
michael@0 773
michael@0 774 157: /* Check for another line */
michael@0 775 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
michael@0 776 .endif
michael@0 777
michael@0 778 .ltorg
michael@0 779
michael@0 780 160: /* Medium case */
michael@0 781 mov ORIG_W, X
michael@0 782 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
michael@0 783 /* This is stmdb sp!,{} */
michael@0 784 .word 0xE92D0000 | LINE_SAVED_REGS
michael@0 785 .endif
michael@0 786 161: /* New line */
michael@0 787 newline
michael@0 788 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
michael@0 789 preload_line 0, mask_bpp, mask_bpp_shift, MASK
michael@0 790 preload_line 0, dst_r_bpp, dst_bpp_shift, DST
michael@0 791
michael@0 792 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
michael@0 793 tst DST, #15
michael@0 794 beq 164f
michael@0 795 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
michael@0 796
michael@0 797 leading_15bytes process_head, process_tail
michael@0 798
michael@0 799 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
michael@0 800 switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
michael@0 801
michael@0 802 167: /* Check for another line */
michael@0 803 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
michael@0 804
michael@0 805 .ltorg
michael@0 806
michael@0 807 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
michael@0 808 .if dst_w_bpp < 32
michael@0 809 mov ORIG_W, X
michael@0 810 .endif
michael@0 811 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
michael@0 812 /* This is stmdb sp!,{} */
michael@0 813 .word 0xE92D0000 | LINE_SAVED_REGS
michael@0 814 .endif
michael@0 815 171: /* New line */
michael@0 816 newline
michael@0 817 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
michael@0 818 preload_line 1, mask_bpp, mask_bpp_shift, MASK
michael@0 819 preload_line 1, dst_r_bpp, dst_bpp_shift, DST
michael@0 820
michael@0 821 .if dst_w_bpp == 8
michael@0 822 tst DST, #3
michael@0 823 beq 174f
michael@0 824 172: subs X, X, #1
michael@0 825 blo 177f
michael@0 826 process_head , 1, 0, 1, 1, 0
michael@0 827 process_tail , 1, 0
michael@0 828 .if !((flags) & FLAG_PROCESS_DOES_STORE)
michael@0 829 pixst , 1, 0, DST
michael@0 830 .endif
michael@0 831 tst DST, #3
michael@0 832 bne 172b
michael@0 833 .elseif dst_w_bpp == 16
michael@0 834 tst DST, #2
michael@0 835 beq 174f
michael@0 836 subs X, X, #1
michael@0 837 blo 177f
michael@0 838 process_head , 2, 0, 1, 1, 0
michael@0 839 process_tail , 2, 0
michael@0 840 .if !((flags) & FLAG_PROCESS_DOES_STORE)
michael@0 841 pixst , 2, 0, DST
michael@0 842 .endif
michael@0 843 .endif
michael@0 844
michael@0 845 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
michael@0 846 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
michael@0 847
michael@0 848 177: /* Check for another line */
michael@0 849 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
michael@0 850
michael@0 851 197:
michael@0 852 .if (flags) & FLAG_SPILL_LINE_VARS
michael@0 853 add sp, sp, #LINE_SAVED_REG_COUNT*4
michael@0 854 .endif
michael@0 855 198:
michael@0 856 cleanup
michael@0 857
michael@0 858 #ifdef DEBUG_PARAMS
michael@0 859 add sp, sp, #9*4 /* junk the debug copy of arguments */
michael@0 860 #endif
michael@0 861 199:
michael@0 862 pop {r4-r11, pc} /* exit */
michael@0 863 .fnend
michael@0 864
michael@0 865 .ltorg
michael@0 866
michael@0 867 .unreq X
michael@0 868 .unreq Y
michael@0 869 .unreq DST
michael@0 870 .unreq STRIDE_D
michael@0 871 .unreq SRC
michael@0 872 .unreq STRIDE_S
michael@0 873 .unreq MASK
michael@0 874 .unreq STRIDE_M
michael@0 875 .unreq WK0
michael@0 876 .unreq WK1
michael@0 877 .unreq WK2
michael@0 878 .unreq WK3
michael@0 879 .unreq SCRATCH
michael@0 880 .unreq ORIG_W
michael@0 881 .endfunc
michael@0 882 .endm
michael@0 883
michael@0 884 .macro line_saved_regs x:vararg
michael@0 885 .set LINE_SAVED_REGS, 0
michael@0 886 .set LINE_SAVED_REG_COUNT, 0
michael@0 887 .irp SAVED_REG,x
michael@0 888 .ifc "SAVED_REG","Y"
michael@0 889 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
michael@0 890 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
michael@0 891 .endif
michael@0 892 .ifc "SAVED_REG","STRIDE_D"
michael@0 893 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
michael@0 894 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
michael@0 895 .endif
michael@0 896 .ifc "SAVED_REG","STRIDE_S"
michael@0 897 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
michael@0 898 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
michael@0 899 .endif
michael@0 900 .ifc "SAVED_REG","STRIDE_M"
michael@0 901 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
michael@0 902 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
michael@0 903 .endif
michael@0 904 .ifc "SAVED_REG","ORIG_W"
michael@0 905 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
michael@0 906 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
michael@0 907 .endif
michael@0 908 .endr
michael@0 909 .endm
michael@0 910
michael@0 911 .macro nop_macro x:vararg
michael@0 912 .endm

mercurial