gfx/cairo/libpixman/src/pixman-arm-simd-asm.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,912 @@
     1.4 +/*
     1.5 + * Copyright © 2012 Raspberry Pi Foundation
     1.6 + * Copyright © 2012 RISC OS Open Ltd
     1.7 + *
     1.8 + * Permission to use, copy, modify, distribute, and sell this software and its
     1.9 + * documentation for any purpose is hereby granted without fee, provided that
    1.10 + * the above copyright notice appear in all copies and that both that
    1.11 + * copyright notice and this permission notice appear in supporting
    1.12 + * documentation, and that the name of the copyright holders not be used in
    1.13 + * advertising or publicity pertaining to distribution of the software without
    1.14 + * specific, written prior permission.  The copyright holders make no
    1.15 + * representations about the suitability of this software for any purpose.  It
    1.16 + * is provided "as is" without express or implied warranty.
    1.17 + *
    1.18 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    1.19 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    1.20 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    1.21 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    1.22 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    1.23 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    1.24 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    1.25 + * SOFTWARE.
    1.26 + *
    1.27 + * Author:  Ben Avison (bavison@riscosopen.org)
    1.28 + *
    1.29 + */
    1.30 +
    1.31 +/*
    1.32 + * Because the alignment of pixel data to cachelines, and even the number of
    1.33 + * cachelines per row can vary from row to row, and because of the need to
    1.34 + * preload each scanline once and only once, this prefetch strategy treats
    1.35 + * each row of pixels independently. When a pixel row is long enough, there
    1.36 + * are three distinct phases of prefetch:
    1.37 + * * an inner loop section, where each time a cacheline of data is
    1.38 + *    processed, another cacheline is preloaded (the exact distance ahead is
    1.39 + *    determined empirically using profiling results from lowlevel-blt-bench)
    1.40 + * * a leading section, where enough cachelines are preloaded to ensure no
    1.41 + *    cachelines escape being preloaded when the inner loop starts
    1.42 + * * a trailing section, where a limited number (0 or more) of cachelines
    1.43 + *    are preloaded to deal with data (if any) that hangs off the end of the
    1.44 + *    last iteration of the inner loop, plus any trailing bytes that were not
    1.45 + *    enough to make up one whole iteration of the inner loop
    1.46 + * 
    1.47 + * There are (in general) three distinct code paths, selected between
    1.48 + * depending upon how long the pixel row is. If it is long enough that there
    1.49 + * is at least one iteration of the inner loop (as described above) then
    1.50 + * this is described as the "wide" case. If it is shorter than that, but
    1.51 + * there are still enough bytes output that there is at least one 16-byte-
    1.52 + * long, 16-byte-aligned write to the destination (the optimum type of
    1.53 + * write), then this is the "medium" case. If it is not even this long, then
    1.54 + * this is the "narrow" case, and there is no attempt to align writes to
    1.55 + * 16-byte boundaries. In the "medium" and "narrow" cases, all the
    1.56 + * cachelines containing data from the pixel row are prefetched up-front.
    1.57 + */
    1.58 +
    1.59 +/*
    1.60 + * Determine whether we put the arguments on the stack for debugging.
    1.61 + */
    1.62 +#undef DEBUG_PARAMS
    1.63 +
    1.64 +/*
    1.65 + * Bit flags for 'generate_composite_function' macro which are used
    1.66 + * to tune generated functions behavior.
    1.67 + */
    1.68 +.set FLAG_DST_WRITEONLY,         0
    1.69 +.set FLAG_DST_READWRITE,         1
    1.70 +.set FLAG_COND_EXEC,             0
    1.71 +.set FLAG_BRANCH_OVER,           2
    1.72 +.set FLAG_PROCESS_PRESERVES_PSR, 0
    1.73 +.set FLAG_PROCESS_CORRUPTS_PSR,  4
    1.74 +.set FLAG_PROCESS_DOESNT_STORE,  0
    1.75 +.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
    1.76 +.set FLAG_NO_SPILL_LINE_VARS,        0
    1.77 +.set FLAG_SPILL_LINE_VARS_WIDE,      16
    1.78 +.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
    1.79 +.set FLAG_SPILL_LINE_VARS,           48
    1.80 +.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
    1.81 +.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
    1.82 +
    1.83 +/*
    1.84 + * Offset into stack where mask and source pointer/stride can be accessed.
    1.85 + */
    1.86 +#ifdef DEBUG_PARAMS
    1.87 +.set ARGS_STACK_OFFSET,        (9*4+9*4)
    1.88 +#else
    1.89 +.set ARGS_STACK_OFFSET,        (9*4)
    1.90 +#endif
    1.91 +
    1.92 +/*
    1.93 + * Constants for selecting preferable prefetch type.
    1.94 + */
    1.95 +.set PREFETCH_TYPE_NONE,       0
    1.96 +.set PREFETCH_TYPE_STANDARD,   1
    1.97 +
    1.98 +/*
    1.99 + * Definitions of macros for load/store of pixel data.
   1.100 + */
   1.101 +
   1.102 +.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
   1.103 + .if numbytes == 16
   1.104 +  .if unaligned == 1
   1.105 +        op&r&cond    WK&reg0, [base], #4
   1.106 +        op&r&cond    WK&reg1, [base], #4
   1.107 +        op&r&cond    WK&reg2, [base], #4
   1.108 +        op&r&cond    WK&reg3, [base], #4
   1.109 +  .else
   1.110 +        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
   1.111 +  .endif
   1.112 + .elseif numbytes == 8
   1.113 +  .if unaligned == 1
   1.114 +        op&r&cond    WK&reg0, [base], #4
   1.115 +        op&r&cond    WK&reg1, [base], #4
   1.116 +  .else
   1.117 +        op&m&cond&ia base!, {WK&reg0,WK&reg1}
   1.118 +  .endif
   1.119 + .elseif numbytes == 4
   1.120 +        op&r&cond    WK&reg0, [base], #4
   1.121 + .elseif numbytes == 2
   1.122 +        op&r&cond&h  WK&reg0, [base], #2
   1.123 + .elseif numbytes == 1
   1.124 +        op&r&cond&b  WK&reg0, [base], #1
   1.125 + .else
   1.126 +  .error "unsupported size: numbytes"
   1.127 + .endif
   1.128 +.endm
   1.129 +
   1.130 +.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
   1.131 + .if numbytes == 16
   1.132 +        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
   1.133 + .elseif numbytes == 8
   1.134 +        stm&cond&db base, {WK&reg0,WK&reg1}
   1.135 + .elseif numbytes == 4
   1.136 +        str&cond    WK&reg0, [base, #-4]
   1.137 + .elseif numbytes == 2
   1.138 +        str&cond&h  WK&reg0, [base, #-2]
   1.139 + .elseif numbytes == 1
   1.140 +        str&cond&b  WK&reg0, [base, #-1]
   1.141 + .else
   1.142 +  .error "unsupported size: numbytes"
   1.143 + .endif
   1.144 +.endm
   1.145 +
   1.146 +.macro pixld cond, numbytes, firstreg, base, unaligned
   1.147 +        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
   1.148 +.endm
   1.149 +
   1.150 +.macro pixst cond, numbytes, firstreg, base
   1.151 + .if (flags) & FLAG_DST_READWRITE
   1.152 +        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
   1.153 + .else
   1.154 +        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
   1.155 + .endif
   1.156 +.endm
   1.157 +
   1.158 +.macro PF a, x:vararg
   1.159 + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
   1.160 +        a x
   1.161 + .endif
   1.162 +.endm
   1.163 +
   1.164 +
   1.165 +.macro preload_leading_step1  bpp, ptr, base
   1.166 +/* If the destination is already 16-byte aligned, then we need to preload
   1.167 + * between 0 and prefetch_distance (inclusive) cache lines ahead so there
   1.168 + * are no gaps when the inner loop starts.
   1.169 + */
   1.170 + .if bpp > 0
   1.171 +        PF  bic,    ptr, base, #31
   1.172 +  .set OFFSET, 0
   1.173 +  .rept prefetch_distance+1
   1.174 +        PF  pld,    [ptr, #OFFSET]
   1.175 +   .set OFFSET, OFFSET+32
   1.176 +  .endr
   1.177 + .endif
   1.178 +.endm
   1.179 +
   1.180 +.macro preload_leading_step2  bpp, bpp_shift, ptr, base
   1.181 +/* However, if the destination is not 16-byte aligned, we may need to
   1.182 + * preload more cache lines than that. The question we need to ask is:
   1.183 + * are the bytes corresponding to the leading pixels more than the amount
   1.184 + * by which the source pointer will be rounded down for preloading, and if
   1.185 + * so, by how many cache lines? Effectively, we want to calculate
   1.186 + *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
   1.187 + *     inner_loop_offset = (src+leading_bytes)&31
   1.188 + *     extra_needed = leading_bytes - inner_loop_offset
   1.189 + * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
   1.190 + * possible when there are 4 src bytes for every 1 dst byte).
   1.191 + */
   1.192 + .if bpp > 0
   1.193 +  .ifc base,DST
   1.194 +        /* The test can be simplified further when preloading the destination */
   1.195 +        PF  tst,    base, #16
   1.196 +        PF  beq,    61f
   1.197 +  .else
   1.198 +   .if bpp/dst_w_bpp == 4
   1.199 +        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
   1.200 +        PF  and,    SCRATCH, SCRATCH, #31
   1.201 +        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
   1.202 +        PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
   1.203 +        PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
   1.204 +        PF  bcs,    61f
   1.205 +        PF  bpl,    60f
   1.206 +        PF  pld,    [ptr, #32*(prefetch_distance+2)]
   1.207 +   .else
   1.208 +        PF  mov,    SCRATCH, base, lsl #32-5
   1.209 +        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
   1.210 +        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
   1.211 +        PF  bls,    61f
   1.212 +   .endif
   1.213 +  .endif
   1.214 +60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
   1.215 +61:
   1.216 + .endif
   1.217 +.endm
   1.218 +
   1.219 +#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
   1.220 +.macro preload_middle   bpp, base, scratch_holds_offset
   1.221 + .if bpp > 0
   1.222 +        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
   1.223 +  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
   1.224 +   .if scratch_holds_offset
   1.225 +        PF  pld,    [base, SCRATCH]
   1.226 +   .else
   1.227 +        PF  bic,    SCRATCH, base, #31
   1.228 +        PF  pld,    [SCRATCH, #32*prefetch_distance]
   1.229 +   .endif
   1.230 +  .endif
   1.231 + .endif
   1.232 +.endm
   1.233 +
   1.234 +.macro preload_trailing  bpp, bpp_shift, base
   1.235 + .if bpp > 0
   1.236 +  .if bpp*pix_per_block > 256
   1.237 +        /* Calculations are more complex if more than one fetch per block */
   1.238 +        PF  and,    WK1, base, #31
   1.239 +        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
   1.240 +        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
   1.241 +        PF  bic,    SCRATCH, base, #31
   1.242 +80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
   1.243 +        PF  add,    SCRATCH, SCRATCH, #32
   1.244 +        PF  subs,   WK1, WK1, #32
   1.245 +        PF  bhi,    80b
   1.246 +  .else
   1.247 +        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
   1.248 +        PF  mov,    SCRATCH, base, lsl #32-5
   1.249 +        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
   1.250 +        PF  adceqs, SCRATCH, SCRATCH, #0
   1.251 +        /* The instruction above has two effects: ensures Z is only
   1.252 +         * set if C was clear (so Z indicates that both shifted quantities
   1.253 +         * were 0), and clears C if Z was set (so C indicates that the sum
   1.254 +         * of the shifted quantities was greater and not equal to 32) */
   1.255 +        PF  beq,    82f
   1.256 +        PF  bic,    SCRATCH, base, #31
   1.257 +        PF  bcc,    81f
   1.258 +        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
   1.259 +81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
   1.260 +82:
   1.261 +  .endif
   1.262 + .endif
   1.263 +.endm
   1.264 +
   1.265 +
   1.266 +.macro preload_line    narrow_case, bpp, bpp_shift, base
   1.267 +/* "narrow_case" - just means that the macro was invoked from the "narrow"
   1.268 + *    code path rather than the "medium" one - because in the narrow case,
   1.269 + *    the row of pixels is known to output no more than 30 bytes, then
   1.270 + *    (assuming the source pixels are no wider than the the destination
   1.271 + *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
   1.272 + *    meaning there's no need for a loop.
   1.273 + * "bpp" - number of bits per pixel in the channel (source, mask or
   1.274 + *    destination) that's being preloaded, or 0 if this channel is not used
   1.275 + *    for reading
   1.276 + * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
   1.277 + * "base" - base address register of channel to preload (SRC, MASK or DST)
   1.278 + */
   1.279 + .if bpp > 0
   1.280 +  .if narrow_case && (bpp <= dst_w_bpp)
   1.281 +        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
   1.282 +        PF  bic,    WK0, base, #31
   1.283 +        PF  pld,    [WK0]
   1.284 +        PF  add,    WK1, base, X, LSL #bpp_shift
   1.285 +        PF  sub,    WK1, WK1, #1
   1.286 +        PF  bic,    WK1, WK1, #31
   1.287 +        PF  cmp,    WK1, WK0
   1.288 +        PF  beq,    90f
   1.289 +        PF  pld,    [WK1]
   1.290 +90:
   1.291 +  .else
   1.292 +        PF  bic,    WK0, base, #31
   1.293 +        PF  pld,    [WK0]
   1.294 +        PF  add,    WK1, base, X, lsl #bpp_shift
   1.295 +        PF  sub,    WK1, WK1, #1
   1.296 +        PF  bic,    WK1, WK1, #31
   1.297 +        PF  cmp,    WK1, WK0
   1.298 +        PF  beq,    92f
   1.299 +91:     PF  add,    WK0, WK0, #32
   1.300 +        PF  cmp,    WK0, WK1
   1.301 +        PF  pld,    [WK0]
   1.302 +        PF  bne,    91b
   1.303 +92:
   1.304 +  .endif
   1.305 + .endif
   1.306 +.endm
   1.307 +
   1.308 +
   1.309 +.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
   1.310 +        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
   1.311 + .if decrementx
   1.312 +        sub&cond X, X, #8*numbytes/dst_w_bpp
   1.313 + .endif
   1.314 +        process_tail  cond, numbytes, firstreg
   1.315 + .if !((flags) & FLAG_PROCESS_DOES_STORE)
   1.316 +        pixst   cond, numbytes, firstreg, DST
   1.317 + .endif
   1.318 +.endm
   1.319 +
   1.320 +.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
   1.321 + .if (flags) & FLAG_BRANCH_OVER
   1.322 +  .ifc cond,mi
   1.323 +        bpl     100f
   1.324 +  .endif
   1.325 +  .ifc cond,cs
   1.326 +        bcc     100f
   1.327 +  .endif
   1.328 +  .ifc cond,ne
   1.329 +        beq     100f
   1.330 +  .endif
   1.331 +        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
   1.332 +100:
   1.333 + .else
   1.334 +        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
   1.335 + .endif
   1.336 +.endm
   1.337 +
   1.338 +.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
   1.339 + .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
   1.340 +        /* Can't interleave reads and writes */
   1.341 +        test
   1.342 +        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
   1.343 +  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
   1.344 +        test
   1.345 +  .endif
   1.346 +        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
   1.347 + .else
   1.348 +        /* Can interleave reads and writes for better scheduling */
   1.349 +        test
   1.350 +        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
   1.351 +        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
   1.352 +  .if decrementx
   1.353 +        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
   1.354 +        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
   1.355 +  .endif
   1.356 +        process_tail  cond1, numbytes1, firstreg1
   1.357 +        process_tail  cond2, numbytes2, firstreg2
   1.358 +        pixst   cond1, numbytes1, firstreg1, DST
   1.359 +        pixst   cond2, numbytes2, firstreg2, DST
   1.360 + .endif
   1.361 +.endm
   1.362 +
   1.363 +
   1.364 +.macro test_bits_1_0_ptr
   1.365 +        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
   1.366 +.endm
   1.367 +
   1.368 +.macro test_bits_3_2_ptr
   1.369 +        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
   1.370 +.endm
   1.371 +
   1.372 +.macro leading_15bytes  process_head, process_tail
   1.373 +        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
   1.374 +        /* Use unaligned loads in all cases for simplicity */
   1.375 + .if dst_w_bpp == 8
   1.376 +        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
   1.377 + .elseif dst_w_bpp == 16
   1.378 +        test_bits_1_0_ptr
   1.379 +        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
   1.380 + .endif
   1.381 +        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
   1.382 +.endm
   1.383 +
   1.384 +.macro test_bits_3_2_pix
   1.385 +        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
   1.386 +.endm
   1.387 +
   1.388 +.macro test_bits_1_0_pix
   1.389 + .if dst_w_bpp == 8
   1.390 +        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
   1.391 + .else
   1.392 +        movs    SCRATCH, X, lsr #1
   1.393 + .endif
   1.394 +.endm
   1.395 +
   1.396 +.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
   1.397 +        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
   1.398 + .if dst_w_bpp == 16
   1.399 +        test_bits_1_0_pix
   1.400 +        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
   1.401 + .elseif dst_w_bpp == 8
   1.402 +        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
   1.403 + .endif
   1.404 +.endm
   1.405 +
   1.406 +
   1.407 +.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
   1.408 +110:
   1.409 + .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
   1.410 + .rept pix_per_block*dst_w_bpp/128
   1.411 +        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
   1.412 +  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
   1.413 +        preload_middle  src_bpp, SRC, 1
   1.414 +  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
   1.415 +        preload_middle  mask_bpp, MASK, 1
   1.416 +  .else
   1.417 +        preload_middle  src_bpp, SRC, 0
   1.418 +        preload_middle  mask_bpp, MASK, 0
   1.419 +  .endif
   1.420 +  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
   1.421 +        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
   1.422 +         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
   1.423 +         * preloads for, to achieve staggered prefetches for multiple channels, because there are
   1.424 +         * always two STMs per prefetch, so there is always an opposite STM on which to put the
   1.425 +         * preload. Note, no need to BIC the base register here */
   1.426 +        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
   1.427 +  .endif
   1.428 +        process_tail  , 16, 0
   1.429 +  .if !((flags) & FLAG_PROCESS_DOES_STORE)
   1.430 +        pixst   , 16, 0, DST
   1.431 +  .endif
   1.432 +  .set SUBBLOCK, SUBBLOCK+1
   1.433 + .endr
   1.434 +        subs    X, X, #pix_per_block
   1.435 +        bhs     110b
   1.436 +.endm
   1.437 +
   1.438 +.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
   1.439 +        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
   1.440 + .if dst_r_bpp > 0
   1.441 +        tst     DST, #16
   1.442 +        bne     111f
   1.443 +        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
   1.444 +        b       112f
   1.445 +111:
   1.446 + .endif
   1.447 +        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
   1.448 +112:
   1.449 +        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
   1.450 + .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
   1.451 +        PF  and,    WK0, X, #pix_per_block-1
   1.452 + .endif
   1.453 +        preload_trailing  src_bpp, src_bpp_shift, SRC
   1.454 +        preload_trailing  mask_bpp, mask_bpp_shift, MASK
   1.455 +        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
   1.456 +        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
   1.457 +        /* The remainder of the line is handled identically to the medium case */
   1.458 +        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
   1.459 +.endm
   1.460 +
   1.461 +.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
   1.462 +120:
   1.463 +        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
   1.464 +        process_tail  , 16, 0
   1.465 + .if !((flags) & FLAG_PROCESS_DOES_STORE)
   1.466 +        pixst   , 16, 0, DST
   1.467 + .endif
   1.468 +        subs    X, X, #128/dst_w_bpp
   1.469 +        bhs     120b
   1.470 +        /* Trailing pixels */
   1.471 +        tst     X, #128/dst_w_bpp - 1
   1.472 +        beq     exit_label
   1.473 +        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
   1.474 +.endm
   1.475 +
   1.476 +.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
   1.477 +        tst     X, #16*8/dst_w_bpp
   1.478 +        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
   1.479 +        /* Trailing pixels */
   1.480 +        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
   1.481 +        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
   1.482 +.endm
   1.483 +
   1.484 +.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
   1.485 + /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
   1.486 + .if mask_bpp == 8 || mask_bpp == 16
   1.487 +        tst     MASK, #3
   1.488 +        bne     141f
   1.489 + .endif
   1.490 +  .if src_bpp == 8 || src_bpp == 16
   1.491 +        tst     SRC, #3
   1.492 +        bne     140f
   1.493 +  .endif
   1.494 +        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
   1.495 +  .if src_bpp == 8 || src_bpp == 16
   1.496 +        b       exit_label
   1.497 +140:
   1.498 +        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
   1.499 +  .endif
   1.500 + .if mask_bpp == 8 || mask_bpp == 16
   1.501 +        b       exit_label
   1.502 +141:
   1.503 +  .if src_bpp == 8 || src_bpp == 16
   1.504 +        tst     SRC, #3
   1.505 +        bne     142f
   1.506 +  .endif
   1.507 +        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
   1.508 +  .if src_bpp == 8 || src_bpp == 16
   1.509 +        b       exit_label
   1.510 +142:
   1.511 +        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
   1.512 +  .endif
   1.513 + .endif
   1.514 +.endm
   1.515 +
   1.516 +
   1.517 +.macro end_of_line      restore_x, vars_spilled, loop_label, last_one
   1.518 + .if vars_spilled
   1.519 +        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
   1.520 +        /* This is ldmia sp,{} */
   1.521 +        .word   0xE89D0000 | LINE_SAVED_REGS
   1.522 + .endif
   1.523 +        subs    Y, Y, #1
   1.524 + .if vars_spilled
   1.525 +  .if (LINE_SAVED_REGS) & (1<<1)
   1.526 +        str     Y, [sp]
   1.527 +  .endif
   1.528 + .endif
   1.529 +        add     DST, DST, STRIDE_D
   1.530 + .if src_bpp > 0
   1.531 +        add     SRC, SRC, STRIDE_S
   1.532 + .endif
   1.533 + .if mask_bpp > 0
   1.534 +        add     MASK, MASK, STRIDE_M
   1.535 + .endif
   1.536 + .if restore_x
   1.537 +        mov     X, ORIG_W
   1.538 + .endif
   1.539 +        bhs     loop_label
   1.540 + .ifc "last_one",""
   1.541 +  .if vars_spilled
   1.542 +        b       197f
   1.543 +  .else
   1.544 +        b       198f
   1.545 +  .endif
   1.546 + .else
   1.547 +  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
   1.548 +        b       198f
   1.549 +  .endif
   1.550 + .endif
   1.551 +.endm
   1.552 +
   1.553 +
   1.554 +.macro generate_composite_function fname, \
   1.555 +                                   src_bpp_, \
   1.556 +                                   mask_bpp_, \
   1.557 +                                   dst_w_bpp_, \
   1.558 +                                   flags_, \
   1.559 +                                   prefetch_distance_, \
   1.560 +                                   init, \
   1.561 +                                   newline, \
   1.562 +                                   cleanup, \
   1.563 +                                   process_head, \
   1.564 +                                   process_tail, \
   1.565 +                                   process_inner_loop
   1.566 +
   1.567 + .func fname
   1.568 + .global fname
   1.569 + /* For ELF format also set function visibility to hidden */
   1.570 +#ifdef __ELF__
   1.571 + .hidden fname
   1.572 + .type fname, %function
   1.573 +#endif
   1.574 +
   1.575 +/*
   1.576 + * Make some macro arguments globally visible and accessible
   1.577 + * from other macros
   1.578 + */
   1.579 + .set src_bpp, src_bpp_
   1.580 + .set mask_bpp, mask_bpp_
   1.581 + .set dst_w_bpp, dst_w_bpp_
   1.582 + .set flags, flags_
   1.583 + .set prefetch_distance, prefetch_distance_
   1.584 +
   1.585 +/*
   1.586 + * Select prefetch type for this function.
   1.587 + */
   1.588 + .if prefetch_distance == 0
   1.589 +  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
   1.590 + .else
   1.591 +  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
   1.592 + .endif
   1.593 +
   1.594 + .if src_bpp == 32
   1.595 +  .set src_bpp_shift, 2
   1.596 + .elseif src_bpp == 24
   1.597 +  .set src_bpp_shift, 0
   1.598 + .elseif src_bpp == 16
   1.599 +  .set src_bpp_shift, 1
   1.600 + .elseif src_bpp == 8
   1.601 +  .set src_bpp_shift, 0
   1.602 + .elseif src_bpp == 0
   1.603 +  .set src_bpp_shift, -1
   1.604 + .else
   1.605 +  .error "requested src bpp (src_bpp) is not supported"
   1.606 + .endif
   1.607 +
   1.608 + .if mask_bpp == 32
   1.609 +  .set mask_bpp_shift, 2
   1.610 + .elseif mask_bpp == 24
   1.611 +  .set mask_bpp_shift, 0
   1.612 + .elseif mask_bpp == 8
   1.613 +  .set mask_bpp_shift, 0
   1.614 + .elseif mask_bpp == 0
   1.615 +  .set mask_bpp_shift, -1
   1.616 + .else
   1.617 +  .error "requested mask bpp (mask_bpp) is not supported"
   1.618 + .endif
   1.619 +
   1.620 + .if dst_w_bpp == 32
   1.621 +  .set dst_bpp_shift, 2
   1.622 + .elseif dst_w_bpp == 24
   1.623 +  .set dst_bpp_shift, 0
   1.624 + .elseif dst_w_bpp == 16
   1.625 +  .set dst_bpp_shift, 1
   1.626 + .elseif dst_w_bpp == 8
   1.627 +  .set dst_bpp_shift, 0
   1.628 + .else
   1.629 +  .error "requested dst bpp (dst_w_bpp) is not supported"
   1.630 + .endif
   1.631 +
   1.632 + .if (((flags) & FLAG_DST_READWRITE) != 0)
   1.633 +  .set dst_r_bpp, dst_w_bpp
   1.634 + .else
   1.635 +  .set dst_r_bpp, 0
   1.636 + .endif
   1.637 +
   1.638 + .set pix_per_block, 16*8/dst_w_bpp
   1.639 + .if src_bpp != 0
   1.640 +  .if 32*8/src_bpp > pix_per_block
   1.641 +   .set pix_per_block, 32*8/src_bpp
   1.642 +  .endif
   1.643 + .endif
   1.644 + .if mask_bpp != 0
   1.645 +  .if 32*8/mask_bpp > pix_per_block
   1.646 +   .set pix_per_block, 32*8/mask_bpp
   1.647 +  .endif
   1.648 + .endif
   1.649 + .if dst_r_bpp != 0
   1.650 +  .if 32*8/dst_r_bpp > pix_per_block
   1.651 +   .set pix_per_block, 32*8/dst_r_bpp
   1.652 +  .endif
   1.653 + .endif
   1.654 +
   1.655 +/* The standard entry conditions set up by pixman-arm-common.h are:
   1.656 + * r0 = width (pixels)
   1.657 + * r1 = height (rows)
   1.658 + * r2 = pointer to top-left pixel of destination
   1.659 + * r3 = destination stride (pixels)
   1.660 + * [sp] = source pixel value, or pointer to top-left pixel of source
   1.661 + * [sp,#4] = 0 or source stride (pixels)
   1.662 + * The following arguments are unused for non-mask operations
   1.663 + * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
   1.664 + * [sp,#12] = 0 or mask stride (pixels)
   1.665 + */
   1.666 +
   1.667 +/*
   1.668 + * Assign symbolic names to registers
   1.669 + */
   1.670 +    X           .req    r0  /* pixels to go on this line */
   1.671 +    Y           .req    r1  /* lines to go */
   1.672 +    DST         .req    r2  /* destination pixel pointer */
   1.673 +    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
   1.674 +    SRC         .req    r4  /* source pixel pointer */
   1.675 +    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
   1.676 +    MASK        .req    r6  /* mask pixel pointer (if applicable) */
   1.677 +    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
   1.678 +    WK0         .req    r8  /* pixel data registers */
   1.679 +    WK1         .req    r9
   1.680 +    WK2         .req    r10
   1.681 +    WK3         .req    r11
   1.682 +    SCRATCH     .req    r12
   1.683 +    ORIG_W      .req    r14 /* width (pixels) */
   1.684 +
   1.685 +fname:
   1.686 +        .fnstart
   1.687 +	.save   {r4-r11, lr}
   1.688 +        push    {r4-r11, lr}        /* save all registers */
   1.689 +
   1.690 +        subs    Y, Y, #1
   1.691 +        blo     199f
   1.692 +
   1.693 +#ifdef DEBUG_PARAMS
   1.694 +	.pad    #9*4
   1.695 +        sub     sp, sp, #9*4
   1.696 +#endif
   1.697 +
   1.698 + .if src_bpp > 0
   1.699 +        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
   1.700 +        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
   1.701 + .endif
   1.702 + .if mask_bpp > 0
   1.703 +        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
   1.704 +        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
   1.705 + .endif
   1.706 +        
   1.707 +#ifdef DEBUG_PARAMS
   1.708 +        add     Y, Y, #1
   1.709 +        stmia   sp, {r0-r7,pc}
   1.710 +        sub     Y, Y, #1
   1.711 +#endif
   1.712 +
   1.713 +        init
   1.714 +        
   1.715 +        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
   1.716 +        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
   1.717 + .if src_bpp > 0
   1.718 +        lsl     STRIDE_S, #src_bpp_shift
   1.719 +        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
   1.720 + .endif
   1.721 + .if mask_bpp > 0
   1.722 +        lsl     STRIDE_M, #mask_bpp_shift
   1.723 +        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
   1.724 + .endif
   1.725 + 
   1.726 +        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
   1.727 +        cmp     X, #2*16*8/dst_w_bpp - 1
   1.728 +        blo     170f
   1.729 + .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
   1.730 +        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
   1.731 +        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
   1.732 +        blo     160f
   1.733 +
   1.734 +        /* Wide case */
   1.735 +        /* Adjust X so that the decrement instruction can also test for
   1.736 +         * inner loop termination. We want it to stop when there are
   1.737 +         * (prefetch_distance+1) complete blocks to go. */
   1.738 +        sub     X, X, #(prefetch_distance+2)*pix_per_block
   1.739 +        mov     ORIG_W, X
   1.740 +  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
   1.741 +        /* This is stmdb sp!,{} */
   1.742 +        .word   0xE92D0000 | LINE_SAVED_REGS
   1.743 +  .endif
   1.744 +151:    /* New line */
   1.745 +        newline
   1.746 +        preload_leading_step1  src_bpp, WK1, SRC
   1.747 +        preload_leading_step1  mask_bpp, WK2, MASK
   1.748 +        preload_leading_step1  dst_r_bpp, WK3, DST
   1.749 +        
   1.750 +        tst     DST, #15
   1.751 +        beq     154f
   1.752 +        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
   1.753 +  .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
   1.754 +        PF  and,    WK0, WK0, #15
   1.755 +  .endif
   1.756 +
   1.757 +        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
   1.758 +        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
   1.759 +        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
   1.760 +
   1.761 +        leading_15bytes  process_head, process_tail
   1.762 +        
   1.763 +154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
   1.764 + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
   1.765 +        and     SCRATCH, SRC, #31
   1.766 +        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
   1.767 + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
   1.768 +        and     SCRATCH, MASK, #31
   1.769 +        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
   1.770 + .endif
   1.771 + .ifc "process_inner_loop",""
   1.772 +        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
   1.773 + .else
   1.774 +        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
   1.775 + .endif
   1.776 +
   1.777 +157:    /* Check for another line */
   1.778 +        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
   1.779 + .endif
   1.780 +
   1.781 + .ltorg
   1.782 +
   1.783 +160:    /* Medium case */
   1.784 +        mov     ORIG_W, X
   1.785 + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
   1.786 +        /* This is stmdb sp!,{} */
   1.787 +        .word   0xE92D0000 | LINE_SAVED_REGS
   1.788 + .endif
   1.789 +161:    /* New line */
   1.790 +        newline
   1.791 +        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
   1.792 +        preload_line 0, mask_bpp, mask_bpp_shift, MASK
   1.793 +        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
   1.794 +        
   1.795 +        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
   1.796 +        tst     DST, #15
   1.797 +        beq     164f
   1.798 +        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
   1.799 +        
   1.800 +        leading_15bytes  process_head, process_tail
   1.801 +        
   1.802 +164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
   1.803 +        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
   1.804 +        
   1.805 +167:    /* Check for another line */
   1.806 +        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
   1.807 +
   1.808 + .ltorg
   1.809 +
   1.810 +170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
   1.811 + .if dst_w_bpp < 32
   1.812 +        mov     ORIG_W, X
   1.813 + .endif
   1.814 + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
   1.815 +        /* This is stmdb sp!,{} */
   1.816 +        .word   0xE92D0000 | LINE_SAVED_REGS
   1.817 + .endif
   1.818 +171:    /* New line */
   1.819 +        newline
   1.820 +        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
   1.821 +        preload_line 1, mask_bpp, mask_bpp_shift, MASK
   1.822 +        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
   1.823 +        
   1.824 + .if dst_w_bpp == 8
   1.825 +        tst     DST, #3
   1.826 +        beq     174f
   1.827 +172:    subs    X, X, #1
   1.828 +        blo     177f
   1.829 +        process_head  , 1, 0, 1, 1, 0
   1.830 +        process_tail  , 1, 0
   1.831 +  .if !((flags) & FLAG_PROCESS_DOES_STORE)
   1.832 +        pixst   , 1, 0, DST
   1.833 +  .endif
   1.834 +        tst     DST, #3
   1.835 +        bne     172b
   1.836 + .elseif dst_w_bpp == 16
   1.837 +        tst     DST, #2
   1.838 +        beq     174f
   1.839 +        subs    X, X, #1
   1.840 +        blo     177f
   1.841 +        process_head  , 2, 0, 1, 1, 0
   1.842 +        process_tail  , 2, 0
   1.843 +  .if !((flags) & FLAG_PROCESS_DOES_STORE)
   1.844 +        pixst   , 2, 0, DST
   1.845 +  .endif
   1.846 + .endif
   1.847 +
   1.848 +174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
   1.849 +        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
   1.850 +
   1.851 +177:    /* Check for another line */
   1.852 +        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
   1.853 +
   1.854 +197:
   1.855 + .if (flags) & FLAG_SPILL_LINE_VARS
   1.856 +        add     sp, sp, #LINE_SAVED_REG_COUNT*4
   1.857 + .endif
   1.858 +198:
   1.859 +        cleanup
   1.860 +
   1.861 +#ifdef DEBUG_PARAMS
   1.862 +        add     sp, sp, #9*4 /* junk the debug copy of arguments */
   1.863 +#endif
   1.864 +199:
   1.865 +        pop     {r4-r11, pc}  /* exit */
   1.866 +	.fnend
   1.867 +
   1.868 + .ltorg
   1.869 +
   1.870 +    .unreq  X
   1.871 +    .unreq  Y
   1.872 +    .unreq  DST
   1.873 +    .unreq  STRIDE_D
   1.874 +    .unreq  SRC
   1.875 +    .unreq  STRIDE_S
   1.876 +    .unreq  MASK
   1.877 +    .unreq  STRIDE_M
   1.878 +    .unreq  WK0
   1.879 +    .unreq  WK1
   1.880 +    .unreq  WK2
   1.881 +    .unreq  WK3
   1.882 +    .unreq  SCRATCH
   1.883 +    .unreq  ORIG_W
   1.884 +    .endfunc
   1.885 +.endm
   1.886 +
   1.887 +.macro line_saved_regs  x:vararg
   1.888 + .set LINE_SAVED_REGS, 0
   1.889 + .set LINE_SAVED_REG_COUNT, 0
   1.890 + .irp SAVED_REG,x
   1.891 +  .ifc "SAVED_REG","Y"
   1.892 +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
   1.893 +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1.894 +  .endif
   1.895 +  .ifc "SAVED_REG","STRIDE_D"
   1.896 +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
   1.897 +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1.898 +  .endif
   1.899 +  .ifc "SAVED_REG","STRIDE_S"
   1.900 +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
   1.901 +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1.902 +  .endif
   1.903 +  .ifc "SAVED_REG","STRIDE_M"
   1.904 +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
   1.905 +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1.906 +  .endif
   1.907 +  .ifc "SAVED_REG","ORIG_W"
   1.908 +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
   1.909 +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1.910 +  .endif
   1.911 + .endr
   1.912 +.endm
   1.913 +
   1.914 +.macro nop_macro x:vararg
   1.915 +.endm

mercurial