Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 /*
2 * Copyright © 2012 Raspberry Pi Foundation
3 * Copyright © 2012 RISC OS Open Ltd
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of the copyright holders not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission. The copyright holders make no
12 * representations about the suitability of this software for any purpose. It
13 * is provided "as is" without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author: Ben Avison (bavison@riscosopen.org)
25 *
26 */
28 /*
29 * Because the alignment of pixel data to cachelines, and even the number of
30 * cachelines per row can vary from row to row, and because of the need to
31 * preload each scanline once and only once, this prefetch strategy treats
32 * each row of pixels independently. When a pixel row is long enough, there
33 * are three distinct phases of prefetch:
34 * * an inner loop section, where each time a cacheline of data is
35 * processed, another cacheline is preloaded (the exact distance ahead is
36 * determined empirically using profiling results from lowlevel-blt-bench)
37 * * a leading section, where enough cachelines are preloaded to ensure no
38 * cachelines escape being preloaded when the inner loop starts
39 * * a trailing section, where a limited number (0 or more) of cachelines
40 * are preloaded to deal with data (if any) that hangs off the end of the
41 * last iteration of the inner loop, plus any trailing bytes that were not
42 * enough to make up one whole iteration of the inner loop
43 *
44 * There are (in general) three distinct code paths, selected between
45 * depending upon how long the pixel row is. If it is long enough that there
46 * is at least one iteration of the inner loop (as described above) then
47 * this is described as the "wide" case. If it is shorter than that, but
48 * there are still enough bytes output that there is at least one 16-byte-
49 * long, 16-byte-aligned write to the destination (the optimum type of
50 * write), then this is the "medium" case. If it is not even this long, then
51 * this is the "narrow" case, and there is no attempt to align writes to
52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the
53 * cachelines containing data from the pixel row are prefetched up-front.
54 */
56 /*
57 * Determine whether we put the arguments on the stack for debugging.
58 */
59 #undef DEBUG_PARAMS
61 /*
62 * Bit flags for 'generate_composite_function' macro which are used
63 * to tune generated functions behavior.
64 */
65 .set FLAG_DST_WRITEONLY, 0
66 .set FLAG_DST_READWRITE, 1
67 .set FLAG_COND_EXEC, 0
68 .set FLAG_BRANCH_OVER, 2
69 .set FLAG_PROCESS_PRESERVES_PSR, 0
70 .set FLAG_PROCESS_CORRUPTS_PSR, 4
71 .set FLAG_PROCESS_DOESNT_STORE, 0
72 .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */
73 .set FLAG_NO_SPILL_LINE_VARS, 0
74 .set FLAG_SPILL_LINE_VARS_WIDE, 16
75 .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32
76 .set FLAG_SPILL_LINE_VARS, 48
77 .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
80 /*
81 * Offset into stack where mask and source pointer/stride can be accessed.
82 */
83 #ifdef DEBUG_PARAMS
84 .set ARGS_STACK_OFFSET, (9*4+9*4)
85 #else
86 .set ARGS_STACK_OFFSET, (9*4)
87 #endif
89 /*
90 * Constants for selecting preferable prefetch type.
91 */
92 .set PREFETCH_TYPE_NONE, 0
93 .set PREFETCH_TYPE_STANDARD, 1
95 /*
96 * Definitions of macros for load/store of pixel data.
97 */
99 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
100 .if numbytes == 16
101 .if unaligned == 1
102 op&r&cond WK®0, [base], #4
103 op&r&cond WK®1, [base], #4
104 op&r&cond WK®2, [base], #4
105 op&r&cond WK®3, [base], #4
106 .else
107 op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3}
108 .endif
109 .elseif numbytes == 8
110 .if unaligned == 1
111 op&r&cond WK®0, [base], #4
112 op&r&cond WK®1, [base], #4
113 .else
114 op&m&cond&ia base!, {WK®0,WK®1}
115 .endif
116 .elseif numbytes == 4
117 op&r&cond WK®0, [base], #4
118 .elseif numbytes == 2
119 op&r&cond&h WK®0, [base], #2
120 .elseif numbytes == 1
121 op&r&cond&b WK®0, [base], #1
122 .else
123 .error "unsupported size: numbytes"
124 .endif
125 .endm
127 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
128 .if numbytes == 16
129 stm&cond&db base, {WK®0,WK®1,WK®2,WK®3}
130 .elseif numbytes == 8
131 stm&cond&db base, {WK®0,WK®1}
132 .elseif numbytes == 4
133 str&cond WK®0, [base, #-4]
134 .elseif numbytes == 2
135 str&cond&h WK®0, [base, #-2]
136 .elseif numbytes == 1
137 str&cond&b WK®0, [base, #-1]
138 .else
139 .error "unsupported size: numbytes"
140 .endif
141 .endm
143 .macro pixld cond, numbytes, firstreg, base, unaligned
144 pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
145 .endm
147 .macro pixst cond, numbytes, firstreg, base
148 .if (flags) & FLAG_DST_READWRITE
149 pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
150 .else
151 pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
152 .endif
153 .endm
155 .macro PF a, x:vararg
156 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
157 a x
158 .endif
159 .endm
162 .macro preload_leading_step1 bpp, ptr, base
163 /* If the destination is already 16-byte aligned, then we need to preload
164 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
165 * are no gaps when the inner loop starts.
166 */
167 .if bpp > 0
168 PF bic, ptr, base, #31
169 .set OFFSET, 0
170 .rept prefetch_distance+1
171 PF pld, [ptr, #OFFSET]
172 .set OFFSET, OFFSET+32
173 .endr
174 .endif
175 .endm
177 .macro preload_leading_step2 bpp, bpp_shift, ptr, base
178 /* However, if the destination is not 16-byte aligned, we may need to
179 * preload more cache lines than that. The question we need to ask is:
180 * are the bytes corresponding to the leading pixels more than the amount
181 * by which the source pointer will be rounded down for preloading, and if
182 * so, by how many cache lines? Effectively, we want to calculate
183 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
184 * inner_loop_offset = (src+leading_bytes)&31
185 * extra_needed = leading_bytes - inner_loop_offset
186 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
187 * possible when there are 4 src bytes for every 1 dst byte).
188 */
189 .if bpp > 0
190 .ifc base,DST
191 /* The test can be simplified further when preloading the destination */
192 PF tst, base, #16
193 PF beq, 61f
194 .else
195 .if bpp/dst_w_bpp == 4
196 PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
197 PF and, SCRATCH, SCRATCH, #31
198 PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
199 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
200 PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */
201 PF bcs, 61f
202 PF bpl, 60f
203 PF pld, [ptr, #32*(prefetch_distance+2)]
204 .else
205 PF mov, SCRATCH, base, lsl #32-5
206 PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
207 PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
208 PF bls, 61f
209 .endif
210 .endif
211 60: PF pld, [ptr, #32*(prefetch_distance+1)]
212 61:
213 .endif
214 .endm
216 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
217 .macro preload_middle bpp, base, scratch_holds_offset
218 .if bpp > 0
219 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
220 .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
221 .if scratch_holds_offset
222 PF pld, [base, SCRATCH]
223 .else
224 PF bic, SCRATCH, base, #31
225 PF pld, [SCRATCH, #32*prefetch_distance]
226 .endif
227 .endif
228 .endif
229 .endm
231 .macro preload_trailing bpp, bpp_shift, base
232 .if bpp > 0
233 .if bpp*pix_per_block > 256
234 /* Calculations are more complex if more than one fetch per block */
235 PF and, WK1, base, #31
236 PF add, WK1, WK1, WK0, lsl #bpp_shift
237 PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
238 PF bic, SCRATCH, base, #31
239 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
240 PF add, SCRATCH, SCRATCH, #32
241 PF subs, WK1, WK1, #32
242 PF bhi, 80b
243 .else
244 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
245 PF mov, SCRATCH, base, lsl #32-5
246 PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
247 PF adceqs, SCRATCH, SCRATCH, #0
248 /* The instruction above has two effects: ensures Z is only
249 * set if C was clear (so Z indicates that both shifted quantities
250 * were 0), and clears C if Z was set (so C indicates that the sum
251 * of the shifted quantities was greater and not equal to 32) */
252 PF beq, 82f
253 PF bic, SCRATCH, base, #31
254 PF bcc, 81f
255 PF pld, [SCRATCH, #32*(prefetch_distance+2)]
256 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
257 82:
258 .endif
259 .endif
260 .endm
263 .macro preload_line narrow_case, bpp, bpp_shift, base
264 /* "narrow_case" - just means that the macro was invoked from the "narrow"
265 * code path rather than the "medium" one - because in the narrow case,
266 * the row of pixels is known to output no more than 30 bytes, then
267 * (assuming the source pixels are no wider than the the destination
268 * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
269 * meaning there's no need for a loop.
270 * "bpp" - number of bits per pixel in the channel (source, mask or
271 * destination) that's being preloaded, or 0 if this channel is not used
272 * for reading
273 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
274 * "base" - base address register of channel to preload (SRC, MASK or DST)
275 */
276 .if bpp > 0
277 .if narrow_case && (bpp <= dst_w_bpp)
278 /* In these cases, each line for each channel is in either 1 or 2 cache lines */
279 PF bic, WK0, base, #31
280 PF pld, [WK0]
281 PF add, WK1, base, X, LSL #bpp_shift
282 PF sub, WK1, WK1, #1
283 PF bic, WK1, WK1, #31
284 PF cmp, WK1, WK0
285 PF beq, 90f
286 PF pld, [WK1]
287 90:
288 .else
289 PF bic, WK0, base, #31
290 PF pld, [WK0]
291 PF add, WK1, base, X, lsl #bpp_shift
292 PF sub, WK1, WK1, #1
293 PF bic, WK1, WK1, #31
294 PF cmp, WK1, WK0
295 PF beq, 92f
296 91: PF add, WK0, WK0, #32
297 PF cmp, WK0, WK1
298 PF pld, [WK0]
299 PF bne, 91b
300 92:
301 .endif
302 .endif
303 .endm
306 .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
307 process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
308 .if decrementx
309 sub&cond X, X, #8*numbytes/dst_w_bpp
310 .endif
311 process_tail cond, numbytes, firstreg
312 .if !((flags) & FLAG_PROCESS_DOES_STORE)
313 pixst cond, numbytes, firstreg, DST
314 .endif
315 .endm
317 .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
318 .if (flags) & FLAG_BRANCH_OVER
319 .ifc cond,mi
320 bpl 100f
321 .endif
322 .ifc cond,cs
323 bcc 100f
324 .endif
325 .ifc cond,ne
326 beq 100f
327 .endif
328 conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
329 100:
330 .else
331 conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
332 .endif
333 .endm
335 .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
336 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
337 /* Can't interleave reads and writes */
338 test
339 conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
340 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
341 test
342 .endif
343 conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
344 .else
345 /* Can interleave reads and writes for better scheduling */
346 test
347 process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
348 process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
349 .if decrementx
350 sub&cond1 X, X, #8*numbytes1/dst_w_bpp
351 sub&cond2 X, X, #8*numbytes2/dst_w_bpp
352 .endif
353 process_tail cond1, numbytes1, firstreg1
354 process_tail cond2, numbytes2, firstreg2
355 pixst cond1, numbytes1, firstreg1, DST
356 pixst cond2, numbytes2, firstreg2, DST
357 .endif
358 .endm
361 .macro test_bits_1_0_ptr
362 movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
363 .endm
365 .macro test_bits_3_2_ptr
366 movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
367 .endm
369 .macro leading_15bytes process_head, process_tail
370 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
371 /* Use unaligned loads in all cases for simplicity */
372 .if dst_w_bpp == 8
373 conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
374 .elseif dst_w_bpp == 16
375 test_bits_1_0_ptr
376 conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1
377 .endif
378 conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
379 .endm
381 .macro test_bits_3_2_pix
382 movs SCRATCH, X, lsl #dst_bpp_shift+32-3
383 .endm
385 .macro test_bits_1_0_pix
386 .if dst_w_bpp == 8
387 movs SCRATCH, X, lsl #dst_bpp_shift+32-1
388 .else
389 movs SCRATCH, X, lsr #1
390 .endif
391 .endm
393 .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
394 conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
395 .if dst_w_bpp == 16
396 test_bits_1_0_pix
397 conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
398 .elseif dst_w_bpp == 8
399 conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
400 .endif
401 .endm
404 .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
405 110:
406 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
407 .rept pix_per_block*dst_w_bpp/128
408 process_head , 16, 0, unaligned_src, unaligned_mask, 1
409 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
410 preload_middle src_bpp, SRC, 1
411 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
412 preload_middle mask_bpp, MASK, 1
413 .else
414 preload_middle src_bpp, SRC, 0
415 preload_middle mask_bpp, MASK, 0
416 .endif
417 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
418 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
419 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
420 * preloads for, to achieve staggered prefetches for multiple channels, because there are
421 * always two STMs per prefetch, so there is always an opposite STM on which to put the
422 * preload. Note, no need to BIC the base register here */
423 PF pld, [DST, #32*prefetch_distance - dst_alignment]
424 .endif
425 process_tail , 16, 0
426 .if !((flags) & FLAG_PROCESS_DOES_STORE)
427 pixst , 16, 0, DST
428 .endif
429 .set SUBBLOCK, SUBBLOCK+1
430 .endr
431 subs X, X, #pix_per_block
432 bhs 110b
433 .endm
435 .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
436 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
437 .if dst_r_bpp > 0
438 tst DST, #16
439 bne 111f
440 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16
441 b 112f
442 111:
443 .endif
444 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0
445 112:
446 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
447 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
448 PF and, WK0, X, #pix_per_block-1
449 .endif
450 preload_trailing src_bpp, src_bpp_shift, SRC
451 preload_trailing mask_bpp, mask_bpp_shift, MASK
452 preload_trailing dst_r_bpp, dst_bpp_shift, DST
453 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
454 /* The remainder of the line is handled identically to the medium case */
455 medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
456 .endm
458 .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
459 120:
460 process_head , 16, 0, unaligned_src, unaligned_mask, 0
461 process_tail , 16, 0
462 .if !((flags) & FLAG_PROCESS_DOES_STORE)
463 pixst , 16, 0, DST
464 .endif
465 subs X, X, #128/dst_w_bpp
466 bhs 120b
467 /* Trailing pixels */
468 tst X, #128/dst_w_bpp - 1
469 beq exit_label
470 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
471 .endm
473 .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
474 tst X, #16*8/dst_w_bpp
475 conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
476 /* Trailing pixels */
477 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
478 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
479 .endm
481 .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
482 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
483 .if mask_bpp == 8 || mask_bpp == 16
484 tst MASK, #3
485 bne 141f
486 .endif
487 .if src_bpp == 8 || src_bpp == 16
488 tst SRC, #3
489 bne 140f
490 .endif
491 action process_head, process_tail, process_inner_loop, exit_label, 0, 0
492 .if src_bpp == 8 || src_bpp == 16
493 b exit_label
494 140:
495 action process_head, process_tail, process_inner_loop, exit_label, 1, 0
496 .endif
497 .if mask_bpp == 8 || mask_bpp == 16
498 b exit_label
499 141:
500 .if src_bpp == 8 || src_bpp == 16
501 tst SRC, #3
502 bne 142f
503 .endif
504 action process_head, process_tail, process_inner_loop, exit_label, 0, 1
505 .if src_bpp == 8 || src_bpp == 16
506 b exit_label
507 142:
508 action process_head, process_tail, process_inner_loop, exit_label, 1, 1
509 .endif
510 .endif
511 .endm
514 .macro end_of_line restore_x, vars_spilled, loop_label, last_one
515 .if vars_spilled
516 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
517 /* This is ldmia sp,{} */
518 .word 0xE89D0000 | LINE_SAVED_REGS
519 .endif
520 subs Y, Y, #1
521 .if vars_spilled
522 .if (LINE_SAVED_REGS) & (1<<1)
523 str Y, [sp]
524 .endif
525 .endif
526 add DST, DST, STRIDE_D
527 .if src_bpp > 0
528 add SRC, SRC, STRIDE_S
529 .endif
530 .if mask_bpp > 0
531 add MASK, MASK, STRIDE_M
532 .endif
533 .if restore_x
534 mov X, ORIG_W
535 .endif
536 bhs loop_label
537 .ifc "last_one",""
538 .if vars_spilled
539 b 197f
540 .else
541 b 198f
542 .endif
543 .else
544 .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
545 b 198f
546 .endif
547 .endif
548 .endm
551 .macro generate_composite_function fname, \
552 src_bpp_, \
553 mask_bpp_, \
554 dst_w_bpp_, \
555 flags_, \
556 prefetch_distance_, \
557 init, \
558 newline, \
559 cleanup, \
560 process_head, \
561 process_tail, \
562 process_inner_loop
564 .func fname
565 .global fname
566 /* For ELF format also set function visibility to hidden */
567 #ifdef __ELF__
568 .hidden fname
569 .type fname, %function
570 #endif
572 /*
573 * Make some macro arguments globally visible and accessible
574 * from other macros
575 */
576 .set src_bpp, src_bpp_
577 .set mask_bpp, mask_bpp_
578 .set dst_w_bpp, dst_w_bpp_
579 .set flags, flags_
580 .set prefetch_distance, prefetch_distance_
582 /*
583 * Select prefetch type for this function.
584 */
585 .if prefetch_distance == 0
586 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
587 .else
588 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
589 .endif
591 .if src_bpp == 32
592 .set src_bpp_shift, 2
593 .elseif src_bpp == 24
594 .set src_bpp_shift, 0
595 .elseif src_bpp == 16
596 .set src_bpp_shift, 1
597 .elseif src_bpp == 8
598 .set src_bpp_shift, 0
599 .elseif src_bpp == 0
600 .set src_bpp_shift, -1
601 .else
602 .error "requested src bpp (src_bpp) is not supported"
603 .endif
605 .if mask_bpp == 32
606 .set mask_bpp_shift, 2
607 .elseif mask_bpp == 24
608 .set mask_bpp_shift, 0
609 .elseif mask_bpp == 8
610 .set mask_bpp_shift, 0
611 .elseif mask_bpp == 0
612 .set mask_bpp_shift, -1
613 .else
614 .error "requested mask bpp (mask_bpp) is not supported"
615 .endif
617 .if dst_w_bpp == 32
618 .set dst_bpp_shift, 2
619 .elseif dst_w_bpp == 24
620 .set dst_bpp_shift, 0
621 .elseif dst_w_bpp == 16
622 .set dst_bpp_shift, 1
623 .elseif dst_w_bpp == 8
624 .set dst_bpp_shift, 0
625 .else
626 .error "requested dst bpp (dst_w_bpp) is not supported"
627 .endif
629 .if (((flags) & FLAG_DST_READWRITE) != 0)
630 .set dst_r_bpp, dst_w_bpp
631 .else
632 .set dst_r_bpp, 0
633 .endif
635 .set pix_per_block, 16*8/dst_w_bpp
636 .if src_bpp != 0
637 .if 32*8/src_bpp > pix_per_block
638 .set pix_per_block, 32*8/src_bpp
639 .endif
640 .endif
641 .if mask_bpp != 0
642 .if 32*8/mask_bpp > pix_per_block
643 .set pix_per_block, 32*8/mask_bpp
644 .endif
645 .endif
646 .if dst_r_bpp != 0
647 .if 32*8/dst_r_bpp > pix_per_block
648 .set pix_per_block, 32*8/dst_r_bpp
649 .endif
650 .endif
652 /* The standard entry conditions set up by pixman-arm-common.h are:
653 * r0 = width (pixels)
654 * r1 = height (rows)
655 * r2 = pointer to top-left pixel of destination
656 * r3 = destination stride (pixels)
657 * [sp] = source pixel value, or pointer to top-left pixel of source
658 * [sp,#4] = 0 or source stride (pixels)
659 * The following arguments are unused for non-mask operations
660 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
661 * [sp,#12] = 0 or mask stride (pixels)
662 */
664 /*
665 * Assign symbolic names to registers
666 */
667 X .req r0 /* pixels to go on this line */
668 Y .req r1 /* lines to go */
669 DST .req r2 /* destination pixel pointer */
670 STRIDE_D .req r3 /* destination stride (bytes, minus width) */
671 SRC .req r4 /* source pixel pointer */
672 STRIDE_S .req r5 /* source stride (bytes, minus width) */
673 MASK .req r6 /* mask pixel pointer (if applicable) */
674 STRIDE_M .req r7 /* mask stride (bytes, minus width) */
675 WK0 .req r8 /* pixel data registers */
676 WK1 .req r9
677 WK2 .req r10
678 WK3 .req r11
679 SCRATCH .req r12
680 ORIG_W .req r14 /* width (pixels) */
682 fname:
683 .fnstart
684 .save {r4-r11, lr}
685 push {r4-r11, lr} /* save all registers */
687 subs Y, Y, #1
688 blo 199f
690 #ifdef DEBUG_PARAMS
691 .pad #9*4
692 sub sp, sp, #9*4
693 #endif
695 .if src_bpp > 0
696 ldr SRC, [sp, #ARGS_STACK_OFFSET]
697 ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
698 .endif
699 .if mask_bpp > 0
700 ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
701 ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
702 .endif
704 #ifdef DEBUG_PARAMS
705 add Y, Y, #1
706 stmia sp, {r0-r7,pc}
707 sub Y, Y, #1
708 #endif
710 init
712 lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
713 sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
714 .if src_bpp > 0
715 lsl STRIDE_S, #src_bpp_shift
716 sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
717 .endif
718 .if mask_bpp > 0
719 lsl STRIDE_M, #mask_bpp_shift
720 sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
721 .endif
723 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
724 cmp X, #2*16*8/dst_w_bpp - 1
725 blo 170f
726 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
727 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
728 cmp X, #(prefetch_distance+3)*pix_per_block - 1
729 blo 160f
731 /* Wide case */
732 /* Adjust X so that the decrement instruction can also test for
733 * inner loop termination. We want it to stop when there are
734 * (prefetch_distance+1) complete blocks to go. */
735 sub X, X, #(prefetch_distance+2)*pix_per_block
736 mov ORIG_W, X
737 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
738 /* This is stmdb sp!,{} */
739 .word 0xE92D0000 | LINE_SAVED_REGS
740 .endif
741 151: /* New line */
742 newline
743 preload_leading_step1 src_bpp, WK1, SRC
744 preload_leading_step1 mask_bpp, WK2, MASK
745 preload_leading_step1 dst_r_bpp, WK3, DST
747 tst DST, #15
748 beq 154f
749 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
750 .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
751 PF and, WK0, WK0, #15
752 .endif
754 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
755 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
756 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
758 leading_15bytes process_head, process_tail
760 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
761 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
762 and SCRATCH, SRC, #31
763 rsb SCRATCH, SCRATCH, #32*prefetch_distance
764 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
765 and SCRATCH, MASK, #31
766 rsb SCRATCH, SCRATCH, #32*prefetch_distance
767 .endif
768 .ifc "process_inner_loop",""
769 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
770 .else
771 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
772 .endif
774 157: /* Check for another line */
775 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
776 .endif
778 .ltorg
780 160: /* Medium case */
781 mov ORIG_W, X
782 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
783 /* This is stmdb sp!,{} */
784 .word 0xE92D0000 | LINE_SAVED_REGS
785 .endif
786 161: /* New line */
787 newline
788 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
789 preload_line 0, mask_bpp, mask_bpp_shift, MASK
790 preload_line 0, dst_r_bpp, dst_bpp_shift, DST
792 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
793 tst DST, #15
794 beq 164f
795 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
797 leading_15bytes process_head, process_tail
799 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
800 switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
802 167: /* Check for another line */
803 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
805 .ltorg
807 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
808 .if dst_w_bpp < 32
809 mov ORIG_W, X
810 .endif
811 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
812 /* This is stmdb sp!,{} */
813 .word 0xE92D0000 | LINE_SAVED_REGS
814 .endif
815 171: /* New line */
816 newline
817 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
818 preload_line 1, mask_bpp, mask_bpp_shift, MASK
819 preload_line 1, dst_r_bpp, dst_bpp_shift, DST
821 .if dst_w_bpp == 8
822 tst DST, #3
823 beq 174f
824 172: subs X, X, #1
825 blo 177f
826 process_head , 1, 0, 1, 1, 0
827 process_tail , 1, 0
828 .if !((flags) & FLAG_PROCESS_DOES_STORE)
829 pixst , 1, 0, DST
830 .endif
831 tst DST, #3
832 bne 172b
833 .elseif dst_w_bpp == 16
834 tst DST, #2
835 beq 174f
836 subs X, X, #1
837 blo 177f
838 process_head , 2, 0, 1, 1, 0
839 process_tail , 2, 0
840 .if !((flags) & FLAG_PROCESS_DOES_STORE)
841 pixst , 2, 0, DST
842 .endif
843 .endif
845 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
846 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
848 177: /* Check for another line */
849 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
851 197:
852 .if (flags) & FLAG_SPILL_LINE_VARS
853 add sp, sp, #LINE_SAVED_REG_COUNT*4
854 .endif
855 198:
856 cleanup
858 #ifdef DEBUG_PARAMS
859 add sp, sp, #9*4 /* junk the debug copy of arguments */
860 #endif
861 199:
862 pop {r4-r11, pc} /* exit */
863 .fnend
865 .ltorg
867 .unreq X
868 .unreq Y
869 .unreq DST
870 .unreq STRIDE_D
871 .unreq SRC
872 .unreq STRIDE_S
873 .unreq MASK
874 .unreq STRIDE_M
875 .unreq WK0
876 .unreq WK1
877 .unreq WK2
878 .unreq WK3
879 .unreq SCRATCH
880 .unreq ORIG_W
881 .endfunc
882 .endm
884 .macro line_saved_regs x:vararg
885 .set LINE_SAVED_REGS, 0
886 .set LINE_SAVED_REG_COUNT, 0
887 .irp SAVED_REG,x
888 .ifc "SAVED_REG","Y"
889 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
890 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
891 .endif
892 .ifc "SAVED_REG","STRIDE_D"
893 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
894 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
895 .endif
896 .ifc "SAVED_REG","STRIDE_S"
897 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
898 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
899 .endif
900 .ifc "SAVED_REG","STRIDE_M"
901 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
902 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
903 .endif
904 .ifc "SAVED_REG","ORIG_W"
905 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
906 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
907 .endif
908 .endr
909 .endm
911 .macro nop_macro x:vararg
912 .endm