|
1 /* |
|
2 * Copyright © 2012 Raspberry Pi Foundation |
|
3 * Copyright © 2012 RISC OS Open Ltd |
|
4 * |
|
5 * Permission to use, copy, modify, distribute, and sell this software and its |
|
6 * documentation for any purpose is hereby granted without fee, provided that |
|
7 * the above copyright notice appear in all copies and that both that |
|
8 * copyright notice and this permission notice appear in supporting |
|
9 * documentation, and that the name of the copyright holders not be used in |
|
10 * advertising or publicity pertaining to distribution of the software without |
|
11 * specific, written prior permission. The copyright holders make no |
|
12 * representations about the suitability of this software for any purpose. It |
|
13 * is provided "as is" without express or implied warranty. |
|
14 * |
|
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
|
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
|
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
|
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
|
22 * SOFTWARE. |
|
23 * |
|
24 * Author: Ben Avison (bavison@riscosopen.org) |
|
25 * |
|
26 */ |
|
27 |
|
28 /* |
|
29 * Because the alignment of pixel data to cachelines, and even the number of |
|
30 * cachelines per row can vary from row to row, and because of the need to |
|
31 * preload each scanline once and only once, this prefetch strategy treats |
|
32 * each row of pixels independently. When a pixel row is long enough, there |
|
33 * are three distinct phases of prefetch: |
|
34 * * an inner loop section, where each time a cacheline of data is |
|
35 * processed, another cacheline is preloaded (the exact distance ahead is |
|
36 * determined empirically using profiling results from lowlevel-blt-bench) |
|
37 * * a leading section, where enough cachelines are preloaded to ensure no |
|
38 * cachelines escape being preloaded when the inner loop starts |
|
39 * * a trailing section, where a limited number (0 or more) of cachelines |
|
40 * are preloaded to deal with data (if any) that hangs off the end of the |
|
41 * last iteration of the inner loop, plus any trailing bytes that were not |
|
42 * enough to make up one whole iteration of the inner loop |
|
43 * |
|
44 * There are (in general) three distinct code paths, selected between |
|
45 * depending upon how long the pixel row is. If it is long enough that there |
|
46 * is at least one iteration of the inner loop (as described above) then |
|
47 * this is described as the "wide" case. If it is shorter than that, but |
|
48 * there are still enough bytes output that there is at least one 16-byte- |
|
49 * long, 16-byte-aligned write to the destination (the optimum type of |
|
50 * write), then this is the "medium" case. If it is not even this long, then |
|
51 * this is the "narrow" case, and there is no attempt to align writes to |
|
52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the |
|
53 * cachelines containing data from the pixel row are prefetched up-front. |
|
54 */ |
|
55 |
|
56 /* |
|
57 * Determine whether we put the arguments on the stack for debugging. |
|
58 */ |
|
59 #undef DEBUG_PARAMS |
|
60 |
|
61 /* |
|
62 * Bit flags for 'generate_composite_function' macro which are used |
|
63 * to tune generated functions behavior. |
|
64 */ |
|
65 .set FLAG_DST_WRITEONLY, 0 |
|
66 .set FLAG_DST_READWRITE, 1 |
|
67 .set FLAG_COND_EXEC, 0 |
|
68 .set FLAG_BRANCH_OVER, 2 |
|
69 .set FLAG_PROCESS_PRESERVES_PSR, 0 |
|
70 .set FLAG_PROCESS_CORRUPTS_PSR, 4 |
|
71 .set FLAG_PROCESS_DOESNT_STORE, 0 |
|
72 .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ |
|
73 .set FLAG_NO_SPILL_LINE_VARS, 0 |
|
74 .set FLAG_SPILL_LINE_VARS_WIDE, 16 |
|
75 .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 |
|
76 .set FLAG_SPILL_LINE_VARS, 48 |
|
77 .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 |
|
78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 |
|
79 |
|
80 /* |
|
81 * Offset into stack where mask and source pointer/stride can be accessed. |
|
82 */ |
|
83 #ifdef DEBUG_PARAMS |
|
84 .set ARGS_STACK_OFFSET, (9*4+9*4) |
|
85 #else |
|
86 .set ARGS_STACK_OFFSET, (9*4) |
|
87 #endif |
|
88 |
|
89 /* |
|
90 * Constants for selecting preferable prefetch type. |
|
91 */ |
|
92 .set PREFETCH_TYPE_NONE, 0 |
|
93 .set PREFETCH_TYPE_STANDARD, 1 |
|
94 |
|
95 /* |
|
96 * Definitions of macros for load/store of pixel data. |
|
97 */ |
|
98 |
|
99 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 |
|
100 .if numbytes == 16 |
|
101 .if unaligned == 1 |
|
102 op&r&cond WK®0, [base], #4 |
|
103 op&r&cond WK®1, [base], #4 |
|
104 op&r&cond WK®2, [base], #4 |
|
105 op&r&cond WK®3, [base], #4 |
|
106 .else |
|
107 op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} |
|
108 .endif |
|
109 .elseif numbytes == 8 |
|
110 .if unaligned == 1 |
|
111 op&r&cond WK®0, [base], #4 |
|
112 op&r&cond WK®1, [base], #4 |
|
113 .else |
|
114 op&m&cond&ia base!, {WK®0,WK®1} |
|
115 .endif |
|
116 .elseif numbytes == 4 |
|
117 op&r&cond WK®0, [base], #4 |
|
118 .elseif numbytes == 2 |
|
119 op&r&cond&h WK®0, [base], #2 |
|
120 .elseif numbytes == 1 |
|
121 op&r&cond&b WK®0, [base], #1 |
|
122 .else |
|
123 .error "unsupported size: numbytes" |
|
124 .endif |
|
125 .endm |
|
126 |
|
127 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base |
|
128 .if numbytes == 16 |
|
129 stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} |
|
130 .elseif numbytes == 8 |
|
131 stm&cond&db base, {WK®0,WK®1} |
|
132 .elseif numbytes == 4 |
|
133 str&cond WK®0, [base, #-4] |
|
134 .elseif numbytes == 2 |
|
135 str&cond&h WK®0, [base, #-2] |
|
136 .elseif numbytes == 1 |
|
137 str&cond&b WK®0, [base, #-1] |
|
138 .else |
|
139 .error "unsupported size: numbytes" |
|
140 .endif |
|
141 .endm |
|
142 |
|
143 .macro pixld cond, numbytes, firstreg, base, unaligned |
|
144 pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned |
|
145 .endm |
|
146 |
|
147 .macro pixst cond, numbytes, firstreg, base |
|
148 .if (flags) & FLAG_DST_READWRITE |
|
149 pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base |
|
150 .else |
|
151 pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base |
|
152 .endif |
|
153 .endm |
|
154 |
|
155 .macro PF a, x:vararg |
|
156 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) |
|
157 a x |
|
158 .endif |
|
159 .endm |
|
160 |
|
161 |
|
162 .macro preload_leading_step1 bpp, ptr, base |
|
163 /* If the destination is already 16-byte aligned, then we need to preload |
|
164 * between 0 and prefetch_distance (inclusive) cache lines ahead so there |
|
165 * are no gaps when the inner loop starts. |
|
166 */ |
|
167 .if bpp > 0 |
|
168 PF bic, ptr, base, #31 |
|
169 .set OFFSET, 0 |
|
170 .rept prefetch_distance+1 |
|
171 PF pld, [ptr, #OFFSET] |
|
172 .set OFFSET, OFFSET+32 |
|
173 .endr |
|
174 .endif |
|
175 .endm |
|
176 |
|
177 .macro preload_leading_step2 bpp, bpp_shift, ptr, base |
|
178 /* However, if the destination is not 16-byte aligned, we may need to |
|
179 * preload more cache lines than that. The question we need to ask is: |
|
180 * are the bytes corresponding to the leading pixels more than the amount |
|
181 * by which the source pointer will be rounded down for preloading, and if |
|
182 * so, by how many cache lines? Effectively, we want to calculate |
|
183 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp |
|
184 * inner_loop_offset = (src+leading_bytes)&31 |
|
185 * extra_needed = leading_bytes - inner_loop_offset |
|
186 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only |
|
187 * possible when there are 4 src bytes for every 1 dst byte). |
|
188 */ |
|
189 .if bpp > 0 |
|
190 .ifc base,DST |
|
191 /* The test can be simplified further when preloading the destination */ |
|
192 PF tst, base, #16 |
|
193 PF beq, 61f |
|
194 .else |
|
195 .if bpp/dst_w_bpp == 4 |
|
196 PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift |
|
197 PF and, SCRATCH, SCRATCH, #31 |
|
198 PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift |
|
199 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ |
|
200 PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */ |
|
201 PF bcs, 61f |
|
202 PF bpl, 60f |
|
203 PF pld, [ptr, #32*(prefetch_distance+2)] |
|
204 .else |
|
205 PF mov, SCRATCH, base, lsl #32-5 |
|
206 PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift |
|
207 PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift |
|
208 PF bls, 61f |
|
209 .endif |
|
210 .endif |
|
211 60: PF pld, [ptr, #32*(prefetch_distance+1)] |
|
212 61: |
|
213 .endif |
|
214 .endm |
|
215 |
|
216 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) |
|
217 .macro preload_middle bpp, base, scratch_holds_offset |
|
218 .if bpp > 0 |
|
219 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ |
|
220 .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) |
|
221 .if scratch_holds_offset |
|
222 PF pld, [base, SCRATCH] |
|
223 .else |
|
224 PF bic, SCRATCH, base, #31 |
|
225 PF pld, [SCRATCH, #32*prefetch_distance] |
|
226 .endif |
|
227 .endif |
|
228 .endif |
|
229 .endm |
|
230 |
|
231 .macro preload_trailing bpp, bpp_shift, base |
|
232 .if bpp > 0 |
|
233 .if bpp*pix_per_block > 256 |
|
234 /* Calculations are more complex if more than one fetch per block */ |
|
235 PF and, WK1, base, #31 |
|
236 PF add, WK1, WK1, WK0, lsl #bpp_shift |
|
237 PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) |
|
238 PF bic, SCRATCH, base, #31 |
|
239 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] |
|
240 PF add, SCRATCH, SCRATCH, #32 |
|
241 PF subs, WK1, WK1, #32 |
|
242 PF bhi, 80b |
|
243 .else |
|
244 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ |
|
245 PF mov, SCRATCH, base, lsl #32-5 |
|
246 PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift |
|
247 PF adceqs, SCRATCH, SCRATCH, #0 |
|
248 /* The instruction above has two effects: ensures Z is only |
|
249 * set if C was clear (so Z indicates that both shifted quantities |
|
250 * were 0), and clears C if Z was set (so C indicates that the sum |
|
251 * of the shifted quantities was greater and not equal to 32) */ |
|
252 PF beq, 82f |
|
253 PF bic, SCRATCH, base, #31 |
|
254 PF bcc, 81f |
|
255 PF pld, [SCRATCH, #32*(prefetch_distance+2)] |
|
256 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] |
|
257 82: |
|
258 .endif |
|
259 .endif |
|
260 .endm |
|
261 |
|
262 |
|
263 .macro preload_line narrow_case, bpp, bpp_shift, base |
|
264 /* "narrow_case" - just means that the macro was invoked from the "narrow" |
|
265 * code path rather than the "medium" one - because in the narrow case, |
|
266 * the row of pixels is known to output no more than 30 bytes, then |
|
267 * (assuming the source pixels are no wider than the the destination |
|
268 * pixels) they cannot possibly straddle more than 2 32-byte cachelines, |
|
269 * meaning there's no need for a loop. |
|
270 * "bpp" - number of bits per pixel in the channel (source, mask or |
|
271 * destination) that's being preloaded, or 0 if this channel is not used |
|
272 * for reading |
|
273 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) |
|
274 * "base" - base address register of channel to preload (SRC, MASK or DST) |
|
275 */ |
|
276 .if bpp > 0 |
|
277 .if narrow_case && (bpp <= dst_w_bpp) |
|
278 /* In these cases, each line for each channel is in either 1 or 2 cache lines */ |
|
279 PF bic, WK0, base, #31 |
|
280 PF pld, [WK0] |
|
281 PF add, WK1, base, X, LSL #bpp_shift |
|
282 PF sub, WK1, WK1, #1 |
|
283 PF bic, WK1, WK1, #31 |
|
284 PF cmp, WK1, WK0 |
|
285 PF beq, 90f |
|
286 PF pld, [WK1] |
|
287 90: |
|
288 .else |
|
289 PF bic, WK0, base, #31 |
|
290 PF pld, [WK0] |
|
291 PF add, WK1, base, X, lsl #bpp_shift |
|
292 PF sub, WK1, WK1, #1 |
|
293 PF bic, WK1, WK1, #31 |
|
294 PF cmp, WK1, WK0 |
|
295 PF beq, 92f |
|
296 91: PF add, WK0, WK0, #32 |
|
297 PF cmp, WK0, WK1 |
|
298 PF pld, [WK0] |
|
299 PF bne, 91b |
|
300 92: |
|
301 .endif |
|
302 .endif |
|
303 .endm |
|
304 |
|
305 |
|
306 .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
|
307 process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 |
|
308 .if decrementx |
|
309 sub&cond X, X, #8*numbytes/dst_w_bpp |
|
310 .endif |
|
311 process_tail cond, numbytes, firstreg |
|
312 .if !((flags) & FLAG_PROCESS_DOES_STORE) |
|
313 pixst cond, numbytes, firstreg, DST |
|
314 .endif |
|
315 .endm |
|
316 |
|
317 .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
|
318 .if (flags) & FLAG_BRANCH_OVER |
|
319 .ifc cond,mi |
|
320 bpl 100f |
|
321 .endif |
|
322 .ifc cond,cs |
|
323 bcc 100f |
|
324 .endif |
|
325 .ifc cond,ne |
|
326 beq 100f |
|
327 .endif |
|
328 conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
|
329 100: |
|
330 .else |
|
331 conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
|
332 .endif |
|
333 .endm |
|
334 |
|
335 .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx |
|
336 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) |
|
337 /* Can't interleave reads and writes */ |
|
338 test |
|
339 conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx |
|
340 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR |
|
341 test |
|
342 .endif |
|
343 conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx |
|
344 .else |
|
345 /* Can interleave reads and writes for better scheduling */ |
|
346 test |
|
347 process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 |
|
348 process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 |
|
349 .if decrementx |
|
350 sub&cond1 X, X, #8*numbytes1/dst_w_bpp |
|
351 sub&cond2 X, X, #8*numbytes2/dst_w_bpp |
|
352 .endif |
|
353 process_tail cond1, numbytes1, firstreg1 |
|
354 process_tail cond2, numbytes2, firstreg2 |
|
355 pixst cond1, numbytes1, firstreg1, DST |
|
356 pixst cond2, numbytes2, firstreg2, DST |
|
357 .endif |
|
358 .endm |
|
359 |
|
360 |
|
361 .macro test_bits_1_0_ptr |
|
362 movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ |
|
363 .endm |
|
364 |
|
365 .macro test_bits_3_2_ptr |
|
366 movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ |
|
367 .endm |
|
368 |
|
369 .macro leading_15bytes process_head, process_tail |
|
370 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ |
|
371 /* Use unaligned loads in all cases for simplicity */ |
|
372 .if dst_w_bpp == 8 |
|
373 conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1 |
|
374 .elseif dst_w_bpp == 16 |
|
375 test_bits_1_0_ptr |
|
376 conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1 |
|
377 .endif |
|
378 conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1 |
|
379 .endm |
|
380 |
|
381 .macro test_bits_3_2_pix |
|
382 movs SCRATCH, X, lsl #dst_bpp_shift+32-3 |
|
383 .endm |
|
384 |
|
385 .macro test_bits_1_0_pix |
|
386 .if dst_w_bpp == 8 |
|
387 movs SCRATCH, X, lsl #dst_bpp_shift+32-1 |
|
388 .else |
|
389 movs SCRATCH, X, lsr #1 |
|
390 .endif |
|
391 .endm |
|
392 |
|
393 .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
|
394 conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 |
|
395 .if dst_w_bpp == 16 |
|
396 test_bits_1_0_pix |
|
397 conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 |
|
398 .elseif dst_w_bpp == 8 |
|
399 conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 |
|
400 .endif |
|
401 .endm |
|
402 |
|
403 |
|
404 .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment |
|
405 110: |
|
406 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ |
|
407 .rept pix_per_block*dst_w_bpp/128 |
|
408 process_head , 16, 0, unaligned_src, unaligned_mask, 1 |
|
409 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
|
410 preload_middle src_bpp, SRC, 1 |
|
411 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
|
412 preload_middle mask_bpp, MASK, 1 |
|
413 .else |
|
414 preload_middle src_bpp, SRC, 0 |
|
415 preload_middle mask_bpp, MASK, 0 |
|
416 .endif |
|
417 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) |
|
418 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that |
|
419 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset |
|
420 * preloads for, to achieve staggered prefetches for multiple channels, because there are |
|
421 * always two STMs per prefetch, so there is always an opposite STM on which to put the |
|
422 * preload. Note, no need to BIC the base register here */ |
|
423 PF pld, [DST, #32*prefetch_distance - dst_alignment] |
|
424 .endif |
|
425 process_tail , 16, 0 |
|
426 .if !((flags) & FLAG_PROCESS_DOES_STORE) |
|
427 pixst , 16, 0, DST |
|
428 .endif |
|
429 .set SUBBLOCK, SUBBLOCK+1 |
|
430 .endr |
|
431 subs X, X, #pix_per_block |
|
432 bhs 110b |
|
433 .endm |
|
434 |
|
435 .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask |
|
436 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ |
|
437 .if dst_r_bpp > 0 |
|
438 tst DST, #16 |
|
439 bne 111f |
|
440 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 |
|
441 b 112f |
|
442 111: |
|
443 .endif |
|
444 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 |
|
445 112: |
|
446 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ |
|
447 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) |
|
448 PF and, WK0, X, #pix_per_block-1 |
|
449 .endif |
|
450 preload_trailing src_bpp, src_bpp_shift, SRC |
|
451 preload_trailing mask_bpp, mask_bpp_shift, MASK |
|
452 preload_trailing dst_r_bpp, dst_bpp_shift, DST |
|
453 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp |
|
454 /* The remainder of the line is handled identically to the medium case */ |
|
455 medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask |
|
456 .endm |
|
457 |
|
458 .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask |
|
459 120: |
|
460 process_head , 16, 0, unaligned_src, unaligned_mask, 0 |
|
461 process_tail , 16, 0 |
|
462 .if !((flags) & FLAG_PROCESS_DOES_STORE) |
|
463 pixst , 16, 0, DST |
|
464 .endif |
|
465 subs X, X, #128/dst_w_bpp |
|
466 bhs 120b |
|
467 /* Trailing pixels */ |
|
468 tst X, #128/dst_w_bpp - 1 |
|
469 beq exit_label |
|
470 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
|
471 .endm |
|
472 |
|
473 .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask |
|
474 tst X, #16*8/dst_w_bpp |
|
475 conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 |
|
476 /* Trailing pixels */ |
|
477 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ |
|
478 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
|
479 .endm |
|
480 |
|
481 .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label |
|
482 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ |
|
483 .if mask_bpp == 8 || mask_bpp == 16 |
|
484 tst MASK, #3 |
|
485 bne 141f |
|
486 .endif |
|
487 .if src_bpp == 8 || src_bpp == 16 |
|
488 tst SRC, #3 |
|
489 bne 140f |
|
490 .endif |
|
491 action process_head, process_tail, process_inner_loop, exit_label, 0, 0 |
|
492 .if src_bpp == 8 || src_bpp == 16 |
|
493 b exit_label |
|
494 140: |
|
495 action process_head, process_tail, process_inner_loop, exit_label, 1, 0 |
|
496 .endif |
|
497 .if mask_bpp == 8 || mask_bpp == 16 |
|
498 b exit_label |
|
499 141: |
|
500 .if src_bpp == 8 || src_bpp == 16 |
|
501 tst SRC, #3 |
|
502 bne 142f |
|
503 .endif |
|
504 action process_head, process_tail, process_inner_loop, exit_label, 0, 1 |
|
505 .if src_bpp == 8 || src_bpp == 16 |
|
506 b exit_label |
|
507 142: |
|
508 action process_head, process_tail, process_inner_loop, exit_label, 1, 1 |
|
509 .endif |
|
510 .endif |
|
511 .endm |
|
512 |
|
513 |
|
514 .macro end_of_line restore_x, vars_spilled, loop_label, last_one |
|
515 .if vars_spilled |
|
516 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ |
|
517 /* This is ldmia sp,{} */ |
|
518 .word 0xE89D0000 | LINE_SAVED_REGS |
|
519 .endif |
|
520 subs Y, Y, #1 |
|
521 .if vars_spilled |
|
522 .if (LINE_SAVED_REGS) & (1<<1) |
|
523 str Y, [sp] |
|
524 .endif |
|
525 .endif |
|
526 add DST, DST, STRIDE_D |
|
527 .if src_bpp > 0 |
|
528 add SRC, SRC, STRIDE_S |
|
529 .endif |
|
530 .if mask_bpp > 0 |
|
531 add MASK, MASK, STRIDE_M |
|
532 .endif |
|
533 .if restore_x |
|
534 mov X, ORIG_W |
|
535 .endif |
|
536 bhs loop_label |
|
537 .ifc "last_one","" |
|
538 .if vars_spilled |
|
539 b 197f |
|
540 .else |
|
541 b 198f |
|
542 .endif |
|
543 .else |
|
544 .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) |
|
545 b 198f |
|
546 .endif |
|
547 .endif |
|
548 .endm |
|
549 |
|
550 |
|
551 .macro generate_composite_function fname, \ |
|
552 src_bpp_, \ |
|
553 mask_bpp_, \ |
|
554 dst_w_bpp_, \ |
|
555 flags_, \ |
|
556 prefetch_distance_, \ |
|
557 init, \ |
|
558 newline, \ |
|
559 cleanup, \ |
|
560 process_head, \ |
|
561 process_tail, \ |
|
562 process_inner_loop |
|
563 |
|
564 .func fname |
|
565 .global fname |
|
566 /* For ELF format also set function visibility to hidden */ |
|
567 #ifdef __ELF__ |
|
568 .hidden fname |
|
569 .type fname, %function |
|
570 #endif |
|
571 |
|
572 /* |
|
573 * Make some macro arguments globally visible and accessible |
|
574 * from other macros |
|
575 */ |
|
576 .set src_bpp, src_bpp_ |
|
577 .set mask_bpp, mask_bpp_ |
|
578 .set dst_w_bpp, dst_w_bpp_ |
|
579 .set flags, flags_ |
|
580 .set prefetch_distance, prefetch_distance_ |
|
581 |
|
582 /* |
|
583 * Select prefetch type for this function. |
|
584 */ |
|
585 .if prefetch_distance == 0 |
|
586 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE |
|
587 .else |
|
588 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD |
|
589 .endif |
|
590 |
|
591 .if src_bpp == 32 |
|
592 .set src_bpp_shift, 2 |
|
593 .elseif src_bpp == 24 |
|
594 .set src_bpp_shift, 0 |
|
595 .elseif src_bpp == 16 |
|
596 .set src_bpp_shift, 1 |
|
597 .elseif src_bpp == 8 |
|
598 .set src_bpp_shift, 0 |
|
599 .elseif src_bpp == 0 |
|
600 .set src_bpp_shift, -1 |
|
601 .else |
|
602 .error "requested src bpp (src_bpp) is not supported" |
|
603 .endif |
|
604 |
|
605 .if mask_bpp == 32 |
|
606 .set mask_bpp_shift, 2 |
|
607 .elseif mask_bpp == 24 |
|
608 .set mask_bpp_shift, 0 |
|
609 .elseif mask_bpp == 8 |
|
610 .set mask_bpp_shift, 0 |
|
611 .elseif mask_bpp == 0 |
|
612 .set mask_bpp_shift, -1 |
|
613 .else |
|
614 .error "requested mask bpp (mask_bpp) is not supported" |
|
615 .endif |
|
616 |
|
617 .if dst_w_bpp == 32 |
|
618 .set dst_bpp_shift, 2 |
|
619 .elseif dst_w_bpp == 24 |
|
620 .set dst_bpp_shift, 0 |
|
621 .elseif dst_w_bpp == 16 |
|
622 .set dst_bpp_shift, 1 |
|
623 .elseif dst_w_bpp == 8 |
|
624 .set dst_bpp_shift, 0 |
|
625 .else |
|
626 .error "requested dst bpp (dst_w_bpp) is not supported" |
|
627 .endif |
|
628 |
|
629 .if (((flags) & FLAG_DST_READWRITE) != 0) |
|
630 .set dst_r_bpp, dst_w_bpp |
|
631 .else |
|
632 .set dst_r_bpp, 0 |
|
633 .endif |
|
634 |
|
635 .set pix_per_block, 16*8/dst_w_bpp |
|
636 .if src_bpp != 0 |
|
637 .if 32*8/src_bpp > pix_per_block |
|
638 .set pix_per_block, 32*8/src_bpp |
|
639 .endif |
|
640 .endif |
|
641 .if mask_bpp != 0 |
|
642 .if 32*8/mask_bpp > pix_per_block |
|
643 .set pix_per_block, 32*8/mask_bpp |
|
644 .endif |
|
645 .endif |
|
646 .if dst_r_bpp != 0 |
|
647 .if 32*8/dst_r_bpp > pix_per_block |
|
648 .set pix_per_block, 32*8/dst_r_bpp |
|
649 .endif |
|
650 .endif |
|
651 |
|
652 /* The standard entry conditions set up by pixman-arm-common.h are: |
|
653 * r0 = width (pixels) |
|
654 * r1 = height (rows) |
|
655 * r2 = pointer to top-left pixel of destination |
|
656 * r3 = destination stride (pixels) |
|
657 * [sp] = source pixel value, or pointer to top-left pixel of source |
|
658 * [sp,#4] = 0 or source stride (pixels) |
|
659 * The following arguments are unused for non-mask operations |
|
660 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask |
|
661 * [sp,#12] = 0 or mask stride (pixels) |
|
662 */ |
|
663 |
|
664 /* |
|
665 * Assign symbolic names to registers |
|
666 */ |
|
667 X .req r0 /* pixels to go on this line */ |
|
668 Y .req r1 /* lines to go */ |
|
669 DST .req r2 /* destination pixel pointer */ |
|
670 STRIDE_D .req r3 /* destination stride (bytes, minus width) */ |
|
671 SRC .req r4 /* source pixel pointer */ |
|
672 STRIDE_S .req r5 /* source stride (bytes, minus width) */ |
|
673 MASK .req r6 /* mask pixel pointer (if applicable) */ |
|
674 STRIDE_M .req r7 /* mask stride (bytes, minus width) */ |
|
675 WK0 .req r8 /* pixel data registers */ |
|
676 WK1 .req r9 |
|
677 WK2 .req r10 |
|
678 WK3 .req r11 |
|
679 SCRATCH .req r12 |
|
680 ORIG_W .req r14 /* width (pixels) */ |
|
681 |
|
682 fname: |
|
683 .fnstart |
|
684 .save {r4-r11, lr} |
|
685 push {r4-r11, lr} /* save all registers */ |
|
686 |
|
687 subs Y, Y, #1 |
|
688 blo 199f |
|
689 |
|
690 #ifdef DEBUG_PARAMS |
|
691 .pad #9*4 |
|
692 sub sp, sp, #9*4 |
|
693 #endif |
|
694 |
|
695 .if src_bpp > 0 |
|
696 ldr SRC, [sp, #ARGS_STACK_OFFSET] |
|
697 ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] |
|
698 .endif |
|
699 .if mask_bpp > 0 |
|
700 ldr MASK, [sp, #ARGS_STACK_OFFSET+8] |
|
701 ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] |
|
702 .endif |
|
703 |
|
704 #ifdef DEBUG_PARAMS |
|
705 add Y, Y, #1 |
|
706 stmia sp, {r0-r7,pc} |
|
707 sub Y, Y, #1 |
|
708 #endif |
|
709 |
|
710 init |
|
711 |
|
712 lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ |
|
713 sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift |
|
714 .if src_bpp > 0 |
|
715 lsl STRIDE_S, #src_bpp_shift |
|
716 sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift |
|
717 .endif |
|
718 .if mask_bpp > 0 |
|
719 lsl STRIDE_M, #mask_bpp_shift |
|
720 sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift |
|
721 .endif |
|
722 |
|
723 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ |
|
724 cmp X, #2*16*8/dst_w_bpp - 1 |
|
725 blo 170f |
|
726 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ |
|
727 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ |
|
728 cmp X, #(prefetch_distance+3)*pix_per_block - 1 |
|
729 blo 160f |
|
730 |
|
731 /* Wide case */ |
|
732 /* Adjust X so that the decrement instruction can also test for |
|
733 * inner loop termination. We want it to stop when there are |
|
734 * (prefetch_distance+1) complete blocks to go. */ |
|
735 sub X, X, #(prefetch_distance+2)*pix_per_block |
|
736 mov ORIG_W, X |
|
737 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE |
|
738 /* This is stmdb sp!,{} */ |
|
739 .word 0xE92D0000 | LINE_SAVED_REGS |
|
740 .endif |
|
741 151: /* New line */ |
|
742 newline |
|
743 preload_leading_step1 src_bpp, WK1, SRC |
|
744 preload_leading_step1 mask_bpp, WK2, MASK |
|
745 preload_leading_step1 dst_r_bpp, WK3, DST |
|
746 |
|
747 tst DST, #15 |
|
748 beq 154f |
|
749 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ |
|
750 .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp) |
|
751 PF and, WK0, WK0, #15 |
|
752 .endif |
|
753 |
|
754 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC |
|
755 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK |
|
756 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST |
|
757 |
|
758 leading_15bytes process_head, process_tail |
|
759 |
|
760 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ |
|
761 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
|
762 and SCRATCH, SRC, #31 |
|
763 rsb SCRATCH, SCRATCH, #32*prefetch_distance |
|
764 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
|
765 and SCRATCH, MASK, #31 |
|
766 rsb SCRATCH, SCRATCH, #32*prefetch_distance |
|
767 .endif |
|
768 .ifc "process_inner_loop","" |
|
769 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f |
|
770 .else |
|
771 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f |
|
772 .endif |
|
773 |
|
774 157: /* Check for another line */ |
|
775 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b |
|
776 .endif |
|
777 |
|
778 .ltorg |
|
779 |
|
780 160: /* Medium case */ |
|
781 mov ORIG_W, X |
|
782 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE |
|
783 /* This is stmdb sp!,{} */ |
|
784 .word 0xE92D0000 | LINE_SAVED_REGS |
|
785 .endif |
|
786 161: /* New line */ |
|
787 newline |
|
788 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ |
|
789 preload_line 0, mask_bpp, mask_bpp_shift, MASK |
|
790 preload_line 0, dst_r_bpp, dst_bpp_shift, DST |
|
791 |
|
792 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ |
|
793 tst DST, #15 |
|
794 beq 164f |
|
795 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ |
|
796 |
|
797 leading_15bytes process_head, process_tail |
|
798 |
|
799 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ |
|
800 switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f |
|
801 |
|
802 167: /* Check for another line */ |
|
803 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b |
|
804 |
|
805 .ltorg |
|
806 |
|
807 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ |
|
808 .if dst_w_bpp < 32 |
|
809 mov ORIG_W, X |
|
810 .endif |
|
811 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE |
|
812 /* This is stmdb sp!,{} */ |
|
813 .word 0xE92D0000 | LINE_SAVED_REGS |
|
814 .endif |
|
815 171: /* New line */ |
|
816 newline |
|
817 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ |
|
818 preload_line 1, mask_bpp, mask_bpp_shift, MASK |
|
819 preload_line 1, dst_r_bpp, dst_bpp_shift, DST |
|
820 |
|
821 .if dst_w_bpp == 8 |
|
822 tst DST, #3 |
|
823 beq 174f |
|
824 172: subs X, X, #1 |
|
825 blo 177f |
|
826 process_head , 1, 0, 1, 1, 0 |
|
827 process_tail , 1, 0 |
|
828 .if !((flags) & FLAG_PROCESS_DOES_STORE) |
|
829 pixst , 1, 0, DST |
|
830 .endif |
|
831 tst DST, #3 |
|
832 bne 172b |
|
833 .elseif dst_w_bpp == 16 |
|
834 tst DST, #2 |
|
835 beq 174f |
|
836 subs X, X, #1 |
|
837 blo 177f |
|
838 process_head , 2, 0, 1, 1, 0 |
|
839 process_tail , 2, 0 |
|
840 .if !((flags) & FLAG_PROCESS_DOES_STORE) |
|
841 pixst , 2, 0, DST |
|
842 .endif |
|
843 .endif |
|
844 |
|
845 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ |
|
846 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f |
|
847 |
|
848 177: /* Check for another line */ |
|
849 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one |
|
850 |
|
851 197: |
|
852 .if (flags) & FLAG_SPILL_LINE_VARS |
|
853 add sp, sp, #LINE_SAVED_REG_COUNT*4 |
|
854 .endif |
|
855 198: |
|
856 cleanup |
|
857 |
|
858 #ifdef DEBUG_PARAMS |
|
859 add sp, sp, #9*4 /* junk the debug copy of arguments */ |
|
860 #endif |
|
861 199: |
|
862 pop {r4-r11, pc} /* exit */ |
|
863 .fnend |
|
864 |
|
865 .ltorg |
|
866 |
|
867 .unreq X |
|
868 .unreq Y |
|
869 .unreq DST |
|
870 .unreq STRIDE_D |
|
871 .unreq SRC |
|
872 .unreq STRIDE_S |
|
873 .unreq MASK |
|
874 .unreq STRIDE_M |
|
875 .unreq WK0 |
|
876 .unreq WK1 |
|
877 .unreq WK2 |
|
878 .unreq WK3 |
|
879 .unreq SCRATCH |
|
880 .unreq ORIG_W |
|
881 .endfunc |
|
882 .endm |
|
883 |
|
884 .macro line_saved_regs x:vararg |
|
885 .set LINE_SAVED_REGS, 0 |
|
886 .set LINE_SAVED_REG_COUNT, 0 |
|
887 .irp SAVED_REG,x |
|
888 .ifc "SAVED_REG","Y" |
|
889 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) |
|
890 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
|
891 .endif |
|
892 .ifc "SAVED_REG","STRIDE_D" |
|
893 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) |
|
894 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
|
895 .endif |
|
896 .ifc "SAVED_REG","STRIDE_S" |
|
897 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) |
|
898 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
|
899 .endif |
|
900 .ifc "SAVED_REG","STRIDE_M" |
|
901 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) |
|
902 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
|
903 .endif |
|
904 .ifc "SAVED_REG","ORIG_W" |
|
905 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) |
|
906 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
|
907 .endif |
|
908 .endr |
|
909 .endm |
|
910 |
|
911 .macro nop_macro x:vararg |
|
912 .endm |