|
1 /* |
|
2 * Copyright © 2012 Raspberry Pi Foundation |
|
3 * Copyright © 2012 RISC OS Open Ltd |
|
4 * |
|
5 * Permission to use, copy, modify, distribute, and sell this software and its |
|
6 * documentation for any purpose is hereby granted without fee, provided that |
|
7 * the above copyright notice appear in all copies and that both that |
|
8 * copyright notice and this permission notice appear in supporting |
|
9 * documentation, and that the name of the copyright holders not be used in |
|
10 * advertising or publicity pertaining to distribution of the software without |
|
11 * specific, written prior permission. The copyright holders make no |
|
12 * representations about the suitability of this software for any purpose. It |
|
13 * is provided "as is" without express or implied warranty. |
|
14 * |
|
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
|
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
|
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
|
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
|
22 * SOFTWARE. |
|
23 * |
|
24 * Author: Ben Avison (bavison@riscosopen.org) |
|
25 * |
|
26 */ |
|
27 |
|
28 /* Prevent the stack from becoming executable */ |
|
29 #if defined(__linux__) && defined(__ELF__) |
|
30 .section .note.GNU-stack,"",%progbits |
|
31 #endif |
|
32 |
|
33 .text |
|
34 .arch armv6 |
|
35 .object_arch armv4 |
|
36 .arm |
|
37 .altmacro |
|
38 .p2align 2 |
|
39 |
|
40 #include "pixman-arm-simd-asm.h" |
|
41 |
|
42 /* A head macro should do all processing which results in an output of up to |
|
43 * 16 bytes, as far as the final load instruction. The corresponding tail macro |
|
44 * should complete the processing of the up-to-16 bytes. The calling macro will |
|
45 * sometimes choose to insert a preload or a decrement of X between them. |
|
46 * cond ARM condition code for code block |
|
47 * numbytes Number of output bytes that should be generated this time |
|
48 * firstreg First WK register in which to place output |
|
49 * unaligned_src Whether to use non-wordaligned loads of source image |
|
50 * unaligned_mask Whether to use non-wordaligned loads of mask image |
|
51 * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output |
|
52 */ |
|
53 |
|
54 .macro blit_init |
|
55 line_saved_regs STRIDE_D, STRIDE_S |
|
56 .endm |
|
57 |
|
58 .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
|
59 pixld cond, numbytes, firstreg, SRC, unaligned_src |
|
60 .endm |
|
61 |
|
62 .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment |
|
63 WK4 .req STRIDE_D |
|
64 WK5 .req STRIDE_S |
|
65 WK6 .req MASK |
|
66 WK7 .req STRIDE_M |
|
67 110: pixld , 16, 0, SRC, unaligned_src |
|
68 pixld , 16, 4, SRC, unaligned_src |
|
69 pld [SRC, SCRATCH] |
|
70 pixst , 16, 0, DST |
|
71 pixst , 16, 4, DST |
|
72 subs X, X, #32*8/src_bpp |
|
73 bhs 110b |
|
74 .unreq WK4 |
|
75 .unreq WK5 |
|
76 .unreq WK6 |
|
77 .unreq WK7 |
|
78 .endm |
|
79 |
|
80 generate_composite_function \ |
|
81 pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ |
|
82 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
|
83 4, /* prefetch distance */ \ |
|
84 blit_init, \ |
|
85 nop_macro, /* newline */ \ |
|
86 nop_macro, /* cleanup */ \ |
|
87 blit_process_head, \ |
|
88 nop_macro, /* process tail */ \ |
|
89 blit_inner_loop |
|
90 |
|
91 generate_composite_function \ |
|
92 pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ |
|
93 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
|
94 4, /* prefetch distance */ \ |
|
95 blit_init, \ |
|
96 nop_macro, /* newline */ \ |
|
97 nop_macro, /* cleanup */ \ |
|
98 blit_process_head, \ |
|
99 nop_macro, /* process tail */ \ |
|
100 blit_inner_loop |
|
101 |
|
102 generate_composite_function \ |
|
103 pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ |
|
104 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
|
105 3, /* prefetch distance */ \ |
|
106 blit_init, \ |
|
107 nop_macro, /* newline */ \ |
|
108 nop_macro, /* cleanup */ \ |
|
109 blit_process_head, \ |
|
110 nop_macro, /* process tail */ \ |
|
111 blit_inner_loop |
|
112 |
|
113 /******************************************************************************/ |
|
114 |
|
115 .macro src_n_8888_init |
|
116 ldr SRC, [sp, #ARGS_STACK_OFFSET] |
|
117 mov STRIDE_S, SRC |
|
118 mov MASK, SRC |
|
119 mov STRIDE_M, SRC |
|
120 .endm |
|
121 |
|
122 .macro src_n_0565_init |
|
123 ldrh SRC, [sp, #ARGS_STACK_OFFSET] |
|
124 orr SRC, SRC, lsl #16 |
|
125 mov STRIDE_S, SRC |
|
126 mov MASK, SRC |
|
127 mov STRIDE_M, SRC |
|
128 .endm |
|
129 |
|
130 .macro src_n_8_init |
|
131 ldrb SRC, [sp, #ARGS_STACK_OFFSET] |
|
132 orr SRC, SRC, lsl #8 |
|
133 orr SRC, SRC, lsl #16 |
|
134 mov STRIDE_S, SRC |
|
135 mov MASK, SRC |
|
136 mov STRIDE_M, SRC |
|
137 .endm |
|
138 |
|
139 .macro fill_process_tail cond, numbytes, firstreg |
|
140 WK4 .req SRC |
|
141 WK5 .req STRIDE_S |
|
142 WK6 .req MASK |
|
143 WK7 .req STRIDE_M |
|
144 pixst cond, numbytes, 4, DST |
|
145 .unreq WK4 |
|
146 .unreq WK5 |
|
147 .unreq WK6 |
|
148 .unreq WK7 |
|
149 .endm |
|
150 |
|
151 generate_composite_function \ |
|
152 pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ |
|
153 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
|
154 0, /* prefetch distance doesn't apply */ \ |
|
155 src_n_8888_init \ |
|
156 nop_macro, /* newline */ \ |
|
157 nop_macro /* cleanup */ \ |
|
158 nop_macro /* process head */ \ |
|
159 fill_process_tail |
|
160 |
|
161 generate_composite_function \ |
|
162 pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ |
|
163 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
|
164 0, /* prefetch distance doesn't apply */ \ |
|
165 src_n_0565_init \ |
|
166 nop_macro, /* newline */ \ |
|
167 nop_macro /* cleanup */ \ |
|
168 nop_macro /* process head */ \ |
|
169 fill_process_tail |
|
170 |
|
171 generate_composite_function \ |
|
172 pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ |
|
173 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
|
174 0, /* prefetch distance doesn't apply */ \ |
|
175 src_n_8_init \ |
|
176 nop_macro, /* newline */ \ |
|
177 nop_macro /* cleanup */ \ |
|
178 nop_macro /* process head */ \ |
|
179 fill_process_tail |
|
180 |
|
181 /******************************************************************************/ |
|
182 |
|
183 .macro src_x888_8888_pixel, cond, reg |
|
184 orr&cond WK®, WK®, #0xFF000000 |
|
185 .endm |
|
186 |
|
187 .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
|
188 pixld cond, numbytes, firstreg, SRC, unaligned_src |
|
189 .endm |
|
190 |
|
191 .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg |
|
192 src_x888_8888_pixel cond, %(firstreg+0) |
|
193 .if numbytes >= 8 |
|
194 src_x888_8888_pixel cond, %(firstreg+1) |
|
195 .if numbytes == 16 |
|
196 src_x888_8888_pixel cond, %(firstreg+2) |
|
197 src_x888_8888_pixel cond, %(firstreg+3) |
|
198 .endif |
|
199 .endif |
|
200 .endm |
|
201 |
|
202 generate_composite_function \ |
|
203 pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ |
|
204 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
|
205 3, /* prefetch distance */ \ |
|
206 nop_macro, /* init */ \ |
|
207 nop_macro, /* newline */ \ |
|
208 nop_macro, /* cleanup */ \ |
|
209 pixman_composite_src_x888_8888_process_head, \ |
|
210 pixman_composite_src_x888_8888_process_tail |
|
211 |
|
212 /******************************************************************************/ |
|
213 |
|
214 .macro src_0565_8888_init |
|
215 /* Hold loop invariants in MASK and STRIDE_M */ |
|
216 ldr MASK, =0x07E007E0 |
|
217 mov STRIDE_M, #0xFF000000 |
|
218 /* Set GE[3:0] to 1010 so SEL instructions do what we want */ |
|
219 ldr SCRATCH, =0x80008000 |
|
220 uadd8 SCRATCH, SCRATCH, SCRATCH |
|
221 .endm |
|
222 |
|
223 .macro src_0565_8888_2pixels, reg1, reg2 |
|
224 and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 |
|
225 bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb |
|
226 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg |
|
227 mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 |
|
228 mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG |
|
229 bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 |
|
230 orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 |
|
231 orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 |
|
232 pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- |
|
233 sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- |
|
234 mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg |
|
235 pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- |
|
236 sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- |
|
237 orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb |
|
238 orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB |
|
239 .endm |
|
240 |
|
241 /* This version doesn't need STRIDE_M, but is one instruction longer. |
|
242 It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? |
|
243 and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 |
|
244 bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb |
|
245 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg |
|
246 mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB |
|
247 mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 |
|
248 bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb |
|
249 mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 |
|
250 mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 |
|
251 orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB |
|
252 orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb |
|
253 pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB |
|
254 pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb |
|
255 sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB |
|
256 sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb |
|
257 orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB |
|
258 orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb |
|
259 */ |
|
260 |
|
261 .macro src_0565_8888_1pixel, reg |
|
262 bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb |
|
263 and WK®, WK®, MASK @ 000000000000000000000gggggg00000 |
|
264 mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 |
|
265 mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 |
|
266 orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb |
|
267 orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 |
|
268 pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb |
|
269 sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb |
|
270 orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb |
|
271 .endm |
|
272 |
|
273 .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
|
274 .if numbytes == 16 |
|
275 pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src |
|
276 .elseif numbytes == 8 |
|
277 pixld , 4, firstreg, SRC, unaligned_src |
|
278 .elseif numbytes == 4 |
|
279 pixld , 2, firstreg, SRC, unaligned_src |
|
280 .endif |
|
281 .endm |
|
282 |
|
283 .macro src_0565_8888_process_tail cond, numbytes, firstreg |
|
284 .if numbytes == 16 |
|
285 src_0565_8888_2pixels firstreg, %(firstreg+1) |
|
286 src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) |
|
287 .elseif numbytes == 8 |
|
288 src_0565_8888_2pixels firstreg, %(firstreg+1) |
|
289 .else |
|
290 src_0565_8888_1pixel firstreg |
|
291 .endif |
|
292 .endm |
|
293 |
|
294 generate_composite_function \ |
|
295 pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ |
|
296 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ |
|
297 3, /* prefetch distance */ \ |
|
298 src_0565_8888_init, \ |
|
299 nop_macro, /* newline */ \ |
|
300 nop_macro, /* cleanup */ \ |
|
301 src_0565_8888_process_head, \ |
|
302 src_0565_8888_process_tail |
|
303 |
|
304 /******************************************************************************/ |
|
305 |
|
306 .macro add_8_8_8pixels cond, dst1, dst2 |
|
307 uqadd8&cond WK&dst1, WK&dst1, MASK |
|
308 uqadd8&cond WK&dst2, WK&dst2, STRIDE_M |
|
309 .endm |
|
310 |
|
311 .macro add_8_8_4pixels cond, dst |
|
312 uqadd8&cond WK&dst, WK&dst, MASK |
|
313 .endm |
|
314 |
|
315 .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
|
316 WK4 .req MASK |
|
317 WK5 .req STRIDE_M |
|
318 .if numbytes == 16 |
|
319 pixld cond, 8, 4, SRC, unaligned_src |
|
320 pixld cond, 16, firstreg, DST, 0 |
|
321 add_8_8_8pixels cond, firstreg, %(firstreg+1) |
|
322 pixld cond, 8, 4, SRC, unaligned_src |
|
323 .else |
|
324 pixld cond, numbytes, 4, SRC, unaligned_src |
|
325 pixld cond, numbytes, firstreg, DST, 0 |
|
326 .endif |
|
327 .unreq WK4 |
|
328 .unreq WK5 |
|
329 .endm |
|
330 |
|
331 .macro add_8_8_process_tail cond, numbytes, firstreg |
|
332 .if numbytes == 16 |
|
333 add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) |
|
334 .elseif numbytes == 8 |
|
335 add_8_8_8pixels cond, firstreg, %(firstreg+1) |
|
336 .else |
|
337 add_8_8_4pixels cond, firstreg |
|
338 .endif |
|
339 .endm |
|
340 |
|
341 generate_composite_function \ |
|
342 pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ |
|
343 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
|
344 2, /* prefetch distance */ \ |
|
345 nop_macro, /* init */ \ |
|
346 nop_macro, /* newline */ \ |
|
347 nop_macro, /* cleanup */ \ |
|
348 add_8_8_process_head, \ |
|
349 add_8_8_process_tail |
|
350 |
|
351 /******************************************************************************/ |
|
352 |
|
353 .macro over_8888_8888_init |
|
354 /* Hold loop invariant in MASK */ |
|
355 ldr MASK, =0x00800080 |
|
356 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ |
|
357 uadd8 SCRATCH, MASK, MASK |
|
358 line_saved_regs STRIDE_D, STRIDE_S, ORIG_W |
|
359 .endm |
|
360 |
|
361 .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
|
362 WK4 .req STRIDE_D |
|
363 WK5 .req STRIDE_S |
|
364 WK6 .req STRIDE_M |
|
365 WK7 .req ORIG_W |
|
366 pixld , numbytes, %(4+firstreg), SRC, unaligned_src |
|
367 pixld , numbytes, firstreg, DST, 0 |
|
368 .unreq WK4 |
|
369 .unreq WK5 |
|
370 .unreq WK6 |
|
371 .unreq WK7 |
|
372 .endm |
|
373 |
|
374 .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 |
|
375 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ |
|
376 teq WK®0, #0 |
|
377 .if numbytes > 4 |
|
378 teqeq WK®1, #0 |
|
379 .if numbytes > 8 |
|
380 teqeq WK®2, #0 |
|
381 teqeq WK®3, #0 |
|
382 .endif |
|
383 .endif |
|
384 .endm |
|
385 |
|
386 .macro over_8888_8888_prepare next |
|
387 mov WK&next, WK&next, lsr #24 |
|
388 .endm |
|
389 |
|
390 .macro over_8888_8888_1pixel src, dst, offset, next |
|
391 /* src = destination component multiplier */ |
|
392 rsb WK&src, WK&src, #255 |
|
393 /* Split even/odd bytes of dst into SCRATCH/dst */ |
|
394 uxtb16 SCRATCH, WK&dst |
|
395 uxtb16 WK&dst, WK&dst, ror #8 |
|
396 /* Multiply through, adding 0.5 to the upper byte of result for rounding */ |
|
397 mla SCRATCH, SCRATCH, WK&src, MASK |
|
398 mla WK&dst, WK&dst, WK&src, MASK |
|
399 /* Where we would have had a stall between the result of the first MLA and the shifter input, |
|
400 * reload the complete source pixel */ |
|
401 ldr WK&src, [SRC, #offset] |
|
402 /* Multiply by 257/256 to approximate 256/255 */ |
|
403 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 |
|
404 /* In this stall, start processing the next pixel */ |
|
405 .if offset < -4 |
|
406 mov WK&next, WK&next, lsr #24 |
|
407 .endif |
|
408 uxtab16 WK&dst, WK&dst, WK&dst, ror #8 |
|
409 /* Recombine even/odd bytes of multiplied destination */ |
|
410 mov SCRATCH, SCRATCH, ror #8 |
|
411 sel WK&dst, SCRATCH, WK&dst |
|
412 /* Saturated add of source to multiplied destination */ |
|
413 uqadd8 WK&dst, WK&dst, WK&src |
|
414 .endm |
|
415 |
|
416 .macro over_8888_8888_process_tail cond, numbytes, firstreg |
|
417 WK4 .req STRIDE_D |
|
418 WK5 .req STRIDE_S |
|
419 WK6 .req STRIDE_M |
|
420 WK7 .req ORIG_W |
|
421 over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) |
|
422 beq 10f |
|
423 over_8888_8888_prepare %(4+firstreg) |
|
424 .set PROCESS_REG, firstreg |
|
425 .set PROCESS_OFF, -numbytes |
|
426 .rept numbytes / 4 |
|
427 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) |
|
428 .set PROCESS_REG, PROCESS_REG+1 |
|
429 .set PROCESS_OFF, PROCESS_OFF+4 |
|
430 .endr |
|
431 pixst , numbytes, firstreg, DST |
|
432 10: |
|
433 .unreq WK4 |
|
434 .unreq WK5 |
|
435 .unreq WK6 |
|
436 .unreq WK7 |
|
437 .endm |
|
438 |
|
439 generate_composite_function \ |
|
440 pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ |
|
441 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ |
|
442 2, /* prefetch distance */ \ |
|
443 over_8888_8888_init, \ |
|
444 nop_macro, /* newline */ \ |
|
445 nop_macro, /* cleanup */ \ |
|
446 over_8888_8888_process_head, \ |
|
447 over_8888_8888_process_tail |
|
448 |
|
449 /******************************************************************************/ |
|
450 |
|
451 /* Multiply each byte of a word by a byte. |
|
452 * Useful when there aren't any obvious ways to fill the stalls with other instructions. |
|
453 * word Register containing 4 bytes |
|
454 * byte Register containing byte multiplier (bits 8-31 must be 0) |
|
455 * tmp Scratch register |
|
456 * half Register containing the constant 0x00800080 |
|
457 * GE[3:0] bits must contain 0101 |
|
458 */ |
|
459 .macro mul_8888_8 word, byte, tmp, half |
|
460 /* Split even/odd bytes of word apart */ |
|
461 uxtb16 tmp, word |
|
462 uxtb16 word, word, ror #8 |
|
463 /* Multiply bytes together with rounding, then by 257/256 */ |
|
464 mla tmp, tmp, byte, half |
|
465 mla word, word, byte, half /* 1 stall follows */ |
|
466 uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ |
|
467 uxtab16 word, word, word, ror #8 |
|
468 /* Recombine bytes */ |
|
469 mov tmp, tmp, ror #8 |
|
470 sel word, tmp, word |
|
471 .endm |
|
472 |
|
473 /******************************************************************************/ |
|
474 |
|
475 .macro over_8888_n_8888_init |
|
476 /* Mask is constant */ |
|
477 ldr MASK, [sp, #ARGS_STACK_OFFSET+8] |
|
478 /* Hold loop invariant in STRIDE_M */ |
|
479 ldr STRIDE_M, =0x00800080 |
|
480 /* We only want the alpha bits of the constant mask */ |
|
481 mov MASK, MASK, lsr #24 |
|
482 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ |
|
483 uadd8 SCRATCH, STRIDE_M, STRIDE_M |
|
484 line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W |
|
485 .endm |
|
486 |
|
487 .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
|
488 WK4 .req Y |
|
489 WK5 .req STRIDE_D |
|
490 WK6 .req STRIDE_S |
|
491 WK7 .req ORIG_W |
|
492 pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src |
|
493 pixld , numbytes, firstreg, DST, 0 |
|
494 .unreq WK4 |
|
495 .unreq WK5 |
|
496 .unreq WK6 |
|
497 .unreq WK7 |
|
498 .endm |
|
499 |
|
500 .macro over_8888_n_8888_1pixel src, dst |
|
501 mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M |
|
502 sub WK7, WK6, WK&src, lsr #24 |
|
503 mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M |
|
504 uqadd8 WK&dst, WK&dst, WK&src |
|
505 .endm |
|
506 |
|
507 .macro over_8888_n_8888_process_tail cond, numbytes, firstreg |
|
508 WK4 .req Y |
|
509 WK5 .req STRIDE_D |
|
510 WK6 .req STRIDE_S |
|
511 WK7 .req ORIG_W |
|
512 over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) |
|
513 beq 10f |
|
514 mov WK6, #255 |
|
515 .set PROCESS_REG, firstreg |
|
516 .rept numbytes / 4 |
|
517 .if numbytes == 16 && PROCESS_REG == 2 |
|
518 /* We're using WK6 and WK7 as temporaries, so half way through |
|
519 * 4 pixels, reload the second two source pixels but this time |
|
520 * into WK4 and WK5 */ |
|
521 ldmdb SRC, {WK4, WK5} |
|
522 .endif |
|
523 over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) |
|
524 .set PROCESS_REG, PROCESS_REG+1 |
|
525 .endr |
|
526 pixst , numbytes, firstreg, DST |
|
527 10: |
|
528 .unreq WK4 |
|
529 .unreq WK5 |
|
530 .unreq WK6 |
|
531 .unreq WK7 |
|
532 .endm |
|
533 |
|
534 generate_composite_function \ |
|
535 pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ |
|
536 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ |
|
537 2, /* prefetch distance */ \ |
|
538 over_8888_n_8888_init, \ |
|
539 nop_macro, /* newline */ \ |
|
540 nop_macro, /* cleanup */ \ |
|
541 over_8888_n_8888_process_head, \ |
|
542 over_8888_n_8888_process_tail |
|
543 |
|
544 /******************************************************************************/ |
|
545 |
|
546 .macro over_n_8_8888_init |
|
547 /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ |
|
548 ldr SRC, [sp, #ARGS_STACK_OFFSET] |
|
549 /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ |
|
550 ldr SCRATCH, =0x00800080 |
|
551 uxtb16 STRIDE_S, SRC |
|
552 uxtb16 SRC, SRC, ror #8 |
|
553 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ |
|
554 uadd8 SCRATCH, SCRATCH, SCRATCH |
|
555 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W |
|
556 .endm |
|
557 |
|
558 .macro over_n_8_8888_newline |
|
559 ldr STRIDE_D, =0x00800080 |
|
560 b 1f |
|
561 .ltorg |
|
562 1: |
|
563 .endm |
|
564 |
|
565 .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
|
566 WK4 .req STRIDE_M |
|
567 pixld , numbytes/4, 4, MASK, unaligned_mask |
|
568 pixld , numbytes, firstreg, DST, 0 |
|
569 .unreq WK4 |
|
570 .endm |
|
571 |
|
572 .macro over_n_8_8888_1pixel src, dst |
|
573 uxtb Y, WK4, ror #src*8 |
|
574 /* Trailing part of multiplication of source */ |
|
575 mla SCRATCH, STRIDE_S, Y, STRIDE_D |
|
576 mla Y, SRC, Y, STRIDE_D |
|
577 mov ORIG_W, #255 |
|
578 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 |
|
579 uxtab16 Y, Y, Y, ror #8 |
|
580 mov SCRATCH, SCRATCH, ror #8 |
|
581 sub ORIG_W, ORIG_W, Y, lsr #24 |
|
582 sel Y, SCRATCH, Y |
|
583 /* Then multiply the destination */ |
|
584 mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D |
|
585 uqadd8 WK&dst, WK&dst, Y |
|
586 .endm |
|
587 |
|
588 .macro over_n_8_8888_process_tail cond, numbytes, firstreg |
|
589 WK4 .req STRIDE_M |
|
590 teq WK4, #0 |
|
591 beq 10f |
|
592 .set PROCESS_REG, firstreg |
|
593 .rept numbytes / 4 |
|
594 over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) |
|
595 .set PROCESS_REG, PROCESS_REG+1 |
|
596 .endr |
|
597 pixst , numbytes, firstreg, DST |
|
598 10: |
|
599 .unreq WK4 |
|
600 .endm |
|
601 |
|
602 generate_composite_function \ |
|
603 pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ |
|
604 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ |
|
605 2, /* prefetch distance */ \ |
|
606 over_n_8_8888_init, \ |
|
607 over_n_8_8888_newline, \ |
|
608 nop_macro, /* cleanup */ \ |
|
609 over_n_8_8888_process_head, \ |
|
610 over_n_8_8888_process_tail |
|
611 |
|
612 /******************************************************************************/ |
|
613 |