|
1 /* |
|
2 * Copyright © 2009 Nokia Corporation |
|
3 * |
|
4 * Permission is hereby granted, free of charge, to any person obtaining a |
|
5 * copy of this software and associated documentation files (the "Software"), |
|
6 * to deal in the Software without restriction, including without limitation |
|
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
8 * and/or sell copies of the Software, and to permit persons to whom the |
|
9 * Software is furnished to do so, subject to the following conditions: |
|
10 * |
|
11 * The above copyright notice and this permission notice (including the next |
|
12 * paragraph) shall be included in all copies or substantial portions of the |
|
13 * Software. |
|
14 * |
|
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
21 * DEALINGS IN THE SOFTWARE. |
|
22 * |
|
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) |
|
24 */ |
|
25 |
|
26 /* |
|
27 * This file contains a macro ('generate_composite_function') which can |
|
28 * construct 2D image processing functions, based on a common template. |
|
29 * Any combinations of source, destination and mask images with 8bpp, |
|
30 * 16bpp, 24bpp, 32bpp color formats are supported. |
|
31 * |
|
32 * This macro takes care of: |
|
33 * - handling of leading and trailing unaligned pixels |
|
34 * - doing most of the work related to L2 cache preload |
|
35 * - encourages the use of software pipelining for better instructions |
|
36 * scheduling |
|
37 * |
|
38 * The user of this macro has to provide some configuration parameters |
|
39 * (bit depths for the images, prefetch distance, etc.) and a set of |
|
40 * macros, which should implement basic code chunks responsible for |
|
41 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage |
|
42 * examples. |
|
43 * |
|
44 * TODO: |
|
45 * - try overlapped pixel method (from Ian Rickards) when processing |
|
46 * exactly two blocks of pixels |
|
47 * - maybe add an option to do reverse scanline processing |
|
48 */ |
|
49 |
|
50 /* |
|
51 * Bit flags for 'generate_composite_function' macro which are used |
|
52 * to tune generated functions behavior. |
|
53 */ |
|
54 .set FLAG_DST_WRITEONLY, 0 |
|
55 .set FLAG_DST_READWRITE, 1 |
|
56 .set FLAG_DEINTERLEAVE_32BPP, 2 |
|
57 |
|
58 /* |
|
59 * Offset in stack where mask and source pointer/stride can be accessed |
|
60 * from 'init' macro. This is useful for doing special handling for solid mask. |
|
61 */ |
|
62 .set ARGS_STACK_OFFSET, 40 |
|
63 |
|
64 /* |
|
65 * Constants for selecting preferable prefetch type. |
|
66 */ |
|
67 .set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ |
|
68 .set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ |
|
69 .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ |
|
70 |
|
71 /* |
|
72 * Definitions of supplementary pixld/pixst macros (for partial load/store of |
|
73 * pixel data). |
|
74 */ |
|
75 |
|
76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits |
|
77 .if abits > 0 |
|
78 op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! |
|
79 .else |
|
80 op&.&elem_size {d®1}, [&mem_operand&]! |
|
81 .endif |
|
82 .endm |
|
83 |
|
84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits |
|
85 .if abits > 0 |
|
86 op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! |
|
87 .else |
|
88 op&.&elem_size {d®1, d®2}, [&mem_operand&]! |
|
89 .endif |
|
90 .endm |
|
91 |
|
92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits |
|
93 .if abits > 0 |
|
94 op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! |
|
95 .else |
|
96 op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! |
|
97 .endif |
|
98 .endm |
|
99 |
|
100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits |
|
101 op&.&elem_size {d®1[idx]}, [&mem_operand&]! |
|
102 .endm |
|
103 |
|
104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand |
|
105 op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! |
|
106 .endm |
|
107 |
|
108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand |
|
109 op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! |
|
110 .endm |
|
111 |
|
112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits |
|
113 .if numbytes == 32 |
|
114 pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ |
|
115 %(basereg+6), %(basereg+7), mem_operand, abits |
|
116 .elseif numbytes == 16 |
|
117 pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits |
|
118 .elseif numbytes == 8 |
|
119 pixldst1 op, elem_size, %(basereg+1), mem_operand, abits |
|
120 .elseif numbytes == 4 |
|
121 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) |
|
122 pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits |
|
123 .elseif elem_size == 16 |
|
124 pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits |
|
125 pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits |
|
126 .else |
|
127 pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits |
|
128 pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits |
|
129 pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits |
|
130 pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits |
|
131 .endif |
|
132 .elseif numbytes == 2 |
|
133 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) |
|
134 pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits |
|
135 .else |
|
136 pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits |
|
137 pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits |
|
138 .endif |
|
139 .elseif numbytes == 1 |
|
140 pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits |
|
141 .else |
|
142 .error "unsupported size: numbytes" |
|
143 .endif |
|
144 .endm |
|
145 |
|
146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0 |
|
147 .if bpp > 0 |
|
148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
|
149 pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ |
|
150 %(basereg+6), %(basereg+7), mem_operand, abits |
|
151 .elseif (bpp == 24) && (numpix == 8) |
|
152 pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand |
|
153 .elseif (bpp == 24) && (numpix == 4) |
|
154 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand |
|
155 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand |
|
156 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand |
|
157 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand |
|
158 .elseif (bpp == 24) && (numpix == 2) |
|
159 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand |
|
160 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand |
|
161 .elseif (bpp == 24) && (numpix == 1) |
|
162 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand |
|
163 .else |
|
164 pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits |
|
165 .endif |
|
166 .endif |
|
167 .endm |
|
168 |
|
169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0 |
|
170 .if bpp > 0 |
|
171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
|
172 pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ |
|
173 %(basereg+6), %(basereg+7), mem_operand, abits |
|
174 .elseif (bpp == 24) && (numpix == 8) |
|
175 pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand |
|
176 .elseif (bpp == 24) && (numpix == 4) |
|
177 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand |
|
178 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand |
|
179 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand |
|
180 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand |
|
181 .elseif (bpp == 24) && (numpix == 2) |
|
182 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand |
|
183 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand |
|
184 .elseif (bpp == 24) && (numpix == 1) |
|
185 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand |
|
186 .else |
|
187 pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits |
|
188 .endif |
|
189 .endif |
|
190 .endm |
|
191 |
|
192 .macro pixld_a numpix, bpp, basereg, mem_operand |
|
193 .if (bpp * numpix) <= 128 |
|
194 pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) |
|
195 .else |
|
196 pixld numpix, bpp, basereg, mem_operand, 128 |
|
197 .endif |
|
198 .endm |
|
199 |
|
200 .macro pixst_a numpix, bpp, basereg, mem_operand |
|
201 .if (bpp * numpix) <= 128 |
|
202 pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) |
|
203 .else |
|
204 pixst numpix, bpp, basereg, mem_operand, 128 |
|
205 .endif |
|
206 .endm |
|
207 |
|
208 /* |
|
209 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register |
|
210 * aliases to be defined) |
|
211 */ |
|
212 .macro pixld1_s elem_size, reg1, mem_operand |
|
213 .if elem_size == 16 |
|
214 mov TMP1, VX, asr #16 |
|
215 adds VX, VX, UNIT_X |
|
216 5: subpls VX, VX, SRC_WIDTH_FIXED |
|
217 bpl 5b |
|
218 add TMP1, mem_operand, TMP1, asl #1 |
|
219 mov TMP2, VX, asr #16 |
|
220 adds VX, VX, UNIT_X |
|
221 5: subpls VX, VX, SRC_WIDTH_FIXED |
|
222 bpl 5b |
|
223 add TMP2, mem_operand, TMP2, asl #1 |
|
224 vld1.16 {d®1&[0]}, [TMP1, :16] |
|
225 mov TMP1, VX, asr #16 |
|
226 adds VX, VX, UNIT_X |
|
227 5: subpls VX, VX, SRC_WIDTH_FIXED |
|
228 bpl 5b |
|
229 add TMP1, mem_operand, TMP1, asl #1 |
|
230 vld1.16 {d®1&[1]}, [TMP2, :16] |
|
231 mov TMP2, VX, asr #16 |
|
232 adds VX, VX, UNIT_X |
|
233 5: subpls VX, VX, SRC_WIDTH_FIXED |
|
234 bpl 5b |
|
235 add TMP2, mem_operand, TMP2, asl #1 |
|
236 vld1.16 {d®1&[2]}, [TMP1, :16] |
|
237 vld1.16 {d®1&[3]}, [TMP2, :16] |
|
238 .elseif elem_size == 32 |
|
239 mov TMP1, VX, asr #16 |
|
240 adds VX, VX, UNIT_X |
|
241 5: subpls VX, VX, SRC_WIDTH_FIXED |
|
242 bpl 5b |
|
243 add TMP1, mem_operand, TMP1, asl #2 |
|
244 mov TMP2, VX, asr #16 |
|
245 adds VX, VX, UNIT_X |
|
246 5: subpls VX, VX, SRC_WIDTH_FIXED |
|
247 bpl 5b |
|
248 add TMP2, mem_operand, TMP2, asl #2 |
|
249 vld1.32 {d®1&[0]}, [TMP1, :32] |
|
250 vld1.32 {d®1&[1]}, [TMP2, :32] |
|
251 .else |
|
252 .error "unsupported" |
|
253 .endif |
|
254 .endm |
|
255 |
|
256 .macro pixld2_s elem_size, reg1, reg2, mem_operand |
|
257 .if 0 /* elem_size == 32 */ |
|
258 mov TMP1, VX, asr #16 |
|
259 add VX, VX, UNIT_X, asl #1 |
|
260 add TMP1, mem_operand, TMP1, asl #2 |
|
261 mov TMP2, VX, asr #16 |
|
262 sub VX, VX, UNIT_X |
|
263 add TMP2, mem_operand, TMP2, asl #2 |
|
264 vld1.32 {d®1&[0]}, [TMP1, :32] |
|
265 mov TMP1, VX, asr #16 |
|
266 add VX, VX, UNIT_X, asl #1 |
|
267 add TMP1, mem_operand, TMP1, asl #2 |
|
268 vld1.32 {d®2&[0]}, [TMP2, :32] |
|
269 mov TMP2, VX, asr #16 |
|
270 add VX, VX, UNIT_X |
|
271 add TMP2, mem_operand, TMP2, asl #2 |
|
272 vld1.32 {d®1&[1]}, [TMP1, :32] |
|
273 vld1.32 {d®2&[1]}, [TMP2, :32] |
|
274 .else |
|
275 pixld1_s elem_size, reg1, mem_operand |
|
276 pixld1_s elem_size, reg2, mem_operand |
|
277 .endif |
|
278 .endm |
|
279 |
|
280 .macro pixld0_s elem_size, reg1, idx, mem_operand |
|
281 .if elem_size == 16 |
|
282 mov TMP1, VX, asr #16 |
|
283 adds VX, VX, UNIT_X |
|
284 5: subpls VX, VX, SRC_WIDTH_FIXED |
|
285 bpl 5b |
|
286 add TMP1, mem_operand, TMP1, asl #1 |
|
287 vld1.16 {d®1&[idx]}, [TMP1, :16] |
|
288 .elseif elem_size == 32 |
|
289 mov TMP1, VX, asr #16 |
|
290 adds VX, VX, UNIT_X |
|
291 5: subpls VX, VX, SRC_WIDTH_FIXED |
|
292 bpl 5b |
|
293 add TMP1, mem_operand, TMP1, asl #2 |
|
294 vld1.32 {d®1&[idx]}, [TMP1, :32] |
|
295 .endif |
|
296 .endm |
|
297 |
|
298 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand |
|
299 .if numbytes == 32 |
|
300 pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand |
|
301 pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand |
|
302 pixdeinterleave elem_size, %(basereg+4) |
|
303 .elseif numbytes == 16 |
|
304 pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand |
|
305 .elseif numbytes == 8 |
|
306 pixld1_s elem_size, %(basereg+1), mem_operand |
|
307 .elseif numbytes == 4 |
|
308 .if elem_size == 32 |
|
309 pixld0_s elem_size, %(basereg+0), 1, mem_operand |
|
310 .elseif elem_size == 16 |
|
311 pixld0_s elem_size, %(basereg+0), 2, mem_operand |
|
312 pixld0_s elem_size, %(basereg+0), 3, mem_operand |
|
313 .else |
|
314 pixld0_s elem_size, %(basereg+0), 4, mem_operand |
|
315 pixld0_s elem_size, %(basereg+0), 5, mem_operand |
|
316 pixld0_s elem_size, %(basereg+0), 6, mem_operand |
|
317 pixld0_s elem_size, %(basereg+0), 7, mem_operand |
|
318 .endif |
|
319 .elseif numbytes == 2 |
|
320 .if elem_size == 16 |
|
321 pixld0_s elem_size, %(basereg+0), 1, mem_operand |
|
322 .else |
|
323 pixld0_s elem_size, %(basereg+0), 2, mem_operand |
|
324 pixld0_s elem_size, %(basereg+0), 3, mem_operand |
|
325 .endif |
|
326 .elseif numbytes == 1 |
|
327 pixld0_s elem_size, %(basereg+0), 1, mem_operand |
|
328 .else |
|
329 .error "unsupported size: numbytes" |
|
330 .endif |
|
331 .endm |
|
332 |
|
333 .macro pixld_s numpix, bpp, basereg, mem_operand |
|
334 .if bpp > 0 |
|
335 pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand |
|
336 .endif |
|
337 .endm |
|
338 |
|
339 .macro vuzp8 reg1, reg2 |
|
340 vuzp.8 d®1, d®2 |
|
341 .endm |
|
342 |
|
343 .macro vzip8 reg1, reg2 |
|
344 vzip.8 d®1, d®2 |
|
345 .endm |
|
346 |
|
347 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ |
|
348 .macro pixdeinterleave bpp, basereg |
|
349 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
|
350 vuzp8 %(basereg+0), %(basereg+1) |
|
351 vuzp8 %(basereg+2), %(basereg+3) |
|
352 vuzp8 %(basereg+1), %(basereg+3) |
|
353 vuzp8 %(basereg+0), %(basereg+2) |
|
354 .endif |
|
355 .endm |
|
356 |
|
357 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ |
|
358 .macro pixinterleave bpp, basereg |
|
359 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
|
360 vzip8 %(basereg+0), %(basereg+2) |
|
361 vzip8 %(basereg+1), %(basereg+3) |
|
362 vzip8 %(basereg+2), %(basereg+3) |
|
363 vzip8 %(basereg+0), %(basereg+1) |
|
364 .endif |
|
365 .endm |
|
366 |
|
367 /* |
|
368 * This is a macro for implementing cache preload. The main idea is that |
|
369 * cache preload logic is mostly independent from the rest of pixels |
|
370 * processing code. It starts at the top left pixel and moves forward |
|
371 * across pixels and can jump across scanlines. Prefetch distance is |
|
372 * handled in an 'incremental' way: it starts from 0 and advances to the |
|
373 * optimal distance over time. After reaching optimal prefetch distance, |
|
374 * it is kept constant. There are some checks which prevent prefetching |
|
375 * unneeded pixel lines below the image (but it still can prefetch a bit |
|
376 * more data on the right side of the image - not a big issue and may |
|
377 * be actually helpful when rendering text glyphs). Additional trick is |
|
378 * the use of LDR instruction for prefetch instead of PLD when moving to |
|
379 * the next line, the point is that we have a high chance of getting TLB |
|
380 * miss in this case, and PLD would be useless. |
|
381 * |
|
382 * This sounds like it may introduce a noticeable overhead (when working with |
|
383 * fully cached data). But in reality, due to having a separate pipeline and |
|
384 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can |
|
385 * execute simultaneously with NEON and be completely shadowed by it. Thus |
|
386 * we get no performance overhead at all (*). This looks like a very nice |
|
387 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, |
|
388 * but still can implement some rather advanced prefetch logic in sofware |
|
389 * for almost zero cost! |
|
390 * |
|
391 * (*) The overhead of the prefetcher is visible when running some trivial |
|
392 * pixels processing like simple copy. Anyway, having prefetch is a must |
|
393 * when working with the graphics data. |
|
394 */ |
|
395 .macro PF a, x:vararg |
|
396 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) |
|
397 a x |
|
398 .endif |
|
399 .endm |
|
400 |
|
401 .macro cache_preload std_increment, boost_increment |
|
402 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) |
|
403 .if regs_shortage |
|
404 PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ |
|
405 .endif |
|
406 .if std_increment != 0 |
|
407 PF add PF_X, PF_X, #std_increment |
|
408 .endif |
|
409 PF tst PF_CTL, #0xF |
|
410 PF addne PF_X, PF_X, #boost_increment |
|
411 PF subne PF_CTL, PF_CTL, #1 |
|
412 PF cmp PF_X, ORIG_W |
|
413 .if src_bpp_shift >= 0 |
|
414 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
415 .endif |
|
416 .if dst_r_bpp != 0 |
|
417 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
418 .endif |
|
419 .if mask_bpp_shift >= 0 |
|
420 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] |
|
421 .endif |
|
422 PF subge PF_X, PF_X, ORIG_W |
|
423 PF subges PF_CTL, PF_CTL, #0x10 |
|
424 .if src_bpp_shift >= 0 |
|
425 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
426 .endif |
|
427 .if dst_r_bpp != 0 |
|
428 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
429 .endif |
|
430 .if mask_bpp_shift >= 0 |
|
431 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! |
|
432 .endif |
|
433 .endif |
|
434 .endm |
|
435 |
|
436 .macro cache_preload_simple |
|
437 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) |
|
438 .if src_bpp > 0 |
|
439 pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] |
|
440 .endif |
|
441 .if dst_r_bpp > 0 |
|
442 pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] |
|
443 .endif |
|
444 .if mask_bpp > 0 |
|
445 pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] |
|
446 .endif |
|
447 .endif |
|
448 .endm |
|
449 |
|
450 .macro fetch_mask_pixblock |
|
451 pixld pixblock_size, mask_bpp, \ |
|
452 (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
|
453 .endm |
|
454 |
|
455 /* |
|
456 * Macro which is used to process leading pixels until destination |
|
457 * pointer is properly aligned (at 16 bytes boundary). When destination |
|
458 * buffer uses 16bpp format, this is unnecessary, or even pointless. |
|
459 */ |
|
460 .macro ensure_destination_ptr_alignment process_pixblock_head, \ |
|
461 process_pixblock_tail, \ |
|
462 process_pixblock_tail_head |
|
463 .if dst_w_bpp != 24 |
|
464 tst DST_R, #0xF |
|
465 beq 2f |
|
466 |
|
467 .irp lowbit, 1, 2, 4, 8, 16 |
|
468 local skip1 |
|
469 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) |
|
470 .if lowbit < 16 /* we don't need more than 16-byte alignment */ |
|
471 tst DST_R, #lowbit |
|
472 beq 1f |
|
473 .endif |
|
474 pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC |
|
475 pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK |
|
476 .if dst_r_bpp > 0 |
|
477 pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R |
|
478 .else |
|
479 add DST_R, DST_R, #lowbit |
|
480 .endif |
|
481 PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) |
|
482 sub W, W, #(lowbit * 8 / dst_w_bpp) |
|
483 1: |
|
484 .endif |
|
485 .endr |
|
486 pixdeinterleave src_bpp, src_basereg |
|
487 pixdeinterleave mask_bpp, mask_basereg |
|
488 pixdeinterleave dst_r_bpp, dst_r_basereg |
|
489 |
|
490 process_pixblock_head |
|
491 cache_preload 0, pixblock_size |
|
492 cache_preload_simple |
|
493 process_pixblock_tail |
|
494 |
|
495 pixinterleave dst_w_bpp, dst_w_basereg |
|
496 .irp lowbit, 1, 2, 4, 8, 16 |
|
497 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) |
|
498 .if lowbit < 16 /* we don't need more than 16-byte alignment */ |
|
499 tst DST_W, #lowbit |
|
500 beq 1f |
|
501 .endif |
|
502 pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W |
|
503 1: |
|
504 .endif |
|
505 .endr |
|
506 .endif |
|
507 2: |
|
508 .endm |
|
509 |
|
510 /* |
|
511 * Special code for processing up to (pixblock_size - 1) remaining |
|
512 * trailing pixels. As SIMD processing performs operation on |
|
513 * pixblock_size pixels, anything smaller than this has to be loaded |
|
514 * and stored in a special way. Loading and storing of pixel data is |
|
515 * performed in such a way that we fill some 'slots' in the NEON |
|
516 * registers (some slots naturally are unused), then perform compositing |
|
517 * operation as usual. In the end, the data is taken from these 'slots' |
|
518 * and saved to memory. |
|
519 * |
|
520 * cache_preload_flag - allows to suppress prefetch if |
|
521 * set to 0 |
|
522 * dst_aligned_flag - selects whether destination buffer |
|
523 * is aligned |
|
524 */ |
|
525 .macro process_trailing_pixels cache_preload_flag, \ |
|
526 dst_aligned_flag, \ |
|
527 process_pixblock_head, \ |
|
528 process_pixblock_tail, \ |
|
529 process_pixblock_tail_head |
|
530 tst W, #(pixblock_size - 1) |
|
531 beq 2f |
|
532 .irp chunk_size, 16, 8, 4, 2, 1 |
|
533 .if pixblock_size > chunk_size |
|
534 tst W, #chunk_size |
|
535 beq 1f |
|
536 pixld_src chunk_size, src_bpp, src_basereg, SRC |
|
537 pixld chunk_size, mask_bpp, mask_basereg, MASK |
|
538 .if dst_aligned_flag != 0 |
|
539 pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R |
|
540 .else |
|
541 pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R |
|
542 .endif |
|
543 .if cache_preload_flag != 0 |
|
544 PF add PF_X, PF_X, #chunk_size |
|
545 .endif |
|
546 1: |
|
547 .endif |
|
548 .endr |
|
549 pixdeinterleave src_bpp, src_basereg |
|
550 pixdeinterleave mask_bpp, mask_basereg |
|
551 pixdeinterleave dst_r_bpp, dst_r_basereg |
|
552 |
|
553 process_pixblock_head |
|
554 .if cache_preload_flag != 0 |
|
555 cache_preload 0, pixblock_size |
|
556 cache_preload_simple |
|
557 .endif |
|
558 process_pixblock_tail |
|
559 pixinterleave dst_w_bpp, dst_w_basereg |
|
560 .irp chunk_size, 16, 8, 4, 2, 1 |
|
561 .if pixblock_size > chunk_size |
|
562 tst W, #chunk_size |
|
563 beq 1f |
|
564 .if dst_aligned_flag != 0 |
|
565 pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W |
|
566 .else |
|
567 pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W |
|
568 .endif |
|
569 1: |
|
570 .endif |
|
571 .endr |
|
572 2: |
|
573 .endm |
|
574 |
|
575 /* |
|
576 * Macro, which performs all the needed operations to switch to the next |
|
577 * scanline and start the next loop iteration unless all the scanlines |
|
578 * are already processed. |
|
579 */ |
|
580 .macro advance_to_next_scanline start_of_loop_label |
|
581 .if regs_shortage |
|
582 ldrd W, [sp] /* load W and H (width and height) from stack */ |
|
583 .else |
|
584 mov W, ORIG_W |
|
585 .endif |
|
586 add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift |
|
587 .if src_bpp != 0 |
|
588 add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift |
|
589 .endif |
|
590 .if mask_bpp != 0 |
|
591 add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift |
|
592 .endif |
|
593 .if (dst_w_bpp != 24) |
|
594 sub DST_W, DST_W, W, lsl #dst_bpp_shift |
|
595 .endif |
|
596 .if (src_bpp != 24) && (src_bpp != 0) |
|
597 sub SRC, SRC, W, lsl #src_bpp_shift |
|
598 .endif |
|
599 .if (mask_bpp != 24) && (mask_bpp != 0) |
|
600 sub MASK, MASK, W, lsl #mask_bpp_shift |
|
601 .endif |
|
602 subs H, H, #1 |
|
603 mov DST_R, DST_W |
|
604 .if regs_shortage |
|
605 str H, [sp, #4] /* save updated height to stack */ |
|
606 .endif |
|
607 bge start_of_loop_label |
|
608 .endm |
|
609 |
|
610 /* |
|
611 * Registers are allocated in the following way by default: |
|
612 * d0, d1, d2, d3 - reserved for loading source pixel data |
|
613 * d4, d5, d6, d7 - reserved for loading destination pixel data |
|
614 * d24, d25, d26, d27 - reserved for loading mask pixel data |
|
615 * d28, d29, d30, d31 - final destination pixel data for writeback to memory |
|
616 */ |
|
617 .macro generate_composite_function fname, \ |
|
618 src_bpp_, \ |
|
619 mask_bpp_, \ |
|
620 dst_w_bpp_, \ |
|
621 flags, \ |
|
622 pixblock_size_, \ |
|
623 prefetch_distance, \ |
|
624 init, \ |
|
625 cleanup, \ |
|
626 process_pixblock_head, \ |
|
627 process_pixblock_tail, \ |
|
628 process_pixblock_tail_head, \ |
|
629 dst_w_basereg_ = 28, \ |
|
630 dst_r_basereg_ = 4, \ |
|
631 src_basereg_ = 0, \ |
|
632 mask_basereg_ = 24 |
|
633 |
|
634 .func fname |
|
635 .global fname |
|
636 /* For ELF format also set function visibility to hidden */ |
|
637 #ifdef __ELF__ |
|
638 .hidden fname |
|
639 .type fname, %function |
|
640 #endif |
|
641 fname: |
|
642 .fnstart |
|
643 .save {r4-r12, lr} |
|
644 push {r4-r12, lr} /* save all registers */ |
|
645 |
|
646 /* |
|
647 * Select prefetch type for this function. If prefetch distance is |
|
648 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch |
|
649 * has to be used instead of ADVANCED. |
|
650 */ |
|
651 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT |
|
652 .if prefetch_distance == 0 |
|
653 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE |
|
654 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ |
|
655 ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) |
|
656 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE |
|
657 .endif |
|
658 |
|
659 /* |
|
660 * Make some macro arguments globally visible and accessible |
|
661 * from other macros |
|
662 */ |
|
663 .set src_bpp, src_bpp_ |
|
664 .set mask_bpp, mask_bpp_ |
|
665 .set dst_w_bpp, dst_w_bpp_ |
|
666 .set pixblock_size, pixblock_size_ |
|
667 .set dst_w_basereg, dst_w_basereg_ |
|
668 .set dst_r_basereg, dst_r_basereg_ |
|
669 .set src_basereg, src_basereg_ |
|
670 .set mask_basereg, mask_basereg_ |
|
671 |
|
672 .macro pixld_src x:vararg |
|
673 pixld x |
|
674 .endm |
|
675 .macro fetch_src_pixblock |
|
676 pixld_src pixblock_size, src_bpp, \ |
|
677 (src_basereg - pixblock_size * src_bpp / 64), SRC |
|
678 .endm |
|
679 /* |
|
680 * Assign symbolic names to registers |
|
681 */ |
|
682 W .req r0 /* width (is updated during processing) */ |
|
683 H .req r1 /* height (is updated during processing) */ |
|
684 DST_W .req r2 /* destination buffer pointer for writes */ |
|
685 DST_STRIDE .req r3 /* destination image stride */ |
|
686 SRC .req r4 /* source buffer pointer */ |
|
687 SRC_STRIDE .req r5 /* source image stride */ |
|
688 DST_R .req r6 /* destination buffer pointer for reads */ |
|
689 |
|
690 MASK .req r7 /* mask pointer */ |
|
691 MASK_STRIDE .req r8 /* mask stride */ |
|
692 |
|
693 PF_CTL .req r9 /* combined lines counter and prefetch */ |
|
694 /* distance increment counter */ |
|
695 PF_X .req r10 /* pixel index in a scanline for current */ |
|
696 /* pretetch position */ |
|
697 PF_SRC .req r11 /* pointer to source scanline start */ |
|
698 /* for prefetch purposes */ |
|
699 PF_DST .req r12 /* pointer to destination scanline start */ |
|
700 /* for prefetch purposes */ |
|
701 PF_MASK .req r14 /* pointer to mask scanline start */ |
|
702 /* for prefetch purposes */ |
|
703 /* |
|
704 * Check whether we have enough registers for all the local variables. |
|
705 * If we don't have enough registers, original width and height are |
|
706 * kept on top of stack (and 'regs_shortage' variable is set to indicate |
|
707 * this for the rest of code). Even if there are enough registers, the |
|
708 * allocation scheme may be a bit different depending on whether source |
|
709 * or mask is not used. |
|
710 */ |
|
711 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) |
|
712 ORIG_W .req r10 /* saved original width */ |
|
713 DUMMY .req r12 /* temporary register */ |
|
714 .set regs_shortage, 0 |
|
715 .elseif mask_bpp == 0 |
|
716 ORIG_W .req r7 /* saved original width */ |
|
717 DUMMY .req r8 /* temporary register */ |
|
718 .set regs_shortage, 0 |
|
719 .elseif src_bpp == 0 |
|
720 ORIG_W .req r4 /* saved original width */ |
|
721 DUMMY .req r5 /* temporary register */ |
|
722 .set regs_shortage, 0 |
|
723 .else |
|
724 ORIG_W .req r1 /* saved original width */ |
|
725 DUMMY .req r1 /* temporary register */ |
|
726 .set regs_shortage, 1 |
|
727 .endif |
|
728 |
|
729 .set mask_bpp_shift, -1 |
|
730 .if src_bpp == 32 |
|
731 .set src_bpp_shift, 2 |
|
732 .elseif src_bpp == 24 |
|
733 .set src_bpp_shift, 0 |
|
734 .elseif src_bpp == 16 |
|
735 .set src_bpp_shift, 1 |
|
736 .elseif src_bpp == 8 |
|
737 .set src_bpp_shift, 0 |
|
738 .elseif src_bpp == 0 |
|
739 .set src_bpp_shift, -1 |
|
740 .else |
|
741 .error "requested src bpp (src_bpp) is not supported" |
|
742 .endif |
|
743 .if mask_bpp == 32 |
|
744 .set mask_bpp_shift, 2 |
|
745 .elseif mask_bpp == 24 |
|
746 .set mask_bpp_shift, 0 |
|
747 .elseif mask_bpp == 8 |
|
748 .set mask_bpp_shift, 0 |
|
749 .elseif mask_bpp == 0 |
|
750 .set mask_bpp_shift, -1 |
|
751 .else |
|
752 .error "requested mask bpp (mask_bpp) is not supported" |
|
753 .endif |
|
754 .if dst_w_bpp == 32 |
|
755 .set dst_bpp_shift, 2 |
|
756 .elseif dst_w_bpp == 24 |
|
757 .set dst_bpp_shift, 0 |
|
758 .elseif dst_w_bpp == 16 |
|
759 .set dst_bpp_shift, 1 |
|
760 .elseif dst_w_bpp == 8 |
|
761 .set dst_bpp_shift, 0 |
|
762 .else |
|
763 .error "requested dst bpp (dst_w_bpp) is not supported" |
|
764 .endif |
|
765 |
|
766 .if (((flags) & FLAG_DST_READWRITE) != 0) |
|
767 .set dst_r_bpp, dst_w_bpp |
|
768 .else |
|
769 .set dst_r_bpp, 0 |
|
770 .endif |
|
771 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) |
|
772 .set DEINTERLEAVE_32BPP_ENABLED, 1 |
|
773 .else |
|
774 .set DEINTERLEAVE_32BPP_ENABLED, 0 |
|
775 .endif |
|
776 |
|
777 .if prefetch_distance < 0 || prefetch_distance > 15 |
|
778 .error "invalid prefetch distance (prefetch_distance)" |
|
779 .endif |
|
780 |
|
781 .if src_bpp > 0 |
|
782 ldr SRC, [sp, #40] |
|
783 .endif |
|
784 .if mask_bpp > 0 |
|
785 ldr MASK, [sp, #48] |
|
786 .endif |
|
787 PF mov PF_X, #0 |
|
788 .if src_bpp > 0 |
|
789 ldr SRC_STRIDE, [sp, #44] |
|
790 .endif |
|
791 .if mask_bpp > 0 |
|
792 ldr MASK_STRIDE, [sp, #52] |
|
793 .endif |
|
794 mov DST_R, DST_W |
|
795 |
|
796 .if src_bpp == 24 |
|
797 sub SRC_STRIDE, SRC_STRIDE, W |
|
798 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 |
|
799 .endif |
|
800 .if mask_bpp == 24 |
|
801 sub MASK_STRIDE, MASK_STRIDE, W |
|
802 sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 |
|
803 .endif |
|
804 .if dst_w_bpp == 24 |
|
805 sub DST_STRIDE, DST_STRIDE, W |
|
806 sub DST_STRIDE, DST_STRIDE, W, lsl #1 |
|
807 .endif |
|
808 |
|
809 /* |
|
810 * Setup advanced prefetcher initial state |
|
811 */ |
|
812 PF mov PF_SRC, SRC |
|
813 PF mov PF_DST, DST_R |
|
814 PF mov PF_MASK, MASK |
|
815 /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ |
|
816 PF mov PF_CTL, H, lsl #4 |
|
817 PF add PF_CTL, #(prefetch_distance - 0x10) |
|
818 |
|
819 init |
|
820 .if regs_shortage |
|
821 .save {r0, r1} |
|
822 push {r0, r1} |
|
823 .endif |
|
824 subs H, H, #1 |
|
825 .if regs_shortage |
|
826 str H, [sp, #4] /* save updated height to stack */ |
|
827 .else |
|
828 mov ORIG_W, W |
|
829 .endif |
|
830 blt 9f |
|
831 cmp W, #(pixblock_size * 2) |
|
832 blt 8f |
|
833 /* |
|
834 * This is the start of the pipelined loop, which if optimized for |
|
835 * long scanlines |
|
836 */ |
|
837 0: |
|
838 ensure_destination_ptr_alignment process_pixblock_head, \ |
|
839 process_pixblock_tail, \ |
|
840 process_pixblock_tail_head |
|
841 |
|
842 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ |
|
843 pixld_a pixblock_size, dst_r_bpp, \ |
|
844 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R |
|
845 fetch_src_pixblock |
|
846 pixld pixblock_size, mask_bpp, \ |
|
847 (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
|
848 PF add PF_X, PF_X, #pixblock_size |
|
849 process_pixblock_head |
|
850 cache_preload 0, pixblock_size |
|
851 cache_preload_simple |
|
852 subs W, W, #(pixblock_size * 2) |
|
853 blt 2f |
|
854 1: |
|
855 process_pixblock_tail_head |
|
856 cache_preload_simple |
|
857 subs W, W, #pixblock_size |
|
858 bge 1b |
|
859 2: |
|
860 process_pixblock_tail |
|
861 pixst_a pixblock_size, dst_w_bpp, \ |
|
862 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
|
863 |
|
864 /* Process the remaining trailing pixels in the scanline */ |
|
865 process_trailing_pixels 1, 1, \ |
|
866 process_pixblock_head, \ |
|
867 process_pixblock_tail, \ |
|
868 process_pixblock_tail_head |
|
869 advance_to_next_scanline 0b |
|
870 |
|
871 .if regs_shortage |
|
872 pop {r0, r1} |
|
873 .endif |
|
874 cleanup |
|
875 pop {r4-r12, pc} /* exit */ |
|
876 /* |
|
877 * This is the start of the loop, designed to process images with small width |
|
878 * (less than pixblock_size * 2 pixels). In this case neither pipelining |
|
879 * nor prefetch are used. |
|
880 */ |
|
881 8: |
|
882 /* Process exactly pixblock_size pixels if needed */ |
|
883 tst W, #pixblock_size |
|
884 beq 1f |
|
885 pixld pixblock_size, dst_r_bpp, \ |
|
886 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R |
|
887 fetch_src_pixblock |
|
888 pixld pixblock_size, mask_bpp, \ |
|
889 (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
|
890 process_pixblock_head |
|
891 process_pixblock_tail |
|
892 pixst pixblock_size, dst_w_bpp, \ |
|
893 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
|
894 1: |
|
895 /* Process the remaining trailing pixels in the scanline */ |
|
896 process_trailing_pixels 0, 0, \ |
|
897 process_pixblock_head, \ |
|
898 process_pixblock_tail, \ |
|
899 process_pixblock_tail_head |
|
900 advance_to_next_scanline 8b |
|
901 9: |
|
902 .if regs_shortage |
|
903 pop {r0, r1} |
|
904 .endif |
|
905 cleanup |
|
906 pop {r4-r12, pc} /* exit */ |
|
907 .fnend |
|
908 |
|
909 .purgem fetch_src_pixblock |
|
910 .purgem pixld_src |
|
911 |
|
912 .unreq SRC |
|
913 .unreq MASK |
|
914 .unreq DST_R |
|
915 .unreq DST_W |
|
916 .unreq ORIG_W |
|
917 .unreq W |
|
918 .unreq H |
|
919 .unreq SRC_STRIDE |
|
920 .unreq DST_STRIDE |
|
921 .unreq MASK_STRIDE |
|
922 .unreq PF_CTL |
|
923 .unreq PF_X |
|
924 .unreq PF_SRC |
|
925 .unreq PF_DST |
|
926 .unreq PF_MASK |
|
927 .unreq DUMMY |
|
928 .endfunc |
|
929 .endm |
|
930 |
|
931 /* |
|
932 * A simplified variant of function generation template for a single |
|
933 * scanline processing (for implementing pixman combine functions) |
|
934 */ |
|
935 .macro generate_composite_function_scanline use_nearest_scaling, \ |
|
936 fname, \ |
|
937 src_bpp_, \ |
|
938 mask_bpp_, \ |
|
939 dst_w_bpp_, \ |
|
940 flags, \ |
|
941 pixblock_size_, \ |
|
942 init, \ |
|
943 cleanup, \ |
|
944 process_pixblock_head, \ |
|
945 process_pixblock_tail, \ |
|
946 process_pixblock_tail_head, \ |
|
947 dst_w_basereg_ = 28, \ |
|
948 dst_r_basereg_ = 4, \ |
|
949 src_basereg_ = 0, \ |
|
950 mask_basereg_ = 24 |
|
951 |
|
952 .func fname |
|
953 .global fname |
|
954 /* For ELF format also set function visibility to hidden */ |
|
955 #ifdef __ELF__ |
|
956 .hidden fname |
|
957 .type fname, %function |
|
958 #endif |
|
959 fname: |
|
960 .fnstart |
|
961 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE |
|
962 /* |
|
963 * Make some macro arguments globally visible and accessible |
|
964 * from other macros |
|
965 */ |
|
966 .set src_bpp, src_bpp_ |
|
967 .set mask_bpp, mask_bpp_ |
|
968 .set dst_w_bpp, dst_w_bpp_ |
|
969 .set pixblock_size, pixblock_size_ |
|
970 .set dst_w_basereg, dst_w_basereg_ |
|
971 .set dst_r_basereg, dst_r_basereg_ |
|
972 .set src_basereg, src_basereg_ |
|
973 .set mask_basereg, mask_basereg_ |
|
974 |
|
975 .if use_nearest_scaling != 0 |
|
976 /* |
|
977 * Assign symbolic names to registers for nearest scaling |
|
978 */ |
|
979 W .req r0 |
|
980 DST_W .req r1 |
|
981 SRC .req r2 |
|
982 VX .req r3 |
|
983 UNIT_X .req ip |
|
984 MASK .req lr |
|
985 TMP1 .req r4 |
|
986 TMP2 .req r5 |
|
987 DST_R .req r6 |
|
988 SRC_WIDTH_FIXED .req r7 |
|
989 |
|
990 .macro pixld_src x:vararg |
|
991 pixld_s x |
|
992 .endm |
|
993 |
|
994 ldr UNIT_X, [sp] |
|
995 .save {r4-r8, lr} |
|
996 push {r4-r8, lr} |
|
997 ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] |
|
998 .if mask_bpp != 0 |
|
999 ldr MASK, [sp, #(24 + 8)] |
|
1000 .endif |
|
1001 .else |
|
1002 /* |
|
1003 * Assign symbolic names to registers |
|
1004 */ |
|
1005 W .req r0 /* width (is updated during processing) */ |
|
1006 DST_W .req r1 /* destination buffer pointer for writes */ |
|
1007 SRC .req r2 /* source buffer pointer */ |
|
1008 DST_R .req ip /* destination buffer pointer for reads */ |
|
1009 MASK .req r3 /* mask pointer */ |
|
1010 |
|
1011 .macro pixld_src x:vararg |
|
1012 pixld x |
|
1013 .endm |
|
1014 .endif |
|
1015 |
|
1016 .if (((flags) & FLAG_DST_READWRITE) != 0) |
|
1017 .set dst_r_bpp, dst_w_bpp |
|
1018 .else |
|
1019 .set dst_r_bpp, 0 |
|
1020 .endif |
|
1021 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) |
|
1022 .set DEINTERLEAVE_32BPP_ENABLED, 1 |
|
1023 .else |
|
1024 .set DEINTERLEAVE_32BPP_ENABLED, 0 |
|
1025 .endif |
|
1026 |
|
1027 .macro fetch_src_pixblock |
|
1028 pixld_src pixblock_size, src_bpp, \ |
|
1029 (src_basereg - pixblock_size * src_bpp / 64), SRC |
|
1030 .endm |
|
1031 |
|
1032 init |
|
1033 mov DST_R, DST_W |
|
1034 |
|
1035 cmp W, #pixblock_size |
|
1036 blt 8f |
|
1037 |
|
1038 ensure_destination_ptr_alignment process_pixblock_head, \ |
|
1039 process_pixblock_tail, \ |
|
1040 process_pixblock_tail_head |
|
1041 |
|
1042 subs W, W, #pixblock_size |
|
1043 blt 7f |
|
1044 |
|
1045 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ |
|
1046 pixld_a pixblock_size, dst_r_bpp, \ |
|
1047 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R |
|
1048 fetch_src_pixblock |
|
1049 pixld pixblock_size, mask_bpp, \ |
|
1050 (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
|
1051 process_pixblock_head |
|
1052 subs W, W, #pixblock_size |
|
1053 blt 2f |
|
1054 1: |
|
1055 process_pixblock_tail_head |
|
1056 subs W, W, #pixblock_size |
|
1057 bge 1b |
|
1058 2: |
|
1059 process_pixblock_tail |
|
1060 pixst_a pixblock_size, dst_w_bpp, \ |
|
1061 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
|
1062 7: |
|
1063 /* Process the remaining trailing pixels in the scanline (dst aligned) */ |
|
1064 process_trailing_pixels 0, 1, \ |
|
1065 process_pixblock_head, \ |
|
1066 process_pixblock_tail, \ |
|
1067 process_pixblock_tail_head |
|
1068 |
|
1069 cleanup |
|
1070 .if use_nearest_scaling != 0 |
|
1071 pop {r4-r8, pc} /* exit */ |
|
1072 .else |
|
1073 bx lr /* exit */ |
|
1074 .endif |
|
1075 8: |
|
1076 /* Process the remaining trailing pixels in the scanline (dst unaligned) */ |
|
1077 process_trailing_pixels 0, 0, \ |
|
1078 process_pixblock_head, \ |
|
1079 process_pixblock_tail, \ |
|
1080 process_pixblock_tail_head |
|
1081 |
|
1082 cleanup |
|
1083 |
|
1084 .if use_nearest_scaling != 0 |
|
1085 pop {r4-r8, pc} /* exit */ |
|
1086 |
|
1087 .unreq DST_R |
|
1088 .unreq SRC |
|
1089 .unreq W |
|
1090 .unreq VX |
|
1091 .unreq UNIT_X |
|
1092 .unreq TMP1 |
|
1093 .unreq TMP2 |
|
1094 .unreq DST_W |
|
1095 .unreq MASK |
|
1096 .unreq SRC_WIDTH_FIXED |
|
1097 |
|
1098 .else |
|
1099 bx lr /* exit */ |
|
1100 |
|
1101 .unreq SRC |
|
1102 .unreq MASK |
|
1103 .unreq DST_R |
|
1104 .unreq DST_W |
|
1105 .unreq W |
|
1106 .endif |
|
1107 |
|
1108 .purgem fetch_src_pixblock |
|
1109 .purgem pixld_src |
|
1110 |
|
1111 .fnend |
|
1112 .endfunc |
|
1113 .endm |
|
1114 |
|
1115 .macro generate_composite_function_single_scanline x:vararg |
|
1116 generate_composite_function_scanline 0, x |
|
1117 .endm |
|
1118 |
|
1119 .macro generate_composite_function_nearest_scanline x:vararg |
|
1120 generate_composite_function_scanline 1, x |
|
1121 .endm |
|
1122 |
|
1123 /* Default prologue/epilogue, nothing special needs to be done */ |
|
1124 |
|
1125 .macro default_init |
|
1126 .endm |
|
1127 |
|
1128 .macro default_cleanup |
|
1129 .endm |
|
1130 |
|
1131 /* |
|
1132 * Prologue/epilogue variant which additionally saves/restores d8-d15 |
|
1133 * registers (they need to be saved/restored by callee according to ABI). |
|
1134 * This is required if the code needs to use all the NEON registers. |
|
1135 */ |
|
1136 |
|
1137 .macro default_init_need_all_regs |
|
1138 .vsave {d8-d15} |
|
1139 vpush {d8-d15} |
|
1140 .endm |
|
1141 |
|
1142 .macro default_cleanup_need_all_regs |
|
1143 vpop {d8-d15} |
|
1144 .endm |
|
1145 |
|
1146 /******************************************************************************/ |
|
1147 |
|
1148 /* |
|
1149 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) |
|
1150 * into a planar a8r8g8b8 format (with a, r, g, b color components |
|
1151 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). |
|
1152 * |
|
1153 * Warning: the conversion is destructive and the original |
|
1154 * value (in) is lost. |
|
1155 */ |
|
1156 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b |
|
1157 vshrn.u16 out_r, in, #8 |
|
1158 vshrn.u16 out_g, in, #3 |
|
1159 vsli.u16 in, in, #5 |
|
1160 vmov.u8 out_a, #255 |
|
1161 vsri.u8 out_r, out_r, #5 |
|
1162 vsri.u8 out_g, out_g, #6 |
|
1163 vshrn.u16 out_b, in, #2 |
|
1164 .endm |
|
1165 |
|
1166 .macro convert_0565_to_x888 in, out_r, out_g, out_b |
|
1167 vshrn.u16 out_r, in, #8 |
|
1168 vshrn.u16 out_g, in, #3 |
|
1169 vsli.u16 in, in, #5 |
|
1170 vsri.u8 out_r, out_r, #5 |
|
1171 vsri.u8 out_g, out_g, #6 |
|
1172 vshrn.u16 out_b, in, #2 |
|
1173 .endm |
|
1174 |
|
1175 /* |
|
1176 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components |
|
1177 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 |
|
1178 * pixels packed in 128-bit register (out). Requires two temporary 128-bit |
|
1179 * registers (tmp1, tmp2) |
|
1180 */ |
|
1181 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 |
|
1182 vshll.u8 tmp1, in_g, #8 |
|
1183 vshll.u8 out, in_r, #8 |
|
1184 vshll.u8 tmp2, in_b, #8 |
|
1185 vsri.u16 out, tmp1, #5 |
|
1186 vsri.u16 out, tmp2, #11 |
|
1187 .endm |
|
1188 |
|
1189 /* |
|
1190 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels |
|
1191 * returned in (out0, out1) registers pair. Requires one temporary |
|
1192 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original |
|
1193 * value from 'in' is lost |
|
1194 */ |
|
1195 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp |
|
1196 vshl.u16 out0, in, #5 /* G top 6 bits */ |
|
1197 vshl.u16 tmp, in, #11 /* B top 5 bits */ |
|
1198 vsri.u16 in, in, #5 /* R is ready in top bits */ |
|
1199 vsri.u16 out0, out0, #6 /* G is ready in top bits */ |
|
1200 vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ |
|
1201 vshr.u16 out1, in, #8 /* R is in place */ |
|
1202 vsri.u16 out0, tmp, #8 /* G & B is in place */ |
|
1203 vzip.u16 out0, out1 /* everything is in place */ |
|
1204 .endm |