|
1 /* |
|
2 * Copyright © 2009 Nokia Corporation |
|
3 * |
|
4 * Permission is hereby granted, free of charge, to any person obtaining a |
|
5 * copy of this software and associated documentation files (the "Software"), |
|
6 * to deal in the Software without restriction, including without limitation |
|
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
8 * and/or sell copies of the Software, and to permit persons to whom the |
|
9 * Software is furnished to do so, subject to the following conditions: |
|
10 * |
|
11 * The above copyright notice and this permission notice (including the next |
|
12 * paragraph) shall be included in all copies or substantial portions of the |
|
13 * Software. |
|
14 * |
|
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
21 * DEALINGS IN THE SOFTWARE. |
|
22 * |
|
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) |
|
24 */ |
|
25 |
|
26 /* |
|
27 * This file contains implementations of NEON optimized pixel processing |
|
28 * functions. There is no full and detailed tutorial, but some functions |
|
29 * (those which are exposing some new or interesting features) are |
|
30 * extensively commented and can be used as examples. |
|
31 * |
|
32 * You may want to have a look at the comments for following functions: |
|
33 * - pixman_composite_over_8888_0565_asm_neon |
|
34 * - pixman_composite_over_n_8_0565_asm_neon |
|
35 */ |
|
36 |
|
37 /* Prevent the stack from becoming executable for no reason... */ |
|
38 #if defined(__linux__) && defined(__ELF__) |
|
39 .section .note.GNU-stack,"",%progbits |
|
40 #endif |
|
41 |
|
42 .text |
|
43 .fpu neon |
|
44 .arch armv7a |
|
45 .object_arch armv4 |
|
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ |
|
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ |
|
48 .arm |
|
49 .altmacro |
|
50 .p2align 2 |
|
51 |
|
52 #include "pixman-private.h" |
|
53 #include "pixman-arm-neon-asm.h" |
|
54 |
|
55 /* Global configuration options and preferences */ |
|
56 |
|
57 /* |
|
58 * The code can optionally make use of unaligned memory accesses to improve |
|
59 * performance of handling leading/trailing pixels for each scanline. |
|
60 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for |
|
61 * example in linux if unaligned memory accesses are not configured to |
|
62 * generate.exceptions. |
|
63 */ |
|
64 .set RESPECT_STRICT_ALIGNMENT, 1 |
|
65 |
|
66 /* |
|
67 * Set default prefetch type. There is a choice between the following options: |
|
68 * |
|
69 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work |
|
70 * as NOP to workaround some HW bugs or for whatever other reason) |
|
71 * |
|
72 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where |
|
73 * advanced prefetch intruduces heavy overhead) |
|
74 * |
|
75 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 |
|
76 * which can run ARM and NEON instructions simultaneously so that extra ARM |
|
77 * instructions do not add (many) extra cycles, but improve prefetch efficiency) |
|
78 * |
|
79 * Note: some types of function can't support advanced prefetch and fallback |
|
80 * to simple one (those which handle 24bpp pixels) |
|
81 */ |
|
82 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED |
|
83 |
|
84 /* Prefetch distance in pixels for simple prefetch */ |
|
85 .set PREFETCH_DISTANCE_SIMPLE, 64 |
|
86 |
|
87 /* |
|
88 * Implementation of pixman_composite_over_8888_0565_asm_neon |
|
89 * |
|
90 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and |
|
91 * performs OVER compositing operation. Function fast_composite_over_8888_0565 |
|
92 * from pixman-fast-path.c does the same in C and can be used as a reference. |
|
93 * |
|
94 * First we need to have some NEON assembly code which can do the actual |
|
95 * operation on the pixels and provide it to the template macro. |
|
96 * |
|
97 * Template macro quite conveniently takes care of emitting all the necessary |
|
98 * code for memory reading and writing (including quite tricky cases of |
|
99 * handling unaligned leading/trailing pixels), so we only need to deal with |
|
100 * the data in NEON registers. |
|
101 * |
|
102 * NEON registers allocation in general is recommented to be the following: |
|
103 * d0, d1, d2, d3 - contain loaded source pixel data |
|
104 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) |
|
105 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) |
|
106 * d28, d29, d30, d31 - place for storing the result (destination pixels) |
|
107 * |
|
108 * As can be seen above, four 64-bit NEON registers are used for keeping |
|
109 * intermediate pixel data and up to 8 pixels can be processed in one step |
|
110 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). |
|
111 * |
|
112 * This particular function uses the following registers allocation: |
|
113 * d0, d1, d2, d3 - contain loaded source pixel data |
|
114 * d4, d5 - contain loaded destination pixels (they are needed) |
|
115 * d28, d29 - place for storing the result (destination pixels) |
|
116 */ |
|
117 |
|
118 /* |
|
119 * Step one. We need to have some code to do some arithmetics on pixel data. |
|
120 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used |
|
121 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, |
|
122 * perform all the needed calculations and write the result to {d28, d29}. |
|
123 * The rationale for having two macros and not just one will be explained |
|
124 * later. In practice, any single monolitic function which does the work can |
|
125 * be split into two parts in any arbitrary way without affecting correctness. |
|
126 * |
|
127 * There is one special trick here too. Common template macro can optionally |
|
128 * make our life a bit easier by doing R, G, B, A color components |
|
129 * deinterleaving for 32bpp pixel formats (and this feature is used in |
|
130 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that |
|
131 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we |
|
132 * actually use d0 register for blue channel (a vector of eight 8-bit |
|
133 * values), d1 register for green, d2 for red and d3 for alpha. This |
|
134 * simple conversion can be also done with a few NEON instructions: |
|
135 * |
|
136 * Packed to planar conversion: |
|
137 * vuzp.8 d0, d1 |
|
138 * vuzp.8 d2, d3 |
|
139 * vuzp.8 d1, d3 |
|
140 * vuzp.8 d0, d2 |
|
141 * |
|
142 * Planar to packed conversion: |
|
143 * vzip.8 d0, d2 |
|
144 * vzip.8 d1, d3 |
|
145 * vzip.8 d2, d3 |
|
146 * vzip.8 d0, d1 |
|
147 * |
|
148 * But pixel can be loaded directly in planar format using VLD4.8 NEON |
|
149 * instruction. It is 1 cycle slower than VLD1.32, so this is not always |
|
150 * desirable, that's why deinterleaving is optional. |
|
151 * |
|
152 * But anyway, here is the code: |
|
153 */ |
|
154 .macro pixman_composite_over_8888_0565_process_pixblock_head |
|
155 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format |
|
156 and put data into d6 - red, d7 - green, d30 - blue */ |
|
157 vshrn.u16 d6, q2, #8 |
|
158 vshrn.u16 d7, q2, #3 |
|
159 vsli.u16 q2, q2, #5 |
|
160 vsri.u8 d6, d6, #5 |
|
161 vmvn.8 d3, d3 /* invert source alpha */ |
|
162 vsri.u8 d7, d7, #6 |
|
163 vshrn.u16 d30, q2, #2 |
|
164 /* now do alpha blending, storing results in 8-bit planar format |
|
165 into d16 - red, d19 - green, d18 - blue */ |
|
166 vmull.u8 q10, d3, d6 |
|
167 vmull.u8 q11, d3, d7 |
|
168 vmull.u8 q12, d3, d30 |
|
169 vrshr.u16 q13, q10, #8 |
|
170 vrshr.u16 q3, q11, #8 |
|
171 vrshr.u16 q15, q12, #8 |
|
172 vraddhn.u16 d20, q10, q13 |
|
173 vraddhn.u16 d23, q11, q3 |
|
174 vraddhn.u16 d22, q12, q15 |
|
175 .endm |
|
176 |
|
177 .macro pixman_composite_over_8888_0565_process_pixblock_tail |
|
178 /* ... continue alpha blending */ |
|
179 vqadd.u8 d16, d2, d20 |
|
180 vqadd.u8 q9, q0, q11 |
|
181 /* convert the result to r5g6b5 and store it into {d28, d29} */ |
|
182 vshll.u8 q14, d16, #8 |
|
183 vshll.u8 q8, d19, #8 |
|
184 vshll.u8 q9, d18, #8 |
|
185 vsri.u16 q14, q8, #5 |
|
186 vsri.u16 q14, q9, #11 |
|
187 .endm |
|
188 |
|
189 /* |
|
190 * OK, now we got almost everything that we need. Using the above two |
|
191 * macros, the work can be done right. But now we want to optimize |
|
192 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really |
|
193 * a lot from good code scheduling and software pipelining. |
|
194 * |
|
195 * Let's construct some code, which will run in the core main loop. |
|
196 * Some pseudo-code of the main loop will look like this: |
|
197 * head |
|
198 * while (...) { |
|
199 * tail |
|
200 * head |
|
201 * } |
|
202 * tail |
|
203 * |
|
204 * It may look a bit weird, but this setup allows to hide instruction |
|
205 * latencies better and also utilize dual-issue capability more |
|
206 * efficiently (make pairs of load-store and ALU instructions). |
|
207 * |
|
208 * So what we need now is a '*_tail_head' macro, which will be used |
|
209 * in the core main loop. A trivial straightforward implementation |
|
210 * of this macro would look like this: |
|
211 * |
|
212 * pixman_composite_over_8888_0565_process_pixblock_tail |
|
213 * vst1.16 {d28, d29}, [DST_W, :128]! |
|
214 * vld1.16 {d4, d5}, [DST_R, :128]! |
|
215 * vld4.32 {d0, d1, d2, d3}, [SRC]! |
|
216 * pixman_composite_over_8888_0565_process_pixblock_head |
|
217 * cache_preload 8, 8 |
|
218 * |
|
219 * Now it also got some VLD/VST instructions. We simply can't move from |
|
220 * processing one block of pixels to the other one with just arithmetics. |
|
221 * The previously processed data needs to be written to memory and new |
|
222 * data needs to be fetched. Fortunately, this main loop does not deal |
|
223 * with partial leading/trailing pixels and can load/store a full block |
|
224 * of pixels in a bulk. Additionally, destination buffer is already |
|
225 * 16 bytes aligned here (which is good for performance). |
|
226 * |
|
227 * New things here are DST_R, DST_W, SRC and MASK identifiers. These |
|
228 * are the aliases for ARM registers which are used as pointers for |
|
229 * accessing data. We maintain separate pointers for reading and writing |
|
230 * destination buffer (DST_R and DST_W). |
|
231 * |
|
232 * Another new thing is 'cache_preload' macro. It is used for prefetching |
|
233 * data into CPU L2 cache and improve performance when dealing with large |
|
234 * images which are far larger than cache size. It uses one argument |
|
235 * (actually two, but they need to be the same here) - number of pixels |
|
236 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some |
|
237 * details about this macro. Moreover, if good performance is needed |
|
238 * the code from this macro needs to be copied into '*_tail_head' macro |
|
239 * and mixed with the rest of code for optimal instructions scheduling. |
|
240 * We are actually doing it below. |
|
241 * |
|
242 * Now after all the explanations, here is the optimized code. |
|
243 * Different instruction streams (originaling from '*_head', '*_tail' |
|
244 * and 'cache_preload' macro) use different indentation levels for |
|
245 * better readability. Actually taking the code from one of these |
|
246 * indentation levels and ignoring a few VLD/VST instructions would |
|
247 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' |
|
248 * macro! |
|
249 */ |
|
250 |
|
251 #if 1 |
|
252 |
|
253 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head |
|
254 vqadd.u8 d16, d2, d20 |
|
255 vld1.16 {d4, d5}, [DST_R, :128]! |
|
256 vqadd.u8 q9, q0, q11 |
|
257 vshrn.u16 d6, q2, #8 |
|
258 fetch_src_pixblock |
|
259 vshrn.u16 d7, q2, #3 |
|
260 vsli.u16 q2, q2, #5 |
|
261 vshll.u8 q14, d16, #8 |
|
262 PF add PF_X, PF_X, #8 |
|
263 vshll.u8 q8, d19, #8 |
|
264 PF tst PF_CTL, #0xF |
|
265 vsri.u8 d6, d6, #5 |
|
266 PF addne PF_X, PF_X, #8 |
|
267 vmvn.8 d3, d3 |
|
268 PF subne PF_CTL, PF_CTL, #1 |
|
269 vsri.u8 d7, d7, #6 |
|
270 vshrn.u16 d30, q2, #2 |
|
271 vmull.u8 q10, d3, d6 |
|
272 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
273 vmull.u8 q11, d3, d7 |
|
274 vmull.u8 q12, d3, d30 |
|
275 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
276 vsri.u16 q14, q8, #5 |
|
277 PF cmp PF_X, ORIG_W |
|
278 vshll.u8 q9, d18, #8 |
|
279 vrshr.u16 q13, q10, #8 |
|
280 PF subge PF_X, PF_X, ORIG_W |
|
281 vrshr.u16 q3, q11, #8 |
|
282 vrshr.u16 q15, q12, #8 |
|
283 PF subges PF_CTL, PF_CTL, #0x10 |
|
284 vsri.u16 q14, q9, #11 |
|
285 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
286 vraddhn.u16 d20, q10, q13 |
|
287 vraddhn.u16 d23, q11, q3 |
|
288 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
289 vraddhn.u16 d22, q12, q15 |
|
290 vst1.16 {d28, d29}, [DST_W, :128]! |
|
291 .endm |
|
292 |
|
293 #else |
|
294 |
|
295 /* If we did not care much about the performance, we would just use this... */ |
|
296 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head |
|
297 pixman_composite_over_8888_0565_process_pixblock_tail |
|
298 vst1.16 {d28, d29}, [DST_W, :128]! |
|
299 vld1.16 {d4, d5}, [DST_R, :128]! |
|
300 fetch_src_pixblock |
|
301 pixman_composite_over_8888_0565_process_pixblock_head |
|
302 cache_preload 8, 8 |
|
303 .endm |
|
304 |
|
305 #endif |
|
306 |
|
307 /* |
|
308 * And now the final part. We are using 'generate_composite_function' macro |
|
309 * to put all the stuff together. We are specifying the name of the function |
|
310 * which we want to get, number of bits per pixel for the source, mask and |
|
311 * destination (0 if unused, like mask in this case). Next come some bit |
|
312 * flags: |
|
313 * FLAG_DST_READWRITE - tells that the destination buffer is both read |
|
314 * and written, for write-only buffer we would use |
|
315 * FLAG_DST_WRITEONLY flag instead |
|
316 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data |
|
317 * and separate color channels for 32bpp format. |
|
318 * The next things are: |
|
319 * - the number of pixels processed per iteration (8 in this case, because |
|
320 * that's the maximum what can fit into four 64-bit NEON registers). |
|
321 * - prefetch distance, measured in pixel blocks. In this case it is 5 times |
|
322 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal |
|
323 * prefetch distance can be selected by running some benchmarks. |
|
324 * |
|
325 * After that we specify some macros, these are 'default_init', |
|
326 * 'default_cleanup' here which are empty (but it is possible to have custom |
|
327 * init/cleanup macros to be able to save/restore some extra NEON registers |
|
328 * like d8-d15 or do anything else) followed by |
|
329 * 'pixman_composite_over_8888_0565_process_pixblock_head', |
|
330 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and |
|
331 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' |
|
332 * which we got implemented above. |
|
333 * |
|
334 * The last part is the NEON registers allocation scheme. |
|
335 */ |
|
336 generate_composite_function \ |
|
337 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ |
|
338 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
339 8, /* number of pixels, processed in a single block */ \ |
|
340 5, /* prefetch distance */ \ |
|
341 default_init, \ |
|
342 default_cleanup, \ |
|
343 pixman_composite_over_8888_0565_process_pixblock_head, \ |
|
344 pixman_composite_over_8888_0565_process_pixblock_tail, \ |
|
345 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ |
|
346 28, /* dst_w_basereg */ \ |
|
347 4, /* dst_r_basereg */ \ |
|
348 0, /* src_basereg */ \ |
|
349 24 /* mask_basereg */ |
|
350 |
|
351 /******************************************************************************/ |
|
352 |
|
353 .macro pixman_composite_over_n_0565_process_pixblock_head |
|
354 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format |
|
355 and put data into d6 - red, d7 - green, d30 - blue */ |
|
356 vshrn.u16 d6, q2, #8 |
|
357 vshrn.u16 d7, q2, #3 |
|
358 vsli.u16 q2, q2, #5 |
|
359 vsri.u8 d6, d6, #5 |
|
360 vsri.u8 d7, d7, #6 |
|
361 vshrn.u16 d30, q2, #2 |
|
362 /* now do alpha blending, storing results in 8-bit planar format |
|
363 into d16 - red, d19 - green, d18 - blue */ |
|
364 vmull.u8 q10, d3, d6 |
|
365 vmull.u8 q11, d3, d7 |
|
366 vmull.u8 q12, d3, d30 |
|
367 vrshr.u16 q13, q10, #8 |
|
368 vrshr.u16 q3, q11, #8 |
|
369 vrshr.u16 q15, q12, #8 |
|
370 vraddhn.u16 d20, q10, q13 |
|
371 vraddhn.u16 d23, q11, q3 |
|
372 vraddhn.u16 d22, q12, q15 |
|
373 .endm |
|
374 |
|
375 .macro pixman_composite_over_n_0565_process_pixblock_tail |
|
376 /* ... continue alpha blending */ |
|
377 vqadd.u8 d16, d2, d20 |
|
378 vqadd.u8 q9, q0, q11 |
|
379 /* convert the result to r5g6b5 and store it into {d28, d29} */ |
|
380 vshll.u8 q14, d16, #8 |
|
381 vshll.u8 q8, d19, #8 |
|
382 vshll.u8 q9, d18, #8 |
|
383 vsri.u16 q14, q8, #5 |
|
384 vsri.u16 q14, q9, #11 |
|
385 .endm |
|
386 |
|
387 /* TODO: expand macros and do better instructions scheduling */ |
|
388 .macro pixman_composite_over_n_0565_process_pixblock_tail_head |
|
389 pixman_composite_over_n_0565_process_pixblock_tail |
|
390 vld1.16 {d4, d5}, [DST_R, :128]! |
|
391 vst1.16 {d28, d29}, [DST_W, :128]! |
|
392 pixman_composite_over_n_0565_process_pixblock_head |
|
393 cache_preload 8, 8 |
|
394 .endm |
|
395 |
|
396 .macro pixman_composite_over_n_0565_init |
|
397 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
398 vld1.32 {d3[0]}, [DUMMY] |
|
399 vdup.8 d0, d3[0] |
|
400 vdup.8 d1, d3[1] |
|
401 vdup.8 d2, d3[2] |
|
402 vdup.8 d3, d3[3] |
|
403 vmvn.8 d3, d3 /* invert source alpha */ |
|
404 .endm |
|
405 |
|
406 generate_composite_function \ |
|
407 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ |
|
408 FLAG_DST_READWRITE, \ |
|
409 8, /* number of pixels, processed in a single block */ \ |
|
410 5, /* prefetch distance */ \ |
|
411 pixman_composite_over_n_0565_init, \ |
|
412 default_cleanup, \ |
|
413 pixman_composite_over_n_0565_process_pixblock_head, \ |
|
414 pixman_composite_over_n_0565_process_pixblock_tail, \ |
|
415 pixman_composite_over_n_0565_process_pixblock_tail_head, \ |
|
416 28, /* dst_w_basereg */ \ |
|
417 4, /* dst_r_basereg */ \ |
|
418 0, /* src_basereg */ \ |
|
419 24 /* mask_basereg */ |
|
420 |
|
421 /******************************************************************************/ |
|
422 |
|
423 .macro pixman_composite_src_8888_0565_process_pixblock_head |
|
424 vshll.u8 q8, d1, #8 |
|
425 vshll.u8 q14, d2, #8 |
|
426 vshll.u8 q9, d0, #8 |
|
427 .endm |
|
428 |
|
429 .macro pixman_composite_src_8888_0565_process_pixblock_tail |
|
430 vsri.u16 q14, q8, #5 |
|
431 vsri.u16 q14, q9, #11 |
|
432 .endm |
|
433 |
|
434 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head |
|
435 vsri.u16 q14, q8, #5 |
|
436 PF add PF_X, PF_X, #8 |
|
437 PF tst PF_CTL, #0xF |
|
438 fetch_src_pixblock |
|
439 PF addne PF_X, PF_X, #8 |
|
440 PF subne PF_CTL, PF_CTL, #1 |
|
441 vsri.u16 q14, q9, #11 |
|
442 PF cmp PF_X, ORIG_W |
|
443 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
444 vshll.u8 q8, d1, #8 |
|
445 vst1.16 {d28, d29}, [DST_W, :128]! |
|
446 PF subge PF_X, PF_X, ORIG_W |
|
447 PF subges PF_CTL, PF_CTL, #0x10 |
|
448 vshll.u8 q14, d2, #8 |
|
449 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
450 vshll.u8 q9, d0, #8 |
|
451 .endm |
|
452 |
|
453 generate_composite_function \ |
|
454 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ |
|
455 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
|
456 8, /* number of pixels, processed in a single block */ \ |
|
457 10, /* prefetch distance */ \ |
|
458 default_init, \ |
|
459 default_cleanup, \ |
|
460 pixman_composite_src_8888_0565_process_pixblock_head, \ |
|
461 pixman_composite_src_8888_0565_process_pixblock_tail, \ |
|
462 pixman_composite_src_8888_0565_process_pixblock_tail_head |
|
463 |
|
464 /******************************************************************************/ |
|
465 |
|
466 .macro pixman_composite_src_0565_8888_process_pixblock_head |
|
467 vshrn.u16 d30, q0, #8 |
|
468 vshrn.u16 d29, q0, #3 |
|
469 vsli.u16 q0, q0, #5 |
|
470 vmov.u8 d31, #255 |
|
471 vsri.u8 d30, d30, #5 |
|
472 vsri.u8 d29, d29, #6 |
|
473 vshrn.u16 d28, q0, #2 |
|
474 .endm |
|
475 |
|
476 .macro pixman_composite_src_0565_8888_process_pixblock_tail |
|
477 .endm |
|
478 |
|
479 /* TODO: expand macros and do better instructions scheduling */ |
|
480 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head |
|
481 pixman_composite_src_0565_8888_process_pixblock_tail |
|
482 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
483 fetch_src_pixblock |
|
484 pixman_composite_src_0565_8888_process_pixblock_head |
|
485 cache_preload 8, 8 |
|
486 .endm |
|
487 |
|
488 generate_composite_function \ |
|
489 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ |
|
490 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
|
491 8, /* number of pixels, processed in a single block */ \ |
|
492 10, /* prefetch distance */ \ |
|
493 default_init, \ |
|
494 default_cleanup, \ |
|
495 pixman_composite_src_0565_8888_process_pixblock_head, \ |
|
496 pixman_composite_src_0565_8888_process_pixblock_tail, \ |
|
497 pixman_composite_src_0565_8888_process_pixblock_tail_head |
|
498 |
|
499 /******************************************************************************/ |
|
500 |
|
501 .macro pixman_composite_add_8_8_process_pixblock_head |
|
502 vqadd.u8 q14, q0, q2 |
|
503 vqadd.u8 q15, q1, q3 |
|
504 .endm |
|
505 |
|
506 .macro pixman_composite_add_8_8_process_pixblock_tail |
|
507 .endm |
|
508 |
|
509 .macro pixman_composite_add_8_8_process_pixblock_tail_head |
|
510 fetch_src_pixblock |
|
511 PF add PF_X, PF_X, #32 |
|
512 PF tst PF_CTL, #0xF |
|
513 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
514 PF addne PF_X, PF_X, #32 |
|
515 PF subne PF_CTL, PF_CTL, #1 |
|
516 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
517 PF cmp PF_X, ORIG_W |
|
518 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
519 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
520 PF subge PF_X, PF_X, ORIG_W |
|
521 PF subges PF_CTL, PF_CTL, #0x10 |
|
522 vqadd.u8 q14, q0, q2 |
|
523 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
524 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
525 vqadd.u8 q15, q1, q3 |
|
526 .endm |
|
527 |
|
528 generate_composite_function \ |
|
529 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ |
|
530 FLAG_DST_READWRITE, \ |
|
531 32, /* number of pixels, processed in a single block */ \ |
|
532 10, /* prefetch distance */ \ |
|
533 default_init, \ |
|
534 default_cleanup, \ |
|
535 pixman_composite_add_8_8_process_pixblock_head, \ |
|
536 pixman_composite_add_8_8_process_pixblock_tail, \ |
|
537 pixman_composite_add_8_8_process_pixblock_tail_head |
|
538 |
|
539 /******************************************************************************/ |
|
540 |
|
541 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head |
|
542 fetch_src_pixblock |
|
543 PF add PF_X, PF_X, #8 |
|
544 PF tst PF_CTL, #0xF |
|
545 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! |
|
546 PF addne PF_X, PF_X, #8 |
|
547 PF subne PF_CTL, PF_CTL, #1 |
|
548 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! |
|
549 PF cmp PF_X, ORIG_W |
|
550 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
551 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
552 PF subge PF_X, PF_X, ORIG_W |
|
553 PF subges PF_CTL, PF_CTL, #0x10 |
|
554 vqadd.u8 q14, q0, q2 |
|
555 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
556 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
557 vqadd.u8 q15, q1, q3 |
|
558 .endm |
|
559 |
|
560 generate_composite_function \ |
|
561 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ |
|
562 FLAG_DST_READWRITE, \ |
|
563 8, /* number of pixels, processed in a single block */ \ |
|
564 10, /* prefetch distance */ \ |
|
565 default_init, \ |
|
566 default_cleanup, \ |
|
567 pixman_composite_add_8_8_process_pixblock_head, \ |
|
568 pixman_composite_add_8_8_process_pixblock_tail, \ |
|
569 pixman_composite_add_8888_8888_process_pixblock_tail_head |
|
570 |
|
571 generate_composite_function_single_scanline \ |
|
572 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ |
|
573 FLAG_DST_READWRITE, \ |
|
574 8, /* number of pixels, processed in a single block */ \ |
|
575 default_init, \ |
|
576 default_cleanup, \ |
|
577 pixman_composite_add_8_8_process_pixblock_head, \ |
|
578 pixman_composite_add_8_8_process_pixblock_tail, \ |
|
579 pixman_composite_add_8888_8888_process_pixblock_tail_head |
|
580 |
|
581 /******************************************************************************/ |
|
582 |
|
583 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head |
|
584 vmvn.8 d24, d3 /* get inverted alpha */ |
|
585 /* do alpha blending */ |
|
586 vmull.u8 q8, d24, d4 |
|
587 vmull.u8 q9, d24, d5 |
|
588 vmull.u8 q10, d24, d6 |
|
589 vmull.u8 q11, d24, d7 |
|
590 .endm |
|
591 |
|
592 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail |
|
593 vrshr.u16 q14, q8, #8 |
|
594 vrshr.u16 q15, q9, #8 |
|
595 vrshr.u16 q12, q10, #8 |
|
596 vrshr.u16 q13, q11, #8 |
|
597 vraddhn.u16 d28, q14, q8 |
|
598 vraddhn.u16 d29, q15, q9 |
|
599 vraddhn.u16 d30, q12, q10 |
|
600 vraddhn.u16 d31, q13, q11 |
|
601 .endm |
|
602 |
|
603 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head |
|
604 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
605 vrshr.u16 q14, q8, #8 |
|
606 PF add PF_X, PF_X, #8 |
|
607 PF tst PF_CTL, #0xF |
|
608 vrshr.u16 q15, q9, #8 |
|
609 vrshr.u16 q12, q10, #8 |
|
610 vrshr.u16 q13, q11, #8 |
|
611 PF addne PF_X, PF_X, #8 |
|
612 PF subne PF_CTL, PF_CTL, #1 |
|
613 vraddhn.u16 d28, q14, q8 |
|
614 vraddhn.u16 d29, q15, q9 |
|
615 PF cmp PF_X, ORIG_W |
|
616 vraddhn.u16 d30, q12, q10 |
|
617 vraddhn.u16 d31, q13, q11 |
|
618 fetch_src_pixblock |
|
619 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
620 vmvn.8 d22, d3 |
|
621 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
622 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
623 PF subge PF_X, PF_X, ORIG_W |
|
624 vmull.u8 q8, d22, d4 |
|
625 PF subges PF_CTL, PF_CTL, #0x10 |
|
626 vmull.u8 q9, d22, d5 |
|
627 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
628 vmull.u8 q10, d22, d6 |
|
629 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
630 vmull.u8 q11, d22, d7 |
|
631 .endm |
|
632 |
|
633 generate_composite_function_single_scanline \ |
|
634 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ |
|
635 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
636 8, /* number of pixels, processed in a single block */ \ |
|
637 default_init, \ |
|
638 default_cleanup, \ |
|
639 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ |
|
640 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ |
|
641 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head |
|
642 |
|
643 /******************************************************************************/ |
|
644 |
|
645 .macro pixman_composite_over_8888_8888_process_pixblock_head |
|
646 pixman_composite_out_reverse_8888_8888_process_pixblock_head |
|
647 .endm |
|
648 |
|
649 .macro pixman_composite_over_8888_8888_process_pixblock_tail |
|
650 pixman_composite_out_reverse_8888_8888_process_pixblock_tail |
|
651 vqadd.u8 q14, q0, q14 |
|
652 vqadd.u8 q15, q1, q15 |
|
653 .endm |
|
654 |
|
655 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head |
|
656 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
657 vrshr.u16 q14, q8, #8 |
|
658 PF add PF_X, PF_X, #8 |
|
659 PF tst PF_CTL, #0xF |
|
660 vrshr.u16 q15, q9, #8 |
|
661 vrshr.u16 q12, q10, #8 |
|
662 vrshr.u16 q13, q11, #8 |
|
663 PF addne PF_X, PF_X, #8 |
|
664 PF subne PF_CTL, PF_CTL, #1 |
|
665 vraddhn.u16 d28, q14, q8 |
|
666 vraddhn.u16 d29, q15, q9 |
|
667 PF cmp PF_X, ORIG_W |
|
668 vraddhn.u16 d30, q12, q10 |
|
669 vraddhn.u16 d31, q13, q11 |
|
670 vqadd.u8 q14, q0, q14 |
|
671 vqadd.u8 q15, q1, q15 |
|
672 fetch_src_pixblock |
|
673 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
674 vmvn.8 d22, d3 |
|
675 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
676 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
677 PF subge PF_X, PF_X, ORIG_W |
|
678 vmull.u8 q8, d22, d4 |
|
679 PF subges PF_CTL, PF_CTL, #0x10 |
|
680 vmull.u8 q9, d22, d5 |
|
681 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
682 vmull.u8 q10, d22, d6 |
|
683 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
684 vmull.u8 q11, d22, d7 |
|
685 .endm |
|
686 |
|
687 generate_composite_function \ |
|
688 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ |
|
689 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
690 8, /* number of pixels, processed in a single block */ \ |
|
691 5, /* prefetch distance */ \ |
|
692 default_init, \ |
|
693 default_cleanup, \ |
|
694 pixman_composite_over_8888_8888_process_pixblock_head, \ |
|
695 pixman_composite_over_8888_8888_process_pixblock_tail, \ |
|
696 pixman_composite_over_8888_8888_process_pixblock_tail_head |
|
697 |
|
698 generate_composite_function_single_scanline \ |
|
699 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ |
|
700 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
701 8, /* number of pixels, processed in a single block */ \ |
|
702 default_init, \ |
|
703 default_cleanup, \ |
|
704 pixman_composite_over_8888_8888_process_pixblock_head, \ |
|
705 pixman_composite_over_8888_8888_process_pixblock_tail, \ |
|
706 pixman_composite_over_8888_8888_process_pixblock_tail_head |
|
707 |
|
708 /******************************************************************************/ |
|
709 |
|
710 .macro pixman_composite_over_n_8888_process_pixblock_head |
|
711 /* deinterleaved source pixels in {d0, d1, d2, d3} */ |
|
712 /* inverted alpha in {d24} */ |
|
713 /* destination pixels in {d4, d5, d6, d7} */ |
|
714 vmull.u8 q8, d24, d4 |
|
715 vmull.u8 q9, d24, d5 |
|
716 vmull.u8 q10, d24, d6 |
|
717 vmull.u8 q11, d24, d7 |
|
718 .endm |
|
719 |
|
720 .macro pixman_composite_over_n_8888_process_pixblock_tail |
|
721 vrshr.u16 q14, q8, #8 |
|
722 vrshr.u16 q15, q9, #8 |
|
723 vrshr.u16 q2, q10, #8 |
|
724 vrshr.u16 q3, q11, #8 |
|
725 vraddhn.u16 d28, q14, q8 |
|
726 vraddhn.u16 d29, q15, q9 |
|
727 vraddhn.u16 d30, q2, q10 |
|
728 vraddhn.u16 d31, q3, q11 |
|
729 vqadd.u8 q14, q0, q14 |
|
730 vqadd.u8 q15, q1, q15 |
|
731 .endm |
|
732 |
|
733 .macro pixman_composite_over_n_8888_process_pixblock_tail_head |
|
734 vrshr.u16 q14, q8, #8 |
|
735 vrshr.u16 q15, q9, #8 |
|
736 vrshr.u16 q2, q10, #8 |
|
737 vrshr.u16 q3, q11, #8 |
|
738 vraddhn.u16 d28, q14, q8 |
|
739 vraddhn.u16 d29, q15, q9 |
|
740 vraddhn.u16 d30, q2, q10 |
|
741 vraddhn.u16 d31, q3, q11 |
|
742 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
743 vqadd.u8 q14, q0, q14 |
|
744 PF add PF_X, PF_X, #8 |
|
745 PF tst PF_CTL, #0x0F |
|
746 PF addne PF_X, PF_X, #8 |
|
747 PF subne PF_CTL, PF_CTL, #1 |
|
748 vqadd.u8 q15, q1, q15 |
|
749 PF cmp PF_X, ORIG_W |
|
750 vmull.u8 q8, d24, d4 |
|
751 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
752 vmull.u8 q9, d24, d5 |
|
753 PF subge PF_X, PF_X, ORIG_W |
|
754 vmull.u8 q10, d24, d6 |
|
755 PF subges PF_CTL, PF_CTL, #0x10 |
|
756 vmull.u8 q11, d24, d7 |
|
757 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
759 .endm |
|
760 |
|
761 .macro pixman_composite_over_n_8888_init |
|
762 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
763 vld1.32 {d3[0]}, [DUMMY] |
|
764 vdup.8 d0, d3[0] |
|
765 vdup.8 d1, d3[1] |
|
766 vdup.8 d2, d3[2] |
|
767 vdup.8 d3, d3[3] |
|
768 vmvn.8 d24, d3 /* get inverted alpha */ |
|
769 .endm |
|
770 |
|
771 generate_composite_function \ |
|
772 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ |
|
773 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
774 8, /* number of pixels, processed in a single block */ \ |
|
775 5, /* prefetch distance */ \ |
|
776 pixman_composite_over_n_8888_init, \ |
|
777 default_cleanup, \ |
|
778 pixman_composite_over_8888_8888_process_pixblock_head, \ |
|
779 pixman_composite_over_8888_8888_process_pixblock_tail, \ |
|
780 pixman_composite_over_n_8888_process_pixblock_tail_head |
|
781 |
|
782 /******************************************************************************/ |
|
783 |
|
784 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head |
|
785 vrshr.u16 q14, q8, #8 |
|
786 PF add PF_X, PF_X, #8 |
|
787 PF tst PF_CTL, #0xF |
|
788 vrshr.u16 q15, q9, #8 |
|
789 vrshr.u16 q12, q10, #8 |
|
790 vrshr.u16 q13, q11, #8 |
|
791 PF addne PF_X, PF_X, #8 |
|
792 PF subne PF_CTL, PF_CTL, #1 |
|
793 vraddhn.u16 d28, q14, q8 |
|
794 vraddhn.u16 d29, q15, q9 |
|
795 PF cmp PF_X, ORIG_W |
|
796 vraddhn.u16 d30, q12, q10 |
|
797 vraddhn.u16 d31, q13, q11 |
|
798 vqadd.u8 q14, q0, q14 |
|
799 vqadd.u8 q15, q1, q15 |
|
800 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! |
|
801 vmvn.8 d22, d3 |
|
802 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
803 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
804 PF subge PF_X, PF_X, ORIG_W |
|
805 vmull.u8 q8, d22, d4 |
|
806 PF subges PF_CTL, PF_CTL, #0x10 |
|
807 vmull.u8 q9, d22, d5 |
|
808 vmull.u8 q10, d22, d6 |
|
809 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
810 vmull.u8 q11, d22, d7 |
|
811 .endm |
|
812 |
|
813 .macro pixman_composite_over_reverse_n_8888_init |
|
814 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
815 vld1.32 {d7[0]}, [DUMMY] |
|
816 vdup.8 d4, d7[0] |
|
817 vdup.8 d5, d7[1] |
|
818 vdup.8 d6, d7[2] |
|
819 vdup.8 d7, d7[3] |
|
820 .endm |
|
821 |
|
822 generate_composite_function \ |
|
823 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ |
|
824 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
825 8, /* number of pixels, processed in a single block */ \ |
|
826 5, /* prefetch distance */ \ |
|
827 pixman_composite_over_reverse_n_8888_init, \ |
|
828 default_cleanup, \ |
|
829 pixman_composite_over_8888_8888_process_pixblock_head, \ |
|
830 pixman_composite_over_8888_8888_process_pixblock_tail, \ |
|
831 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ |
|
832 28, /* dst_w_basereg */ \ |
|
833 0, /* dst_r_basereg */ \ |
|
834 4, /* src_basereg */ \ |
|
835 24 /* mask_basereg */ |
|
836 |
|
837 /******************************************************************************/ |
|
838 |
|
839 .macro pixman_composite_over_8888_8_0565_process_pixblock_head |
|
840 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ |
|
841 vmull.u8 q1, d24, d9 |
|
842 vmull.u8 q6, d24, d10 |
|
843 vmull.u8 q7, d24, d11 |
|
844 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ |
|
845 vshrn.u16 d7, q2, #3 |
|
846 vsli.u16 q2, q2, #5 |
|
847 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ |
|
848 vrshr.u16 q9, q1, #8 |
|
849 vrshr.u16 q10, q6, #8 |
|
850 vrshr.u16 q11, q7, #8 |
|
851 vraddhn.u16 d0, q0, q8 |
|
852 vraddhn.u16 d1, q1, q9 |
|
853 vraddhn.u16 d2, q6, q10 |
|
854 vraddhn.u16 d3, q7, q11 |
|
855 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ |
|
856 vsri.u8 d7, d7, #6 |
|
857 vmvn.8 d3, d3 |
|
858 vshrn.u16 d30, q2, #2 |
|
859 vmull.u8 q8, d3, d6 /* now do alpha blending */ |
|
860 vmull.u8 q9, d3, d7 |
|
861 vmull.u8 q10, d3, d30 |
|
862 .endm |
|
863 |
|
864 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail |
|
865 /* 3 cycle bubble (after vmull.u8) */ |
|
866 vrshr.u16 q13, q8, #8 |
|
867 vrshr.u16 q11, q9, #8 |
|
868 vrshr.u16 q15, q10, #8 |
|
869 vraddhn.u16 d16, q8, q13 |
|
870 vraddhn.u16 d27, q9, q11 |
|
871 vraddhn.u16 d26, q10, q15 |
|
872 vqadd.u8 d16, d2, d16 |
|
873 /* 1 cycle bubble */ |
|
874 vqadd.u8 q9, q0, q13 |
|
875 vshll.u8 q14, d16, #8 /* convert to 16bpp */ |
|
876 vshll.u8 q8, d19, #8 |
|
877 vshll.u8 q9, d18, #8 |
|
878 vsri.u16 q14, q8, #5 |
|
879 /* 1 cycle bubble */ |
|
880 vsri.u16 q14, q9, #11 |
|
881 .endm |
|
882 |
|
883 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head |
|
884 vld1.16 {d4, d5}, [DST_R, :128]! |
|
885 vshrn.u16 d6, q2, #8 |
|
886 fetch_mask_pixblock |
|
887 vshrn.u16 d7, q2, #3 |
|
888 fetch_src_pixblock |
|
889 vmull.u8 q6, d24, d10 |
|
890 vrshr.u16 q13, q8, #8 |
|
891 vrshr.u16 q11, q9, #8 |
|
892 vrshr.u16 q15, q10, #8 |
|
893 vraddhn.u16 d16, q8, q13 |
|
894 vraddhn.u16 d27, q9, q11 |
|
895 vraddhn.u16 d26, q10, q15 |
|
896 vqadd.u8 d16, d2, d16 |
|
897 vmull.u8 q1, d24, d9 |
|
898 vqadd.u8 q9, q0, q13 |
|
899 vshll.u8 q14, d16, #8 |
|
900 vmull.u8 q0, d24, d8 |
|
901 vshll.u8 q8, d19, #8 |
|
902 vshll.u8 q9, d18, #8 |
|
903 vsri.u16 q14, q8, #5 |
|
904 vmull.u8 q7, d24, d11 |
|
905 vsri.u16 q14, q9, #11 |
|
906 |
|
907 cache_preload 8, 8 |
|
908 |
|
909 vsli.u16 q2, q2, #5 |
|
910 vrshr.u16 q8, q0, #8 |
|
911 vrshr.u16 q9, q1, #8 |
|
912 vrshr.u16 q10, q6, #8 |
|
913 vrshr.u16 q11, q7, #8 |
|
914 vraddhn.u16 d0, q0, q8 |
|
915 vraddhn.u16 d1, q1, q9 |
|
916 vraddhn.u16 d2, q6, q10 |
|
917 vraddhn.u16 d3, q7, q11 |
|
918 vsri.u8 d6, d6, #5 |
|
919 vsri.u8 d7, d7, #6 |
|
920 vmvn.8 d3, d3 |
|
921 vshrn.u16 d30, q2, #2 |
|
922 vst1.16 {d28, d29}, [DST_W, :128]! |
|
923 vmull.u8 q8, d3, d6 |
|
924 vmull.u8 q9, d3, d7 |
|
925 vmull.u8 q10, d3, d30 |
|
926 .endm |
|
927 |
|
928 generate_composite_function \ |
|
929 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ |
|
930 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
931 8, /* number of pixels, processed in a single block */ \ |
|
932 5, /* prefetch distance */ \ |
|
933 default_init_need_all_regs, \ |
|
934 default_cleanup_need_all_regs, \ |
|
935 pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
|
936 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
|
937 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ |
|
938 28, /* dst_w_basereg */ \ |
|
939 4, /* dst_r_basereg */ \ |
|
940 8, /* src_basereg */ \ |
|
941 24 /* mask_basereg */ |
|
942 |
|
943 /******************************************************************************/ |
|
944 |
|
945 /* |
|
946 * This function needs a special initialization of solid mask. |
|
947 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET |
|
948 * offset, split into color components and replicated in d8-d11 |
|
949 * registers. Additionally, this function needs all the NEON registers, |
|
950 * so it has to save d8-d15 registers which are callee saved according |
|
951 * to ABI. These registers are restored from 'cleanup' macro. All the |
|
952 * other NEON registers are caller saved, so can be clobbered freely |
|
953 * without introducing any problems. |
|
954 */ |
|
955 .macro pixman_composite_over_n_8_0565_init |
|
956 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
957 .vsave {d8-d15} |
|
958 vpush {d8-d15} |
|
959 vld1.32 {d11[0]}, [DUMMY] |
|
960 vdup.8 d8, d11[0] |
|
961 vdup.8 d9, d11[1] |
|
962 vdup.8 d10, d11[2] |
|
963 vdup.8 d11, d11[3] |
|
964 .endm |
|
965 |
|
966 .macro pixman_composite_over_n_8_0565_cleanup |
|
967 vpop {d8-d15} |
|
968 .endm |
|
969 |
|
970 generate_composite_function \ |
|
971 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ |
|
972 FLAG_DST_READWRITE, \ |
|
973 8, /* number of pixels, processed in a single block */ \ |
|
974 5, /* prefetch distance */ \ |
|
975 pixman_composite_over_n_8_0565_init, \ |
|
976 pixman_composite_over_n_8_0565_cleanup, \ |
|
977 pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
|
978 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
|
979 pixman_composite_over_8888_8_0565_process_pixblock_tail_head |
|
980 |
|
981 /******************************************************************************/ |
|
982 |
|
983 .macro pixman_composite_over_8888_n_0565_init |
|
984 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) |
|
985 .vsave {d8-d15} |
|
986 vpush {d8-d15} |
|
987 vld1.32 {d24[0]}, [DUMMY] |
|
988 vdup.8 d24, d24[3] |
|
989 .endm |
|
990 |
|
991 .macro pixman_composite_over_8888_n_0565_cleanup |
|
992 vpop {d8-d15} |
|
993 .endm |
|
994 |
|
995 generate_composite_function \ |
|
996 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ |
|
997 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
998 8, /* number of pixels, processed in a single block */ \ |
|
999 5, /* prefetch distance */ \ |
|
1000 pixman_composite_over_8888_n_0565_init, \ |
|
1001 pixman_composite_over_8888_n_0565_cleanup, \ |
|
1002 pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
|
1003 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
|
1004 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ |
|
1005 28, /* dst_w_basereg */ \ |
|
1006 4, /* dst_r_basereg */ \ |
|
1007 8, /* src_basereg */ \ |
|
1008 24 /* mask_basereg */ |
|
1009 |
|
1010 /******************************************************************************/ |
|
1011 |
|
1012 .macro pixman_composite_src_0565_0565_process_pixblock_head |
|
1013 .endm |
|
1014 |
|
1015 .macro pixman_composite_src_0565_0565_process_pixblock_tail |
|
1016 .endm |
|
1017 |
|
1018 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head |
|
1019 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! |
|
1020 fetch_src_pixblock |
|
1021 cache_preload 16, 16 |
|
1022 .endm |
|
1023 |
|
1024 generate_composite_function \ |
|
1025 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ |
|
1026 FLAG_DST_WRITEONLY, \ |
|
1027 16, /* number of pixels, processed in a single block */ \ |
|
1028 10, /* prefetch distance */ \ |
|
1029 default_init, \ |
|
1030 default_cleanup, \ |
|
1031 pixman_composite_src_0565_0565_process_pixblock_head, \ |
|
1032 pixman_composite_src_0565_0565_process_pixblock_tail, \ |
|
1033 pixman_composite_src_0565_0565_process_pixblock_tail_head, \ |
|
1034 0, /* dst_w_basereg */ \ |
|
1035 0, /* dst_r_basereg */ \ |
|
1036 0, /* src_basereg */ \ |
|
1037 0 /* mask_basereg */ |
|
1038 |
|
1039 /******************************************************************************/ |
|
1040 |
|
1041 .macro pixman_composite_src_n_8_process_pixblock_head |
|
1042 .endm |
|
1043 |
|
1044 .macro pixman_composite_src_n_8_process_pixblock_tail |
|
1045 .endm |
|
1046 |
|
1047 .macro pixman_composite_src_n_8_process_pixblock_tail_head |
|
1048 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! |
|
1049 .endm |
|
1050 |
|
1051 .macro pixman_composite_src_n_8_init |
|
1052 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1053 vld1.32 {d0[0]}, [DUMMY] |
|
1054 vsli.u64 d0, d0, #8 |
|
1055 vsli.u64 d0, d0, #16 |
|
1056 vsli.u64 d0, d0, #32 |
|
1057 vorr d1, d0, d0 |
|
1058 vorr q1, q0, q0 |
|
1059 .endm |
|
1060 |
|
1061 .macro pixman_composite_src_n_8_cleanup |
|
1062 .endm |
|
1063 |
|
1064 generate_composite_function \ |
|
1065 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ |
|
1066 FLAG_DST_WRITEONLY, \ |
|
1067 32, /* number of pixels, processed in a single block */ \ |
|
1068 0, /* prefetch distance */ \ |
|
1069 pixman_composite_src_n_8_init, \ |
|
1070 pixman_composite_src_n_8_cleanup, \ |
|
1071 pixman_composite_src_n_8_process_pixblock_head, \ |
|
1072 pixman_composite_src_n_8_process_pixblock_tail, \ |
|
1073 pixman_composite_src_n_8_process_pixblock_tail_head, \ |
|
1074 0, /* dst_w_basereg */ \ |
|
1075 0, /* dst_r_basereg */ \ |
|
1076 0, /* src_basereg */ \ |
|
1077 0 /* mask_basereg */ |
|
1078 |
|
1079 /******************************************************************************/ |
|
1080 |
|
1081 .macro pixman_composite_src_n_0565_process_pixblock_head |
|
1082 .endm |
|
1083 |
|
1084 .macro pixman_composite_src_n_0565_process_pixblock_tail |
|
1085 .endm |
|
1086 |
|
1087 .macro pixman_composite_src_n_0565_process_pixblock_tail_head |
|
1088 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! |
|
1089 .endm |
|
1090 |
|
1091 .macro pixman_composite_src_n_0565_init |
|
1092 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1093 vld1.32 {d0[0]}, [DUMMY] |
|
1094 vsli.u64 d0, d0, #16 |
|
1095 vsli.u64 d0, d0, #32 |
|
1096 vorr d1, d0, d0 |
|
1097 vorr q1, q0, q0 |
|
1098 .endm |
|
1099 |
|
1100 .macro pixman_composite_src_n_0565_cleanup |
|
1101 .endm |
|
1102 |
|
1103 generate_composite_function \ |
|
1104 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ |
|
1105 FLAG_DST_WRITEONLY, \ |
|
1106 16, /* number of pixels, processed in a single block */ \ |
|
1107 0, /* prefetch distance */ \ |
|
1108 pixman_composite_src_n_0565_init, \ |
|
1109 pixman_composite_src_n_0565_cleanup, \ |
|
1110 pixman_composite_src_n_0565_process_pixblock_head, \ |
|
1111 pixman_composite_src_n_0565_process_pixblock_tail, \ |
|
1112 pixman_composite_src_n_0565_process_pixblock_tail_head, \ |
|
1113 0, /* dst_w_basereg */ \ |
|
1114 0, /* dst_r_basereg */ \ |
|
1115 0, /* src_basereg */ \ |
|
1116 0 /* mask_basereg */ |
|
1117 |
|
1118 /******************************************************************************/ |
|
1119 |
|
1120 .macro pixman_composite_src_n_8888_process_pixblock_head |
|
1121 .endm |
|
1122 |
|
1123 .macro pixman_composite_src_n_8888_process_pixblock_tail |
|
1124 .endm |
|
1125 |
|
1126 .macro pixman_composite_src_n_8888_process_pixblock_tail_head |
|
1127 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! |
|
1128 .endm |
|
1129 |
|
1130 .macro pixman_composite_src_n_8888_init |
|
1131 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1132 vld1.32 {d0[0]}, [DUMMY] |
|
1133 vsli.u64 d0, d0, #32 |
|
1134 vorr d1, d0, d0 |
|
1135 vorr q1, q0, q0 |
|
1136 .endm |
|
1137 |
|
1138 .macro pixman_composite_src_n_8888_cleanup |
|
1139 .endm |
|
1140 |
|
1141 generate_composite_function \ |
|
1142 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ |
|
1143 FLAG_DST_WRITEONLY, \ |
|
1144 8, /* number of pixels, processed in a single block */ \ |
|
1145 0, /* prefetch distance */ \ |
|
1146 pixman_composite_src_n_8888_init, \ |
|
1147 pixman_composite_src_n_8888_cleanup, \ |
|
1148 pixman_composite_src_n_8888_process_pixblock_head, \ |
|
1149 pixman_composite_src_n_8888_process_pixblock_tail, \ |
|
1150 pixman_composite_src_n_8888_process_pixblock_tail_head, \ |
|
1151 0, /* dst_w_basereg */ \ |
|
1152 0, /* dst_r_basereg */ \ |
|
1153 0, /* src_basereg */ \ |
|
1154 0 /* mask_basereg */ |
|
1155 |
|
1156 /******************************************************************************/ |
|
1157 |
|
1158 .macro pixman_composite_src_8888_8888_process_pixblock_head |
|
1159 .endm |
|
1160 |
|
1161 .macro pixman_composite_src_8888_8888_process_pixblock_tail |
|
1162 .endm |
|
1163 |
|
1164 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head |
|
1165 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! |
|
1166 fetch_src_pixblock |
|
1167 cache_preload 8, 8 |
|
1168 .endm |
|
1169 |
|
1170 generate_composite_function \ |
|
1171 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ |
|
1172 FLAG_DST_WRITEONLY, \ |
|
1173 8, /* number of pixels, processed in a single block */ \ |
|
1174 10, /* prefetch distance */ \ |
|
1175 default_init, \ |
|
1176 default_cleanup, \ |
|
1177 pixman_composite_src_8888_8888_process_pixblock_head, \ |
|
1178 pixman_composite_src_8888_8888_process_pixblock_tail, \ |
|
1179 pixman_composite_src_8888_8888_process_pixblock_tail_head, \ |
|
1180 0, /* dst_w_basereg */ \ |
|
1181 0, /* dst_r_basereg */ \ |
|
1182 0, /* src_basereg */ \ |
|
1183 0 /* mask_basereg */ |
|
1184 |
|
1185 /******************************************************************************/ |
|
1186 |
|
1187 .macro pixman_composite_src_x888_8888_process_pixblock_head |
|
1188 vorr q0, q0, q2 |
|
1189 vorr q1, q1, q2 |
|
1190 .endm |
|
1191 |
|
1192 .macro pixman_composite_src_x888_8888_process_pixblock_tail |
|
1193 .endm |
|
1194 |
|
1195 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head |
|
1196 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! |
|
1197 fetch_src_pixblock |
|
1198 vorr q0, q0, q2 |
|
1199 vorr q1, q1, q2 |
|
1200 cache_preload 8, 8 |
|
1201 .endm |
|
1202 |
|
1203 .macro pixman_composite_src_x888_8888_init |
|
1204 vmov.u8 q2, #0xFF |
|
1205 vshl.u32 q2, q2, #24 |
|
1206 .endm |
|
1207 |
|
1208 generate_composite_function \ |
|
1209 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ |
|
1210 FLAG_DST_WRITEONLY, \ |
|
1211 8, /* number of pixels, processed in a single block */ \ |
|
1212 10, /* prefetch distance */ \ |
|
1213 pixman_composite_src_x888_8888_init, \ |
|
1214 default_cleanup, \ |
|
1215 pixman_composite_src_x888_8888_process_pixblock_head, \ |
|
1216 pixman_composite_src_x888_8888_process_pixblock_tail, \ |
|
1217 pixman_composite_src_x888_8888_process_pixblock_tail_head, \ |
|
1218 0, /* dst_w_basereg */ \ |
|
1219 0, /* dst_r_basereg */ \ |
|
1220 0, /* src_basereg */ \ |
|
1221 0 /* mask_basereg */ |
|
1222 |
|
1223 /******************************************************************************/ |
|
1224 |
|
1225 .macro pixman_composite_src_n_8_8888_process_pixblock_head |
|
1226 /* expecting solid source in {d0, d1, d2, d3} */ |
|
1227 /* mask is in d24 (d25, d26, d27 are unused) */ |
|
1228 |
|
1229 /* in */ |
|
1230 vmull.u8 q8, d24, d0 |
|
1231 vmull.u8 q9, d24, d1 |
|
1232 vmull.u8 q10, d24, d2 |
|
1233 vmull.u8 q11, d24, d3 |
|
1234 vrsra.u16 q8, q8, #8 |
|
1235 vrsra.u16 q9, q9, #8 |
|
1236 vrsra.u16 q10, q10, #8 |
|
1237 vrsra.u16 q11, q11, #8 |
|
1238 .endm |
|
1239 |
|
1240 .macro pixman_composite_src_n_8_8888_process_pixblock_tail |
|
1241 vrshrn.u16 d28, q8, #8 |
|
1242 vrshrn.u16 d29, q9, #8 |
|
1243 vrshrn.u16 d30, q10, #8 |
|
1244 vrshrn.u16 d31, q11, #8 |
|
1245 .endm |
|
1246 |
|
1247 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head |
|
1248 fetch_mask_pixblock |
|
1249 PF add PF_X, PF_X, #8 |
|
1250 vrshrn.u16 d28, q8, #8 |
|
1251 PF tst PF_CTL, #0x0F |
|
1252 vrshrn.u16 d29, q9, #8 |
|
1253 PF addne PF_X, PF_X, #8 |
|
1254 vrshrn.u16 d30, q10, #8 |
|
1255 PF subne PF_CTL, PF_CTL, #1 |
|
1256 vrshrn.u16 d31, q11, #8 |
|
1257 PF cmp PF_X, ORIG_W |
|
1258 vmull.u8 q8, d24, d0 |
|
1259 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] |
|
1260 vmull.u8 q9, d24, d1 |
|
1261 PF subge PF_X, PF_X, ORIG_W |
|
1262 vmull.u8 q10, d24, d2 |
|
1263 PF subges PF_CTL, PF_CTL, #0x10 |
|
1264 vmull.u8 q11, d24, d3 |
|
1265 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! |
|
1266 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
1267 vrsra.u16 q8, q8, #8 |
|
1268 vrsra.u16 q9, q9, #8 |
|
1269 vrsra.u16 q10, q10, #8 |
|
1270 vrsra.u16 q11, q11, #8 |
|
1271 .endm |
|
1272 |
|
1273 .macro pixman_composite_src_n_8_8888_init |
|
1274 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1275 vld1.32 {d3[0]}, [DUMMY] |
|
1276 vdup.8 d0, d3[0] |
|
1277 vdup.8 d1, d3[1] |
|
1278 vdup.8 d2, d3[2] |
|
1279 vdup.8 d3, d3[3] |
|
1280 .endm |
|
1281 |
|
1282 .macro pixman_composite_src_n_8_8888_cleanup |
|
1283 .endm |
|
1284 |
|
1285 generate_composite_function \ |
|
1286 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ |
|
1287 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
|
1288 8, /* number of pixels, processed in a single block */ \ |
|
1289 5, /* prefetch distance */ \ |
|
1290 pixman_composite_src_n_8_8888_init, \ |
|
1291 pixman_composite_src_n_8_8888_cleanup, \ |
|
1292 pixman_composite_src_n_8_8888_process_pixblock_head, \ |
|
1293 pixman_composite_src_n_8_8888_process_pixblock_tail, \ |
|
1294 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ |
|
1295 |
|
1296 /******************************************************************************/ |
|
1297 |
|
1298 .macro pixman_composite_src_n_8_8_process_pixblock_head |
|
1299 vmull.u8 q0, d24, d16 |
|
1300 vmull.u8 q1, d25, d16 |
|
1301 vmull.u8 q2, d26, d16 |
|
1302 vmull.u8 q3, d27, d16 |
|
1303 vrsra.u16 q0, q0, #8 |
|
1304 vrsra.u16 q1, q1, #8 |
|
1305 vrsra.u16 q2, q2, #8 |
|
1306 vrsra.u16 q3, q3, #8 |
|
1307 .endm |
|
1308 |
|
1309 .macro pixman_composite_src_n_8_8_process_pixblock_tail |
|
1310 vrshrn.u16 d28, q0, #8 |
|
1311 vrshrn.u16 d29, q1, #8 |
|
1312 vrshrn.u16 d30, q2, #8 |
|
1313 vrshrn.u16 d31, q3, #8 |
|
1314 .endm |
|
1315 |
|
1316 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head |
|
1317 fetch_mask_pixblock |
|
1318 PF add PF_X, PF_X, #8 |
|
1319 vrshrn.u16 d28, q0, #8 |
|
1320 PF tst PF_CTL, #0x0F |
|
1321 vrshrn.u16 d29, q1, #8 |
|
1322 PF addne PF_X, PF_X, #8 |
|
1323 vrshrn.u16 d30, q2, #8 |
|
1324 PF subne PF_CTL, PF_CTL, #1 |
|
1325 vrshrn.u16 d31, q3, #8 |
|
1326 PF cmp PF_X, ORIG_W |
|
1327 vmull.u8 q0, d24, d16 |
|
1328 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] |
|
1329 vmull.u8 q1, d25, d16 |
|
1330 PF subge PF_X, PF_X, ORIG_W |
|
1331 vmull.u8 q2, d26, d16 |
|
1332 PF subges PF_CTL, PF_CTL, #0x10 |
|
1333 vmull.u8 q3, d27, d16 |
|
1334 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! |
|
1335 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
1336 vrsra.u16 q0, q0, #8 |
|
1337 vrsra.u16 q1, q1, #8 |
|
1338 vrsra.u16 q2, q2, #8 |
|
1339 vrsra.u16 q3, q3, #8 |
|
1340 .endm |
|
1341 |
|
1342 .macro pixman_composite_src_n_8_8_init |
|
1343 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1344 vld1.32 {d16[0]}, [DUMMY] |
|
1345 vdup.8 d16, d16[3] |
|
1346 .endm |
|
1347 |
|
1348 .macro pixman_composite_src_n_8_8_cleanup |
|
1349 .endm |
|
1350 |
|
1351 generate_composite_function \ |
|
1352 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ |
|
1353 FLAG_DST_WRITEONLY, \ |
|
1354 32, /* number of pixels, processed in a single block */ \ |
|
1355 5, /* prefetch distance */ \ |
|
1356 pixman_composite_src_n_8_8_init, \ |
|
1357 pixman_composite_src_n_8_8_cleanup, \ |
|
1358 pixman_composite_src_n_8_8_process_pixblock_head, \ |
|
1359 pixman_composite_src_n_8_8_process_pixblock_tail, \ |
|
1360 pixman_composite_src_n_8_8_process_pixblock_tail_head |
|
1361 |
|
1362 /******************************************************************************/ |
|
1363 |
|
1364 .macro pixman_composite_over_n_8_8888_process_pixblock_head |
|
1365 /* expecting deinterleaved source data in {d8, d9, d10, d11} */ |
|
1366 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ |
|
1367 /* and destination data in {d4, d5, d6, d7} */ |
|
1368 /* mask is in d24 (d25, d26, d27 are unused) */ |
|
1369 |
|
1370 /* in */ |
|
1371 vmull.u8 q6, d24, d8 |
|
1372 vmull.u8 q7, d24, d9 |
|
1373 vmull.u8 q8, d24, d10 |
|
1374 vmull.u8 q9, d24, d11 |
|
1375 vrshr.u16 q10, q6, #8 |
|
1376 vrshr.u16 q11, q7, #8 |
|
1377 vrshr.u16 q12, q8, #8 |
|
1378 vrshr.u16 q13, q9, #8 |
|
1379 vraddhn.u16 d0, q6, q10 |
|
1380 vraddhn.u16 d1, q7, q11 |
|
1381 vraddhn.u16 d2, q8, q12 |
|
1382 vraddhn.u16 d3, q9, q13 |
|
1383 vmvn.8 d25, d3 /* get inverted alpha */ |
|
1384 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ |
|
1385 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ |
|
1386 /* now do alpha blending */ |
|
1387 vmull.u8 q8, d25, d4 |
|
1388 vmull.u8 q9, d25, d5 |
|
1389 vmull.u8 q10, d25, d6 |
|
1390 vmull.u8 q11, d25, d7 |
|
1391 .endm |
|
1392 |
|
1393 .macro pixman_composite_over_n_8_8888_process_pixblock_tail |
|
1394 vrshr.u16 q14, q8, #8 |
|
1395 vrshr.u16 q15, q9, #8 |
|
1396 vrshr.u16 q6, q10, #8 |
|
1397 vrshr.u16 q7, q11, #8 |
|
1398 vraddhn.u16 d28, q14, q8 |
|
1399 vraddhn.u16 d29, q15, q9 |
|
1400 vraddhn.u16 d30, q6, q10 |
|
1401 vraddhn.u16 d31, q7, q11 |
|
1402 vqadd.u8 q14, q0, q14 |
|
1403 vqadd.u8 q15, q1, q15 |
|
1404 .endm |
|
1405 |
|
1406 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head |
|
1407 vrshr.u16 q14, q8, #8 |
|
1408 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
1409 vrshr.u16 q15, q9, #8 |
|
1410 fetch_mask_pixblock |
|
1411 vrshr.u16 q6, q10, #8 |
|
1412 PF add PF_X, PF_X, #8 |
|
1413 vrshr.u16 q7, q11, #8 |
|
1414 PF tst PF_CTL, #0x0F |
|
1415 vraddhn.u16 d28, q14, q8 |
|
1416 PF addne PF_X, PF_X, #8 |
|
1417 vraddhn.u16 d29, q15, q9 |
|
1418 PF subne PF_CTL, PF_CTL, #1 |
|
1419 vraddhn.u16 d30, q6, q10 |
|
1420 PF cmp PF_X, ORIG_W |
|
1421 vraddhn.u16 d31, q7, q11 |
|
1422 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
1423 vmull.u8 q6, d24, d8 |
|
1424 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] |
|
1425 vmull.u8 q7, d24, d9 |
|
1426 PF subge PF_X, PF_X, ORIG_W |
|
1427 vmull.u8 q8, d24, d10 |
|
1428 PF subges PF_CTL, PF_CTL, #0x10 |
|
1429 vmull.u8 q9, d24, d11 |
|
1430 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
1431 vqadd.u8 q14, q0, q14 |
|
1432 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! |
|
1433 vqadd.u8 q15, q1, q15 |
|
1434 vrshr.u16 q10, q6, #8 |
|
1435 vrshr.u16 q11, q7, #8 |
|
1436 vrshr.u16 q12, q8, #8 |
|
1437 vrshr.u16 q13, q9, #8 |
|
1438 vraddhn.u16 d0, q6, q10 |
|
1439 vraddhn.u16 d1, q7, q11 |
|
1440 vraddhn.u16 d2, q8, q12 |
|
1441 vraddhn.u16 d3, q9, q13 |
|
1442 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
1443 vmvn.8 d25, d3 |
|
1444 vmull.u8 q8, d25, d4 |
|
1445 vmull.u8 q9, d25, d5 |
|
1446 vmull.u8 q10, d25, d6 |
|
1447 vmull.u8 q11, d25, d7 |
|
1448 .endm |
|
1449 |
|
1450 .macro pixman_composite_over_n_8_8888_init |
|
1451 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1452 .vsave {d8-d15} |
|
1453 vpush {d8-d15} |
|
1454 vld1.32 {d11[0]}, [DUMMY] |
|
1455 vdup.8 d8, d11[0] |
|
1456 vdup.8 d9, d11[1] |
|
1457 vdup.8 d10, d11[2] |
|
1458 vdup.8 d11, d11[3] |
|
1459 .endm |
|
1460 |
|
1461 .macro pixman_composite_over_n_8_8888_cleanup |
|
1462 vpop {d8-d15} |
|
1463 .endm |
|
1464 |
|
1465 generate_composite_function \ |
|
1466 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ |
|
1467 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
1468 8, /* number of pixels, processed in a single block */ \ |
|
1469 5, /* prefetch distance */ \ |
|
1470 pixman_composite_over_n_8_8888_init, \ |
|
1471 pixman_composite_over_n_8_8888_cleanup, \ |
|
1472 pixman_composite_over_n_8_8888_process_pixblock_head, \ |
|
1473 pixman_composite_over_n_8_8888_process_pixblock_tail, \ |
|
1474 pixman_composite_over_n_8_8888_process_pixblock_tail_head |
|
1475 |
|
1476 /******************************************************************************/ |
|
1477 |
|
1478 .macro pixman_composite_over_n_8_8_process_pixblock_head |
|
1479 vmull.u8 q0, d24, d8 |
|
1480 vmull.u8 q1, d25, d8 |
|
1481 vmull.u8 q6, d26, d8 |
|
1482 vmull.u8 q7, d27, d8 |
|
1483 vrshr.u16 q10, q0, #8 |
|
1484 vrshr.u16 q11, q1, #8 |
|
1485 vrshr.u16 q12, q6, #8 |
|
1486 vrshr.u16 q13, q7, #8 |
|
1487 vraddhn.u16 d0, q0, q10 |
|
1488 vraddhn.u16 d1, q1, q11 |
|
1489 vraddhn.u16 d2, q6, q12 |
|
1490 vraddhn.u16 d3, q7, q13 |
|
1491 vmvn.8 q12, q0 |
|
1492 vmvn.8 q13, q1 |
|
1493 vmull.u8 q8, d24, d4 |
|
1494 vmull.u8 q9, d25, d5 |
|
1495 vmull.u8 q10, d26, d6 |
|
1496 vmull.u8 q11, d27, d7 |
|
1497 .endm |
|
1498 |
|
1499 .macro pixman_composite_over_n_8_8_process_pixblock_tail |
|
1500 vrshr.u16 q14, q8, #8 |
|
1501 vrshr.u16 q15, q9, #8 |
|
1502 vrshr.u16 q12, q10, #8 |
|
1503 vrshr.u16 q13, q11, #8 |
|
1504 vraddhn.u16 d28, q14, q8 |
|
1505 vraddhn.u16 d29, q15, q9 |
|
1506 vraddhn.u16 d30, q12, q10 |
|
1507 vraddhn.u16 d31, q13, q11 |
|
1508 vqadd.u8 q14, q0, q14 |
|
1509 vqadd.u8 q15, q1, q15 |
|
1510 .endm |
|
1511 |
|
1512 /* TODO: expand macros and do better instructions scheduling */ |
|
1513 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head |
|
1514 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
1515 pixman_composite_over_n_8_8_process_pixblock_tail |
|
1516 fetch_mask_pixblock |
|
1517 cache_preload 32, 32 |
|
1518 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
1519 pixman_composite_over_n_8_8_process_pixblock_head |
|
1520 .endm |
|
1521 |
|
1522 .macro pixman_composite_over_n_8_8_init |
|
1523 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1524 .vsave {d8-d15} |
|
1525 vpush {d8-d15} |
|
1526 vld1.32 {d8[0]}, [DUMMY] |
|
1527 vdup.8 d8, d8[3] |
|
1528 .endm |
|
1529 |
|
1530 .macro pixman_composite_over_n_8_8_cleanup |
|
1531 vpop {d8-d15} |
|
1532 .endm |
|
1533 |
|
1534 generate_composite_function \ |
|
1535 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ |
|
1536 FLAG_DST_READWRITE, \ |
|
1537 32, /* number of pixels, processed in a single block */ \ |
|
1538 5, /* prefetch distance */ \ |
|
1539 pixman_composite_over_n_8_8_init, \ |
|
1540 pixman_composite_over_n_8_8_cleanup, \ |
|
1541 pixman_composite_over_n_8_8_process_pixblock_head, \ |
|
1542 pixman_composite_over_n_8_8_process_pixblock_tail, \ |
|
1543 pixman_composite_over_n_8_8_process_pixblock_tail_head |
|
1544 |
|
1545 /******************************************************************************/ |
|
1546 |
|
1547 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head |
|
1548 /* |
|
1549 * 'combine_mask_ca' replacement |
|
1550 * |
|
1551 * input: solid src (n) in {d8, d9, d10, d11} |
|
1552 * dest in {d4, d5, d6, d7 } |
|
1553 * mask in {d24, d25, d26, d27} |
|
1554 * output: updated src in {d0, d1, d2, d3 } |
|
1555 * updated mask in {d24, d25, d26, d3 } |
|
1556 */ |
|
1557 vmull.u8 q0, d24, d8 |
|
1558 vmull.u8 q1, d25, d9 |
|
1559 vmull.u8 q6, d26, d10 |
|
1560 vmull.u8 q7, d27, d11 |
|
1561 vmull.u8 q9, d11, d25 |
|
1562 vmull.u8 q12, d11, d24 |
|
1563 vmull.u8 q13, d11, d26 |
|
1564 vrshr.u16 q8, q0, #8 |
|
1565 vrshr.u16 q10, q1, #8 |
|
1566 vrshr.u16 q11, q6, #8 |
|
1567 vraddhn.u16 d0, q0, q8 |
|
1568 vraddhn.u16 d1, q1, q10 |
|
1569 vraddhn.u16 d2, q6, q11 |
|
1570 vrshr.u16 q11, q12, #8 |
|
1571 vrshr.u16 q8, q9, #8 |
|
1572 vrshr.u16 q6, q13, #8 |
|
1573 vrshr.u16 q10, q7, #8 |
|
1574 vraddhn.u16 d24, q12, q11 |
|
1575 vraddhn.u16 d25, q9, q8 |
|
1576 vraddhn.u16 d26, q13, q6 |
|
1577 vraddhn.u16 d3, q7, q10 |
|
1578 /* |
|
1579 * 'combine_over_ca' replacement |
|
1580 * |
|
1581 * output: updated dest in {d28, d29, d30, d31} |
|
1582 */ |
|
1583 vmvn.8 q12, q12 |
|
1584 vmvn.8 d26, d26 |
|
1585 vmull.u8 q8, d24, d4 |
|
1586 vmull.u8 q9, d25, d5 |
|
1587 vmvn.8 d27, d3 |
|
1588 vmull.u8 q10, d26, d6 |
|
1589 vmull.u8 q11, d27, d7 |
|
1590 .endm |
|
1591 |
|
1592 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail |
|
1593 /* ... continue 'combine_over_ca' replacement */ |
|
1594 vrshr.u16 q14, q8, #8 |
|
1595 vrshr.u16 q15, q9, #8 |
|
1596 vrshr.u16 q6, q10, #8 |
|
1597 vrshr.u16 q7, q11, #8 |
|
1598 vraddhn.u16 d28, q14, q8 |
|
1599 vraddhn.u16 d29, q15, q9 |
|
1600 vraddhn.u16 d30, q6, q10 |
|
1601 vraddhn.u16 d31, q7, q11 |
|
1602 vqadd.u8 q14, q0, q14 |
|
1603 vqadd.u8 q15, q1, q15 |
|
1604 .endm |
|
1605 |
|
1606 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head |
|
1607 vrshr.u16 q14, q8, #8 |
|
1608 vrshr.u16 q15, q9, #8 |
|
1609 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
1610 vrshr.u16 q6, q10, #8 |
|
1611 vrshr.u16 q7, q11, #8 |
|
1612 vraddhn.u16 d28, q14, q8 |
|
1613 vraddhn.u16 d29, q15, q9 |
|
1614 vraddhn.u16 d30, q6, q10 |
|
1615 vraddhn.u16 d31, q7, q11 |
|
1616 fetch_mask_pixblock |
|
1617 vqadd.u8 q14, q0, q14 |
|
1618 vqadd.u8 q15, q1, q15 |
|
1619 cache_preload 8, 8 |
|
1620 pixman_composite_over_n_8888_8888_ca_process_pixblock_head |
|
1621 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
1622 .endm |
|
1623 |
|
1624 .macro pixman_composite_over_n_8888_8888_ca_init |
|
1625 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1626 .vsave {d8-d15} |
|
1627 vpush {d8-d15} |
|
1628 vld1.32 {d11[0]}, [DUMMY] |
|
1629 vdup.8 d8, d11[0] |
|
1630 vdup.8 d9, d11[1] |
|
1631 vdup.8 d10, d11[2] |
|
1632 vdup.8 d11, d11[3] |
|
1633 .endm |
|
1634 |
|
1635 .macro pixman_composite_over_n_8888_8888_ca_cleanup |
|
1636 vpop {d8-d15} |
|
1637 .endm |
|
1638 |
|
1639 generate_composite_function \ |
|
1640 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ |
|
1641 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
1642 8, /* number of pixels, processed in a single block */ \ |
|
1643 5, /* prefetch distance */ \ |
|
1644 pixman_composite_over_n_8888_8888_ca_init, \ |
|
1645 pixman_composite_over_n_8888_8888_ca_cleanup, \ |
|
1646 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ |
|
1647 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ |
|
1648 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head |
|
1649 |
|
1650 /******************************************************************************/ |
|
1651 |
|
1652 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head |
|
1653 /* |
|
1654 * 'combine_mask_ca' replacement |
|
1655 * |
|
1656 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] |
|
1657 * mask in {d24, d25, d26} [B, G, R] |
|
1658 * output: updated src in {d0, d1, d2 } [B, G, R] |
|
1659 * updated mask in {d24, d25, d26} [B, G, R] |
|
1660 */ |
|
1661 vmull.u8 q0, d24, d8 |
|
1662 vmull.u8 q1, d25, d9 |
|
1663 vmull.u8 q6, d26, d10 |
|
1664 vmull.u8 q9, d11, d25 |
|
1665 vmull.u8 q12, d11, d24 |
|
1666 vmull.u8 q13, d11, d26 |
|
1667 vrshr.u16 q8, q0, #8 |
|
1668 vrshr.u16 q10, q1, #8 |
|
1669 vrshr.u16 q11, q6, #8 |
|
1670 vraddhn.u16 d0, q0, q8 |
|
1671 vraddhn.u16 d1, q1, q10 |
|
1672 vraddhn.u16 d2, q6, q11 |
|
1673 vrshr.u16 q11, q12, #8 |
|
1674 vrshr.u16 q8, q9, #8 |
|
1675 vrshr.u16 q6, q13, #8 |
|
1676 vraddhn.u16 d24, q12, q11 |
|
1677 vraddhn.u16 d25, q9, q8 |
|
1678 /* |
|
1679 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format |
|
1680 * and put data into d16 - blue, d17 - green, d18 - red |
|
1681 */ |
|
1682 vshrn.u16 d17, q2, #3 |
|
1683 vshrn.u16 d18, q2, #8 |
|
1684 vraddhn.u16 d26, q13, q6 |
|
1685 vsli.u16 q2, q2, #5 |
|
1686 vsri.u8 d18, d18, #5 |
|
1687 vsri.u8 d17, d17, #6 |
|
1688 /* |
|
1689 * 'combine_over_ca' replacement |
|
1690 * |
|
1691 * output: updated dest in d16 - blue, d17 - green, d18 - red |
|
1692 */ |
|
1693 vmvn.8 q12, q12 |
|
1694 vshrn.u16 d16, q2, #2 |
|
1695 vmvn.8 d26, d26 |
|
1696 vmull.u8 q6, d16, d24 |
|
1697 vmull.u8 q7, d17, d25 |
|
1698 vmull.u8 q11, d18, d26 |
|
1699 .endm |
|
1700 |
|
1701 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail |
|
1702 /* ... continue 'combine_over_ca' replacement */ |
|
1703 vrshr.u16 q10, q6, #8 |
|
1704 vrshr.u16 q14, q7, #8 |
|
1705 vrshr.u16 q15, q11, #8 |
|
1706 vraddhn.u16 d16, q10, q6 |
|
1707 vraddhn.u16 d17, q14, q7 |
|
1708 vraddhn.u16 d18, q15, q11 |
|
1709 vqadd.u8 q8, q0, q8 |
|
1710 vqadd.u8 d18, d2, d18 |
|
1711 /* |
|
1712 * convert the results in d16, d17, d18 to r5g6b5 and store |
|
1713 * them into {d28, d29} |
|
1714 */ |
|
1715 vshll.u8 q14, d18, #8 |
|
1716 vshll.u8 q10, d17, #8 |
|
1717 vshll.u8 q15, d16, #8 |
|
1718 vsri.u16 q14, q10, #5 |
|
1719 vsri.u16 q14, q15, #11 |
|
1720 .endm |
|
1721 |
|
1722 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head |
|
1723 fetch_mask_pixblock |
|
1724 vrshr.u16 q10, q6, #8 |
|
1725 vrshr.u16 q14, q7, #8 |
|
1726 vld1.16 {d4, d5}, [DST_R, :128]! |
|
1727 vrshr.u16 q15, q11, #8 |
|
1728 vraddhn.u16 d16, q10, q6 |
|
1729 vraddhn.u16 d17, q14, q7 |
|
1730 vraddhn.u16 d22, q15, q11 |
|
1731 /* process_pixblock_head */ |
|
1732 /* |
|
1733 * 'combine_mask_ca' replacement |
|
1734 * |
|
1735 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] |
|
1736 * mask in {d24, d25, d26} [B, G, R] |
|
1737 * output: updated src in {d0, d1, d2 } [B, G, R] |
|
1738 * updated mask in {d24, d25, d26} [B, G, R] |
|
1739 */ |
|
1740 vmull.u8 q6, d26, d10 |
|
1741 vqadd.u8 q8, q0, q8 |
|
1742 vmull.u8 q0, d24, d8 |
|
1743 vqadd.u8 d22, d2, d22 |
|
1744 vmull.u8 q1, d25, d9 |
|
1745 /* |
|
1746 * convert the result in d16, d17, d22 to r5g6b5 and store |
|
1747 * it into {d28, d29} |
|
1748 */ |
|
1749 vshll.u8 q14, d22, #8 |
|
1750 vshll.u8 q10, d17, #8 |
|
1751 vshll.u8 q15, d16, #8 |
|
1752 vmull.u8 q9, d11, d25 |
|
1753 vsri.u16 q14, q10, #5 |
|
1754 vmull.u8 q12, d11, d24 |
|
1755 vmull.u8 q13, d11, d26 |
|
1756 vsri.u16 q14, q15, #11 |
|
1757 cache_preload 8, 8 |
|
1758 vrshr.u16 q8, q0, #8 |
|
1759 vrshr.u16 q10, q1, #8 |
|
1760 vrshr.u16 q11, q6, #8 |
|
1761 vraddhn.u16 d0, q0, q8 |
|
1762 vraddhn.u16 d1, q1, q10 |
|
1763 vraddhn.u16 d2, q6, q11 |
|
1764 vrshr.u16 q11, q12, #8 |
|
1765 vrshr.u16 q8, q9, #8 |
|
1766 vrshr.u16 q6, q13, #8 |
|
1767 vraddhn.u16 d24, q12, q11 |
|
1768 vraddhn.u16 d25, q9, q8 |
|
1769 /* |
|
1770 * convert 8 r5g6b5 pixel data from {d4, d5} to planar |
|
1771 * 8-bit format and put data into d16 - blue, d17 - green, |
|
1772 * d18 - red |
|
1773 */ |
|
1774 vshrn.u16 d17, q2, #3 |
|
1775 vshrn.u16 d18, q2, #8 |
|
1776 vraddhn.u16 d26, q13, q6 |
|
1777 vsli.u16 q2, q2, #5 |
|
1778 vsri.u8 d17, d17, #6 |
|
1779 vsri.u8 d18, d18, #5 |
|
1780 /* |
|
1781 * 'combine_over_ca' replacement |
|
1782 * |
|
1783 * output: updated dest in d16 - blue, d17 - green, d18 - red |
|
1784 */ |
|
1785 vmvn.8 q12, q12 |
|
1786 vshrn.u16 d16, q2, #2 |
|
1787 vmvn.8 d26, d26 |
|
1788 vmull.u8 q7, d17, d25 |
|
1789 vmull.u8 q6, d16, d24 |
|
1790 vmull.u8 q11, d18, d26 |
|
1791 vst1.16 {d28, d29}, [DST_W, :128]! |
|
1792 .endm |
|
1793 |
|
1794 .macro pixman_composite_over_n_8888_0565_ca_init |
|
1795 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1796 .vsave {d8-d15} |
|
1797 vpush {d8-d15} |
|
1798 vld1.32 {d11[0]}, [DUMMY] |
|
1799 vdup.8 d8, d11[0] |
|
1800 vdup.8 d9, d11[1] |
|
1801 vdup.8 d10, d11[2] |
|
1802 vdup.8 d11, d11[3] |
|
1803 .endm |
|
1804 |
|
1805 .macro pixman_composite_over_n_8888_0565_ca_cleanup |
|
1806 vpop {d8-d15} |
|
1807 .endm |
|
1808 |
|
1809 generate_composite_function \ |
|
1810 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ |
|
1811 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
1812 8, /* number of pixels, processed in a single block */ \ |
|
1813 5, /* prefetch distance */ \ |
|
1814 pixman_composite_over_n_8888_0565_ca_init, \ |
|
1815 pixman_composite_over_n_8888_0565_ca_cleanup, \ |
|
1816 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ |
|
1817 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ |
|
1818 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head |
|
1819 |
|
1820 /******************************************************************************/ |
|
1821 |
|
1822 .macro pixman_composite_in_n_8_process_pixblock_head |
|
1823 /* expecting source data in {d0, d1, d2, d3} */ |
|
1824 /* and destination data in {d4, d5, d6, d7} */ |
|
1825 vmull.u8 q8, d4, d3 |
|
1826 vmull.u8 q9, d5, d3 |
|
1827 vmull.u8 q10, d6, d3 |
|
1828 vmull.u8 q11, d7, d3 |
|
1829 .endm |
|
1830 |
|
1831 .macro pixman_composite_in_n_8_process_pixblock_tail |
|
1832 vrshr.u16 q14, q8, #8 |
|
1833 vrshr.u16 q15, q9, #8 |
|
1834 vrshr.u16 q12, q10, #8 |
|
1835 vrshr.u16 q13, q11, #8 |
|
1836 vraddhn.u16 d28, q8, q14 |
|
1837 vraddhn.u16 d29, q9, q15 |
|
1838 vraddhn.u16 d30, q10, q12 |
|
1839 vraddhn.u16 d31, q11, q13 |
|
1840 .endm |
|
1841 |
|
1842 .macro pixman_composite_in_n_8_process_pixblock_tail_head |
|
1843 pixman_composite_in_n_8_process_pixblock_tail |
|
1844 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
1845 cache_preload 32, 32 |
|
1846 pixman_composite_in_n_8_process_pixblock_head |
|
1847 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
1848 .endm |
|
1849 |
|
1850 .macro pixman_composite_in_n_8_init |
|
1851 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1852 vld1.32 {d3[0]}, [DUMMY] |
|
1853 vdup.8 d3, d3[3] |
|
1854 .endm |
|
1855 |
|
1856 .macro pixman_composite_in_n_8_cleanup |
|
1857 .endm |
|
1858 |
|
1859 generate_composite_function \ |
|
1860 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ |
|
1861 FLAG_DST_READWRITE, \ |
|
1862 32, /* number of pixels, processed in a single block */ \ |
|
1863 5, /* prefetch distance */ \ |
|
1864 pixman_composite_in_n_8_init, \ |
|
1865 pixman_composite_in_n_8_cleanup, \ |
|
1866 pixman_composite_in_n_8_process_pixblock_head, \ |
|
1867 pixman_composite_in_n_8_process_pixblock_tail, \ |
|
1868 pixman_composite_in_n_8_process_pixblock_tail_head, \ |
|
1869 28, /* dst_w_basereg */ \ |
|
1870 4, /* dst_r_basereg */ \ |
|
1871 0, /* src_basereg */ \ |
|
1872 24 /* mask_basereg */ |
|
1873 |
|
1874 .macro pixman_composite_add_n_8_8_process_pixblock_head |
|
1875 /* expecting source data in {d8, d9, d10, d11} */ |
|
1876 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ |
|
1877 /* and destination data in {d4, d5, d6, d7} */ |
|
1878 /* mask is in d24, d25, d26, d27 */ |
|
1879 vmull.u8 q0, d24, d11 |
|
1880 vmull.u8 q1, d25, d11 |
|
1881 vmull.u8 q6, d26, d11 |
|
1882 vmull.u8 q7, d27, d11 |
|
1883 vrshr.u16 q10, q0, #8 |
|
1884 vrshr.u16 q11, q1, #8 |
|
1885 vrshr.u16 q12, q6, #8 |
|
1886 vrshr.u16 q13, q7, #8 |
|
1887 vraddhn.u16 d0, q0, q10 |
|
1888 vraddhn.u16 d1, q1, q11 |
|
1889 vraddhn.u16 d2, q6, q12 |
|
1890 vraddhn.u16 d3, q7, q13 |
|
1891 vqadd.u8 q14, q0, q2 |
|
1892 vqadd.u8 q15, q1, q3 |
|
1893 .endm |
|
1894 |
|
1895 .macro pixman_composite_add_n_8_8_process_pixblock_tail |
|
1896 .endm |
|
1897 |
|
1898 /* TODO: expand macros and do better instructions scheduling */ |
|
1899 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head |
|
1900 pixman_composite_add_n_8_8_process_pixblock_tail |
|
1901 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
1902 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
1903 fetch_mask_pixblock |
|
1904 cache_preload 32, 32 |
|
1905 pixman_composite_add_n_8_8_process_pixblock_head |
|
1906 .endm |
|
1907 |
|
1908 .macro pixman_composite_add_n_8_8_init |
|
1909 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
1910 .vsave {d8-d15} |
|
1911 vpush {d8-d15} |
|
1912 vld1.32 {d11[0]}, [DUMMY] |
|
1913 vdup.8 d11, d11[3] |
|
1914 .endm |
|
1915 |
|
1916 .macro pixman_composite_add_n_8_8_cleanup |
|
1917 vpop {d8-d15} |
|
1918 .endm |
|
1919 |
|
1920 generate_composite_function \ |
|
1921 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ |
|
1922 FLAG_DST_READWRITE, \ |
|
1923 32, /* number of pixels, processed in a single block */ \ |
|
1924 5, /* prefetch distance */ \ |
|
1925 pixman_composite_add_n_8_8_init, \ |
|
1926 pixman_composite_add_n_8_8_cleanup, \ |
|
1927 pixman_composite_add_n_8_8_process_pixblock_head, \ |
|
1928 pixman_composite_add_n_8_8_process_pixblock_tail, \ |
|
1929 pixman_composite_add_n_8_8_process_pixblock_tail_head |
|
1930 |
|
1931 /******************************************************************************/ |
|
1932 |
|
1933 .macro pixman_composite_add_8_8_8_process_pixblock_head |
|
1934 /* expecting source data in {d0, d1, d2, d3} */ |
|
1935 /* destination data in {d4, d5, d6, d7} */ |
|
1936 /* mask in {d24, d25, d26, d27} */ |
|
1937 vmull.u8 q8, d24, d0 |
|
1938 vmull.u8 q9, d25, d1 |
|
1939 vmull.u8 q10, d26, d2 |
|
1940 vmull.u8 q11, d27, d3 |
|
1941 vrshr.u16 q0, q8, #8 |
|
1942 vrshr.u16 q1, q9, #8 |
|
1943 vrshr.u16 q12, q10, #8 |
|
1944 vrshr.u16 q13, q11, #8 |
|
1945 vraddhn.u16 d0, q0, q8 |
|
1946 vraddhn.u16 d1, q1, q9 |
|
1947 vraddhn.u16 d2, q12, q10 |
|
1948 vraddhn.u16 d3, q13, q11 |
|
1949 vqadd.u8 q14, q0, q2 |
|
1950 vqadd.u8 q15, q1, q3 |
|
1951 .endm |
|
1952 |
|
1953 .macro pixman_composite_add_8_8_8_process_pixblock_tail |
|
1954 .endm |
|
1955 |
|
1956 /* TODO: expand macros and do better instructions scheduling */ |
|
1957 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head |
|
1958 pixman_composite_add_8_8_8_process_pixblock_tail |
|
1959 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
1960 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
1961 fetch_mask_pixblock |
|
1962 fetch_src_pixblock |
|
1963 cache_preload 32, 32 |
|
1964 pixman_composite_add_8_8_8_process_pixblock_head |
|
1965 .endm |
|
1966 |
|
1967 .macro pixman_composite_add_8_8_8_init |
|
1968 .endm |
|
1969 |
|
1970 .macro pixman_composite_add_8_8_8_cleanup |
|
1971 .endm |
|
1972 |
|
1973 generate_composite_function \ |
|
1974 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ |
|
1975 FLAG_DST_READWRITE, \ |
|
1976 32, /* number of pixels, processed in a single block */ \ |
|
1977 5, /* prefetch distance */ \ |
|
1978 pixman_composite_add_8_8_8_init, \ |
|
1979 pixman_composite_add_8_8_8_cleanup, \ |
|
1980 pixman_composite_add_8_8_8_process_pixblock_head, \ |
|
1981 pixman_composite_add_8_8_8_process_pixblock_tail, \ |
|
1982 pixman_composite_add_8_8_8_process_pixblock_tail_head |
|
1983 |
|
1984 /******************************************************************************/ |
|
1985 |
|
1986 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head |
|
1987 /* expecting source data in {d0, d1, d2, d3} */ |
|
1988 /* destination data in {d4, d5, d6, d7} */ |
|
1989 /* mask in {d24, d25, d26, d27} */ |
|
1990 vmull.u8 q8, d27, d0 |
|
1991 vmull.u8 q9, d27, d1 |
|
1992 vmull.u8 q10, d27, d2 |
|
1993 vmull.u8 q11, d27, d3 |
|
1994 /* 1 cycle bubble */ |
|
1995 vrsra.u16 q8, q8, #8 |
|
1996 vrsra.u16 q9, q9, #8 |
|
1997 vrsra.u16 q10, q10, #8 |
|
1998 vrsra.u16 q11, q11, #8 |
|
1999 .endm |
|
2000 |
|
2001 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail |
|
2002 /* 2 cycle bubble */ |
|
2003 vrshrn.u16 d28, q8, #8 |
|
2004 vrshrn.u16 d29, q9, #8 |
|
2005 vrshrn.u16 d30, q10, #8 |
|
2006 vrshrn.u16 d31, q11, #8 |
|
2007 vqadd.u8 q14, q2, q14 |
|
2008 /* 1 cycle bubble */ |
|
2009 vqadd.u8 q15, q3, q15 |
|
2010 .endm |
|
2011 |
|
2012 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head |
|
2013 fetch_src_pixblock |
|
2014 vrshrn.u16 d28, q8, #8 |
|
2015 fetch_mask_pixblock |
|
2016 vrshrn.u16 d29, q9, #8 |
|
2017 vmull.u8 q8, d27, d0 |
|
2018 vrshrn.u16 d30, q10, #8 |
|
2019 vmull.u8 q9, d27, d1 |
|
2020 vrshrn.u16 d31, q11, #8 |
|
2021 vmull.u8 q10, d27, d2 |
|
2022 vqadd.u8 q14, q2, q14 |
|
2023 vmull.u8 q11, d27, d3 |
|
2024 vqadd.u8 q15, q3, q15 |
|
2025 vrsra.u16 q8, q8, #8 |
|
2026 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
2027 vrsra.u16 q9, q9, #8 |
|
2028 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
2029 vrsra.u16 q10, q10, #8 |
|
2030 |
|
2031 cache_preload 8, 8 |
|
2032 |
|
2033 vrsra.u16 q11, q11, #8 |
|
2034 .endm |
|
2035 |
|
2036 generate_composite_function \ |
|
2037 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ |
|
2038 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2039 8, /* number of pixels, processed in a single block */ \ |
|
2040 10, /* prefetch distance */ \ |
|
2041 default_init, \ |
|
2042 default_cleanup, \ |
|
2043 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
|
2044 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
|
2045 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head |
|
2046 |
|
2047 generate_composite_function_single_scanline \ |
|
2048 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ |
|
2049 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2050 8, /* number of pixels, processed in a single block */ \ |
|
2051 default_init, \ |
|
2052 default_cleanup, \ |
|
2053 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
|
2054 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
|
2055 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head |
|
2056 |
|
2057 /******************************************************************************/ |
|
2058 |
|
2059 generate_composite_function \ |
|
2060 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ |
|
2061 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2062 8, /* number of pixels, processed in a single block */ \ |
|
2063 5, /* prefetch distance */ \ |
|
2064 default_init, \ |
|
2065 default_cleanup, \ |
|
2066 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
|
2067 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
|
2068 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ |
|
2069 28, /* dst_w_basereg */ \ |
|
2070 4, /* dst_r_basereg */ \ |
|
2071 0, /* src_basereg */ \ |
|
2072 27 /* mask_basereg */ |
|
2073 |
|
2074 /******************************************************************************/ |
|
2075 |
|
2076 .macro pixman_composite_add_n_8_8888_init |
|
2077 add DUMMY, sp, #ARGS_STACK_OFFSET |
|
2078 vld1.32 {d3[0]}, [DUMMY] |
|
2079 vdup.8 d0, d3[0] |
|
2080 vdup.8 d1, d3[1] |
|
2081 vdup.8 d2, d3[2] |
|
2082 vdup.8 d3, d3[3] |
|
2083 .endm |
|
2084 |
|
2085 .macro pixman_composite_add_n_8_8888_cleanup |
|
2086 .endm |
|
2087 |
|
2088 generate_composite_function \ |
|
2089 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ |
|
2090 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2091 8, /* number of pixels, processed in a single block */ \ |
|
2092 5, /* prefetch distance */ \ |
|
2093 pixman_composite_add_n_8_8888_init, \ |
|
2094 pixman_composite_add_n_8_8888_cleanup, \ |
|
2095 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
|
2096 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
|
2097 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ |
|
2098 28, /* dst_w_basereg */ \ |
|
2099 4, /* dst_r_basereg */ \ |
|
2100 0, /* src_basereg */ \ |
|
2101 27 /* mask_basereg */ |
|
2102 |
|
2103 /******************************************************************************/ |
|
2104 |
|
2105 .macro pixman_composite_add_8888_n_8888_init |
|
2106 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) |
|
2107 vld1.32 {d27[0]}, [DUMMY] |
|
2108 vdup.8 d27, d27[3] |
|
2109 .endm |
|
2110 |
|
2111 .macro pixman_composite_add_8888_n_8888_cleanup |
|
2112 .endm |
|
2113 |
|
2114 generate_composite_function \ |
|
2115 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ |
|
2116 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2117 8, /* number of pixels, processed in a single block */ \ |
|
2118 5, /* prefetch distance */ \ |
|
2119 pixman_composite_add_8888_n_8888_init, \ |
|
2120 pixman_composite_add_8888_n_8888_cleanup, \ |
|
2121 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
|
2122 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
|
2123 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ |
|
2124 28, /* dst_w_basereg */ \ |
|
2125 4, /* dst_r_basereg */ \ |
|
2126 0, /* src_basereg */ \ |
|
2127 27 /* mask_basereg */ |
|
2128 |
|
2129 /******************************************************************************/ |
|
2130 |
|
2131 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
|
2132 /* expecting source data in {d0, d1, d2, d3} */ |
|
2133 /* destination data in {d4, d5, d6, d7} */ |
|
2134 /* solid mask is in d15 */ |
|
2135 |
|
2136 /* 'in' */ |
|
2137 vmull.u8 q8, d15, d3 |
|
2138 vmull.u8 q6, d15, d2 |
|
2139 vmull.u8 q5, d15, d1 |
|
2140 vmull.u8 q4, d15, d0 |
|
2141 vrshr.u16 q13, q8, #8 |
|
2142 vrshr.u16 q12, q6, #8 |
|
2143 vrshr.u16 q11, q5, #8 |
|
2144 vrshr.u16 q10, q4, #8 |
|
2145 vraddhn.u16 d3, q8, q13 |
|
2146 vraddhn.u16 d2, q6, q12 |
|
2147 vraddhn.u16 d1, q5, q11 |
|
2148 vraddhn.u16 d0, q4, q10 |
|
2149 vmvn.8 d24, d3 /* get inverted alpha */ |
|
2150 /* now do alpha blending */ |
|
2151 vmull.u8 q8, d24, d4 |
|
2152 vmull.u8 q9, d24, d5 |
|
2153 vmull.u8 q10, d24, d6 |
|
2154 vmull.u8 q11, d24, d7 |
|
2155 .endm |
|
2156 |
|
2157 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
|
2158 vrshr.u16 q14, q8, #8 |
|
2159 vrshr.u16 q15, q9, #8 |
|
2160 vrshr.u16 q12, q10, #8 |
|
2161 vrshr.u16 q13, q11, #8 |
|
2162 vraddhn.u16 d28, q14, q8 |
|
2163 vraddhn.u16 d29, q15, q9 |
|
2164 vraddhn.u16 d30, q12, q10 |
|
2165 vraddhn.u16 d31, q13, q11 |
|
2166 .endm |
|
2167 |
|
2168 /* TODO: expand macros and do better instructions scheduling */ |
|
2169 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head |
|
2170 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
2171 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
|
2172 fetch_src_pixblock |
|
2173 cache_preload 8, 8 |
|
2174 fetch_mask_pixblock |
|
2175 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
|
2176 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
2177 .endm |
|
2178 |
|
2179 generate_composite_function_single_scanline \ |
|
2180 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ |
|
2181 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2182 8, /* number of pixels, processed in a single block */ \ |
|
2183 default_init_need_all_regs, \ |
|
2184 default_cleanup_need_all_regs, \ |
|
2185 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ |
|
2186 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ |
|
2187 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ |
|
2188 28, /* dst_w_basereg */ \ |
|
2189 4, /* dst_r_basereg */ \ |
|
2190 0, /* src_basereg */ \ |
|
2191 12 /* mask_basereg */ |
|
2192 |
|
2193 /******************************************************************************/ |
|
2194 |
|
2195 .macro pixman_composite_over_8888_n_8888_process_pixblock_head |
|
2196 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
|
2197 .endm |
|
2198 |
|
2199 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail |
|
2200 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
|
2201 vqadd.u8 q14, q0, q14 |
|
2202 vqadd.u8 q15, q1, q15 |
|
2203 .endm |
|
2204 |
|
2205 /* TODO: expand macros and do better instructions scheduling */ |
|
2206 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head |
|
2207 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
2208 pixman_composite_over_8888_n_8888_process_pixblock_tail |
|
2209 fetch_src_pixblock |
|
2210 cache_preload 8, 8 |
|
2211 pixman_composite_over_8888_n_8888_process_pixblock_head |
|
2212 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
2213 .endm |
|
2214 |
|
2215 .macro pixman_composite_over_8888_n_8888_init |
|
2216 add DUMMY, sp, #48 |
|
2217 .vsave {d8-d15} |
|
2218 vpush {d8-d15} |
|
2219 vld1.32 {d15[0]}, [DUMMY] |
|
2220 vdup.8 d15, d15[3] |
|
2221 .endm |
|
2222 |
|
2223 .macro pixman_composite_over_8888_n_8888_cleanup |
|
2224 vpop {d8-d15} |
|
2225 .endm |
|
2226 |
|
2227 generate_composite_function \ |
|
2228 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ |
|
2229 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2230 8, /* number of pixels, processed in a single block */ \ |
|
2231 5, /* prefetch distance */ \ |
|
2232 pixman_composite_over_8888_n_8888_init, \ |
|
2233 pixman_composite_over_8888_n_8888_cleanup, \ |
|
2234 pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
|
2235 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
|
2236 pixman_composite_over_8888_n_8888_process_pixblock_tail_head |
|
2237 |
|
2238 /******************************************************************************/ |
|
2239 |
|
2240 /* TODO: expand macros and do better instructions scheduling */ |
|
2241 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head |
|
2242 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
2243 pixman_composite_over_8888_n_8888_process_pixblock_tail |
|
2244 fetch_src_pixblock |
|
2245 cache_preload 8, 8 |
|
2246 fetch_mask_pixblock |
|
2247 pixman_composite_over_8888_n_8888_process_pixblock_head |
|
2248 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
2249 .endm |
|
2250 |
|
2251 generate_composite_function \ |
|
2252 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ |
|
2253 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2254 8, /* number of pixels, processed in a single block */ \ |
|
2255 5, /* prefetch distance */ \ |
|
2256 default_init_need_all_regs, \ |
|
2257 default_cleanup_need_all_regs, \ |
|
2258 pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
|
2259 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
|
2260 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ |
|
2261 28, /* dst_w_basereg */ \ |
|
2262 4, /* dst_r_basereg */ \ |
|
2263 0, /* src_basereg */ \ |
|
2264 12 /* mask_basereg */ |
|
2265 |
|
2266 generate_composite_function_single_scanline \ |
|
2267 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ |
|
2268 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2269 8, /* number of pixels, processed in a single block */ \ |
|
2270 default_init_need_all_regs, \ |
|
2271 default_cleanup_need_all_regs, \ |
|
2272 pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
|
2273 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
|
2274 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ |
|
2275 28, /* dst_w_basereg */ \ |
|
2276 4, /* dst_r_basereg */ \ |
|
2277 0, /* src_basereg */ \ |
|
2278 12 /* mask_basereg */ |
|
2279 |
|
2280 /******************************************************************************/ |
|
2281 |
|
2282 /* TODO: expand macros and do better instructions scheduling */ |
|
2283 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head |
|
2284 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
2285 pixman_composite_over_8888_n_8888_process_pixblock_tail |
|
2286 fetch_src_pixblock |
|
2287 cache_preload 8, 8 |
|
2288 fetch_mask_pixblock |
|
2289 pixman_composite_over_8888_n_8888_process_pixblock_head |
|
2290 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
2291 .endm |
|
2292 |
|
2293 generate_composite_function \ |
|
2294 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ |
|
2295 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2296 8, /* number of pixels, processed in a single block */ \ |
|
2297 5, /* prefetch distance */ \ |
|
2298 default_init_need_all_regs, \ |
|
2299 default_cleanup_need_all_regs, \ |
|
2300 pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
|
2301 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
|
2302 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ |
|
2303 28, /* dst_w_basereg */ \ |
|
2304 4, /* dst_r_basereg */ \ |
|
2305 0, /* src_basereg */ \ |
|
2306 15 /* mask_basereg */ |
|
2307 |
|
2308 /******************************************************************************/ |
|
2309 |
|
2310 .macro pixman_composite_src_0888_0888_process_pixblock_head |
|
2311 .endm |
|
2312 |
|
2313 .macro pixman_composite_src_0888_0888_process_pixblock_tail |
|
2314 .endm |
|
2315 |
|
2316 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head |
|
2317 vst3.8 {d0, d1, d2}, [DST_W]! |
|
2318 fetch_src_pixblock |
|
2319 cache_preload 8, 8 |
|
2320 .endm |
|
2321 |
|
2322 generate_composite_function \ |
|
2323 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ |
|
2324 FLAG_DST_WRITEONLY, \ |
|
2325 8, /* number of pixels, processed in a single block */ \ |
|
2326 10, /* prefetch distance */ \ |
|
2327 default_init, \ |
|
2328 default_cleanup, \ |
|
2329 pixman_composite_src_0888_0888_process_pixblock_head, \ |
|
2330 pixman_composite_src_0888_0888_process_pixblock_tail, \ |
|
2331 pixman_composite_src_0888_0888_process_pixblock_tail_head, \ |
|
2332 0, /* dst_w_basereg */ \ |
|
2333 0, /* dst_r_basereg */ \ |
|
2334 0, /* src_basereg */ \ |
|
2335 0 /* mask_basereg */ |
|
2336 |
|
2337 /******************************************************************************/ |
|
2338 |
|
2339 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head |
|
2340 vswp d0, d2 |
|
2341 .endm |
|
2342 |
|
2343 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail |
|
2344 .endm |
|
2345 |
|
2346 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head |
|
2347 vst4.8 {d0, d1, d2, d3}, [DST_W]! |
|
2348 fetch_src_pixblock |
|
2349 vswp d0, d2 |
|
2350 cache_preload 8, 8 |
|
2351 .endm |
|
2352 |
|
2353 .macro pixman_composite_src_0888_8888_rev_init |
|
2354 veor d3, d3, d3 |
|
2355 .endm |
|
2356 |
|
2357 generate_composite_function \ |
|
2358 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ |
|
2359 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
|
2360 8, /* number of pixels, processed in a single block */ \ |
|
2361 10, /* prefetch distance */ \ |
|
2362 pixman_composite_src_0888_8888_rev_init, \ |
|
2363 default_cleanup, \ |
|
2364 pixman_composite_src_0888_8888_rev_process_pixblock_head, \ |
|
2365 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ |
|
2366 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ |
|
2367 0, /* dst_w_basereg */ \ |
|
2368 0, /* dst_r_basereg */ \ |
|
2369 0, /* src_basereg */ \ |
|
2370 0 /* mask_basereg */ |
|
2371 |
|
2372 /******************************************************************************/ |
|
2373 |
|
2374 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head |
|
2375 vshll.u8 q8, d1, #8 |
|
2376 vshll.u8 q9, d2, #8 |
|
2377 .endm |
|
2378 |
|
2379 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail |
|
2380 vshll.u8 q14, d0, #8 |
|
2381 vsri.u16 q14, q8, #5 |
|
2382 vsri.u16 q14, q9, #11 |
|
2383 .endm |
|
2384 |
|
2385 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head |
|
2386 vshll.u8 q14, d0, #8 |
|
2387 fetch_src_pixblock |
|
2388 vsri.u16 q14, q8, #5 |
|
2389 vsri.u16 q14, q9, #11 |
|
2390 vshll.u8 q8, d1, #8 |
|
2391 vst1.16 {d28, d29}, [DST_W, :128]! |
|
2392 vshll.u8 q9, d2, #8 |
|
2393 .endm |
|
2394 |
|
2395 generate_composite_function \ |
|
2396 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ |
|
2397 FLAG_DST_WRITEONLY, \ |
|
2398 8, /* number of pixels, processed in a single block */ \ |
|
2399 10, /* prefetch distance */ \ |
|
2400 default_init, \ |
|
2401 default_cleanup, \ |
|
2402 pixman_composite_src_0888_0565_rev_process_pixblock_head, \ |
|
2403 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ |
|
2404 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ |
|
2405 28, /* dst_w_basereg */ \ |
|
2406 0, /* dst_r_basereg */ \ |
|
2407 0, /* src_basereg */ \ |
|
2408 0 /* mask_basereg */ |
|
2409 |
|
2410 /******************************************************************************/ |
|
2411 |
|
2412 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head |
|
2413 vmull.u8 q8, d3, d0 |
|
2414 vmull.u8 q9, d3, d1 |
|
2415 vmull.u8 q10, d3, d2 |
|
2416 .endm |
|
2417 |
|
2418 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail |
|
2419 vrshr.u16 q11, q8, #8 |
|
2420 vswp d3, d31 |
|
2421 vrshr.u16 q12, q9, #8 |
|
2422 vrshr.u16 q13, q10, #8 |
|
2423 vraddhn.u16 d30, q11, q8 |
|
2424 vraddhn.u16 d29, q12, q9 |
|
2425 vraddhn.u16 d28, q13, q10 |
|
2426 .endm |
|
2427 |
|
2428 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head |
|
2429 vrshr.u16 q11, q8, #8 |
|
2430 vswp d3, d31 |
|
2431 vrshr.u16 q12, q9, #8 |
|
2432 vrshr.u16 q13, q10, #8 |
|
2433 fetch_src_pixblock |
|
2434 vraddhn.u16 d30, q11, q8 |
|
2435 PF add PF_X, PF_X, #8 |
|
2436 PF tst PF_CTL, #0xF |
|
2437 PF addne PF_X, PF_X, #8 |
|
2438 PF subne PF_CTL, PF_CTL, #1 |
|
2439 vraddhn.u16 d29, q12, q9 |
|
2440 vraddhn.u16 d28, q13, q10 |
|
2441 vmull.u8 q8, d3, d0 |
|
2442 vmull.u8 q9, d3, d1 |
|
2443 vmull.u8 q10, d3, d2 |
|
2444 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
2445 PF cmp PF_X, ORIG_W |
|
2446 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
2447 PF subge PF_X, PF_X, ORIG_W |
|
2448 PF subges PF_CTL, PF_CTL, #0x10 |
|
2449 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
2450 .endm |
|
2451 |
|
2452 generate_composite_function \ |
|
2453 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ |
|
2454 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
|
2455 8, /* number of pixels, processed in a single block */ \ |
|
2456 10, /* prefetch distance */ \ |
|
2457 default_init, \ |
|
2458 default_cleanup, \ |
|
2459 pixman_composite_src_pixbuf_8888_process_pixblock_head, \ |
|
2460 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ |
|
2461 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ |
|
2462 28, /* dst_w_basereg */ \ |
|
2463 0, /* dst_r_basereg */ \ |
|
2464 0, /* src_basereg */ \ |
|
2465 0 /* mask_basereg */ |
|
2466 |
|
2467 /******************************************************************************/ |
|
2468 |
|
2469 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head |
|
2470 vmull.u8 q8, d3, d0 |
|
2471 vmull.u8 q9, d3, d1 |
|
2472 vmull.u8 q10, d3, d2 |
|
2473 .endm |
|
2474 |
|
2475 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail |
|
2476 vrshr.u16 q11, q8, #8 |
|
2477 vswp d3, d31 |
|
2478 vrshr.u16 q12, q9, #8 |
|
2479 vrshr.u16 q13, q10, #8 |
|
2480 vraddhn.u16 d28, q11, q8 |
|
2481 vraddhn.u16 d29, q12, q9 |
|
2482 vraddhn.u16 d30, q13, q10 |
|
2483 .endm |
|
2484 |
|
2485 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head |
|
2486 vrshr.u16 q11, q8, #8 |
|
2487 vswp d3, d31 |
|
2488 vrshr.u16 q12, q9, #8 |
|
2489 vrshr.u16 q13, q10, #8 |
|
2490 fetch_src_pixblock |
|
2491 vraddhn.u16 d28, q11, q8 |
|
2492 PF add PF_X, PF_X, #8 |
|
2493 PF tst PF_CTL, #0xF |
|
2494 PF addne PF_X, PF_X, #8 |
|
2495 PF subne PF_CTL, PF_CTL, #1 |
|
2496 vraddhn.u16 d29, q12, q9 |
|
2497 vraddhn.u16 d30, q13, q10 |
|
2498 vmull.u8 q8, d3, d0 |
|
2499 vmull.u8 q9, d3, d1 |
|
2500 vmull.u8 q10, d3, d2 |
|
2501 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
2502 PF cmp PF_X, ORIG_W |
|
2503 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
2504 PF subge PF_X, PF_X, ORIG_W |
|
2505 PF subges PF_CTL, PF_CTL, #0x10 |
|
2506 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
2507 .endm |
|
2508 |
|
2509 generate_composite_function \ |
|
2510 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ |
|
2511 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
|
2512 8, /* number of pixels, processed in a single block */ \ |
|
2513 10, /* prefetch distance */ \ |
|
2514 default_init, \ |
|
2515 default_cleanup, \ |
|
2516 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ |
|
2517 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ |
|
2518 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ |
|
2519 28, /* dst_w_basereg */ \ |
|
2520 0, /* dst_r_basereg */ \ |
|
2521 0, /* src_basereg */ \ |
|
2522 0 /* mask_basereg */ |
|
2523 |
|
2524 /******************************************************************************/ |
|
2525 |
|
2526 .macro pixman_composite_over_0565_8_0565_process_pixblock_head |
|
2527 /* mask is in d15 */ |
|
2528 convert_0565_to_x888 q4, d2, d1, d0 |
|
2529 convert_0565_to_x888 q5, d6, d5, d4 |
|
2530 /* source pixel data is in {d0, d1, d2, XX} */ |
|
2531 /* destination pixel data is in {d4, d5, d6, XX} */ |
|
2532 vmvn.8 d7, d15 |
|
2533 vmull.u8 q6, d15, d2 |
|
2534 vmull.u8 q5, d15, d1 |
|
2535 vmull.u8 q4, d15, d0 |
|
2536 vmull.u8 q8, d7, d4 |
|
2537 vmull.u8 q9, d7, d5 |
|
2538 vmull.u8 q13, d7, d6 |
|
2539 vrshr.u16 q12, q6, #8 |
|
2540 vrshr.u16 q11, q5, #8 |
|
2541 vrshr.u16 q10, q4, #8 |
|
2542 vraddhn.u16 d2, q6, q12 |
|
2543 vraddhn.u16 d1, q5, q11 |
|
2544 vraddhn.u16 d0, q4, q10 |
|
2545 .endm |
|
2546 |
|
2547 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail |
|
2548 vrshr.u16 q14, q8, #8 |
|
2549 vrshr.u16 q15, q9, #8 |
|
2550 vrshr.u16 q12, q13, #8 |
|
2551 vraddhn.u16 d28, q14, q8 |
|
2552 vraddhn.u16 d29, q15, q9 |
|
2553 vraddhn.u16 d30, q12, q13 |
|
2554 vqadd.u8 q0, q0, q14 |
|
2555 vqadd.u8 q1, q1, q15 |
|
2556 /* 32bpp result is in {d0, d1, d2, XX} */ |
|
2557 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 |
|
2558 .endm |
|
2559 |
|
2560 /* TODO: expand macros and do better instructions scheduling */ |
|
2561 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head |
|
2562 fetch_mask_pixblock |
|
2563 pixman_composite_over_0565_8_0565_process_pixblock_tail |
|
2564 fetch_src_pixblock |
|
2565 vld1.16 {d10, d11}, [DST_R, :128]! |
|
2566 cache_preload 8, 8 |
|
2567 pixman_composite_over_0565_8_0565_process_pixblock_head |
|
2568 vst1.16 {d28, d29}, [DST_W, :128]! |
|
2569 .endm |
|
2570 |
|
2571 generate_composite_function \ |
|
2572 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ |
|
2573 FLAG_DST_READWRITE, \ |
|
2574 8, /* number of pixels, processed in a single block */ \ |
|
2575 5, /* prefetch distance */ \ |
|
2576 default_init_need_all_regs, \ |
|
2577 default_cleanup_need_all_regs, \ |
|
2578 pixman_composite_over_0565_8_0565_process_pixblock_head, \ |
|
2579 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ |
|
2580 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ |
|
2581 28, /* dst_w_basereg */ \ |
|
2582 10, /* dst_r_basereg */ \ |
|
2583 8, /* src_basereg */ \ |
|
2584 15 /* mask_basereg */ |
|
2585 |
|
2586 /******************************************************************************/ |
|
2587 |
|
2588 .macro pixman_composite_over_0565_n_0565_init |
|
2589 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) |
|
2590 .vsave {d8-d15} |
|
2591 vpush {d8-d15} |
|
2592 vld1.32 {d15[0]}, [DUMMY] |
|
2593 vdup.8 d15, d15[3] |
|
2594 .endm |
|
2595 |
|
2596 .macro pixman_composite_over_0565_n_0565_cleanup |
|
2597 vpop {d8-d15} |
|
2598 .endm |
|
2599 |
|
2600 generate_composite_function \ |
|
2601 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ |
|
2602 FLAG_DST_READWRITE, \ |
|
2603 8, /* number of pixels, processed in a single block */ \ |
|
2604 5, /* prefetch distance */ \ |
|
2605 pixman_composite_over_0565_n_0565_init, \ |
|
2606 pixman_composite_over_0565_n_0565_cleanup, \ |
|
2607 pixman_composite_over_0565_8_0565_process_pixblock_head, \ |
|
2608 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ |
|
2609 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ |
|
2610 28, /* dst_w_basereg */ \ |
|
2611 10, /* dst_r_basereg */ \ |
|
2612 8, /* src_basereg */ \ |
|
2613 15 /* mask_basereg */ |
|
2614 |
|
2615 /******************************************************************************/ |
|
2616 |
|
2617 .macro pixman_composite_add_0565_8_0565_process_pixblock_head |
|
2618 /* mask is in d15 */ |
|
2619 convert_0565_to_x888 q4, d2, d1, d0 |
|
2620 convert_0565_to_x888 q5, d6, d5, d4 |
|
2621 /* source pixel data is in {d0, d1, d2, XX} */ |
|
2622 /* destination pixel data is in {d4, d5, d6, XX} */ |
|
2623 vmull.u8 q6, d15, d2 |
|
2624 vmull.u8 q5, d15, d1 |
|
2625 vmull.u8 q4, d15, d0 |
|
2626 vrshr.u16 q12, q6, #8 |
|
2627 vrshr.u16 q11, q5, #8 |
|
2628 vrshr.u16 q10, q4, #8 |
|
2629 vraddhn.u16 d2, q6, q12 |
|
2630 vraddhn.u16 d1, q5, q11 |
|
2631 vraddhn.u16 d0, q4, q10 |
|
2632 .endm |
|
2633 |
|
2634 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail |
|
2635 vqadd.u8 q0, q0, q2 |
|
2636 vqadd.u8 q1, q1, q3 |
|
2637 /* 32bpp result is in {d0, d1, d2, XX} */ |
|
2638 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 |
|
2639 .endm |
|
2640 |
|
2641 /* TODO: expand macros and do better instructions scheduling */ |
|
2642 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head |
|
2643 fetch_mask_pixblock |
|
2644 pixman_composite_add_0565_8_0565_process_pixblock_tail |
|
2645 fetch_src_pixblock |
|
2646 vld1.16 {d10, d11}, [DST_R, :128]! |
|
2647 cache_preload 8, 8 |
|
2648 pixman_composite_add_0565_8_0565_process_pixblock_head |
|
2649 vst1.16 {d28, d29}, [DST_W, :128]! |
|
2650 .endm |
|
2651 |
|
2652 generate_composite_function \ |
|
2653 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ |
|
2654 FLAG_DST_READWRITE, \ |
|
2655 8, /* number of pixels, processed in a single block */ \ |
|
2656 5, /* prefetch distance */ \ |
|
2657 default_init_need_all_regs, \ |
|
2658 default_cleanup_need_all_regs, \ |
|
2659 pixman_composite_add_0565_8_0565_process_pixblock_head, \ |
|
2660 pixman_composite_add_0565_8_0565_process_pixblock_tail, \ |
|
2661 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ |
|
2662 28, /* dst_w_basereg */ \ |
|
2663 10, /* dst_r_basereg */ \ |
|
2664 8, /* src_basereg */ \ |
|
2665 15 /* mask_basereg */ |
|
2666 |
|
2667 /******************************************************************************/ |
|
2668 |
|
2669 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head |
|
2670 /* mask is in d15 */ |
|
2671 convert_0565_to_x888 q5, d6, d5, d4 |
|
2672 /* destination pixel data is in {d4, d5, d6, xx} */ |
|
2673 vmvn.8 d24, d15 /* get inverted alpha */ |
|
2674 /* now do alpha blending */ |
|
2675 vmull.u8 q8, d24, d4 |
|
2676 vmull.u8 q9, d24, d5 |
|
2677 vmull.u8 q10, d24, d6 |
|
2678 .endm |
|
2679 |
|
2680 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail |
|
2681 vrshr.u16 q14, q8, #8 |
|
2682 vrshr.u16 q15, q9, #8 |
|
2683 vrshr.u16 q12, q10, #8 |
|
2684 vraddhn.u16 d0, q14, q8 |
|
2685 vraddhn.u16 d1, q15, q9 |
|
2686 vraddhn.u16 d2, q12, q10 |
|
2687 /* 32bpp result is in {d0, d1, d2, XX} */ |
|
2688 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 |
|
2689 .endm |
|
2690 |
|
2691 /* TODO: expand macros and do better instructions scheduling */ |
|
2692 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head |
|
2693 fetch_src_pixblock |
|
2694 pixman_composite_out_reverse_8_0565_process_pixblock_tail |
|
2695 vld1.16 {d10, d11}, [DST_R, :128]! |
|
2696 cache_preload 8, 8 |
|
2697 pixman_composite_out_reverse_8_0565_process_pixblock_head |
|
2698 vst1.16 {d28, d29}, [DST_W, :128]! |
|
2699 .endm |
|
2700 |
|
2701 generate_composite_function \ |
|
2702 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ |
|
2703 FLAG_DST_READWRITE, \ |
|
2704 8, /* number of pixels, processed in a single block */ \ |
|
2705 5, /* prefetch distance */ \ |
|
2706 default_init_need_all_regs, \ |
|
2707 default_cleanup_need_all_regs, \ |
|
2708 pixman_composite_out_reverse_8_0565_process_pixblock_head, \ |
|
2709 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ |
|
2710 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ |
|
2711 28, /* dst_w_basereg */ \ |
|
2712 10, /* dst_r_basereg */ \ |
|
2713 15, /* src_basereg */ \ |
|
2714 0 /* mask_basereg */ |
|
2715 |
|
2716 /******************************************************************************/ |
|
2717 |
|
2718 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head |
|
2719 /* src is in d0 */ |
|
2720 /* destination pixel data is in {d4, d5, d6, d7} */ |
|
2721 vmvn.8 d1, d0 /* get inverted alpha */ |
|
2722 /* now do alpha blending */ |
|
2723 vmull.u8 q8, d1, d4 |
|
2724 vmull.u8 q9, d1, d5 |
|
2725 vmull.u8 q10, d1, d6 |
|
2726 vmull.u8 q11, d1, d7 |
|
2727 .endm |
|
2728 |
|
2729 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail |
|
2730 vrshr.u16 q14, q8, #8 |
|
2731 vrshr.u16 q15, q9, #8 |
|
2732 vrshr.u16 q12, q10, #8 |
|
2733 vrshr.u16 q13, q11, #8 |
|
2734 vraddhn.u16 d28, q14, q8 |
|
2735 vraddhn.u16 d29, q15, q9 |
|
2736 vraddhn.u16 d30, q12, q10 |
|
2737 vraddhn.u16 d31, q13, q11 |
|
2738 /* 32bpp result is in {d28, d29, d30, d31} */ |
|
2739 .endm |
|
2740 |
|
2741 /* TODO: expand macros and do better instructions scheduling */ |
|
2742 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head |
|
2743 fetch_src_pixblock |
|
2744 pixman_composite_out_reverse_8_8888_process_pixblock_tail |
|
2745 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
|
2746 cache_preload 8, 8 |
|
2747 pixman_composite_out_reverse_8_8888_process_pixblock_head |
|
2748 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
|
2749 .endm |
|
2750 |
|
2751 generate_composite_function \ |
|
2752 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ |
|
2753 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2754 8, /* number of pixels, processed in a single block */ \ |
|
2755 5, /* prefetch distance */ \ |
|
2756 default_init, \ |
|
2757 default_cleanup, \ |
|
2758 pixman_composite_out_reverse_8_8888_process_pixblock_head, \ |
|
2759 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ |
|
2760 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ |
|
2761 28, /* dst_w_basereg */ \ |
|
2762 4, /* dst_r_basereg */ \ |
|
2763 0, /* src_basereg */ \ |
|
2764 0 /* mask_basereg */ |
|
2765 |
|
2766 /******************************************************************************/ |
|
2767 |
|
2768 generate_composite_function_nearest_scanline \ |
|
2769 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ |
|
2770 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2771 8, /* number of pixels, processed in a single block */ \ |
|
2772 default_init, \ |
|
2773 default_cleanup, \ |
|
2774 pixman_composite_over_8888_8888_process_pixblock_head, \ |
|
2775 pixman_composite_over_8888_8888_process_pixblock_tail, \ |
|
2776 pixman_composite_over_8888_8888_process_pixblock_tail_head |
|
2777 |
|
2778 generate_composite_function_nearest_scanline \ |
|
2779 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ |
|
2780 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2781 8, /* number of pixels, processed in a single block */ \ |
|
2782 default_init, \ |
|
2783 default_cleanup, \ |
|
2784 pixman_composite_over_8888_0565_process_pixblock_head, \ |
|
2785 pixman_composite_over_8888_0565_process_pixblock_tail, \ |
|
2786 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ |
|
2787 28, /* dst_w_basereg */ \ |
|
2788 4, /* dst_r_basereg */ \ |
|
2789 0, /* src_basereg */ \ |
|
2790 24 /* mask_basereg */ |
|
2791 |
|
2792 generate_composite_function_nearest_scanline \ |
|
2793 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ |
|
2794 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
|
2795 8, /* number of pixels, processed in a single block */ \ |
|
2796 default_init, \ |
|
2797 default_cleanup, \ |
|
2798 pixman_composite_src_8888_0565_process_pixblock_head, \ |
|
2799 pixman_composite_src_8888_0565_process_pixblock_tail, \ |
|
2800 pixman_composite_src_8888_0565_process_pixblock_tail_head |
|
2801 |
|
2802 generate_composite_function_nearest_scanline \ |
|
2803 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ |
|
2804 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
|
2805 8, /* number of pixels, processed in a single block */ \ |
|
2806 default_init, \ |
|
2807 default_cleanup, \ |
|
2808 pixman_composite_src_0565_8888_process_pixblock_head, \ |
|
2809 pixman_composite_src_0565_8888_process_pixblock_tail, \ |
|
2810 pixman_composite_src_0565_8888_process_pixblock_tail_head |
|
2811 |
|
2812 generate_composite_function_nearest_scanline \ |
|
2813 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ |
|
2814 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
|
2815 8, /* number of pixels, processed in a single block */ \ |
|
2816 default_init_need_all_regs, \ |
|
2817 default_cleanup_need_all_regs, \ |
|
2818 pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
|
2819 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
|
2820 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ |
|
2821 28, /* dst_w_basereg */ \ |
|
2822 4, /* dst_r_basereg */ \ |
|
2823 8, /* src_basereg */ \ |
|
2824 24 /* mask_basereg */ |
|
2825 |
|
2826 generate_composite_function_nearest_scanline \ |
|
2827 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ |
|
2828 FLAG_DST_READWRITE, \ |
|
2829 8, /* number of pixels, processed in a single block */ \ |
|
2830 default_init_need_all_regs, \ |
|
2831 default_cleanup_need_all_regs, \ |
|
2832 pixman_composite_over_0565_8_0565_process_pixblock_head, \ |
|
2833 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ |
|
2834 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ |
|
2835 28, /* dst_w_basereg */ \ |
|
2836 10, /* dst_r_basereg */ \ |
|
2837 8, /* src_basereg */ \ |
|
2838 15 /* mask_basereg */ |
|
2839 |
|
2840 /******************************************************************************/ |
|
2841 |
|
2842 /* Supplementary macro for setting function attributes */ |
|
2843 .macro pixman_asm_function fname |
|
2844 .func fname |
|
2845 .global fname |
|
2846 #ifdef __ELF__ |
|
2847 .hidden fname |
|
2848 .type fname, %function |
|
2849 #endif |
|
2850 fname: |
|
2851 .endm |
|
2852 |
|
2853 /* |
|
2854 * Bilinear scaling support code which tries to provide pixel fetching, color |
|
2855 * format conversion, and interpolation as separate macros which can be used |
|
2856 * as the basic building blocks for constructing bilinear scanline functions. |
|
2857 */ |
|
2858 |
|
2859 .macro bilinear_load_8888 reg1, reg2, tmp |
|
2860 mov TMP1, X, asr #16 |
|
2861 add X, X, UX |
|
2862 add TMP1, TOP, TMP1, asl #2 |
|
2863 vld1.32 {reg1}, [TMP1], STRIDE |
|
2864 vld1.32 {reg2}, [TMP1] |
|
2865 .endm |
|
2866 |
|
2867 .macro bilinear_load_0565 reg1, reg2, tmp |
|
2868 mov TMP1, X, asr #16 |
|
2869 add X, X, UX |
|
2870 add TMP1, TOP, TMP1, asl #1 |
|
2871 vld1.32 {reg2[0]}, [TMP1], STRIDE |
|
2872 vld1.32 {reg2[1]}, [TMP1] |
|
2873 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp |
|
2874 .endm |
|
2875 |
|
2876 .macro bilinear_load_and_vertical_interpolate_two_8888 \ |
|
2877 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 |
|
2878 |
|
2879 bilinear_load_8888 reg1, reg2, tmp1 |
|
2880 vmull.u8 acc1, reg1, d28 |
|
2881 vmlal.u8 acc1, reg2, d29 |
|
2882 bilinear_load_8888 reg3, reg4, tmp2 |
|
2883 vmull.u8 acc2, reg3, d28 |
|
2884 vmlal.u8 acc2, reg4, d29 |
|
2885 .endm |
|
2886 |
|
2887 .macro bilinear_load_and_vertical_interpolate_four_8888 \ |
|
2888 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ |
|
2889 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
|
2890 |
|
2891 bilinear_load_and_vertical_interpolate_two_8888 \ |
|
2892 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi |
|
2893 bilinear_load_and_vertical_interpolate_two_8888 \ |
|
2894 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
|
2895 .endm |
|
2896 |
|
2897 .macro bilinear_load_and_vertical_interpolate_two_0565 \ |
|
2898 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi |
|
2899 |
|
2900 mov TMP1, X, asr #16 |
|
2901 add X, X, UX |
|
2902 add TMP1, TOP, TMP1, asl #1 |
|
2903 mov TMP2, X, asr #16 |
|
2904 add X, X, UX |
|
2905 add TMP2, TOP, TMP2, asl #1 |
|
2906 vld1.32 {acc2lo[0]}, [TMP1], STRIDE |
|
2907 vld1.32 {acc2hi[0]}, [TMP2], STRIDE |
|
2908 vld1.32 {acc2lo[1]}, [TMP1] |
|
2909 vld1.32 {acc2hi[1]}, [TMP2] |
|
2910 convert_0565_to_x888 acc2, reg3, reg2, reg1 |
|
2911 vzip.u8 reg1, reg3 |
|
2912 vzip.u8 reg2, reg4 |
|
2913 vzip.u8 reg3, reg4 |
|
2914 vzip.u8 reg1, reg2 |
|
2915 vmull.u8 acc1, reg1, d28 |
|
2916 vmlal.u8 acc1, reg2, d29 |
|
2917 vmull.u8 acc2, reg3, d28 |
|
2918 vmlal.u8 acc2, reg4, d29 |
|
2919 .endm |
|
2920 |
|
2921 .macro bilinear_load_and_vertical_interpolate_four_0565 \ |
|
2922 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ |
|
2923 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
|
2924 |
|
2925 mov TMP1, X, asr #16 |
|
2926 add X, X, UX |
|
2927 add TMP1, TOP, TMP1, asl #1 |
|
2928 mov TMP2, X, asr #16 |
|
2929 add X, X, UX |
|
2930 add TMP2, TOP, TMP2, asl #1 |
|
2931 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE |
|
2932 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE |
|
2933 vld1.32 {xacc2lo[1]}, [TMP1] |
|
2934 vld1.32 {xacc2hi[1]}, [TMP2] |
|
2935 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 |
|
2936 mov TMP1, X, asr #16 |
|
2937 add X, X, UX |
|
2938 add TMP1, TOP, TMP1, asl #1 |
|
2939 mov TMP2, X, asr #16 |
|
2940 add X, X, UX |
|
2941 add TMP2, TOP, TMP2, asl #1 |
|
2942 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE |
|
2943 vzip.u8 xreg1, xreg3 |
|
2944 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE |
|
2945 vzip.u8 xreg2, xreg4 |
|
2946 vld1.32 {yacc2lo[1]}, [TMP1] |
|
2947 vzip.u8 xreg3, xreg4 |
|
2948 vld1.32 {yacc2hi[1]}, [TMP2] |
|
2949 vzip.u8 xreg1, xreg2 |
|
2950 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 |
|
2951 vmull.u8 xacc1, xreg1, d28 |
|
2952 vzip.u8 yreg1, yreg3 |
|
2953 vmlal.u8 xacc1, xreg2, d29 |
|
2954 vzip.u8 yreg2, yreg4 |
|
2955 vmull.u8 xacc2, xreg3, d28 |
|
2956 vzip.u8 yreg3, yreg4 |
|
2957 vmlal.u8 xacc2, xreg4, d29 |
|
2958 vzip.u8 yreg1, yreg2 |
|
2959 vmull.u8 yacc1, yreg1, d28 |
|
2960 vmlal.u8 yacc1, yreg2, d29 |
|
2961 vmull.u8 yacc2, yreg3, d28 |
|
2962 vmlal.u8 yacc2, yreg4, d29 |
|
2963 .endm |
|
2964 |
|
2965 .macro bilinear_store_8888 numpix, tmp1, tmp2 |
|
2966 .if numpix == 4 |
|
2967 vst1.32 {d0, d1}, [OUT, :128]! |
|
2968 .elseif numpix == 2 |
|
2969 vst1.32 {d0}, [OUT, :64]! |
|
2970 .elseif numpix == 1 |
|
2971 vst1.32 {d0[0]}, [OUT, :32]! |
|
2972 .else |
|
2973 .error bilinear_store_8888 numpix is unsupported |
|
2974 .endif |
|
2975 .endm |
|
2976 |
|
2977 .macro bilinear_store_0565 numpix, tmp1, tmp2 |
|
2978 vuzp.u8 d0, d1 |
|
2979 vuzp.u8 d2, d3 |
|
2980 vuzp.u8 d1, d3 |
|
2981 vuzp.u8 d0, d2 |
|
2982 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 |
|
2983 .if numpix == 4 |
|
2984 vst1.16 {d2}, [OUT, :64]! |
|
2985 .elseif numpix == 2 |
|
2986 vst1.32 {d2[0]}, [OUT, :32]! |
|
2987 .elseif numpix == 1 |
|
2988 vst1.16 {d2[0]}, [OUT, :16]! |
|
2989 .else |
|
2990 .error bilinear_store_0565 numpix is unsupported |
|
2991 .endif |
|
2992 .endm |
|
2993 |
|
2994 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt |
|
2995 bilinear_load_&src_fmt d0, d1, d2 |
|
2996 vmull.u8 q1, d0, d28 |
|
2997 vmlal.u8 q1, d1, d29 |
|
2998 /* 5 cycles bubble */ |
|
2999 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS |
|
3000 vmlsl.u16 q0, d2, d30 |
|
3001 vmlal.u16 q0, d3, d30 |
|
3002 /* 5 cycles bubble */ |
|
3003 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3004 /* 3 cycles bubble */ |
|
3005 vmovn.u16 d0, q0 |
|
3006 /* 1 cycle bubble */ |
|
3007 bilinear_store_&dst_fmt 1, q2, q3 |
|
3008 .endm |
|
3009 |
|
3010 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt |
|
3011 bilinear_load_and_vertical_interpolate_two_&src_fmt \ |
|
3012 q1, q11, d0, d1, d20, d21, d22, d23 |
|
3013 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS |
|
3014 vmlsl.u16 q0, d2, d30 |
|
3015 vmlal.u16 q0, d3, d30 |
|
3016 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS |
|
3017 vmlsl.u16 q10, d22, d31 |
|
3018 vmlal.u16 q10, d23, d31 |
|
3019 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3020 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3021 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3022 vadd.u16 q12, q12, q13 |
|
3023 vmovn.u16 d0, q0 |
|
3024 bilinear_store_&dst_fmt 2, q2, q3 |
|
3025 .endm |
|
3026 |
|
3027 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt |
|
3028 bilinear_load_and_vertical_interpolate_four_&src_fmt \ |
|
3029 q1, q11, d0, d1, d20, d21, d22, d23 \ |
|
3030 q3, q9, d4, d5, d16, d17, d18, d19 |
|
3031 pld [TMP1, PF_OFFS] |
|
3032 sub TMP1, TMP1, STRIDE |
|
3033 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS |
|
3034 vmlsl.u16 q0, d2, d30 |
|
3035 vmlal.u16 q0, d3, d30 |
|
3036 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS |
|
3037 vmlsl.u16 q10, d22, d31 |
|
3038 vmlal.u16 q10, d23, d31 |
|
3039 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3040 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS |
|
3041 vmlsl.u16 q2, d6, d30 |
|
3042 vmlal.u16 q2, d7, d30 |
|
3043 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS |
|
3044 pld [TMP2, PF_OFFS] |
|
3045 vmlsl.u16 q8, d18, d31 |
|
3046 vmlal.u16 q8, d19, d31 |
|
3047 vadd.u16 q12, q12, q13 |
|
3048 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3049 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3050 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3051 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3052 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3053 vmovn.u16 d0, q0 |
|
3054 vmovn.u16 d1, q2 |
|
3055 vadd.u16 q12, q12, q13 |
|
3056 bilinear_store_&dst_fmt 4, q2, q3 |
|
3057 .endm |
|
3058 |
|
3059 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt |
|
3060 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt |
|
3061 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head |
|
3062 .else |
|
3063 bilinear_interpolate_four_pixels src_fmt, dst_fmt |
|
3064 .endif |
|
3065 .endm |
|
3066 |
|
3067 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt |
|
3068 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt |
|
3069 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail |
|
3070 .endif |
|
3071 .endm |
|
3072 |
|
3073 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
|
3074 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt |
|
3075 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head |
|
3076 .else |
|
3077 bilinear_interpolate_four_pixels src_fmt, dst_fmt |
|
3078 .endif |
|
3079 .endm |
|
3080 |
|
3081 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt |
|
3082 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt |
|
3083 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head |
|
3084 .else |
|
3085 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt |
|
3086 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
|
3087 .endif |
|
3088 .endm |
|
3089 |
|
3090 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt |
|
3091 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt |
|
3092 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail |
|
3093 .else |
|
3094 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt |
|
3095 .endif |
|
3096 .endm |
|
3097 |
|
3098 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt |
|
3099 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt |
|
3100 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head |
|
3101 .else |
|
3102 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
|
3103 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
|
3104 .endif |
|
3105 .endm |
|
3106 |
|
3107 .set BILINEAR_FLAG_UNROLL_4, 0 |
|
3108 .set BILINEAR_FLAG_UNROLL_8, 1 |
|
3109 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 |
|
3110 |
|
3111 /* |
|
3112 * Main template macro for generating NEON optimized bilinear scanline |
|
3113 * functions. |
|
3114 * |
|
3115 * Bilinear scanline scaler macro template uses the following arguments: |
|
3116 * fname - name of the function to generate |
|
3117 * src_fmt - source color format (8888 or 0565) |
|
3118 * dst_fmt - destination color format (8888 or 0565) |
|
3119 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes |
|
3120 * prefetch_distance - prefetch in the source image by that many |
|
3121 * pixels ahead |
|
3122 */ |
|
3123 |
|
3124 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ |
|
3125 src_bpp_shift, dst_bpp_shift, \ |
|
3126 prefetch_distance, flags |
|
3127 |
|
3128 pixman_asm_function fname |
|
3129 OUT .req r0 |
|
3130 TOP .req r1 |
|
3131 BOTTOM .req r2 |
|
3132 WT .req r3 |
|
3133 WB .req r4 |
|
3134 X .req r5 |
|
3135 UX .req r6 |
|
3136 WIDTH .req ip |
|
3137 TMP1 .req r3 |
|
3138 TMP2 .req r4 |
|
3139 PF_OFFS .req r7 |
|
3140 TMP3 .req r8 |
|
3141 TMP4 .req r9 |
|
3142 STRIDE .req r2 |
|
3143 |
|
3144 .fnstart |
|
3145 mov ip, sp |
|
3146 .save {r4, r5, r6, r7, r8, r9} |
|
3147 push {r4, r5, r6, r7, r8, r9} |
|
3148 mov PF_OFFS, #prefetch_distance |
|
3149 ldmia ip, {WB, X, UX, WIDTH} |
|
3150 mul PF_OFFS, PF_OFFS, UX |
|
3151 |
|
3152 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 |
|
3153 .vsave {d8-d15} |
|
3154 vpush {d8-d15} |
|
3155 .endif |
|
3156 |
|
3157 sub STRIDE, BOTTOM, TOP |
|
3158 .unreq BOTTOM |
|
3159 |
|
3160 cmp WIDTH, #0 |
|
3161 ble 3f |
|
3162 |
|
3163 vdup.u16 q12, X |
|
3164 vdup.u16 q13, UX |
|
3165 vdup.u8 d28, WT |
|
3166 vdup.u8 d29, WB |
|
3167 vadd.u16 d25, d25, d26 |
|
3168 |
|
3169 /* ensure good destination alignment */ |
|
3170 cmp WIDTH, #1 |
|
3171 blt 0f |
|
3172 tst OUT, #(1 << dst_bpp_shift) |
|
3173 beq 0f |
|
3174 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3175 vadd.u16 q12, q12, q13 |
|
3176 bilinear_interpolate_last_pixel src_fmt, dst_fmt |
|
3177 sub WIDTH, WIDTH, #1 |
|
3178 0: |
|
3179 vadd.u16 q13, q13, q13 |
|
3180 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3181 vadd.u16 q12, q12, q13 |
|
3182 |
|
3183 cmp WIDTH, #2 |
|
3184 blt 0f |
|
3185 tst OUT, #(1 << (dst_bpp_shift + 1)) |
|
3186 beq 0f |
|
3187 bilinear_interpolate_two_pixels src_fmt, dst_fmt |
|
3188 sub WIDTH, WIDTH, #2 |
|
3189 0: |
|
3190 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 |
|
3191 /*********** 8 pixels per iteration *****************/ |
|
3192 cmp WIDTH, #4 |
|
3193 blt 0f |
|
3194 tst OUT, #(1 << (dst_bpp_shift + 2)) |
|
3195 beq 0f |
|
3196 bilinear_interpolate_four_pixels src_fmt, dst_fmt |
|
3197 sub WIDTH, WIDTH, #4 |
|
3198 0: |
|
3199 subs WIDTH, WIDTH, #8 |
|
3200 blt 1f |
|
3201 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) |
|
3202 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt |
|
3203 subs WIDTH, WIDTH, #8 |
|
3204 blt 5f |
|
3205 0: |
|
3206 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt |
|
3207 subs WIDTH, WIDTH, #8 |
|
3208 bge 0b |
|
3209 5: |
|
3210 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt |
|
3211 1: |
|
3212 tst WIDTH, #4 |
|
3213 beq 2f |
|
3214 bilinear_interpolate_four_pixels src_fmt, dst_fmt |
|
3215 2: |
|
3216 .else |
|
3217 /*********** 4 pixels per iteration *****************/ |
|
3218 subs WIDTH, WIDTH, #4 |
|
3219 blt 1f |
|
3220 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) |
|
3221 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt |
|
3222 subs WIDTH, WIDTH, #4 |
|
3223 blt 5f |
|
3224 0: |
|
3225 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
|
3226 subs WIDTH, WIDTH, #4 |
|
3227 bge 0b |
|
3228 5: |
|
3229 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt |
|
3230 1: |
|
3231 /****************************************************/ |
|
3232 .endif |
|
3233 /* handle the remaining trailing pixels */ |
|
3234 tst WIDTH, #2 |
|
3235 beq 2f |
|
3236 bilinear_interpolate_two_pixels src_fmt, dst_fmt |
|
3237 2: |
|
3238 tst WIDTH, #1 |
|
3239 beq 3f |
|
3240 bilinear_interpolate_last_pixel src_fmt, dst_fmt |
|
3241 3: |
|
3242 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 |
|
3243 vpop {d8-d15} |
|
3244 .endif |
|
3245 pop {r4, r5, r6, r7, r8, r9} |
|
3246 bx lr |
|
3247 .fnend |
|
3248 |
|
3249 .unreq OUT |
|
3250 .unreq TOP |
|
3251 .unreq WT |
|
3252 .unreq WB |
|
3253 .unreq X |
|
3254 .unreq UX |
|
3255 .unreq WIDTH |
|
3256 .unreq TMP1 |
|
3257 .unreq TMP2 |
|
3258 .unreq PF_OFFS |
|
3259 .unreq TMP3 |
|
3260 .unreq TMP4 |
|
3261 .unreq STRIDE |
|
3262 .endfunc |
|
3263 |
|
3264 .endm |
|
3265 |
|
3266 /*****************************************************************************/ |
|
3267 |
|
3268 .set have_bilinear_interpolate_four_pixels_8888_8888, 1 |
|
3269 |
|
3270 .macro bilinear_interpolate_four_pixels_8888_8888_head |
|
3271 mov TMP1, X, asr #16 |
|
3272 add X, X, UX |
|
3273 add TMP1, TOP, TMP1, asl #2 |
|
3274 mov TMP2, X, asr #16 |
|
3275 add X, X, UX |
|
3276 add TMP2, TOP, TMP2, asl #2 |
|
3277 |
|
3278 vld1.32 {d22}, [TMP1], STRIDE |
|
3279 vld1.32 {d23}, [TMP1] |
|
3280 mov TMP3, X, asr #16 |
|
3281 add X, X, UX |
|
3282 add TMP3, TOP, TMP3, asl #2 |
|
3283 vmull.u8 q8, d22, d28 |
|
3284 vmlal.u8 q8, d23, d29 |
|
3285 |
|
3286 vld1.32 {d22}, [TMP2], STRIDE |
|
3287 vld1.32 {d23}, [TMP2] |
|
3288 mov TMP4, X, asr #16 |
|
3289 add X, X, UX |
|
3290 add TMP4, TOP, TMP4, asl #2 |
|
3291 vmull.u8 q9, d22, d28 |
|
3292 vmlal.u8 q9, d23, d29 |
|
3293 |
|
3294 vld1.32 {d22}, [TMP3], STRIDE |
|
3295 vld1.32 {d23}, [TMP3] |
|
3296 vmull.u8 q10, d22, d28 |
|
3297 vmlal.u8 q10, d23, d29 |
|
3298 |
|
3299 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
|
3300 vmlsl.u16 q0, d16, d30 |
|
3301 vmlal.u16 q0, d17, d30 |
|
3302 |
|
3303 pld [TMP4, PF_OFFS] |
|
3304 vld1.32 {d16}, [TMP4], STRIDE |
|
3305 vld1.32 {d17}, [TMP4] |
|
3306 pld [TMP4, PF_OFFS] |
|
3307 vmull.u8 q11, d16, d28 |
|
3308 vmlal.u8 q11, d17, d29 |
|
3309 |
|
3310 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
|
3311 vmlsl.u16 q1, d18, d31 |
|
3312 .endm |
|
3313 |
|
3314 .macro bilinear_interpolate_four_pixels_8888_8888_tail |
|
3315 vmlal.u16 q1, d19, d31 |
|
3316 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3317 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
|
3318 vmlsl.u16 q2, d20, d30 |
|
3319 vmlal.u16 q2, d21, d30 |
|
3320 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
|
3321 vmlsl.u16 q3, d22, d31 |
|
3322 vmlal.u16 q3, d23, d31 |
|
3323 vadd.u16 q12, q12, q13 |
|
3324 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3325 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3326 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3327 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3328 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3329 vmovn.u16 d6, q0 |
|
3330 vmovn.u16 d7, q2 |
|
3331 vadd.u16 q12, q12, q13 |
|
3332 vst1.32 {d6, d7}, [OUT, :128]! |
|
3333 .endm |
|
3334 |
|
3335 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head |
|
3336 mov TMP1, X, asr #16 |
|
3337 add X, X, UX |
|
3338 add TMP1, TOP, TMP1, asl #2 |
|
3339 mov TMP2, X, asr #16 |
|
3340 add X, X, UX |
|
3341 add TMP2, TOP, TMP2, asl #2 |
|
3342 vmlal.u16 q1, d19, d31 |
|
3343 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3344 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
|
3345 vmlsl.u16 q2, d20, d30 |
|
3346 vmlal.u16 q2, d21, d30 |
|
3347 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
|
3348 vld1.32 {d20}, [TMP1], STRIDE |
|
3349 vmlsl.u16 q3, d22, d31 |
|
3350 vmlal.u16 q3, d23, d31 |
|
3351 vld1.32 {d21}, [TMP1] |
|
3352 vmull.u8 q8, d20, d28 |
|
3353 vmlal.u8 q8, d21, d29 |
|
3354 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3355 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3356 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3357 vld1.32 {d22}, [TMP2], STRIDE |
|
3358 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3359 vadd.u16 q12, q12, q13 |
|
3360 vld1.32 {d23}, [TMP2] |
|
3361 vmull.u8 q9, d22, d28 |
|
3362 mov TMP3, X, asr #16 |
|
3363 add X, X, UX |
|
3364 add TMP3, TOP, TMP3, asl #2 |
|
3365 mov TMP4, X, asr #16 |
|
3366 add X, X, UX |
|
3367 add TMP4, TOP, TMP4, asl #2 |
|
3368 vmlal.u8 q9, d23, d29 |
|
3369 vld1.32 {d22}, [TMP3], STRIDE |
|
3370 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3371 vld1.32 {d23}, [TMP3] |
|
3372 vmull.u8 q10, d22, d28 |
|
3373 vmlal.u8 q10, d23, d29 |
|
3374 vmovn.u16 d6, q0 |
|
3375 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
|
3376 vmovn.u16 d7, q2 |
|
3377 vmlsl.u16 q0, d16, d30 |
|
3378 vmlal.u16 q0, d17, d30 |
|
3379 pld [TMP4, PF_OFFS] |
|
3380 vld1.32 {d16}, [TMP4], STRIDE |
|
3381 vadd.u16 q12, q12, q13 |
|
3382 vld1.32 {d17}, [TMP4] |
|
3383 pld [TMP4, PF_OFFS] |
|
3384 vmull.u8 q11, d16, d28 |
|
3385 vmlal.u8 q11, d17, d29 |
|
3386 vst1.32 {d6, d7}, [OUT, :128]! |
|
3387 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
|
3388 vmlsl.u16 q1, d18, d31 |
|
3389 .endm |
|
3390 |
|
3391 /*****************************************************************************/ |
|
3392 |
|
3393 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1 |
|
3394 |
|
3395 .macro bilinear_interpolate_eight_pixels_8888_0565_head |
|
3396 mov TMP1, X, asr #16 |
|
3397 add X, X, UX |
|
3398 add TMP1, TOP, TMP1, asl #2 |
|
3399 mov TMP2, X, asr #16 |
|
3400 add X, X, UX |
|
3401 add TMP2, TOP, TMP2, asl #2 |
|
3402 vld1.32 {d20}, [TMP1], STRIDE |
|
3403 vld1.32 {d21}, [TMP1] |
|
3404 vmull.u8 q8, d20, d28 |
|
3405 vmlal.u8 q8, d21, d29 |
|
3406 vld1.32 {d22}, [TMP2], STRIDE |
|
3407 vld1.32 {d23}, [TMP2] |
|
3408 vmull.u8 q9, d22, d28 |
|
3409 mov TMP3, X, asr #16 |
|
3410 add X, X, UX |
|
3411 add TMP3, TOP, TMP3, asl #2 |
|
3412 mov TMP4, X, asr #16 |
|
3413 add X, X, UX |
|
3414 add TMP4, TOP, TMP4, asl #2 |
|
3415 vmlal.u8 q9, d23, d29 |
|
3416 vld1.32 {d22}, [TMP3], STRIDE |
|
3417 vld1.32 {d23}, [TMP3] |
|
3418 vmull.u8 q10, d22, d28 |
|
3419 vmlal.u8 q10, d23, d29 |
|
3420 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
|
3421 vmlsl.u16 q0, d16, d30 |
|
3422 vmlal.u16 q0, d17, d30 |
|
3423 pld [TMP4, PF_OFFS] |
|
3424 vld1.32 {d16}, [TMP4], STRIDE |
|
3425 vld1.32 {d17}, [TMP4] |
|
3426 pld [TMP4, PF_OFFS] |
|
3427 vmull.u8 q11, d16, d28 |
|
3428 vmlal.u8 q11, d17, d29 |
|
3429 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
|
3430 vmlsl.u16 q1, d18, d31 |
|
3431 |
|
3432 mov TMP1, X, asr #16 |
|
3433 add X, X, UX |
|
3434 add TMP1, TOP, TMP1, asl #2 |
|
3435 mov TMP2, X, asr #16 |
|
3436 add X, X, UX |
|
3437 add TMP2, TOP, TMP2, asl #2 |
|
3438 vmlal.u16 q1, d19, d31 |
|
3439 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3440 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
|
3441 vmlsl.u16 q2, d20, d30 |
|
3442 vmlal.u16 q2, d21, d30 |
|
3443 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
|
3444 vld1.32 {d20}, [TMP1], STRIDE |
|
3445 vmlsl.u16 q3, d22, d31 |
|
3446 vmlal.u16 q3, d23, d31 |
|
3447 vld1.32 {d21}, [TMP1] |
|
3448 vmull.u8 q8, d20, d28 |
|
3449 vmlal.u8 q8, d21, d29 |
|
3450 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3451 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3452 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3453 vld1.32 {d22}, [TMP2], STRIDE |
|
3454 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3455 vadd.u16 q12, q12, q13 |
|
3456 vld1.32 {d23}, [TMP2] |
|
3457 vmull.u8 q9, d22, d28 |
|
3458 mov TMP3, X, asr #16 |
|
3459 add X, X, UX |
|
3460 add TMP3, TOP, TMP3, asl #2 |
|
3461 mov TMP4, X, asr #16 |
|
3462 add X, X, UX |
|
3463 add TMP4, TOP, TMP4, asl #2 |
|
3464 vmlal.u8 q9, d23, d29 |
|
3465 vld1.32 {d22}, [TMP3], STRIDE |
|
3466 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3467 vld1.32 {d23}, [TMP3] |
|
3468 vmull.u8 q10, d22, d28 |
|
3469 vmlal.u8 q10, d23, d29 |
|
3470 vmovn.u16 d8, q0 |
|
3471 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
|
3472 vmovn.u16 d9, q2 |
|
3473 vmlsl.u16 q0, d16, d30 |
|
3474 vmlal.u16 q0, d17, d30 |
|
3475 pld [TMP4, PF_OFFS] |
|
3476 vld1.32 {d16}, [TMP4], STRIDE |
|
3477 vadd.u16 q12, q12, q13 |
|
3478 vld1.32 {d17}, [TMP4] |
|
3479 pld [TMP4, PF_OFFS] |
|
3480 vmull.u8 q11, d16, d28 |
|
3481 vmlal.u8 q11, d17, d29 |
|
3482 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
|
3483 vmlsl.u16 q1, d18, d31 |
|
3484 .endm |
|
3485 |
|
3486 .macro bilinear_interpolate_eight_pixels_8888_0565_tail |
|
3487 vmlal.u16 q1, d19, d31 |
|
3488 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3489 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
|
3490 vmlsl.u16 q2, d20, d30 |
|
3491 vmlal.u16 q2, d21, d30 |
|
3492 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
|
3493 vmlsl.u16 q3, d22, d31 |
|
3494 vmlal.u16 q3, d23, d31 |
|
3495 vadd.u16 q12, q12, q13 |
|
3496 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3497 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3498 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3499 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3500 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3501 vmovn.u16 d10, q0 |
|
3502 vmovn.u16 d11, q2 |
|
3503 vadd.u16 q12, q12, q13 |
|
3504 |
|
3505 vuzp.u8 d8, d9 |
|
3506 vuzp.u8 d10, d11 |
|
3507 vuzp.u8 d9, d11 |
|
3508 vuzp.u8 d8, d10 |
|
3509 vshll.u8 q6, d9, #8 |
|
3510 vshll.u8 q5, d10, #8 |
|
3511 vshll.u8 q7, d8, #8 |
|
3512 vsri.u16 q5, q6, #5 |
|
3513 vsri.u16 q5, q7, #11 |
|
3514 vst1.32 {d10, d11}, [OUT, :128]! |
|
3515 .endm |
|
3516 |
|
3517 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head |
|
3518 mov TMP1, X, asr #16 |
|
3519 add X, X, UX |
|
3520 add TMP1, TOP, TMP1, asl #2 |
|
3521 mov TMP2, X, asr #16 |
|
3522 add X, X, UX |
|
3523 add TMP2, TOP, TMP2, asl #2 |
|
3524 vmlal.u16 q1, d19, d31 |
|
3525 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3526 vuzp.u8 d8, d9 |
|
3527 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
|
3528 vmlsl.u16 q2, d20, d30 |
|
3529 vmlal.u16 q2, d21, d30 |
|
3530 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
|
3531 vld1.32 {d20}, [TMP1], STRIDE |
|
3532 vmlsl.u16 q3, d22, d31 |
|
3533 vmlal.u16 q3, d23, d31 |
|
3534 vld1.32 {d21}, [TMP1] |
|
3535 vmull.u8 q8, d20, d28 |
|
3536 vmlal.u8 q8, d21, d29 |
|
3537 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3538 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3539 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3540 vld1.32 {d22}, [TMP2], STRIDE |
|
3541 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3542 vadd.u16 q12, q12, q13 |
|
3543 vld1.32 {d23}, [TMP2] |
|
3544 vmull.u8 q9, d22, d28 |
|
3545 mov TMP3, X, asr #16 |
|
3546 add X, X, UX |
|
3547 add TMP3, TOP, TMP3, asl #2 |
|
3548 mov TMP4, X, asr #16 |
|
3549 add X, X, UX |
|
3550 add TMP4, TOP, TMP4, asl #2 |
|
3551 vmlal.u8 q9, d23, d29 |
|
3552 vld1.32 {d22}, [TMP3], STRIDE |
|
3553 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3554 vld1.32 {d23}, [TMP3] |
|
3555 vmull.u8 q10, d22, d28 |
|
3556 vmlal.u8 q10, d23, d29 |
|
3557 vmovn.u16 d10, q0 |
|
3558 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
|
3559 vmovn.u16 d11, q2 |
|
3560 vmlsl.u16 q0, d16, d30 |
|
3561 vmlal.u16 q0, d17, d30 |
|
3562 pld [TMP4, PF_OFFS] |
|
3563 vld1.32 {d16}, [TMP4], STRIDE |
|
3564 vadd.u16 q12, q12, q13 |
|
3565 vld1.32 {d17}, [TMP4] |
|
3566 pld [TMP4, PF_OFFS] |
|
3567 vmull.u8 q11, d16, d28 |
|
3568 vmlal.u8 q11, d17, d29 |
|
3569 vuzp.u8 d10, d11 |
|
3570 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
|
3571 vmlsl.u16 q1, d18, d31 |
|
3572 |
|
3573 mov TMP1, X, asr #16 |
|
3574 add X, X, UX |
|
3575 add TMP1, TOP, TMP1, asl #2 |
|
3576 mov TMP2, X, asr #16 |
|
3577 add X, X, UX |
|
3578 add TMP2, TOP, TMP2, asl #2 |
|
3579 vmlal.u16 q1, d19, d31 |
|
3580 vuzp.u8 d9, d11 |
|
3581 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3582 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
|
3583 vuzp.u8 d8, d10 |
|
3584 vmlsl.u16 q2, d20, d30 |
|
3585 vmlal.u16 q2, d21, d30 |
|
3586 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
|
3587 vld1.32 {d20}, [TMP1], STRIDE |
|
3588 vmlsl.u16 q3, d22, d31 |
|
3589 vmlal.u16 q3, d23, d31 |
|
3590 vld1.32 {d21}, [TMP1] |
|
3591 vmull.u8 q8, d20, d28 |
|
3592 vmlal.u8 q8, d21, d29 |
|
3593 vshll.u8 q6, d9, #8 |
|
3594 vshll.u8 q5, d10, #8 |
|
3595 vshll.u8 q7, d8, #8 |
|
3596 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3597 vsri.u16 q5, q6, #5 |
|
3598 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3599 vsri.u16 q5, q7, #11 |
|
3600 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3601 vld1.32 {d22}, [TMP2], STRIDE |
|
3602 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
|
3603 vadd.u16 q12, q12, q13 |
|
3604 vld1.32 {d23}, [TMP2] |
|
3605 vmull.u8 q9, d22, d28 |
|
3606 mov TMP3, X, asr #16 |
|
3607 add X, X, UX |
|
3608 add TMP3, TOP, TMP3, asl #2 |
|
3609 mov TMP4, X, asr #16 |
|
3610 add X, X, UX |
|
3611 add TMP4, TOP, TMP4, asl #2 |
|
3612 vmlal.u8 q9, d23, d29 |
|
3613 vld1.32 {d22}, [TMP3], STRIDE |
|
3614 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
|
3615 vld1.32 {d23}, [TMP3] |
|
3616 vmull.u8 q10, d22, d28 |
|
3617 vmlal.u8 q10, d23, d29 |
|
3618 vmovn.u16 d8, q0 |
|
3619 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
|
3620 vmovn.u16 d9, q2 |
|
3621 vmlsl.u16 q0, d16, d30 |
|
3622 vmlal.u16 q0, d17, d30 |
|
3623 pld [TMP4, PF_OFFS] |
|
3624 vld1.32 {d16}, [TMP4], STRIDE |
|
3625 vadd.u16 q12, q12, q13 |
|
3626 vld1.32 {d17}, [TMP4] |
|
3627 pld [TMP4, PF_OFFS] |
|
3628 vmull.u8 q11, d16, d28 |
|
3629 vmlal.u8 q11, d17, d29 |
|
3630 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
|
3631 vst1.32 {d10, d11}, [OUT, :128]! |
|
3632 vmlsl.u16 q1, d18, d31 |
|
3633 .endm |
|
3634 /*****************************************************************************/ |
|
3635 |
|
3636 generate_bilinear_scanline_func \ |
|
3637 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ |
|
3638 2, 2, 28, BILINEAR_FLAG_UNROLL_4 |
|
3639 |
|
3640 generate_bilinear_scanline_func \ |
|
3641 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ |
|
3642 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS |
|
3643 |
|
3644 generate_bilinear_scanline_func \ |
|
3645 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ |
|
3646 1, 2, 28, BILINEAR_FLAG_UNROLL_4 |
|
3647 |
|
3648 generate_bilinear_scanline_func \ |
|
3649 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ |
|
3650 1, 1, 28, BILINEAR_FLAG_UNROLL_4 |