gfx/cairo/libpixman/src/pixman-arm-neon-asm.S

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:52999874c7f1
1 /*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26 /*
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
31 *
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
35 */
36
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
40 #endif
41
42 .text
43 .fpu neon
44 .arch armv7a
45 .object_arch armv4
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
48 .arm
49 .altmacro
50 .p2align 2
51
52 #include "pixman-private.h"
53 #include "pixman-arm-neon-asm.h"
54
55 /* Global configuration options and preferences */
56
57 /*
58 * The code can optionally make use of unaligned memory accesses to improve
59 * performance of handling leading/trailing pixels for each scanline.
60 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
61 * example in linux if unaligned memory accesses are not configured to
62 * generate.exceptions.
63 */
64 .set RESPECT_STRICT_ALIGNMENT, 1
65
66 /*
67 * Set default prefetch type. There is a choice between the following options:
68 *
69 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
70 * as NOP to workaround some HW bugs or for whatever other reason)
71 *
72 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
73 * advanced prefetch intruduces heavy overhead)
74 *
75 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
76 * which can run ARM and NEON instructions simultaneously so that extra ARM
77 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
78 *
79 * Note: some types of function can't support advanced prefetch and fallback
80 * to simple one (those which handle 24bpp pixels)
81 */
82 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
83
84 /* Prefetch distance in pixels for simple prefetch */
85 .set PREFETCH_DISTANCE_SIMPLE, 64
86
87 /*
88 * Implementation of pixman_composite_over_8888_0565_asm_neon
89 *
90 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
91 * performs OVER compositing operation. Function fast_composite_over_8888_0565
92 * from pixman-fast-path.c does the same in C and can be used as a reference.
93 *
94 * First we need to have some NEON assembly code which can do the actual
95 * operation on the pixels and provide it to the template macro.
96 *
97 * Template macro quite conveniently takes care of emitting all the necessary
98 * code for memory reading and writing (including quite tricky cases of
99 * handling unaligned leading/trailing pixels), so we only need to deal with
100 * the data in NEON registers.
101 *
102 * NEON registers allocation in general is recommented to be the following:
103 * d0, d1, d2, d3 - contain loaded source pixel data
104 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
105 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
106 * d28, d29, d30, d31 - place for storing the result (destination pixels)
107 *
108 * As can be seen above, four 64-bit NEON registers are used for keeping
109 * intermediate pixel data and up to 8 pixels can be processed in one step
110 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
111 *
112 * This particular function uses the following registers allocation:
113 * d0, d1, d2, d3 - contain loaded source pixel data
114 * d4, d5 - contain loaded destination pixels (they are needed)
115 * d28, d29 - place for storing the result (destination pixels)
116 */
117
118 /*
119 * Step one. We need to have some code to do some arithmetics on pixel data.
120 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
121 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
122 * perform all the needed calculations and write the result to {d28, d29}.
123 * The rationale for having two macros and not just one will be explained
124 * later. In practice, any single monolitic function which does the work can
125 * be split into two parts in any arbitrary way without affecting correctness.
126 *
127 * There is one special trick here too. Common template macro can optionally
128 * make our life a bit easier by doing R, G, B, A color components
129 * deinterleaving for 32bpp pixel formats (and this feature is used in
130 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
131 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
132 * actually use d0 register for blue channel (a vector of eight 8-bit
133 * values), d1 register for green, d2 for red and d3 for alpha. This
134 * simple conversion can be also done with a few NEON instructions:
135 *
136 * Packed to planar conversion:
137 * vuzp.8 d0, d1
138 * vuzp.8 d2, d3
139 * vuzp.8 d1, d3
140 * vuzp.8 d0, d2
141 *
142 * Planar to packed conversion:
143 * vzip.8 d0, d2
144 * vzip.8 d1, d3
145 * vzip.8 d2, d3
146 * vzip.8 d0, d1
147 *
148 * But pixel can be loaded directly in planar format using VLD4.8 NEON
149 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
150 * desirable, that's why deinterleaving is optional.
151 *
152 * But anyway, here is the code:
153 */
154 .macro pixman_composite_over_8888_0565_process_pixblock_head
155 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
156 and put data into d6 - red, d7 - green, d30 - blue */
157 vshrn.u16 d6, q2, #8
158 vshrn.u16 d7, q2, #3
159 vsli.u16 q2, q2, #5
160 vsri.u8 d6, d6, #5
161 vmvn.8 d3, d3 /* invert source alpha */
162 vsri.u8 d7, d7, #6
163 vshrn.u16 d30, q2, #2
164 /* now do alpha blending, storing results in 8-bit planar format
165 into d16 - red, d19 - green, d18 - blue */
166 vmull.u8 q10, d3, d6
167 vmull.u8 q11, d3, d7
168 vmull.u8 q12, d3, d30
169 vrshr.u16 q13, q10, #8
170 vrshr.u16 q3, q11, #8
171 vrshr.u16 q15, q12, #8
172 vraddhn.u16 d20, q10, q13
173 vraddhn.u16 d23, q11, q3
174 vraddhn.u16 d22, q12, q15
175 .endm
176
177 .macro pixman_composite_over_8888_0565_process_pixblock_tail
178 /* ... continue alpha blending */
179 vqadd.u8 d16, d2, d20
180 vqadd.u8 q9, q0, q11
181 /* convert the result to r5g6b5 and store it into {d28, d29} */
182 vshll.u8 q14, d16, #8
183 vshll.u8 q8, d19, #8
184 vshll.u8 q9, d18, #8
185 vsri.u16 q14, q8, #5
186 vsri.u16 q14, q9, #11
187 .endm
188
189 /*
190 * OK, now we got almost everything that we need. Using the above two
191 * macros, the work can be done right. But now we want to optimize
192 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
193 * a lot from good code scheduling and software pipelining.
194 *
195 * Let's construct some code, which will run in the core main loop.
196 * Some pseudo-code of the main loop will look like this:
197 * head
198 * while (...) {
199 * tail
200 * head
201 * }
202 * tail
203 *
204 * It may look a bit weird, but this setup allows to hide instruction
205 * latencies better and also utilize dual-issue capability more
206 * efficiently (make pairs of load-store and ALU instructions).
207 *
208 * So what we need now is a '*_tail_head' macro, which will be used
209 * in the core main loop. A trivial straightforward implementation
210 * of this macro would look like this:
211 *
212 * pixman_composite_over_8888_0565_process_pixblock_tail
213 * vst1.16 {d28, d29}, [DST_W, :128]!
214 * vld1.16 {d4, d5}, [DST_R, :128]!
215 * vld4.32 {d0, d1, d2, d3}, [SRC]!
216 * pixman_composite_over_8888_0565_process_pixblock_head
217 * cache_preload 8, 8
218 *
219 * Now it also got some VLD/VST instructions. We simply can't move from
220 * processing one block of pixels to the other one with just arithmetics.
221 * The previously processed data needs to be written to memory and new
222 * data needs to be fetched. Fortunately, this main loop does not deal
223 * with partial leading/trailing pixels and can load/store a full block
224 * of pixels in a bulk. Additionally, destination buffer is already
225 * 16 bytes aligned here (which is good for performance).
226 *
227 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
228 * are the aliases for ARM registers which are used as pointers for
229 * accessing data. We maintain separate pointers for reading and writing
230 * destination buffer (DST_R and DST_W).
231 *
232 * Another new thing is 'cache_preload' macro. It is used for prefetching
233 * data into CPU L2 cache and improve performance when dealing with large
234 * images which are far larger than cache size. It uses one argument
235 * (actually two, but they need to be the same here) - number of pixels
236 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
237 * details about this macro. Moreover, if good performance is needed
238 * the code from this macro needs to be copied into '*_tail_head' macro
239 * and mixed with the rest of code for optimal instructions scheduling.
240 * We are actually doing it below.
241 *
242 * Now after all the explanations, here is the optimized code.
243 * Different instruction streams (originaling from '*_head', '*_tail'
244 * and 'cache_preload' macro) use different indentation levels for
245 * better readability. Actually taking the code from one of these
246 * indentation levels and ignoring a few VLD/VST instructions would
247 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
248 * macro!
249 */
250
251 #if 1
252
253 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
254 vqadd.u8 d16, d2, d20
255 vld1.16 {d4, d5}, [DST_R, :128]!
256 vqadd.u8 q9, q0, q11
257 vshrn.u16 d6, q2, #8
258 fetch_src_pixblock
259 vshrn.u16 d7, q2, #3
260 vsli.u16 q2, q2, #5
261 vshll.u8 q14, d16, #8
262 PF add PF_X, PF_X, #8
263 vshll.u8 q8, d19, #8
264 PF tst PF_CTL, #0xF
265 vsri.u8 d6, d6, #5
266 PF addne PF_X, PF_X, #8
267 vmvn.8 d3, d3
268 PF subne PF_CTL, PF_CTL, #1
269 vsri.u8 d7, d7, #6
270 vshrn.u16 d30, q2, #2
271 vmull.u8 q10, d3, d6
272 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
273 vmull.u8 q11, d3, d7
274 vmull.u8 q12, d3, d30
275 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
276 vsri.u16 q14, q8, #5
277 PF cmp PF_X, ORIG_W
278 vshll.u8 q9, d18, #8
279 vrshr.u16 q13, q10, #8
280 PF subge PF_X, PF_X, ORIG_W
281 vrshr.u16 q3, q11, #8
282 vrshr.u16 q15, q12, #8
283 PF subges PF_CTL, PF_CTL, #0x10
284 vsri.u16 q14, q9, #11
285 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
286 vraddhn.u16 d20, q10, q13
287 vraddhn.u16 d23, q11, q3
288 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
289 vraddhn.u16 d22, q12, q15
290 vst1.16 {d28, d29}, [DST_W, :128]!
291 .endm
292
293 #else
294
295 /* If we did not care much about the performance, we would just use this... */
296 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
297 pixman_composite_over_8888_0565_process_pixblock_tail
298 vst1.16 {d28, d29}, [DST_W, :128]!
299 vld1.16 {d4, d5}, [DST_R, :128]!
300 fetch_src_pixblock
301 pixman_composite_over_8888_0565_process_pixblock_head
302 cache_preload 8, 8
303 .endm
304
305 #endif
306
307 /*
308 * And now the final part. We are using 'generate_composite_function' macro
309 * to put all the stuff together. We are specifying the name of the function
310 * which we want to get, number of bits per pixel for the source, mask and
311 * destination (0 if unused, like mask in this case). Next come some bit
312 * flags:
313 * FLAG_DST_READWRITE - tells that the destination buffer is both read
314 * and written, for write-only buffer we would use
315 * FLAG_DST_WRITEONLY flag instead
316 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
317 * and separate color channels for 32bpp format.
318 * The next things are:
319 * - the number of pixels processed per iteration (8 in this case, because
320 * that's the maximum what can fit into four 64-bit NEON registers).
321 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
322 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
323 * prefetch distance can be selected by running some benchmarks.
324 *
325 * After that we specify some macros, these are 'default_init',
326 * 'default_cleanup' here which are empty (but it is possible to have custom
327 * init/cleanup macros to be able to save/restore some extra NEON registers
328 * like d8-d15 or do anything else) followed by
329 * 'pixman_composite_over_8888_0565_process_pixblock_head',
330 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
331 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
332 * which we got implemented above.
333 *
334 * The last part is the NEON registers allocation scheme.
335 */
336 generate_composite_function \
337 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
338 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
339 8, /* number of pixels, processed in a single block */ \
340 5, /* prefetch distance */ \
341 default_init, \
342 default_cleanup, \
343 pixman_composite_over_8888_0565_process_pixblock_head, \
344 pixman_composite_over_8888_0565_process_pixblock_tail, \
345 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
346 28, /* dst_w_basereg */ \
347 4, /* dst_r_basereg */ \
348 0, /* src_basereg */ \
349 24 /* mask_basereg */
350
351 /******************************************************************************/
352
353 .macro pixman_composite_over_n_0565_process_pixblock_head
354 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
355 and put data into d6 - red, d7 - green, d30 - blue */
356 vshrn.u16 d6, q2, #8
357 vshrn.u16 d7, q2, #3
358 vsli.u16 q2, q2, #5
359 vsri.u8 d6, d6, #5
360 vsri.u8 d7, d7, #6
361 vshrn.u16 d30, q2, #2
362 /* now do alpha blending, storing results in 8-bit planar format
363 into d16 - red, d19 - green, d18 - blue */
364 vmull.u8 q10, d3, d6
365 vmull.u8 q11, d3, d7
366 vmull.u8 q12, d3, d30
367 vrshr.u16 q13, q10, #8
368 vrshr.u16 q3, q11, #8
369 vrshr.u16 q15, q12, #8
370 vraddhn.u16 d20, q10, q13
371 vraddhn.u16 d23, q11, q3
372 vraddhn.u16 d22, q12, q15
373 .endm
374
375 .macro pixman_composite_over_n_0565_process_pixblock_tail
376 /* ... continue alpha blending */
377 vqadd.u8 d16, d2, d20
378 vqadd.u8 q9, q0, q11
379 /* convert the result to r5g6b5 and store it into {d28, d29} */
380 vshll.u8 q14, d16, #8
381 vshll.u8 q8, d19, #8
382 vshll.u8 q9, d18, #8
383 vsri.u16 q14, q8, #5
384 vsri.u16 q14, q9, #11
385 .endm
386
387 /* TODO: expand macros and do better instructions scheduling */
388 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
389 pixman_composite_over_n_0565_process_pixblock_tail
390 vld1.16 {d4, d5}, [DST_R, :128]!
391 vst1.16 {d28, d29}, [DST_W, :128]!
392 pixman_composite_over_n_0565_process_pixblock_head
393 cache_preload 8, 8
394 .endm
395
396 .macro pixman_composite_over_n_0565_init
397 add DUMMY, sp, #ARGS_STACK_OFFSET
398 vld1.32 {d3[0]}, [DUMMY]
399 vdup.8 d0, d3[0]
400 vdup.8 d1, d3[1]
401 vdup.8 d2, d3[2]
402 vdup.8 d3, d3[3]
403 vmvn.8 d3, d3 /* invert source alpha */
404 .endm
405
406 generate_composite_function \
407 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
408 FLAG_DST_READWRITE, \
409 8, /* number of pixels, processed in a single block */ \
410 5, /* prefetch distance */ \
411 pixman_composite_over_n_0565_init, \
412 default_cleanup, \
413 pixman_composite_over_n_0565_process_pixblock_head, \
414 pixman_composite_over_n_0565_process_pixblock_tail, \
415 pixman_composite_over_n_0565_process_pixblock_tail_head, \
416 28, /* dst_w_basereg */ \
417 4, /* dst_r_basereg */ \
418 0, /* src_basereg */ \
419 24 /* mask_basereg */
420
421 /******************************************************************************/
422
423 .macro pixman_composite_src_8888_0565_process_pixblock_head
424 vshll.u8 q8, d1, #8
425 vshll.u8 q14, d2, #8
426 vshll.u8 q9, d0, #8
427 .endm
428
429 .macro pixman_composite_src_8888_0565_process_pixblock_tail
430 vsri.u16 q14, q8, #5
431 vsri.u16 q14, q9, #11
432 .endm
433
434 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
435 vsri.u16 q14, q8, #5
436 PF add PF_X, PF_X, #8
437 PF tst PF_CTL, #0xF
438 fetch_src_pixblock
439 PF addne PF_X, PF_X, #8
440 PF subne PF_CTL, PF_CTL, #1
441 vsri.u16 q14, q9, #11
442 PF cmp PF_X, ORIG_W
443 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
444 vshll.u8 q8, d1, #8
445 vst1.16 {d28, d29}, [DST_W, :128]!
446 PF subge PF_X, PF_X, ORIG_W
447 PF subges PF_CTL, PF_CTL, #0x10
448 vshll.u8 q14, d2, #8
449 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
450 vshll.u8 q9, d0, #8
451 .endm
452
453 generate_composite_function \
454 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
455 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
456 8, /* number of pixels, processed in a single block */ \
457 10, /* prefetch distance */ \
458 default_init, \
459 default_cleanup, \
460 pixman_composite_src_8888_0565_process_pixblock_head, \
461 pixman_composite_src_8888_0565_process_pixblock_tail, \
462 pixman_composite_src_8888_0565_process_pixblock_tail_head
463
464 /******************************************************************************/
465
466 .macro pixman_composite_src_0565_8888_process_pixblock_head
467 vshrn.u16 d30, q0, #8
468 vshrn.u16 d29, q0, #3
469 vsli.u16 q0, q0, #5
470 vmov.u8 d31, #255
471 vsri.u8 d30, d30, #5
472 vsri.u8 d29, d29, #6
473 vshrn.u16 d28, q0, #2
474 .endm
475
476 .macro pixman_composite_src_0565_8888_process_pixblock_tail
477 .endm
478
479 /* TODO: expand macros and do better instructions scheduling */
480 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
481 pixman_composite_src_0565_8888_process_pixblock_tail
482 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
483 fetch_src_pixblock
484 pixman_composite_src_0565_8888_process_pixblock_head
485 cache_preload 8, 8
486 .endm
487
488 generate_composite_function \
489 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
490 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
491 8, /* number of pixels, processed in a single block */ \
492 10, /* prefetch distance */ \
493 default_init, \
494 default_cleanup, \
495 pixman_composite_src_0565_8888_process_pixblock_head, \
496 pixman_composite_src_0565_8888_process_pixblock_tail, \
497 pixman_composite_src_0565_8888_process_pixblock_tail_head
498
499 /******************************************************************************/
500
501 .macro pixman_composite_add_8_8_process_pixblock_head
502 vqadd.u8 q14, q0, q2
503 vqadd.u8 q15, q1, q3
504 .endm
505
506 .macro pixman_composite_add_8_8_process_pixblock_tail
507 .endm
508
509 .macro pixman_composite_add_8_8_process_pixblock_tail_head
510 fetch_src_pixblock
511 PF add PF_X, PF_X, #32
512 PF tst PF_CTL, #0xF
513 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
514 PF addne PF_X, PF_X, #32
515 PF subne PF_CTL, PF_CTL, #1
516 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
517 PF cmp PF_X, ORIG_W
518 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
519 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
520 PF subge PF_X, PF_X, ORIG_W
521 PF subges PF_CTL, PF_CTL, #0x10
522 vqadd.u8 q14, q0, q2
523 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
524 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
525 vqadd.u8 q15, q1, q3
526 .endm
527
528 generate_composite_function \
529 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
530 FLAG_DST_READWRITE, \
531 32, /* number of pixels, processed in a single block */ \
532 10, /* prefetch distance */ \
533 default_init, \
534 default_cleanup, \
535 pixman_composite_add_8_8_process_pixblock_head, \
536 pixman_composite_add_8_8_process_pixblock_tail, \
537 pixman_composite_add_8_8_process_pixblock_tail_head
538
539 /******************************************************************************/
540
541 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
542 fetch_src_pixblock
543 PF add PF_X, PF_X, #8
544 PF tst PF_CTL, #0xF
545 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
546 PF addne PF_X, PF_X, #8
547 PF subne PF_CTL, PF_CTL, #1
548 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
549 PF cmp PF_X, ORIG_W
550 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
551 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
552 PF subge PF_X, PF_X, ORIG_W
553 PF subges PF_CTL, PF_CTL, #0x10
554 vqadd.u8 q14, q0, q2
555 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
556 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
557 vqadd.u8 q15, q1, q3
558 .endm
559
560 generate_composite_function \
561 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
562 FLAG_DST_READWRITE, \
563 8, /* number of pixels, processed in a single block */ \
564 10, /* prefetch distance */ \
565 default_init, \
566 default_cleanup, \
567 pixman_composite_add_8_8_process_pixblock_head, \
568 pixman_composite_add_8_8_process_pixblock_tail, \
569 pixman_composite_add_8888_8888_process_pixblock_tail_head
570
571 generate_composite_function_single_scanline \
572 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
573 FLAG_DST_READWRITE, \
574 8, /* number of pixels, processed in a single block */ \
575 default_init, \
576 default_cleanup, \
577 pixman_composite_add_8_8_process_pixblock_head, \
578 pixman_composite_add_8_8_process_pixblock_tail, \
579 pixman_composite_add_8888_8888_process_pixblock_tail_head
580
581 /******************************************************************************/
582
583 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
584 vmvn.8 d24, d3 /* get inverted alpha */
585 /* do alpha blending */
586 vmull.u8 q8, d24, d4
587 vmull.u8 q9, d24, d5
588 vmull.u8 q10, d24, d6
589 vmull.u8 q11, d24, d7
590 .endm
591
592 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
593 vrshr.u16 q14, q8, #8
594 vrshr.u16 q15, q9, #8
595 vrshr.u16 q12, q10, #8
596 vrshr.u16 q13, q11, #8
597 vraddhn.u16 d28, q14, q8
598 vraddhn.u16 d29, q15, q9
599 vraddhn.u16 d30, q12, q10
600 vraddhn.u16 d31, q13, q11
601 .endm
602
603 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
604 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
605 vrshr.u16 q14, q8, #8
606 PF add PF_X, PF_X, #8
607 PF tst PF_CTL, #0xF
608 vrshr.u16 q15, q9, #8
609 vrshr.u16 q12, q10, #8
610 vrshr.u16 q13, q11, #8
611 PF addne PF_X, PF_X, #8
612 PF subne PF_CTL, PF_CTL, #1
613 vraddhn.u16 d28, q14, q8
614 vraddhn.u16 d29, q15, q9
615 PF cmp PF_X, ORIG_W
616 vraddhn.u16 d30, q12, q10
617 vraddhn.u16 d31, q13, q11
618 fetch_src_pixblock
619 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
620 vmvn.8 d22, d3
621 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
622 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
623 PF subge PF_X, PF_X, ORIG_W
624 vmull.u8 q8, d22, d4
625 PF subges PF_CTL, PF_CTL, #0x10
626 vmull.u8 q9, d22, d5
627 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
628 vmull.u8 q10, d22, d6
629 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
630 vmull.u8 q11, d22, d7
631 .endm
632
633 generate_composite_function_single_scanline \
634 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
635 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
636 8, /* number of pixels, processed in a single block */ \
637 default_init, \
638 default_cleanup, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
640 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
641 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
642
643 /******************************************************************************/
644
645 .macro pixman_composite_over_8888_8888_process_pixblock_head
646 pixman_composite_out_reverse_8888_8888_process_pixblock_head
647 .endm
648
649 .macro pixman_composite_over_8888_8888_process_pixblock_tail
650 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
651 vqadd.u8 q14, q0, q14
652 vqadd.u8 q15, q1, q15
653 .endm
654
655 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
656 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
657 vrshr.u16 q14, q8, #8
658 PF add PF_X, PF_X, #8
659 PF tst PF_CTL, #0xF
660 vrshr.u16 q15, q9, #8
661 vrshr.u16 q12, q10, #8
662 vrshr.u16 q13, q11, #8
663 PF addne PF_X, PF_X, #8
664 PF subne PF_CTL, PF_CTL, #1
665 vraddhn.u16 d28, q14, q8
666 vraddhn.u16 d29, q15, q9
667 PF cmp PF_X, ORIG_W
668 vraddhn.u16 d30, q12, q10
669 vraddhn.u16 d31, q13, q11
670 vqadd.u8 q14, q0, q14
671 vqadd.u8 q15, q1, q15
672 fetch_src_pixblock
673 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
674 vmvn.8 d22, d3
675 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
676 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
677 PF subge PF_X, PF_X, ORIG_W
678 vmull.u8 q8, d22, d4
679 PF subges PF_CTL, PF_CTL, #0x10
680 vmull.u8 q9, d22, d5
681 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
682 vmull.u8 q10, d22, d6
683 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
684 vmull.u8 q11, d22, d7
685 .endm
686
687 generate_composite_function \
688 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
689 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
690 8, /* number of pixels, processed in a single block */ \
691 5, /* prefetch distance */ \
692 default_init, \
693 default_cleanup, \
694 pixman_composite_over_8888_8888_process_pixblock_head, \
695 pixman_composite_over_8888_8888_process_pixblock_tail, \
696 pixman_composite_over_8888_8888_process_pixblock_tail_head
697
698 generate_composite_function_single_scanline \
699 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
700 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
701 8, /* number of pixels, processed in a single block */ \
702 default_init, \
703 default_cleanup, \
704 pixman_composite_over_8888_8888_process_pixblock_head, \
705 pixman_composite_over_8888_8888_process_pixblock_tail, \
706 pixman_composite_over_8888_8888_process_pixblock_tail_head
707
708 /******************************************************************************/
709
710 .macro pixman_composite_over_n_8888_process_pixblock_head
711 /* deinterleaved source pixels in {d0, d1, d2, d3} */
712 /* inverted alpha in {d24} */
713 /* destination pixels in {d4, d5, d6, d7} */
714 vmull.u8 q8, d24, d4
715 vmull.u8 q9, d24, d5
716 vmull.u8 q10, d24, d6
717 vmull.u8 q11, d24, d7
718 .endm
719
720 .macro pixman_composite_over_n_8888_process_pixblock_tail
721 vrshr.u16 q14, q8, #8
722 vrshr.u16 q15, q9, #8
723 vrshr.u16 q2, q10, #8
724 vrshr.u16 q3, q11, #8
725 vraddhn.u16 d28, q14, q8
726 vraddhn.u16 d29, q15, q9
727 vraddhn.u16 d30, q2, q10
728 vraddhn.u16 d31, q3, q11
729 vqadd.u8 q14, q0, q14
730 vqadd.u8 q15, q1, q15
731 .endm
732
733 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
734 vrshr.u16 q14, q8, #8
735 vrshr.u16 q15, q9, #8
736 vrshr.u16 q2, q10, #8
737 vrshr.u16 q3, q11, #8
738 vraddhn.u16 d28, q14, q8
739 vraddhn.u16 d29, q15, q9
740 vraddhn.u16 d30, q2, q10
741 vraddhn.u16 d31, q3, q11
742 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
743 vqadd.u8 q14, q0, q14
744 PF add PF_X, PF_X, #8
745 PF tst PF_CTL, #0x0F
746 PF addne PF_X, PF_X, #8
747 PF subne PF_CTL, PF_CTL, #1
748 vqadd.u8 q15, q1, q15
749 PF cmp PF_X, ORIG_W
750 vmull.u8 q8, d24, d4
751 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
752 vmull.u8 q9, d24, d5
753 PF subge PF_X, PF_X, ORIG_W
754 vmull.u8 q10, d24, d6
755 PF subges PF_CTL, PF_CTL, #0x10
756 vmull.u8 q11, d24, d7
757 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
759 .endm
760
761 .macro pixman_composite_over_n_8888_init
762 add DUMMY, sp, #ARGS_STACK_OFFSET
763 vld1.32 {d3[0]}, [DUMMY]
764 vdup.8 d0, d3[0]
765 vdup.8 d1, d3[1]
766 vdup.8 d2, d3[2]
767 vdup.8 d3, d3[3]
768 vmvn.8 d24, d3 /* get inverted alpha */
769 .endm
770
771 generate_composite_function \
772 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
773 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
774 8, /* number of pixels, processed in a single block */ \
775 5, /* prefetch distance */ \
776 pixman_composite_over_n_8888_init, \
777 default_cleanup, \
778 pixman_composite_over_8888_8888_process_pixblock_head, \
779 pixman_composite_over_8888_8888_process_pixblock_tail, \
780 pixman_composite_over_n_8888_process_pixblock_tail_head
781
782 /******************************************************************************/
783
784 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
785 vrshr.u16 q14, q8, #8
786 PF add PF_X, PF_X, #8
787 PF tst PF_CTL, #0xF
788 vrshr.u16 q15, q9, #8
789 vrshr.u16 q12, q10, #8
790 vrshr.u16 q13, q11, #8
791 PF addne PF_X, PF_X, #8
792 PF subne PF_CTL, PF_CTL, #1
793 vraddhn.u16 d28, q14, q8
794 vraddhn.u16 d29, q15, q9
795 PF cmp PF_X, ORIG_W
796 vraddhn.u16 d30, q12, q10
797 vraddhn.u16 d31, q13, q11
798 vqadd.u8 q14, q0, q14
799 vqadd.u8 q15, q1, q15
800 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
801 vmvn.8 d22, d3
802 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
803 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
804 PF subge PF_X, PF_X, ORIG_W
805 vmull.u8 q8, d22, d4
806 PF subges PF_CTL, PF_CTL, #0x10
807 vmull.u8 q9, d22, d5
808 vmull.u8 q10, d22, d6
809 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
810 vmull.u8 q11, d22, d7
811 .endm
812
813 .macro pixman_composite_over_reverse_n_8888_init
814 add DUMMY, sp, #ARGS_STACK_OFFSET
815 vld1.32 {d7[0]}, [DUMMY]
816 vdup.8 d4, d7[0]
817 vdup.8 d5, d7[1]
818 vdup.8 d6, d7[2]
819 vdup.8 d7, d7[3]
820 .endm
821
822 generate_composite_function \
823 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
824 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
825 8, /* number of pixels, processed in a single block */ \
826 5, /* prefetch distance */ \
827 pixman_composite_over_reverse_n_8888_init, \
828 default_cleanup, \
829 pixman_composite_over_8888_8888_process_pixblock_head, \
830 pixman_composite_over_8888_8888_process_pixblock_tail, \
831 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
832 28, /* dst_w_basereg */ \
833 0, /* dst_r_basereg */ \
834 4, /* src_basereg */ \
835 24 /* mask_basereg */
836
837 /******************************************************************************/
838
839 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
840 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
841 vmull.u8 q1, d24, d9
842 vmull.u8 q6, d24, d10
843 vmull.u8 q7, d24, d11
844 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
845 vshrn.u16 d7, q2, #3
846 vsli.u16 q2, q2, #5
847 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
848 vrshr.u16 q9, q1, #8
849 vrshr.u16 q10, q6, #8
850 vrshr.u16 q11, q7, #8
851 vraddhn.u16 d0, q0, q8
852 vraddhn.u16 d1, q1, q9
853 vraddhn.u16 d2, q6, q10
854 vraddhn.u16 d3, q7, q11
855 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
856 vsri.u8 d7, d7, #6
857 vmvn.8 d3, d3
858 vshrn.u16 d30, q2, #2
859 vmull.u8 q8, d3, d6 /* now do alpha blending */
860 vmull.u8 q9, d3, d7
861 vmull.u8 q10, d3, d30
862 .endm
863
864 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
865 /* 3 cycle bubble (after vmull.u8) */
866 vrshr.u16 q13, q8, #8
867 vrshr.u16 q11, q9, #8
868 vrshr.u16 q15, q10, #8
869 vraddhn.u16 d16, q8, q13
870 vraddhn.u16 d27, q9, q11
871 vraddhn.u16 d26, q10, q15
872 vqadd.u8 d16, d2, d16
873 /* 1 cycle bubble */
874 vqadd.u8 q9, q0, q13
875 vshll.u8 q14, d16, #8 /* convert to 16bpp */
876 vshll.u8 q8, d19, #8
877 vshll.u8 q9, d18, #8
878 vsri.u16 q14, q8, #5
879 /* 1 cycle bubble */
880 vsri.u16 q14, q9, #11
881 .endm
882
883 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
884 vld1.16 {d4, d5}, [DST_R, :128]!
885 vshrn.u16 d6, q2, #8
886 fetch_mask_pixblock
887 vshrn.u16 d7, q2, #3
888 fetch_src_pixblock
889 vmull.u8 q6, d24, d10
890 vrshr.u16 q13, q8, #8
891 vrshr.u16 q11, q9, #8
892 vrshr.u16 q15, q10, #8
893 vraddhn.u16 d16, q8, q13
894 vraddhn.u16 d27, q9, q11
895 vraddhn.u16 d26, q10, q15
896 vqadd.u8 d16, d2, d16
897 vmull.u8 q1, d24, d9
898 vqadd.u8 q9, q0, q13
899 vshll.u8 q14, d16, #8
900 vmull.u8 q0, d24, d8
901 vshll.u8 q8, d19, #8
902 vshll.u8 q9, d18, #8
903 vsri.u16 q14, q8, #5
904 vmull.u8 q7, d24, d11
905 vsri.u16 q14, q9, #11
906
907 cache_preload 8, 8
908
909 vsli.u16 q2, q2, #5
910 vrshr.u16 q8, q0, #8
911 vrshr.u16 q9, q1, #8
912 vrshr.u16 q10, q6, #8
913 vrshr.u16 q11, q7, #8
914 vraddhn.u16 d0, q0, q8
915 vraddhn.u16 d1, q1, q9
916 vraddhn.u16 d2, q6, q10
917 vraddhn.u16 d3, q7, q11
918 vsri.u8 d6, d6, #5
919 vsri.u8 d7, d7, #6
920 vmvn.8 d3, d3
921 vshrn.u16 d30, q2, #2
922 vst1.16 {d28, d29}, [DST_W, :128]!
923 vmull.u8 q8, d3, d6
924 vmull.u8 q9, d3, d7
925 vmull.u8 q10, d3, d30
926 .endm
927
928 generate_composite_function \
929 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
930 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
931 8, /* number of pixels, processed in a single block */ \
932 5, /* prefetch distance */ \
933 default_init_need_all_regs, \
934 default_cleanup_need_all_regs, \
935 pixman_composite_over_8888_8_0565_process_pixblock_head, \
936 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
937 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
938 28, /* dst_w_basereg */ \
939 4, /* dst_r_basereg */ \
940 8, /* src_basereg */ \
941 24 /* mask_basereg */
942
943 /******************************************************************************/
944
945 /*
946 * This function needs a special initialization of solid mask.
947 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
948 * offset, split into color components and replicated in d8-d11
949 * registers. Additionally, this function needs all the NEON registers,
950 * so it has to save d8-d15 registers which are callee saved according
951 * to ABI. These registers are restored from 'cleanup' macro. All the
952 * other NEON registers are caller saved, so can be clobbered freely
953 * without introducing any problems.
954 */
955 .macro pixman_composite_over_n_8_0565_init
956 add DUMMY, sp, #ARGS_STACK_OFFSET
957 .vsave {d8-d15}
958 vpush {d8-d15}
959 vld1.32 {d11[0]}, [DUMMY]
960 vdup.8 d8, d11[0]
961 vdup.8 d9, d11[1]
962 vdup.8 d10, d11[2]
963 vdup.8 d11, d11[3]
964 .endm
965
966 .macro pixman_composite_over_n_8_0565_cleanup
967 vpop {d8-d15}
968 .endm
969
970 generate_composite_function \
971 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
972 FLAG_DST_READWRITE, \
973 8, /* number of pixels, processed in a single block */ \
974 5, /* prefetch distance */ \
975 pixman_composite_over_n_8_0565_init, \
976 pixman_composite_over_n_8_0565_cleanup, \
977 pixman_composite_over_8888_8_0565_process_pixblock_head, \
978 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
979 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
980
981 /******************************************************************************/
982
983 .macro pixman_composite_over_8888_n_0565_init
984 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
985 .vsave {d8-d15}
986 vpush {d8-d15}
987 vld1.32 {d24[0]}, [DUMMY]
988 vdup.8 d24, d24[3]
989 .endm
990
991 .macro pixman_composite_over_8888_n_0565_cleanup
992 vpop {d8-d15}
993 .endm
994
995 generate_composite_function \
996 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
997 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
998 8, /* number of pixels, processed in a single block */ \
999 5, /* prefetch distance */ \
1000 pixman_composite_over_8888_n_0565_init, \
1001 pixman_composite_over_8888_n_0565_cleanup, \
1002 pixman_composite_over_8888_8_0565_process_pixblock_head, \
1003 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
1004 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1005 28, /* dst_w_basereg */ \
1006 4, /* dst_r_basereg */ \
1007 8, /* src_basereg */ \
1008 24 /* mask_basereg */
1009
1010 /******************************************************************************/
1011
1012 .macro pixman_composite_src_0565_0565_process_pixblock_head
1013 .endm
1014
1015 .macro pixman_composite_src_0565_0565_process_pixblock_tail
1016 .endm
1017
1018 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1019 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1020 fetch_src_pixblock
1021 cache_preload 16, 16
1022 .endm
1023
1024 generate_composite_function \
1025 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
1026 FLAG_DST_WRITEONLY, \
1027 16, /* number of pixels, processed in a single block */ \
1028 10, /* prefetch distance */ \
1029 default_init, \
1030 default_cleanup, \
1031 pixman_composite_src_0565_0565_process_pixblock_head, \
1032 pixman_composite_src_0565_0565_process_pixblock_tail, \
1033 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
1034 0, /* dst_w_basereg */ \
1035 0, /* dst_r_basereg */ \
1036 0, /* src_basereg */ \
1037 0 /* mask_basereg */
1038
1039 /******************************************************************************/
1040
1041 .macro pixman_composite_src_n_8_process_pixblock_head
1042 .endm
1043
1044 .macro pixman_composite_src_n_8_process_pixblock_tail
1045 .endm
1046
1047 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1048 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1049 .endm
1050
1051 .macro pixman_composite_src_n_8_init
1052 add DUMMY, sp, #ARGS_STACK_OFFSET
1053 vld1.32 {d0[0]}, [DUMMY]
1054 vsli.u64 d0, d0, #8
1055 vsli.u64 d0, d0, #16
1056 vsli.u64 d0, d0, #32
1057 vorr d1, d0, d0
1058 vorr q1, q0, q0
1059 .endm
1060
1061 .macro pixman_composite_src_n_8_cleanup
1062 .endm
1063
1064 generate_composite_function \
1065 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1066 FLAG_DST_WRITEONLY, \
1067 32, /* number of pixels, processed in a single block */ \
1068 0, /* prefetch distance */ \
1069 pixman_composite_src_n_8_init, \
1070 pixman_composite_src_n_8_cleanup, \
1071 pixman_composite_src_n_8_process_pixblock_head, \
1072 pixman_composite_src_n_8_process_pixblock_tail, \
1073 pixman_composite_src_n_8_process_pixblock_tail_head, \
1074 0, /* dst_w_basereg */ \
1075 0, /* dst_r_basereg */ \
1076 0, /* src_basereg */ \
1077 0 /* mask_basereg */
1078
1079 /******************************************************************************/
1080
1081 .macro pixman_composite_src_n_0565_process_pixblock_head
1082 .endm
1083
1084 .macro pixman_composite_src_n_0565_process_pixblock_tail
1085 .endm
1086
1087 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1088 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1089 .endm
1090
1091 .macro pixman_composite_src_n_0565_init
1092 add DUMMY, sp, #ARGS_STACK_OFFSET
1093 vld1.32 {d0[0]}, [DUMMY]
1094 vsli.u64 d0, d0, #16
1095 vsli.u64 d0, d0, #32
1096 vorr d1, d0, d0
1097 vorr q1, q0, q0
1098 .endm
1099
1100 .macro pixman_composite_src_n_0565_cleanup
1101 .endm
1102
1103 generate_composite_function \
1104 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1105 FLAG_DST_WRITEONLY, \
1106 16, /* number of pixels, processed in a single block */ \
1107 0, /* prefetch distance */ \
1108 pixman_composite_src_n_0565_init, \
1109 pixman_composite_src_n_0565_cleanup, \
1110 pixman_composite_src_n_0565_process_pixblock_head, \
1111 pixman_composite_src_n_0565_process_pixblock_tail, \
1112 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1113 0, /* dst_w_basereg */ \
1114 0, /* dst_r_basereg */ \
1115 0, /* src_basereg */ \
1116 0 /* mask_basereg */
1117
1118 /******************************************************************************/
1119
1120 .macro pixman_composite_src_n_8888_process_pixblock_head
1121 .endm
1122
1123 .macro pixman_composite_src_n_8888_process_pixblock_tail
1124 .endm
1125
1126 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1127 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1128 .endm
1129
1130 .macro pixman_composite_src_n_8888_init
1131 add DUMMY, sp, #ARGS_STACK_OFFSET
1132 vld1.32 {d0[0]}, [DUMMY]
1133 vsli.u64 d0, d0, #32
1134 vorr d1, d0, d0
1135 vorr q1, q0, q0
1136 .endm
1137
1138 .macro pixman_composite_src_n_8888_cleanup
1139 .endm
1140
1141 generate_composite_function \
1142 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1143 FLAG_DST_WRITEONLY, \
1144 8, /* number of pixels, processed in a single block */ \
1145 0, /* prefetch distance */ \
1146 pixman_composite_src_n_8888_init, \
1147 pixman_composite_src_n_8888_cleanup, \
1148 pixman_composite_src_n_8888_process_pixblock_head, \
1149 pixman_composite_src_n_8888_process_pixblock_tail, \
1150 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1151 0, /* dst_w_basereg */ \
1152 0, /* dst_r_basereg */ \
1153 0, /* src_basereg */ \
1154 0 /* mask_basereg */
1155
1156 /******************************************************************************/
1157
1158 .macro pixman_composite_src_8888_8888_process_pixblock_head
1159 .endm
1160
1161 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1162 .endm
1163
1164 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1165 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1166 fetch_src_pixblock
1167 cache_preload 8, 8
1168 .endm
1169
1170 generate_composite_function \
1171 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1172 FLAG_DST_WRITEONLY, \
1173 8, /* number of pixels, processed in a single block */ \
1174 10, /* prefetch distance */ \
1175 default_init, \
1176 default_cleanup, \
1177 pixman_composite_src_8888_8888_process_pixblock_head, \
1178 pixman_composite_src_8888_8888_process_pixblock_tail, \
1179 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1180 0, /* dst_w_basereg */ \
1181 0, /* dst_r_basereg */ \
1182 0, /* src_basereg */ \
1183 0 /* mask_basereg */
1184
1185 /******************************************************************************/
1186
1187 .macro pixman_composite_src_x888_8888_process_pixblock_head
1188 vorr q0, q0, q2
1189 vorr q1, q1, q2
1190 .endm
1191
1192 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1193 .endm
1194
1195 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1196 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1197 fetch_src_pixblock
1198 vorr q0, q0, q2
1199 vorr q1, q1, q2
1200 cache_preload 8, 8
1201 .endm
1202
1203 .macro pixman_composite_src_x888_8888_init
1204 vmov.u8 q2, #0xFF
1205 vshl.u32 q2, q2, #24
1206 .endm
1207
1208 generate_composite_function \
1209 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1210 FLAG_DST_WRITEONLY, \
1211 8, /* number of pixels, processed in a single block */ \
1212 10, /* prefetch distance */ \
1213 pixman_composite_src_x888_8888_init, \
1214 default_cleanup, \
1215 pixman_composite_src_x888_8888_process_pixblock_head, \
1216 pixman_composite_src_x888_8888_process_pixblock_tail, \
1217 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1218 0, /* dst_w_basereg */ \
1219 0, /* dst_r_basereg */ \
1220 0, /* src_basereg */ \
1221 0 /* mask_basereg */
1222
1223 /******************************************************************************/
1224
1225 .macro pixman_composite_src_n_8_8888_process_pixblock_head
1226 /* expecting solid source in {d0, d1, d2, d3} */
1227 /* mask is in d24 (d25, d26, d27 are unused) */
1228
1229 /* in */
1230 vmull.u8 q8, d24, d0
1231 vmull.u8 q9, d24, d1
1232 vmull.u8 q10, d24, d2
1233 vmull.u8 q11, d24, d3
1234 vrsra.u16 q8, q8, #8
1235 vrsra.u16 q9, q9, #8
1236 vrsra.u16 q10, q10, #8
1237 vrsra.u16 q11, q11, #8
1238 .endm
1239
1240 .macro pixman_composite_src_n_8_8888_process_pixblock_tail
1241 vrshrn.u16 d28, q8, #8
1242 vrshrn.u16 d29, q9, #8
1243 vrshrn.u16 d30, q10, #8
1244 vrshrn.u16 d31, q11, #8
1245 .endm
1246
1247 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1248 fetch_mask_pixblock
1249 PF add PF_X, PF_X, #8
1250 vrshrn.u16 d28, q8, #8
1251 PF tst PF_CTL, #0x0F
1252 vrshrn.u16 d29, q9, #8
1253 PF addne PF_X, PF_X, #8
1254 vrshrn.u16 d30, q10, #8
1255 PF subne PF_CTL, PF_CTL, #1
1256 vrshrn.u16 d31, q11, #8
1257 PF cmp PF_X, ORIG_W
1258 vmull.u8 q8, d24, d0
1259 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1260 vmull.u8 q9, d24, d1
1261 PF subge PF_X, PF_X, ORIG_W
1262 vmull.u8 q10, d24, d2
1263 PF subges PF_CTL, PF_CTL, #0x10
1264 vmull.u8 q11, d24, d3
1265 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1266 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1267 vrsra.u16 q8, q8, #8
1268 vrsra.u16 q9, q9, #8
1269 vrsra.u16 q10, q10, #8
1270 vrsra.u16 q11, q11, #8
1271 .endm
1272
1273 .macro pixman_composite_src_n_8_8888_init
1274 add DUMMY, sp, #ARGS_STACK_OFFSET
1275 vld1.32 {d3[0]}, [DUMMY]
1276 vdup.8 d0, d3[0]
1277 vdup.8 d1, d3[1]
1278 vdup.8 d2, d3[2]
1279 vdup.8 d3, d3[3]
1280 .endm
1281
1282 .macro pixman_composite_src_n_8_8888_cleanup
1283 .endm
1284
1285 generate_composite_function \
1286 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
1287 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1288 8, /* number of pixels, processed in a single block */ \
1289 5, /* prefetch distance */ \
1290 pixman_composite_src_n_8_8888_init, \
1291 pixman_composite_src_n_8_8888_cleanup, \
1292 pixman_composite_src_n_8_8888_process_pixblock_head, \
1293 pixman_composite_src_n_8_8888_process_pixblock_tail, \
1294 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
1295
1296 /******************************************************************************/
1297
1298 .macro pixman_composite_src_n_8_8_process_pixblock_head
1299 vmull.u8 q0, d24, d16
1300 vmull.u8 q1, d25, d16
1301 vmull.u8 q2, d26, d16
1302 vmull.u8 q3, d27, d16
1303 vrsra.u16 q0, q0, #8
1304 vrsra.u16 q1, q1, #8
1305 vrsra.u16 q2, q2, #8
1306 vrsra.u16 q3, q3, #8
1307 .endm
1308
1309 .macro pixman_composite_src_n_8_8_process_pixblock_tail
1310 vrshrn.u16 d28, q0, #8
1311 vrshrn.u16 d29, q1, #8
1312 vrshrn.u16 d30, q2, #8
1313 vrshrn.u16 d31, q3, #8
1314 .endm
1315
1316 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1317 fetch_mask_pixblock
1318 PF add PF_X, PF_X, #8
1319 vrshrn.u16 d28, q0, #8
1320 PF tst PF_CTL, #0x0F
1321 vrshrn.u16 d29, q1, #8
1322 PF addne PF_X, PF_X, #8
1323 vrshrn.u16 d30, q2, #8
1324 PF subne PF_CTL, PF_CTL, #1
1325 vrshrn.u16 d31, q3, #8
1326 PF cmp PF_X, ORIG_W
1327 vmull.u8 q0, d24, d16
1328 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1329 vmull.u8 q1, d25, d16
1330 PF subge PF_X, PF_X, ORIG_W
1331 vmull.u8 q2, d26, d16
1332 PF subges PF_CTL, PF_CTL, #0x10
1333 vmull.u8 q3, d27, d16
1334 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1335 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1336 vrsra.u16 q0, q0, #8
1337 vrsra.u16 q1, q1, #8
1338 vrsra.u16 q2, q2, #8
1339 vrsra.u16 q3, q3, #8
1340 .endm
1341
1342 .macro pixman_composite_src_n_8_8_init
1343 add DUMMY, sp, #ARGS_STACK_OFFSET
1344 vld1.32 {d16[0]}, [DUMMY]
1345 vdup.8 d16, d16[3]
1346 .endm
1347
1348 .macro pixman_composite_src_n_8_8_cleanup
1349 .endm
1350
1351 generate_composite_function \
1352 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
1353 FLAG_DST_WRITEONLY, \
1354 32, /* number of pixels, processed in a single block */ \
1355 5, /* prefetch distance */ \
1356 pixman_composite_src_n_8_8_init, \
1357 pixman_composite_src_n_8_8_cleanup, \
1358 pixman_composite_src_n_8_8_process_pixblock_head, \
1359 pixman_composite_src_n_8_8_process_pixblock_tail, \
1360 pixman_composite_src_n_8_8_process_pixblock_tail_head
1361
1362 /******************************************************************************/
1363
1364 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1365 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1366 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1367 /* and destination data in {d4, d5, d6, d7} */
1368 /* mask is in d24 (d25, d26, d27 are unused) */
1369
1370 /* in */
1371 vmull.u8 q6, d24, d8
1372 vmull.u8 q7, d24, d9
1373 vmull.u8 q8, d24, d10
1374 vmull.u8 q9, d24, d11
1375 vrshr.u16 q10, q6, #8
1376 vrshr.u16 q11, q7, #8
1377 vrshr.u16 q12, q8, #8
1378 vrshr.u16 q13, q9, #8
1379 vraddhn.u16 d0, q6, q10
1380 vraddhn.u16 d1, q7, q11
1381 vraddhn.u16 d2, q8, q12
1382 vraddhn.u16 d3, q9, q13
1383 vmvn.8 d25, d3 /* get inverted alpha */
1384 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1385 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1386 /* now do alpha blending */
1387 vmull.u8 q8, d25, d4
1388 vmull.u8 q9, d25, d5
1389 vmull.u8 q10, d25, d6
1390 vmull.u8 q11, d25, d7
1391 .endm
1392
1393 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1394 vrshr.u16 q14, q8, #8
1395 vrshr.u16 q15, q9, #8
1396 vrshr.u16 q6, q10, #8
1397 vrshr.u16 q7, q11, #8
1398 vraddhn.u16 d28, q14, q8
1399 vraddhn.u16 d29, q15, q9
1400 vraddhn.u16 d30, q6, q10
1401 vraddhn.u16 d31, q7, q11
1402 vqadd.u8 q14, q0, q14
1403 vqadd.u8 q15, q1, q15
1404 .endm
1405
1406 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1407 vrshr.u16 q14, q8, #8
1408 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1409 vrshr.u16 q15, q9, #8
1410 fetch_mask_pixblock
1411 vrshr.u16 q6, q10, #8
1412 PF add PF_X, PF_X, #8
1413 vrshr.u16 q7, q11, #8
1414 PF tst PF_CTL, #0x0F
1415 vraddhn.u16 d28, q14, q8
1416 PF addne PF_X, PF_X, #8
1417 vraddhn.u16 d29, q15, q9
1418 PF subne PF_CTL, PF_CTL, #1
1419 vraddhn.u16 d30, q6, q10
1420 PF cmp PF_X, ORIG_W
1421 vraddhn.u16 d31, q7, q11
1422 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1423 vmull.u8 q6, d24, d8
1424 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1425 vmull.u8 q7, d24, d9
1426 PF subge PF_X, PF_X, ORIG_W
1427 vmull.u8 q8, d24, d10
1428 PF subges PF_CTL, PF_CTL, #0x10
1429 vmull.u8 q9, d24, d11
1430 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1431 vqadd.u8 q14, q0, q14
1432 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1433 vqadd.u8 q15, q1, q15
1434 vrshr.u16 q10, q6, #8
1435 vrshr.u16 q11, q7, #8
1436 vrshr.u16 q12, q8, #8
1437 vrshr.u16 q13, q9, #8
1438 vraddhn.u16 d0, q6, q10
1439 vraddhn.u16 d1, q7, q11
1440 vraddhn.u16 d2, q8, q12
1441 vraddhn.u16 d3, q9, q13
1442 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1443 vmvn.8 d25, d3
1444 vmull.u8 q8, d25, d4
1445 vmull.u8 q9, d25, d5
1446 vmull.u8 q10, d25, d6
1447 vmull.u8 q11, d25, d7
1448 .endm
1449
1450 .macro pixman_composite_over_n_8_8888_init
1451 add DUMMY, sp, #ARGS_STACK_OFFSET
1452 .vsave {d8-d15}
1453 vpush {d8-d15}
1454 vld1.32 {d11[0]}, [DUMMY]
1455 vdup.8 d8, d11[0]
1456 vdup.8 d9, d11[1]
1457 vdup.8 d10, d11[2]
1458 vdup.8 d11, d11[3]
1459 .endm
1460
1461 .macro pixman_composite_over_n_8_8888_cleanup
1462 vpop {d8-d15}
1463 .endm
1464
1465 generate_composite_function \
1466 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1467 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1468 8, /* number of pixels, processed in a single block */ \
1469 5, /* prefetch distance */ \
1470 pixman_composite_over_n_8_8888_init, \
1471 pixman_composite_over_n_8_8888_cleanup, \
1472 pixman_composite_over_n_8_8888_process_pixblock_head, \
1473 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1474 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1475
1476 /******************************************************************************/
1477
1478 .macro pixman_composite_over_n_8_8_process_pixblock_head
1479 vmull.u8 q0, d24, d8
1480 vmull.u8 q1, d25, d8
1481 vmull.u8 q6, d26, d8
1482 vmull.u8 q7, d27, d8
1483 vrshr.u16 q10, q0, #8
1484 vrshr.u16 q11, q1, #8
1485 vrshr.u16 q12, q6, #8
1486 vrshr.u16 q13, q7, #8
1487 vraddhn.u16 d0, q0, q10
1488 vraddhn.u16 d1, q1, q11
1489 vraddhn.u16 d2, q6, q12
1490 vraddhn.u16 d3, q7, q13
1491 vmvn.8 q12, q0
1492 vmvn.8 q13, q1
1493 vmull.u8 q8, d24, d4
1494 vmull.u8 q9, d25, d5
1495 vmull.u8 q10, d26, d6
1496 vmull.u8 q11, d27, d7
1497 .endm
1498
1499 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1500 vrshr.u16 q14, q8, #8
1501 vrshr.u16 q15, q9, #8
1502 vrshr.u16 q12, q10, #8
1503 vrshr.u16 q13, q11, #8
1504 vraddhn.u16 d28, q14, q8
1505 vraddhn.u16 d29, q15, q9
1506 vraddhn.u16 d30, q12, q10
1507 vraddhn.u16 d31, q13, q11
1508 vqadd.u8 q14, q0, q14
1509 vqadd.u8 q15, q1, q15
1510 .endm
1511
1512 /* TODO: expand macros and do better instructions scheduling */
1513 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1514 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1515 pixman_composite_over_n_8_8_process_pixblock_tail
1516 fetch_mask_pixblock
1517 cache_preload 32, 32
1518 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1519 pixman_composite_over_n_8_8_process_pixblock_head
1520 .endm
1521
1522 .macro pixman_composite_over_n_8_8_init
1523 add DUMMY, sp, #ARGS_STACK_OFFSET
1524 .vsave {d8-d15}
1525 vpush {d8-d15}
1526 vld1.32 {d8[0]}, [DUMMY]
1527 vdup.8 d8, d8[3]
1528 .endm
1529
1530 .macro pixman_composite_over_n_8_8_cleanup
1531 vpop {d8-d15}
1532 .endm
1533
1534 generate_composite_function \
1535 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1536 FLAG_DST_READWRITE, \
1537 32, /* number of pixels, processed in a single block */ \
1538 5, /* prefetch distance */ \
1539 pixman_composite_over_n_8_8_init, \
1540 pixman_composite_over_n_8_8_cleanup, \
1541 pixman_composite_over_n_8_8_process_pixblock_head, \
1542 pixman_composite_over_n_8_8_process_pixblock_tail, \
1543 pixman_composite_over_n_8_8_process_pixblock_tail_head
1544
1545 /******************************************************************************/
1546
1547 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1548 /*
1549 * 'combine_mask_ca' replacement
1550 *
1551 * input: solid src (n) in {d8, d9, d10, d11}
1552 * dest in {d4, d5, d6, d7 }
1553 * mask in {d24, d25, d26, d27}
1554 * output: updated src in {d0, d1, d2, d3 }
1555 * updated mask in {d24, d25, d26, d3 }
1556 */
1557 vmull.u8 q0, d24, d8
1558 vmull.u8 q1, d25, d9
1559 vmull.u8 q6, d26, d10
1560 vmull.u8 q7, d27, d11
1561 vmull.u8 q9, d11, d25
1562 vmull.u8 q12, d11, d24
1563 vmull.u8 q13, d11, d26
1564 vrshr.u16 q8, q0, #8
1565 vrshr.u16 q10, q1, #8
1566 vrshr.u16 q11, q6, #8
1567 vraddhn.u16 d0, q0, q8
1568 vraddhn.u16 d1, q1, q10
1569 vraddhn.u16 d2, q6, q11
1570 vrshr.u16 q11, q12, #8
1571 vrshr.u16 q8, q9, #8
1572 vrshr.u16 q6, q13, #8
1573 vrshr.u16 q10, q7, #8
1574 vraddhn.u16 d24, q12, q11
1575 vraddhn.u16 d25, q9, q8
1576 vraddhn.u16 d26, q13, q6
1577 vraddhn.u16 d3, q7, q10
1578 /*
1579 * 'combine_over_ca' replacement
1580 *
1581 * output: updated dest in {d28, d29, d30, d31}
1582 */
1583 vmvn.8 q12, q12
1584 vmvn.8 d26, d26
1585 vmull.u8 q8, d24, d4
1586 vmull.u8 q9, d25, d5
1587 vmvn.8 d27, d3
1588 vmull.u8 q10, d26, d6
1589 vmull.u8 q11, d27, d7
1590 .endm
1591
1592 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1593 /* ... continue 'combine_over_ca' replacement */
1594 vrshr.u16 q14, q8, #8
1595 vrshr.u16 q15, q9, #8
1596 vrshr.u16 q6, q10, #8
1597 vrshr.u16 q7, q11, #8
1598 vraddhn.u16 d28, q14, q8
1599 vraddhn.u16 d29, q15, q9
1600 vraddhn.u16 d30, q6, q10
1601 vraddhn.u16 d31, q7, q11
1602 vqadd.u8 q14, q0, q14
1603 vqadd.u8 q15, q1, q15
1604 .endm
1605
1606 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1607 vrshr.u16 q14, q8, #8
1608 vrshr.u16 q15, q9, #8
1609 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1610 vrshr.u16 q6, q10, #8
1611 vrshr.u16 q7, q11, #8
1612 vraddhn.u16 d28, q14, q8
1613 vraddhn.u16 d29, q15, q9
1614 vraddhn.u16 d30, q6, q10
1615 vraddhn.u16 d31, q7, q11
1616 fetch_mask_pixblock
1617 vqadd.u8 q14, q0, q14
1618 vqadd.u8 q15, q1, q15
1619 cache_preload 8, 8
1620 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1621 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1622 .endm
1623
1624 .macro pixman_composite_over_n_8888_8888_ca_init
1625 add DUMMY, sp, #ARGS_STACK_OFFSET
1626 .vsave {d8-d15}
1627 vpush {d8-d15}
1628 vld1.32 {d11[0]}, [DUMMY]
1629 vdup.8 d8, d11[0]
1630 vdup.8 d9, d11[1]
1631 vdup.8 d10, d11[2]
1632 vdup.8 d11, d11[3]
1633 .endm
1634
1635 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1636 vpop {d8-d15}
1637 .endm
1638
1639 generate_composite_function \
1640 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1641 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1642 8, /* number of pixels, processed in a single block */ \
1643 5, /* prefetch distance */ \
1644 pixman_composite_over_n_8888_8888_ca_init, \
1645 pixman_composite_over_n_8888_8888_ca_cleanup, \
1646 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1647 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1648 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1649
1650 /******************************************************************************/
1651
1652 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1653 /*
1654 * 'combine_mask_ca' replacement
1655 *
1656 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1657 * mask in {d24, d25, d26} [B, G, R]
1658 * output: updated src in {d0, d1, d2 } [B, G, R]
1659 * updated mask in {d24, d25, d26} [B, G, R]
1660 */
1661 vmull.u8 q0, d24, d8
1662 vmull.u8 q1, d25, d9
1663 vmull.u8 q6, d26, d10
1664 vmull.u8 q9, d11, d25
1665 vmull.u8 q12, d11, d24
1666 vmull.u8 q13, d11, d26
1667 vrshr.u16 q8, q0, #8
1668 vrshr.u16 q10, q1, #8
1669 vrshr.u16 q11, q6, #8
1670 vraddhn.u16 d0, q0, q8
1671 vraddhn.u16 d1, q1, q10
1672 vraddhn.u16 d2, q6, q11
1673 vrshr.u16 q11, q12, #8
1674 vrshr.u16 q8, q9, #8
1675 vrshr.u16 q6, q13, #8
1676 vraddhn.u16 d24, q12, q11
1677 vraddhn.u16 d25, q9, q8
1678 /*
1679 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1680 * and put data into d16 - blue, d17 - green, d18 - red
1681 */
1682 vshrn.u16 d17, q2, #3
1683 vshrn.u16 d18, q2, #8
1684 vraddhn.u16 d26, q13, q6
1685 vsli.u16 q2, q2, #5
1686 vsri.u8 d18, d18, #5
1687 vsri.u8 d17, d17, #6
1688 /*
1689 * 'combine_over_ca' replacement
1690 *
1691 * output: updated dest in d16 - blue, d17 - green, d18 - red
1692 */
1693 vmvn.8 q12, q12
1694 vshrn.u16 d16, q2, #2
1695 vmvn.8 d26, d26
1696 vmull.u8 q6, d16, d24
1697 vmull.u8 q7, d17, d25
1698 vmull.u8 q11, d18, d26
1699 .endm
1700
1701 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1702 /* ... continue 'combine_over_ca' replacement */
1703 vrshr.u16 q10, q6, #8
1704 vrshr.u16 q14, q7, #8
1705 vrshr.u16 q15, q11, #8
1706 vraddhn.u16 d16, q10, q6
1707 vraddhn.u16 d17, q14, q7
1708 vraddhn.u16 d18, q15, q11
1709 vqadd.u8 q8, q0, q8
1710 vqadd.u8 d18, d2, d18
1711 /*
1712 * convert the results in d16, d17, d18 to r5g6b5 and store
1713 * them into {d28, d29}
1714 */
1715 vshll.u8 q14, d18, #8
1716 vshll.u8 q10, d17, #8
1717 vshll.u8 q15, d16, #8
1718 vsri.u16 q14, q10, #5
1719 vsri.u16 q14, q15, #11
1720 .endm
1721
1722 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1723 fetch_mask_pixblock
1724 vrshr.u16 q10, q6, #8
1725 vrshr.u16 q14, q7, #8
1726 vld1.16 {d4, d5}, [DST_R, :128]!
1727 vrshr.u16 q15, q11, #8
1728 vraddhn.u16 d16, q10, q6
1729 vraddhn.u16 d17, q14, q7
1730 vraddhn.u16 d22, q15, q11
1731 /* process_pixblock_head */
1732 /*
1733 * 'combine_mask_ca' replacement
1734 *
1735 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1736 * mask in {d24, d25, d26} [B, G, R]
1737 * output: updated src in {d0, d1, d2 } [B, G, R]
1738 * updated mask in {d24, d25, d26} [B, G, R]
1739 */
1740 vmull.u8 q6, d26, d10
1741 vqadd.u8 q8, q0, q8
1742 vmull.u8 q0, d24, d8
1743 vqadd.u8 d22, d2, d22
1744 vmull.u8 q1, d25, d9
1745 /*
1746 * convert the result in d16, d17, d22 to r5g6b5 and store
1747 * it into {d28, d29}
1748 */
1749 vshll.u8 q14, d22, #8
1750 vshll.u8 q10, d17, #8
1751 vshll.u8 q15, d16, #8
1752 vmull.u8 q9, d11, d25
1753 vsri.u16 q14, q10, #5
1754 vmull.u8 q12, d11, d24
1755 vmull.u8 q13, d11, d26
1756 vsri.u16 q14, q15, #11
1757 cache_preload 8, 8
1758 vrshr.u16 q8, q0, #8
1759 vrshr.u16 q10, q1, #8
1760 vrshr.u16 q11, q6, #8
1761 vraddhn.u16 d0, q0, q8
1762 vraddhn.u16 d1, q1, q10
1763 vraddhn.u16 d2, q6, q11
1764 vrshr.u16 q11, q12, #8
1765 vrshr.u16 q8, q9, #8
1766 vrshr.u16 q6, q13, #8
1767 vraddhn.u16 d24, q12, q11
1768 vraddhn.u16 d25, q9, q8
1769 /*
1770 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1771 * 8-bit format and put data into d16 - blue, d17 - green,
1772 * d18 - red
1773 */
1774 vshrn.u16 d17, q2, #3
1775 vshrn.u16 d18, q2, #8
1776 vraddhn.u16 d26, q13, q6
1777 vsli.u16 q2, q2, #5
1778 vsri.u8 d17, d17, #6
1779 vsri.u8 d18, d18, #5
1780 /*
1781 * 'combine_over_ca' replacement
1782 *
1783 * output: updated dest in d16 - blue, d17 - green, d18 - red
1784 */
1785 vmvn.8 q12, q12
1786 vshrn.u16 d16, q2, #2
1787 vmvn.8 d26, d26
1788 vmull.u8 q7, d17, d25
1789 vmull.u8 q6, d16, d24
1790 vmull.u8 q11, d18, d26
1791 vst1.16 {d28, d29}, [DST_W, :128]!
1792 .endm
1793
1794 .macro pixman_composite_over_n_8888_0565_ca_init
1795 add DUMMY, sp, #ARGS_STACK_OFFSET
1796 .vsave {d8-d15}
1797 vpush {d8-d15}
1798 vld1.32 {d11[0]}, [DUMMY]
1799 vdup.8 d8, d11[0]
1800 vdup.8 d9, d11[1]
1801 vdup.8 d10, d11[2]
1802 vdup.8 d11, d11[3]
1803 .endm
1804
1805 .macro pixman_composite_over_n_8888_0565_ca_cleanup
1806 vpop {d8-d15}
1807 .endm
1808
1809 generate_composite_function \
1810 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1811 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1812 8, /* number of pixels, processed in a single block */ \
1813 5, /* prefetch distance */ \
1814 pixman_composite_over_n_8888_0565_ca_init, \
1815 pixman_composite_over_n_8888_0565_ca_cleanup, \
1816 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1817 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1818 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1819
1820 /******************************************************************************/
1821
1822 .macro pixman_composite_in_n_8_process_pixblock_head
1823 /* expecting source data in {d0, d1, d2, d3} */
1824 /* and destination data in {d4, d5, d6, d7} */
1825 vmull.u8 q8, d4, d3
1826 vmull.u8 q9, d5, d3
1827 vmull.u8 q10, d6, d3
1828 vmull.u8 q11, d7, d3
1829 .endm
1830
1831 .macro pixman_composite_in_n_8_process_pixblock_tail
1832 vrshr.u16 q14, q8, #8
1833 vrshr.u16 q15, q9, #8
1834 vrshr.u16 q12, q10, #8
1835 vrshr.u16 q13, q11, #8
1836 vraddhn.u16 d28, q8, q14
1837 vraddhn.u16 d29, q9, q15
1838 vraddhn.u16 d30, q10, q12
1839 vraddhn.u16 d31, q11, q13
1840 .endm
1841
1842 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1843 pixman_composite_in_n_8_process_pixblock_tail
1844 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1845 cache_preload 32, 32
1846 pixman_composite_in_n_8_process_pixblock_head
1847 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1848 .endm
1849
1850 .macro pixman_composite_in_n_8_init
1851 add DUMMY, sp, #ARGS_STACK_OFFSET
1852 vld1.32 {d3[0]}, [DUMMY]
1853 vdup.8 d3, d3[3]
1854 .endm
1855
1856 .macro pixman_composite_in_n_8_cleanup
1857 .endm
1858
1859 generate_composite_function \
1860 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1861 FLAG_DST_READWRITE, \
1862 32, /* number of pixels, processed in a single block */ \
1863 5, /* prefetch distance */ \
1864 pixman_composite_in_n_8_init, \
1865 pixman_composite_in_n_8_cleanup, \
1866 pixman_composite_in_n_8_process_pixblock_head, \
1867 pixman_composite_in_n_8_process_pixblock_tail, \
1868 pixman_composite_in_n_8_process_pixblock_tail_head, \
1869 28, /* dst_w_basereg */ \
1870 4, /* dst_r_basereg */ \
1871 0, /* src_basereg */ \
1872 24 /* mask_basereg */
1873
1874 .macro pixman_composite_add_n_8_8_process_pixblock_head
1875 /* expecting source data in {d8, d9, d10, d11} */
1876 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1877 /* and destination data in {d4, d5, d6, d7} */
1878 /* mask is in d24, d25, d26, d27 */
1879 vmull.u8 q0, d24, d11
1880 vmull.u8 q1, d25, d11
1881 vmull.u8 q6, d26, d11
1882 vmull.u8 q7, d27, d11
1883 vrshr.u16 q10, q0, #8
1884 vrshr.u16 q11, q1, #8
1885 vrshr.u16 q12, q6, #8
1886 vrshr.u16 q13, q7, #8
1887 vraddhn.u16 d0, q0, q10
1888 vraddhn.u16 d1, q1, q11
1889 vraddhn.u16 d2, q6, q12
1890 vraddhn.u16 d3, q7, q13
1891 vqadd.u8 q14, q0, q2
1892 vqadd.u8 q15, q1, q3
1893 .endm
1894
1895 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1896 .endm
1897
1898 /* TODO: expand macros and do better instructions scheduling */
1899 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1900 pixman_composite_add_n_8_8_process_pixblock_tail
1901 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1902 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1903 fetch_mask_pixblock
1904 cache_preload 32, 32
1905 pixman_composite_add_n_8_8_process_pixblock_head
1906 .endm
1907
1908 .macro pixman_composite_add_n_8_8_init
1909 add DUMMY, sp, #ARGS_STACK_OFFSET
1910 .vsave {d8-d15}
1911 vpush {d8-d15}
1912 vld1.32 {d11[0]}, [DUMMY]
1913 vdup.8 d11, d11[3]
1914 .endm
1915
1916 .macro pixman_composite_add_n_8_8_cleanup
1917 vpop {d8-d15}
1918 .endm
1919
1920 generate_composite_function \
1921 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1922 FLAG_DST_READWRITE, \
1923 32, /* number of pixels, processed in a single block */ \
1924 5, /* prefetch distance */ \
1925 pixman_composite_add_n_8_8_init, \
1926 pixman_composite_add_n_8_8_cleanup, \
1927 pixman_composite_add_n_8_8_process_pixblock_head, \
1928 pixman_composite_add_n_8_8_process_pixblock_tail, \
1929 pixman_composite_add_n_8_8_process_pixblock_tail_head
1930
1931 /******************************************************************************/
1932
1933 .macro pixman_composite_add_8_8_8_process_pixblock_head
1934 /* expecting source data in {d0, d1, d2, d3} */
1935 /* destination data in {d4, d5, d6, d7} */
1936 /* mask in {d24, d25, d26, d27} */
1937 vmull.u8 q8, d24, d0
1938 vmull.u8 q9, d25, d1
1939 vmull.u8 q10, d26, d2
1940 vmull.u8 q11, d27, d3
1941 vrshr.u16 q0, q8, #8
1942 vrshr.u16 q1, q9, #8
1943 vrshr.u16 q12, q10, #8
1944 vrshr.u16 q13, q11, #8
1945 vraddhn.u16 d0, q0, q8
1946 vraddhn.u16 d1, q1, q9
1947 vraddhn.u16 d2, q12, q10
1948 vraddhn.u16 d3, q13, q11
1949 vqadd.u8 q14, q0, q2
1950 vqadd.u8 q15, q1, q3
1951 .endm
1952
1953 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1954 .endm
1955
1956 /* TODO: expand macros and do better instructions scheduling */
1957 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1958 pixman_composite_add_8_8_8_process_pixblock_tail
1959 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1960 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1961 fetch_mask_pixblock
1962 fetch_src_pixblock
1963 cache_preload 32, 32
1964 pixman_composite_add_8_8_8_process_pixblock_head
1965 .endm
1966
1967 .macro pixman_composite_add_8_8_8_init
1968 .endm
1969
1970 .macro pixman_composite_add_8_8_8_cleanup
1971 .endm
1972
1973 generate_composite_function \
1974 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1975 FLAG_DST_READWRITE, \
1976 32, /* number of pixels, processed in a single block */ \
1977 5, /* prefetch distance */ \
1978 pixman_composite_add_8_8_8_init, \
1979 pixman_composite_add_8_8_8_cleanup, \
1980 pixman_composite_add_8_8_8_process_pixblock_head, \
1981 pixman_composite_add_8_8_8_process_pixblock_tail, \
1982 pixman_composite_add_8_8_8_process_pixblock_tail_head
1983
1984 /******************************************************************************/
1985
1986 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1987 /* expecting source data in {d0, d1, d2, d3} */
1988 /* destination data in {d4, d5, d6, d7} */
1989 /* mask in {d24, d25, d26, d27} */
1990 vmull.u8 q8, d27, d0
1991 vmull.u8 q9, d27, d1
1992 vmull.u8 q10, d27, d2
1993 vmull.u8 q11, d27, d3
1994 /* 1 cycle bubble */
1995 vrsra.u16 q8, q8, #8
1996 vrsra.u16 q9, q9, #8
1997 vrsra.u16 q10, q10, #8
1998 vrsra.u16 q11, q11, #8
1999 .endm
2000
2001 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
2002 /* 2 cycle bubble */
2003 vrshrn.u16 d28, q8, #8
2004 vrshrn.u16 d29, q9, #8
2005 vrshrn.u16 d30, q10, #8
2006 vrshrn.u16 d31, q11, #8
2007 vqadd.u8 q14, q2, q14
2008 /* 1 cycle bubble */
2009 vqadd.u8 q15, q3, q15
2010 .endm
2011
2012 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2013 fetch_src_pixblock
2014 vrshrn.u16 d28, q8, #8
2015 fetch_mask_pixblock
2016 vrshrn.u16 d29, q9, #8
2017 vmull.u8 q8, d27, d0
2018 vrshrn.u16 d30, q10, #8
2019 vmull.u8 q9, d27, d1
2020 vrshrn.u16 d31, q11, #8
2021 vmull.u8 q10, d27, d2
2022 vqadd.u8 q14, q2, q14
2023 vmull.u8 q11, d27, d3
2024 vqadd.u8 q15, q3, q15
2025 vrsra.u16 q8, q8, #8
2026 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2027 vrsra.u16 q9, q9, #8
2028 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2029 vrsra.u16 q10, q10, #8
2030
2031 cache_preload 8, 8
2032
2033 vrsra.u16 q11, q11, #8
2034 .endm
2035
2036 generate_composite_function \
2037 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
2038 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2039 8, /* number of pixels, processed in a single block */ \
2040 10, /* prefetch distance */ \
2041 default_init, \
2042 default_cleanup, \
2043 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2044 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2045 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2046
2047 generate_composite_function_single_scanline \
2048 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
2049 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2050 8, /* number of pixels, processed in a single block */ \
2051 default_init, \
2052 default_cleanup, \
2053 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2054 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2055 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2056
2057 /******************************************************************************/
2058
2059 generate_composite_function \
2060 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
2061 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2062 8, /* number of pixels, processed in a single block */ \
2063 5, /* prefetch distance */ \
2064 default_init, \
2065 default_cleanup, \
2066 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2067 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2068 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2069 28, /* dst_w_basereg */ \
2070 4, /* dst_r_basereg */ \
2071 0, /* src_basereg */ \
2072 27 /* mask_basereg */
2073
2074 /******************************************************************************/
2075
2076 .macro pixman_composite_add_n_8_8888_init
2077 add DUMMY, sp, #ARGS_STACK_OFFSET
2078 vld1.32 {d3[0]}, [DUMMY]
2079 vdup.8 d0, d3[0]
2080 vdup.8 d1, d3[1]
2081 vdup.8 d2, d3[2]
2082 vdup.8 d3, d3[3]
2083 .endm
2084
2085 .macro pixman_composite_add_n_8_8888_cleanup
2086 .endm
2087
2088 generate_composite_function \
2089 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
2090 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2091 8, /* number of pixels, processed in a single block */ \
2092 5, /* prefetch distance */ \
2093 pixman_composite_add_n_8_8888_init, \
2094 pixman_composite_add_n_8_8888_cleanup, \
2095 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2096 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2097 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2098 28, /* dst_w_basereg */ \
2099 4, /* dst_r_basereg */ \
2100 0, /* src_basereg */ \
2101 27 /* mask_basereg */
2102
2103 /******************************************************************************/
2104
2105 .macro pixman_composite_add_8888_n_8888_init
2106 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2107 vld1.32 {d27[0]}, [DUMMY]
2108 vdup.8 d27, d27[3]
2109 .endm
2110
2111 .macro pixman_composite_add_8888_n_8888_cleanup
2112 .endm
2113
2114 generate_composite_function \
2115 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
2116 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2117 8, /* number of pixels, processed in a single block */ \
2118 5, /* prefetch distance */ \
2119 pixman_composite_add_8888_n_8888_init, \
2120 pixman_composite_add_8888_n_8888_cleanup, \
2121 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2122 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2123 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2124 28, /* dst_w_basereg */ \
2125 4, /* dst_r_basereg */ \
2126 0, /* src_basereg */ \
2127 27 /* mask_basereg */
2128
2129 /******************************************************************************/
2130
2131 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2132 /* expecting source data in {d0, d1, d2, d3} */
2133 /* destination data in {d4, d5, d6, d7} */
2134 /* solid mask is in d15 */
2135
2136 /* 'in' */
2137 vmull.u8 q8, d15, d3
2138 vmull.u8 q6, d15, d2
2139 vmull.u8 q5, d15, d1
2140 vmull.u8 q4, d15, d0
2141 vrshr.u16 q13, q8, #8
2142 vrshr.u16 q12, q6, #8
2143 vrshr.u16 q11, q5, #8
2144 vrshr.u16 q10, q4, #8
2145 vraddhn.u16 d3, q8, q13
2146 vraddhn.u16 d2, q6, q12
2147 vraddhn.u16 d1, q5, q11
2148 vraddhn.u16 d0, q4, q10
2149 vmvn.8 d24, d3 /* get inverted alpha */
2150 /* now do alpha blending */
2151 vmull.u8 q8, d24, d4
2152 vmull.u8 q9, d24, d5
2153 vmull.u8 q10, d24, d6
2154 vmull.u8 q11, d24, d7
2155 .endm
2156
2157 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2158 vrshr.u16 q14, q8, #8
2159 vrshr.u16 q15, q9, #8
2160 vrshr.u16 q12, q10, #8
2161 vrshr.u16 q13, q11, #8
2162 vraddhn.u16 d28, q14, q8
2163 vraddhn.u16 d29, q15, q9
2164 vraddhn.u16 d30, q12, q10
2165 vraddhn.u16 d31, q13, q11
2166 .endm
2167
2168 /* TODO: expand macros and do better instructions scheduling */
2169 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2170 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2171 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2172 fetch_src_pixblock
2173 cache_preload 8, 8
2174 fetch_mask_pixblock
2175 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2176 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2177 .endm
2178
2179 generate_composite_function_single_scanline \
2180 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2181 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2182 8, /* number of pixels, processed in a single block */ \
2183 default_init_need_all_regs, \
2184 default_cleanup_need_all_regs, \
2185 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
2186 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
2187 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
2188 28, /* dst_w_basereg */ \
2189 4, /* dst_r_basereg */ \
2190 0, /* src_basereg */ \
2191 12 /* mask_basereg */
2192
2193 /******************************************************************************/
2194
2195 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
2196 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2197 .endm
2198
2199 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2200 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2201 vqadd.u8 q14, q0, q14
2202 vqadd.u8 q15, q1, q15
2203 .endm
2204
2205 /* TODO: expand macros and do better instructions scheduling */
2206 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2207 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2208 pixman_composite_over_8888_n_8888_process_pixblock_tail
2209 fetch_src_pixblock
2210 cache_preload 8, 8
2211 pixman_composite_over_8888_n_8888_process_pixblock_head
2212 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2213 .endm
2214
2215 .macro pixman_composite_over_8888_n_8888_init
2216 add DUMMY, sp, #48
2217 .vsave {d8-d15}
2218 vpush {d8-d15}
2219 vld1.32 {d15[0]}, [DUMMY]
2220 vdup.8 d15, d15[3]
2221 .endm
2222
2223 .macro pixman_composite_over_8888_n_8888_cleanup
2224 vpop {d8-d15}
2225 .endm
2226
2227 generate_composite_function \
2228 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2229 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2230 8, /* number of pixels, processed in a single block */ \
2231 5, /* prefetch distance */ \
2232 pixman_composite_over_8888_n_8888_init, \
2233 pixman_composite_over_8888_n_8888_cleanup, \
2234 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2235 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2236 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2237
2238 /******************************************************************************/
2239
2240 /* TODO: expand macros and do better instructions scheduling */
2241 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2242 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2243 pixman_composite_over_8888_n_8888_process_pixblock_tail
2244 fetch_src_pixblock
2245 cache_preload 8, 8
2246 fetch_mask_pixblock
2247 pixman_composite_over_8888_n_8888_process_pixblock_head
2248 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2249 .endm
2250
2251 generate_composite_function \
2252 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2253 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2254 8, /* number of pixels, processed in a single block */ \
2255 5, /* prefetch distance */ \
2256 default_init_need_all_regs, \
2257 default_cleanup_need_all_regs, \
2258 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2259 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2260 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2261 28, /* dst_w_basereg */ \
2262 4, /* dst_r_basereg */ \
2263 0, /* src_basereg */ \
2264 12 /* mask_basereg */
2265
2266 generate_composite_function_single_scanline \
2267 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2268 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2269 8, /* number of pixels, processed in a single block */ \
2270 default_init_need_all_regs, \
2271 default_cleanup_need_all_regs, \
2272 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2273 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2274 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2275 28, /* dst_w_basereg */ \
2276 4, /* dst_r_basereg */ \
2277 0, /* src_basereg */ \
2278 12 /* mask_basereg */
2279
2280 /******************************************************************************/
2281
2282 /* TODO: expand macros and do better instructions scheduling */
2283 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2284 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2285 pixman_composite_over_8888_n_8888_process_pixblock_tail
2286 fetch_src_pixblock
2287 cache_preload 8, 8
2288 fetch_mask_pixblock
2289 pixman_composite_over_8888_n_8888_process_pixblock_head
2290 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2291 .endm
2292
2293 generate_composite_function \
2294 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2295 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2296 8, /* number of pixels, processed in a single block */ \
2297 5, /* prefetch distance */ \
2298 default_init_need_all_regs, \
2299 default_cleanup_need_all_regs, \
2300 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2301 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2302 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2303 28, /* dst_w_basereg */ \
2304 4, /* dst_r_basereg */ \
2305 0, /* src_basereg */ \
2306 15 /* mask_basereg */
2307
2308 /******************************************************************************/
2309
2310 .macro pixman_composite_src_0888_0888_process_pixblock_head
2311 .endm
2312
2313 .macro pixman_composite_src_0888_0888_process_pixblock_tail
2314 .endm
2315
2316 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2317 vst3.8 {d0, d1, d2}, [DST_W]!
2318 fetch_src_pixblock
2319 cache_preload 8, 8
2320 .endm
2321
2322 generate_composite_function \
2323 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2324 FLAG_DST_WRITEONLY, \
2325 8, /* number of pixels, processed in a single block */ \
2326 10, /* prefetch distance */ \
2327 default_init, \
2328 default_cleanup, \
2329 pixman_composite_src_0888_0888_process_pixblock_head, \
2330 pixman_composite_src_0888_0888_process_pixblock_tail, \
2331 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2332 0, /* dst_w_basereg */ \
2333 0, /* dst_r_basereg */ \
2334 0, /* src_basereg */ \
2335 0 /* mask_basereg */
2336
2337 /******************************************************************************/
2338
2339 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2340 vswp d0, d2
2341 .endm
2342
2343 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2344 .endm
2345
2346 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2347 vst4.8 {d0, d1, d2, d3}, [DST_W]!
2348 fetch_src_pixblock
2349 vswp d0, d2
2350 cache_preload 8, 8
2351 .endm
2352
2353 .macro pixman_composite_src_0888_8888_rev_init
2354 veor d3, d3, d3
2355 .endm
2356
2357 generate_composite_function \
2358 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2359 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2360 8, /* number of pixels, processed in a single block */ \
2361 10, /* prefetch distance */ \
2362 pixman_composite_src_0888_8888_rev_init, \
2363 default_cleanup, \
2364 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2365 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2366 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2367 0, /* dst_w_basereg */ \
2368 0, /* dst_r_basereg */ \
2369 0, /* src_basereg */ \
2370 0 /* mask_basereg */
2371
2372 /******************************************************************************/
2373
2374 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2375 vshll.u8 q8, d1, #8
2376 vshll.u8 q9, d2, #8
2377 .endm
2378
2379 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2380 vshll.u8 q14, d0, #8
2381 vsri.u16 q14, q8, #5
2382 vsri.u16 q14, q9, #11
2383 .endm
2384
2385 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2386 vshll.u8 q14, d0, #8
2387 fetch_src_pixblock
2388 vsri.u16 q14, q8, #5
2389 vsri.u16 q14, q9, #11
2390 vshll.u8 q8, d1, #8
2391 vst1.16 {d28, d29}, [DST_W, :128]!
2392 vshll.u8 q9, d2, #8
2393 .endm
2394
2395 generate_composite_function \
2396 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2397 FLAG_DST_WRITEONLY, \
2398 8, /* number of pixels, processed in a single block */ \
2399 10, /* prefetch distance */ \
2400 default_init, \
2401 default_cleanup, \
2402 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2403 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2404 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2405 28, /* dst_w_basereg */ \
2406 0, /* dst_r_basereg */ \
2407 0, /* src_basereg */ \
2408 0 /* mask_basereg */
2409
2410 /******************************************************************************/
2411
2412 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2413 vmull.u8 q8, d3, d0
2414 vmull.u8 q9, d3, d1
2415 vmull.u8 q10, d3, d2
2416 .endm
2417
2418 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2419 vrshr.u16 q11, q8, #8
2420 vswp d3, d31
2421 vrshr.u16 q12, q9, #8
2422 vrshr.u16 q13, q10, #8
2423 vraddhn.u16 d30, q11, q8
2424 vraddhn.u16 d29, q12, q9
2425 vraddhn.u16 d28, q13, q10
2426 .endm
2427
2428 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2429 vrshr.u16 q11, q8, #8
2430 vswp d3, d31
2431 vrshr.u16 q12, q9, #8
2432 vrshr.u16 q13, q10, #8
2433 fetch_src_pixblock
2434 vraddhn.u16 d30, q11, q8
2435 PF add PF_X, PF_X, #8
2436 PF tst PF_CTL, #0xF
2437 PF addne PF_X, PF_X, #8
2438 PF subne PF_CTL, PF_CTL, #1
2439 vraddhn.u16 d29, q12, q9
2440 vraddhn.u16 d28, q13, q10
2441 vmull.u8 q8, d3, d0
2442 vmull.u8 q9, d3, d1
2443 vmull.u8 q10, d3, d2
2444 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2445 PF cmp PF_X, ORIG_W
2446 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2447 PF subge PF_X, PF_X, ORIG_W
2448 PF subges PF_CTL, PF_CTL, #0x10
2449 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2450 .endm
2451
2452 generate_composite_function \
2453 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2454 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2455 8, /* number of pixels, processed in a single block */ \
2456 10, /* prefetch distance */ \
2457 default_init, \
2458 default_cleanup, \
2459 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2460 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2461 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2462 28, /* dst_w_basereg */ \
2463 0, /* dst_r_basereg */ \
2464 0, /* src_basereg */ \
2465 0 /* mask_basereg */
2466
2467 /******************************************************************************/
2468
2469 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2470 vmull.u8 q8, d3, d0
2471 vmull.u8 q9, d3, d1
2472 vmull.u8 q10, d3, d2
2473 .endm
2474
2475 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2476 vrshr.u16 q11, q8, #8
2477 vswp d3, d31
2478 vrshr.u16 q12, q9, #8
2479 vrshr.u16 q13, q10, #8
2480 vraddhn.u16 d28, q11, q8
2481 vraddhn.u16 d29, q12, q9
2482 vraddhn.u16 d30, q13, q10
2483 .endm
2484
2485 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2486 vrshr.u16 q11, q8, #8
2487 vswp d3, d31
2488 vrshr.u16 q12, q9, #8
2489 vrshr.u16 q13, q10, #8
2490 fetch_src_pixblock
2491 vraddhn.u16 d28, q11, q8
2492 PF add PF_X, PF_X, #8
2493 PF tst PF_CTL, #0xF
2494 PF addne PF_X, PF_X, #8
2495 PF subne PF_CTL, PF_CTL, #1
2496 vraddhn.u16 d29, q12, q9
2497 vraddhn.u16 d30, q13, q10
2498 vmull.u8 q8, d3, d0
2499 vmull.u8 q9, d3, d1
2500 vmull.u8 q10, d3, d2
2501 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2502 PF cmp PF_X, ORIG_W
2503 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2504 PF subge PF_X, PF_X, ORIG_W
2505 PF subges PF_CTL, PF_CTL, #0x10
2506 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2507 .endm
2508
2509 generate_composite_function \
2510 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2511 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2512 8, /* number of pixels, processed in a single block */ \
2513 10, /* prefetch distance */ \
2514 default_init, \
2515 default_cleanup, \
2516 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2517 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2518 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2519 28, /* dst_w_basereg */ \
2520 0, /* dst_r_basereg */ \
2521 0, /* src_basereg */ \
2522 0 /* mask_basereg */
2523
2524 /******************************************************************************/
2525
2526 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2527 /* mask is in d15 */
2528 convert_0565_to_x888 q4, d2, d1, d0
2529 convert_0565_to_x888 q5, d6, d5, d4
2530 /* source pixel data is in {d0, d1, d2, XX} */
2531 /* destination pixel data is in {d4, d5, d6, XX} */
2532 vmvn.8 d7, d15
2533 vmull.u8 q6, d15, d2
2534 vmull.u8 q5, d15, d1
2535 vmull.u8 q4, d15, d0
2536 vmull.u8 q8, d7, d4
2537 vmull.u8 q9, d7, d5
2538 vmull.u8 q13, d7, d6
2539 vrshr.u16 q12, q6, #8
2540 vrshr.u16 q11, q5, #8
2541 vrshr.u16 q10, q4, #8
2542 vraddhn.u16 d2, q6, q12
2543 vraddhn.u16 d1, q5, q11
2544 vraddhn.u16 d0, q4, q10
2545 .endm
2546
2547 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2548 vrshr.u16 q14, q8, #8
2549 vrshr.u16 q15, q9, #8
2550 vrshr.u16 q12, q13, #8
2551 vraddhn.u16 d28, q14, q8
2552 vraddhn.u16 d29, q15, q9
2553 vraddhn.u16 d30, q12, q13
2554 vqadd.u8 q0, q0, q14
2555 vqadd.u8 q1, q1, q15
2556 /* 32bpp result is in {d0, d1, d2, XX} */
2557 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2558 .endm
2559
2560 /* TODO: expand macros and do better instructions scheduling */
2561 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2562 fetch_mask_pixblock
2563 pixman_composite_over_0565_8_0565_process_pixblock_tail
2564 fetch_src_pixblock
2565 vld1.16 {d10, d11}, [DST_R, :128]!
2566 cache_preload 8, 8
2567 pixman_composite_over_0565_8_0565_process_pixblock_head
2568 vst1.16 {d28, d29}, [DST_W, :128]!
2569 .endm
2570
2571 generate_composite_function \
2572 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2573 FLAG_DST_READWRITE, \
2574 8, /* number of pixels, processed in a single block */ \
2575 5, /* prefetch distance */ \
2576 default_init_need_all_regs, \
2577 default_cleanup_need_all_regs, \
2578 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2579 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2580 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2581 28, /* dst_w_basereg */ \
2582 10, /* dst_r_basereg */ \
2583 8, /* src_basereg */ \
2584 15 /* mask_basereg */
2585
2586 /******************************************************************************/
2587
2588 .macro pixman_composite_over_0565_n_0565_init
2589 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2590 .vsave {d8-d15}
2591 vpush {d8-d15}
2592 vld1.32 {d15[0]}, [DUMMY]
2593 vdup.8 d15, d15[3]
2594 .endm
2595
2596 .macro pixman_composite_over_0565_n_0565_cleanup
2597 vpop {d8-d15}
2598 .endm
2599
2600 generate_composite_function \
2601 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2602 FLAG_DST_READWRITE, \
2603 8, /* number of pixels, processed in a single block */ \
2604 5, /* prefetch distance */ \
2605 pixman_composite_over_0565_n_0565_init, \
2606 pixman_composite_over_0565_n_0565_cleanup, \
2607 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2608 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2609 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2610 28, /* dst_w_basereg */ \
2611 10, /* dst_r_basereg */ \
2612 8, /* src_basereg */ \
2613 15 /* mask_basereg */
2614
2615 /******************************************************************************/
2616
2617 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2618 /* mask is in d15 */
2619 convert_0565_to_x888 q4, d2, d1, d0
2620 convert_0565_to_x888 q5, d6, d5, d4
2621 /* source pixel data is in {d0, d1, d2, XX} */
2622 /* destination pixel data is in {d4, d5, d6, XX} */
2623 vmull.u8 q6, d15, d2
2624 vmull.u8 q5, d15, d1
2625 vmull.u8 q4, d15, d0
2626 vrshr.u16 q12, q6, #8
2627 vrshr.u16 q11, q5, #8
2628 vrshr.u16 q10, q4, #8
2629 vraddhn.u16 d2, q6, q12
2630 vraddhn.u16 d1, q5, q11
2631 vraddhn.u16 d0, q4, q10
2632 .endm
2633
2634 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2635 vqadd.u8 q0, q0, q2
2636 vqadd.u8 q1, q1, q3
2637 /* 32bpp result is in {d0, d1, d2, XX} */
2638 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2639 .endm
2640
2641 /* TODO: expand macros and do better instructions scheduling */
2642 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2643 fetch_mask_pixblock
2644 pixman_composite_add_0565_8_0565_process_pixblock_tail
2645 fetch_src_pixblock
2646 vld1.16 {d10, d11}, [DST_R, :128]!
2647 cache_preload 8, 8
2648 pixman_composite_add_0565_8_0565_process_pixblock_head
2649 vst1.16 {d28, d29}, [DST_W, :128]!
2650 .endm
2651
2652 generate_composite_function \
2653 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2654 FLAG_DST_READWRITE, \
2655 8, /* number of pixels, processed in a single block */ \
2656 5, /* prefetch distance */ \
2657 default_init_need_all_regs, \
2658 default_cleanup_need_all_regs, \
2659 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2660 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2661 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2662 28, /* dst_w_basereg */ \
2663 10, /* dst_r_basereg */ \
2664 8, /* src_basereg */ \
2665 15 /* mask_basereg */
2666
2667 /******************************************************************************/
2668
2669 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2670 /* mask is in d15 */
2671 convert_0565_to_x888 q5, d6, d5, d4
2672 /* destination pixel data is in {d4, d5, d6, xx} */
2673 vmvn.8 d24, d15 /* get inverted alpha */
2674 /* now do alpha blending */
2675 vmull.u8 q8, d24, d4
2676 vmull.u8 q9, d24, d5
2677 vmull.u8 q10, d24, d6
2678 .endm
2679
2680 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2681 vrshr.u16 q14, q8, #8
2682 vrshr.u16 q15, q9, #8
2683 vrshr.u16 q12, q10, #8
2684 vraddhn.u16 d0, q14, q8
2685 vraddhn.u16 d1, q15, q9
2686 vraddhn.u16 d2, q12, q10
2687 /* 32bpp result is in {d0, d1, d2, XX} */
2688 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2689 .endm
2690
2691 /* TODO: expand macros and do better instructions scheduling */
2692 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2693 fetch_src_pixblock
2694 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2695 vld1.16 {d10, d11}, [DST_R, :128]!
2696 cache_preload 8, 8
2697 pixman_composite_out_reverse_8_0565_process_pixblock_head
2698 vst1.16 {d28, d29}, [DST_W, :128]!
2699 .endm
2700
2701 generate_composite_function \
2702 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2703 FLAG_DST_READWRITE, \
2704 8, /* number of pixels, processed in a single block */ \
2705 5, /* prefetch distance */ \
2706 default_init_need_all_regs, \
2707 default_cleanup_need_all_regs, \
2708 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2709 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2710 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2711 28, /* dst_w_basereg */ \
2712 10, /* dst_r_basereg */ \
2713 15, /* src_basereg */ \
2714 0 /* mask_basereg */
2715
2716 /******************************************************************************/
2717
2718 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2719 /* src is in d0 */
2720 /* destination pixel data is in {d4, d5, d6, d7} */
2721 vmvn.8 d1, d0 /* get inverted alpha */
2722 /* now do alpha blending */
2723 vmull.u8 q8, d1, d4
2724 vmull.u8 q9, d1, d5
2725 vmull.u8 q10, d1, d6
2726 vmull.u8 q11, d1, d7
2727 .endm
2728
2729 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2730 vrshr.u16 q14, q8, #8
2731 vrshr.u16 q15, q9, #8
2732 vrshr.u16 q12, q10, #8
2733 vrshr.u16 q13, q11, #8
2734 vraddhn.u16 d28, q14, q8
2735 vraddhn.u16 d29, q15, q9
2736 vraddhn.u16 d30, q12, q10
2737 vraddhn.u16 d31, q13, q11
2738 /* 32bpp result is in {d28, d29, d30, d31} */
2739 .endm
2740
2741 /* TODO: expand macros and do better instructions scheduling */
2742 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2743 fetch_src_pixblock
2744 pixman_composite_out_reverse_8_8888_process_pixblock_tail
2745 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2746 cache_preload 8, 8
2747 pixman_composite_out_reverse_8_8888_process_pixblock_head
2748 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2749 .endm
2750
2751 generate_composite_function \
2752 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2753 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2754 8, /* number of pixels, processed in a single block */ \
2755 5, /* prefetch distance */ \
2756 default_init, \
2757 default_cleanup, \
2758 pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2759 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2760 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2761 28, /* dst_w_basereg */ \
2762 4, /* dst_r_basereg */ \
2763 0, /* src_basereg */ \
2764 0 /* mask_basereg */
2765
2766 /******************************************************************************/
2767
2768 generate_composite_function_nearest_scanline \
2769 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2770 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2771 8, /* number of pixels, processed in a single block */ \
2772 default_init, \
2773 default_cleanup, \
2774 pixman_composite_over_8888_8888_process_pixblock_head, \
2775 pixman_composite_over_8888_8888_process_pixblock_tail, \
2776 pixman_composite_over_8888_8888_process_pixblock_tail_head
2777
2778 generate_composite_function_nearest_scanline \
2779 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2780 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2781 8, /* number of pixels, processed in a single block */ \
2782 default_init, \
2783 default_cleanup, \
2784 pixman_composite_over_8888_0565_process_pixblock_head, \
2785 pixman_composite_over_8888_0565_process_pixblock_tail, \
2786 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2787 28, /* dst_w_basereg */ \
2788 4, /* dst_r_basereg */ \
2789 0, /* src_basereg */ \
2790 24 /* mask_basereg */
2791
2792 generate_composite_function_nearest_scanline \
2793 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2794 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2795 8, /* number of pixels, processed in a single block */ \
2796 default_init, \
2797 default_cleanup, \
2798 pixman_composite_src_8888_0565_process_pixblock_head, \
2799 pixman_composite_src_8888_0565_process_pixblock_tail, \
2800 pixman_composite_src_8888_0565_process_pixblock_tail_head
2801
2802 generate_composite_function_nearest_scanline \
2803 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2804 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2805 8, /* number of pixels, processed in a single block */ \
2806 default_init, \
2807 default_cleanup, \
2808 pixman_composite_src_0565_8888_process_pixblock_head, \
2809 pixman_composite_src_0565_8888_process_pixblock_tail, \
2810 pixman_composite_src_0565_8888_process_pixblock_tail_head
2811
2812 generate_composite_function_nearest_scanline \
2813 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2814 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2815 8, /* number of pixels, processed in a single block */ \
2816 default_init_need_all_regs, \
2817 default_cleanup_need_all_regs, \
2818 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2819 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2820 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2821 28, /* dst_w_basereg */ \
2822 4, /* dst_r_basereg */ \
2823 8, /* src_basereg */ \
2824 24 /* mask_basereg */
2825
2826 generate_composite_function_nearest_scanline \
2827 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2828 FLAG_DST_READWRITE, \
2829 8, /* number of pixels, processed in a single block */ \
2830 default_init_need_all_regs, \
2831 default_cleanup_need_all_regs, \
2832 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2833 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2834 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2835 28, /* dst_w_basereg */ \
2836 10, /* dst_r_basereg */ \
2837 8, /* src_basereg */ \
2838 15 /* mask_basereg */
2839
2840 /******************************************************************************/
2841
2842 /* Supplementary macro for setting function attributes */
2843 .macro pixman_asm_function fname
2844 .func fname
2845 .global fname
2846 #ifdef __ELF__
2847 .hidden fname
2848 .type fname, %function
2849 #endif
2850 fname:
2851 .endm
2852
2853 /*
2854 * Bilinear scaling support code which tries to provide pixel fetching, color
2855 * format conversion, and interpolation as separate macros which can be used
2856 * as the basic building blocks for constructing bilinear scanline functions.
2857 */
2858
2859 .macro bilinear_load_8888 reg1, reg2, tmp
2860 mov TMP1, X, asr #16
2861 add X, X, UX
2862 add TMP1, TOP, TMP1, asl #2
2863 vld1.32 {reg1}, [TMP1], STRIDE
2864 vld1.32 {reg2}, [TMP1]
2865 .endm
2866
2867 .macro bilinear_load_0565 reg1, reg2, tmp
2868 mov TMP1, X, asr #16
2869 add X, X, UX
2870 add TMP1, TOP, TMP1, asl #1
2871 vld1.32 {reg2[0]}, [TMP1], STRIDE
2872 vld1.32 {reg2[1]}, [TMP1]
2873 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2874 .endm
2875
2876 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2877 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2878
2879 bilinear_load_8888 reg1, reg2, tmp1
2880 vmull.u8 acc1, reg1, d28
2881 vmlal.u8 acc1, reg2, d29
2882 bilinear_load_8888 reg3, reg4, tmp2
2883 vmull.u8 acc2, reg3, d28
2884 vmlal.u8 acc2, reg4, d29
2885 .endm
2886
2887 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2888 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2889 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2890
2891 bilinear_load_and_vertical_interpolate_two_8888 \
2892 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2893 bilinear_load_and_vertical_interpolate_two_8888 \
2894 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2895 .endm
2896
2897 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2898 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2899
2900 mov TMP1, X, asr #16
2901 add X, X, UX
2902 add TMP1, TOP, TMP1, asl #1
2903 mov TMP2, X, asr #16
2904 add X, X, UX
2905 add TMP2, TOP, TMP2, asl #1
2906 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2907 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2908 vld1.32 {acc2lo[1]}, [TMP1]
2909 vld1.32 {acc2hi[1]}, [TMP2]
2910 convert_0565_to_x888 acc2, reg3, reg2, reg1
2911 vzip.u8 reg1, reg3
2912 vzip.u8 reg2, reg4
2913 vzip.u8 reg3, reg4
2914 vzip.u8 reg1, reg2
2915 vmull.u8 acc1, reg1, d28
2916 vmlal.u8 acc1, reg2, d29
2917 vmull.u8 acc2, reg3, d28
2918 vmlal.u8 acc2, reg4, d29
2919 .endm
2920
2921 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2922 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2923 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2924
2925 mov TMP1, X, asr #16
2926 add X, X, UX
2927 add TMP1, TOP, TMP1, asl #1
2928 mov TMP2, X, asr #16
2929 add X, X, UX
2930 add TMP2, TOP, TMP2, asl #1
2931 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2932 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2933 vld1.32 {xacc2lo[1]}, [TMP1]
2934 vld1.32 {xacc2hi[1]}, [TMP2]
2935 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2936 mov TMP1, X, asr #16
2937 add X, X, UX
2938 add TMP1, TOP, TMP1, asl #1
2939 mov TMP2, X, asr #16
2940 add X, X, UX
2941 add TMP2, TOP, TMP2, asl #1
2942 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2943 vzip.u8 xreg1, xreg3
2944 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2945 vzip.u8 xreg2, xreg4
2946 vld1.32 {yacc2lo[1]}, [TMP1]
2947 vzip.u8 xreg3, xreg4
2948 vld1.32 {yacc2hi[1]}, [TMP2]
2949 vzip.u8 xreg1, xreg2
2950 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2951 vmull.u8 xacc1, xreg1, d28
2952 vzip.u8 yreg1, yreg3
2953 vmlal.u8 xacc1, xreg2, d29
2954 vzip.u8 yreg2, yreg4
2955 vmull.u8 xacc2, xreg3, d28
2956 vzip.u8 yreg3, yreg4
2957 vmlal.u8 xacc2, xreg4, d29
2958 vzip.u8 yreg1, yreg2
2959 vmull.u8 yacc1, yreg1, d28
2960 vmlal.u8 yacc1, yreg2, d29
2961 vmull.u8 yacc2, yreg3, d28
2962 vmlal.u8 yacc2, yreg4, d29
2963 .endm
2964
2965 .macro bilinear_store_8888 numpix, tmp1, tmp2
2966 .if numpix == 4
2967 vst1.32 {d0, d1}, [OUT, :128]!
2968 .elseif numpix == 2
2969 vst1.32 {d0}, [OUT, :64]!
2970 .elseif numpix == 1
2971 vst1.32 {d0[0]}, [OUT, :32]!
2972 .else
2973 .error bilinear_store_8888 numpix is unsupported
2974 .endif
2975 .endm
2976
2977 .macro bilinear_store_0565 numpix, tmp1, tmp2
2978 vuzp.u8 d0, d1
2979 vuzp.u8 d2, d3
2980 vuzp.u8 d1, d3
2981 vuzp.u8 d0, d2
2982 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2983 .if numpix == 4
2984 vst1.16 {d2}, [OUT, :64]!
2985 .elseif numpix == 2
2986 vst1.32 {d2[0]}, [OUT, :32]!
2987 .elseif numpix == 1
2988 vst1.16 {d2[0]}, [OUT, :16]!
2989 .else
2990 .error bilinear_store_0565 numpix is unsupported
2991 .endif
2992 .endm
2993
2994 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2995 bilinear_load_&src_fmt d0, d1, d2
2996 vmull.u8 q1, d0, d28
2997 vmlal.u8 q1, d1, d29
2998 /* 5 cycles bubble */
2999 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3000 vmlsl.u16 q0, d2, d30
3001 vmlal.u16 q0, d3, d30
3002 /* 5 cycles bubble */
3003 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3004 /* 3 cycles bubble */
3005 vmovn.u16 d0, q0
3006 /* 1 cycle bubble */
3007 bilinear_store_&dst_fmt 1, q2, q3
3008 .endm
3009
3010 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
3011 bilinear_load_and_vertical_interpolate_two_&src_fmt \
3012 q1, q11, d0, d1, d20, d21, d22, d23
3013 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3014 vmlsl.u16 q0, d2, d30
3015 vmlal.u16 q0, d3, d30
3016 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3017 vmlsl.u16 q10, d22, d31
3018 vmlal.u16 q10, d23, d31
3019 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3020 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3021 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3022 vadd.u16 q12, q12, q13
3023 vmovn.u16 d0, q0
3024 bilinear_store_&dst_fmt 2, q2, q3
3025 .endm
3026
3027 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
3028 bilinear_load_and_vertical_interpolate_four_&src_fmt \
3029 q1, q11, d0, d1, d20, d21, d22, d23 \
3030 q3, q9, d4, d5, d16, d17, d18, d19
3031 pld [TMP1, PF_OFFS]
3032 sub TMP1, TMP1, STRIDE
3033 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3034 vmlsl.u16 q0, d2, d30
3035 vmlal.u16 q0, d3, d30
3036 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3037 vmlsl.u16 q10, d22, d31
3038 vmlal.u16 q10, d23, d31
3039 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3040 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
3041 vmlsl.u16 q2, d6, d30
3042 vmlal.u16 q2, d7, d30
3043 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
3044 pld [TMP2, PF_OFFS]
3045 vmlsl.u16 q8, d18, d31
3046 vmlal.u16 q8, d19, d31
3047 vadd.u16 q12, q12, q13
3048 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3049 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3050 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3051 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
3052 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3053 vmovn.u16 d0, q0
3054 vmovn.u16 d1, q2
3055 vadd.u16 q12, q12, q13
3056 bilinear_store_&dst_fmt 4, q2, q3
3057 .endm
3058
3059 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3060 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3061 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
3062 .else
3063 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3064 .endif
3065 .endm
3066
3067 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3068 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3069 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
3070 .endif
3071 .endm
3072
3073 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3074 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3075 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
3076 .else
3077 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3078 .endif
3079 .endm
3080
3081 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3082 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3083 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
3084 .else
3085 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3086 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3087 .endif
3088 .endm
3089
3090 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3091 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3092 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
3093 .else
3094 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3095 .endif
3096 .endm
3097
3098 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3099 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3100 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
3101 .else
3102 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3103 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3104 .endif
3105 .endm
3106
3107 .set BILINEAR_FLAG_UNROLL_4, 0
3108 .set BILINEAR_FLAG_UNROLL_8, 1
3109 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
3110
3111 /*
3112 * Main template macro for generating NEON optimized bilinear scanline
3113 * functions.
3114 *
3115 * Bilinear scanline scaler macro template uses the following arguments:
3116 * fname - name of the function to generate
3117 * src_fmt - source color format (8888 or 0565)
3118 * dst_fmt - destination color format (8888 or 0565)
3119 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
3120 * prefetch_distance - prefetch in the source image by that many
3121 * pixels ahead
3122 */
3123
3124 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
3125 src_bpp_shift, dst_bpp_shift, \
3126 prefetch_distance, flags
3127
3128 pixman_asm_function fname
3129 OUT .req r0
3130 TOP .req r1
3131 BOTTOM .req r2
3132 WT .req r3
3133 WB .req r4
3134 X .req r5
3135 UX .req r6
3136 WIDTH .req ip
3137 TMP1 .req r3
3138 TMP2 .req r4
3139 PF_OFFS .req r7
3140 TMP3 .req r8
3141 TMP4 .req r9
3142 STRIDE .req r2
3143
3144 .fnstart
3145 mov ip, sp
3146 .save {r4, r5, r6, r7, r8, r9}
3147 push {r4, r5, r6, r7, r8, r9}
3148 mov PF_OFFS, #prefetch_distance
3149 ldmia ip, {WB, X, UX, WIDTH}
3150 mul PF_OFFS, PF_OFFS, UX
3151
3152 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3153 .vsave {d8-d15}
3154 vpush {d8-d15}
3155 .endif
3156
3157 sub STRIDE, BOTTOM, TOP
3158 .unreq BOTTOM
3159
3160 cmp WIDTH, #0
3161 ble 3f
3162
3163 vdup.u16 q12, X
3164 vdup.u16 q13, UX
3165 vdup.u8 d28, WT
3166 vdup.u8 d29, WB
3167 vadd.u16 d25, d25, d26
3168
3169 /* ensure good destination alignment */
3170 cmp WIDTH, #1
3171 blt 0f
3172 tst OUT, #(1 << dst_bpp_shift)
3173 beq 0f
3174 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3175 vadd.u16 q12, q12, q13
3176 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3177 sub WIDTH, WIDTH, #1
3178 0:
3179 vadd.u16 q13, q13, q13
3180 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3181 vadd.u16 q12, q12, q13
3182
3183 cmp WIDTH, #2
3184 blt 0f
3185 tst OUT, #(1 << (dst_bpp_shift + 1))
3186 beq 0f
3187 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3188 sub WIDTH, WIDTH, #2
3189 0:
3190 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3191 /*********** 8 pixels per iteration *****************/
3192 cmp WIDTH, #4
3193 blt 0f
3194 tst OUT, #(1 << (dst_bpp_shift + 2))
3195 beq 0f
3196 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3197 sub WIDTH, WIDTH, #4
3198 0:
3199 subs WIDTH, WIDTH, #8
3200 blt 1f
3201 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3202 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3203 subs WIDTH, WIDTH, #8
3204 blt 5f
3205 0:
3206 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3207 subs WIDTH, WIDTH, #8
3208 bge 0b
3209 5:
3210 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3211 1:
3212 tst WIDTH, #4
3213 beq 2f
3214 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3215 2:
3216 .else
3217 /*********** 4 pixels per iteration *****************/
3218 subs WIDTH, WIDTH, #4
3219 blt 1f
3220 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3221 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3222 subs WIDTH, WIDTH, #4
3223 blt 5f
3224 0:
3225 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3226 subs WIDTH, WIDTH, #4
3227 bge 0b
3228 5:
3229 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3230 1:
3231 /****************************************************/
3232 .endif
3233 /* handle the remaining trailing pixels */
3234 tst WIDTH, #2
3235 beq 2f
3236 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3237 2:
3238 tst WIDTH, #1
3239 beq 3f
3240 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3241 3:
3242 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3243 vpop {d8-d15}
3244 .endif
3245 pop {r4, r5, r6, r7, r8, r9}
3246 bx lr
3247 .fnend
3248
3249 .unreq OUT
3250 .unreq TOP
3251 .unreq WT
3252 .unreq WB
3253 .unreq X
3254 .unreq UX
3255 .unreq WIDTH
3256 .unreq TMP1
3257 .unreq TMP2
3258 .unreq PF_OFFS
3259 .unreq TMP3
3260 .unreq TMP4
3261 .unreq STRIDE
3262 .endfunc
3263
3264 .endm
3265
3266 /*****************************************************************************/
3267
3268 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
3269
3270 .macro bilinear_interpolate_four_pixels_8888_8888_head
3271 mov TMP1, X, asr #16
3272 add X, X, UX
3273 add TMP1, TOP, TMP1, asl #2
3274 mov TMP2, X, asr #16
3275 add X, X, UX
3276 add TMP2, TOP, TMP2, asl #2
3277
3278 vld1.32 {d22}, [TMP1], STRIDE
3279 vld1.32 {d23}, [TMP1]
3280 mov TMP3, X, asr #16
3281 add X, X, UX
3282 add TMP3, TOP, TMP3, asl #2
3283 vmull.u8 q8, d22, d28
3284 vmlal.u8 q8, d23, d29
3285
3286 vld1.32 {d22}, [TMP2], STRIDE
3287 vld1.32 {d23}, [TMP2]
3288 mov TMP4, X, asr #16
3289 add X, X, UX
3290 add TMP4, TOP, TMP4, asl #2
3291 vmull.u8 q9, d22, d28
3292 vmlal.u8 q9, d23, d29
3293
3294 vld1.32 {d22}, [TMP3], STRIDE
3295 vld1.32 {d23}, [TMP3]
3296 vmull.u8 q10, d22, d28
3297 vmlal.u8 q10, d23, d29
3298
3299 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3300 vmlsl.u16 q0, d16, d30
3301 vmlal.u16 q0, d17, d30
3302
3303 pld [TMP4, PF_OFFS]
3304 vld1.32 {d16}, [TMP4], STRIDE
3305 vld1.32 {d17}, [TMP4]
3306 pld [TMP4, PF_OFFS]
3307 vmull.u8 q11, d16, d28
3308 vmlal.u8 q11, d17, d29
3309
3310 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3311 vmlsl.u16 q1, d18, d31
3312 .endm
3313
3314 .macro bilinear_interpolate_four_pixels_8888_8888_tail
3315 vmlal.u16 q1, d19, d31
3316 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3317 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3318 vmlsl.u16 q2, d20, d30
3319 vmlal.u16 q2, d21, d30
3320 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3321 vmlsl.u16 q3, d22, d31
3322 vmlal.u16 q3, d23, d31
3323 vadd.u16 q12, q12, q13
3324 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3325 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3326 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3327 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3328 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3329 vmovn.u16 d6, q0
3330 vmovn.u16 d7, q2
3331 vadd.u16 q12, q12, q13
3332 vst1.32 {d6, d7}, [OUT, :128]!
3333 .endm
3334
3335 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3336 mov TMP1, X, asr #16
3337 add X, X, UX
3338 add TMP1, TOP, TMP1, asl #2
3339 mov TMP2, X, asr #16
3340 add X, X, UX
3341 add TMP2, TOP, TMP2, asl #2
3342 vmlal.u16 q1, d19, d31
3343 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3344 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3345 vmlsl.u16 q2, d20, d30
3346 vmlal.u16 q2, d21, d30
3347 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3348 vld1.32 {d20}, [TMP1], STRIDE
3349 vmlsl.u16 q3, d22, d31
3350 vmlal.u16 q3, d23, d31
3351 vld1.32 {d21}, [TMP1]
3352 vmull.u8 q8, d20, d28
3353 vmlal.u8 q8, d21, d29
3354 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3355 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3356 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3357 vld1.32 {d22}, [TMP2], STRIDE
3358 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3359 vadd.u16 q12, q12, q13
3360 vld1.32 {d23}, [TMP2]
3361 vmull.u8 q9, d22, d28
3362 mov TMP3, X, asr #16
3363 add X, X, UX
3364 add TMP3, TOP, TMP3, asl #2
3365 mov TMP4, X, asr #16
3366 add X, X, UX
3367 add TMP4, TOP, TMP4, asl #2
3368 vmlal.u8 q9, d23, d29
3369 vld1.32 {d22}, [TMP3], STRIDE
3370 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3371 vld1.32 {d23}, [TMP3]
3372 vmull.u8 q10, d22, d28
3373 vmlal.u8 q10, d23, d29
3374 vmovn.u16 d6, q0
3375 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3376 vmovn.u16 d7, q2
3377 vmlsl.u16 q0, d16, d30
3378 vmlal.u16 q0, d17, d30
3379 pld [TMP4, PF_OFFS]
3380 vld1.32 {d16}, [TMP4], STRIDE
3381 vadd.u16 q12, q12, q13
3382 vld1.32 {d17}, [TMP4]
3383 pld [TMP4, PF_OFFS]
3384 vmull.u8 q11, d16, d28
3385 vmlal.u8 q11, d17, d29
3386 vst1.32 {d6, d7}, [OUT, :128]!
3387 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3388 vmlsl.u16 q1, d18, d31
3389 .endm
3390
3391 /*****************************************************************************/
3392
3393 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3394
3395 .macro bilinear_interpolate_eight_pixels_8888_0565_head
3396 mov TMP1, X, asr #16
3397 add X, X, UX
3398 add TMP1, TOP, TMP1, asl #2
3399 mov TMP2, X, asr #16
3400 add X, X, UX
3401 add TMP2, TOP, TMP2, asl #2
3402 vld1.32 {d20}, [TMP1], STRIDE
3403 vld1.32 {d21}, [TMP1]
3404 vmull.u8 q8, d20, d28
3405 vmlal.u8 q8, d21, d29
3406 vld1.32 {d22}, [TMP2], STRIDE
3407 vld1.32 {d23}, [TMP2]
3408 vmull.u8 q9, d22, d28
3409 mov TMP3, X, asr #16
3410 add X, X, UX
3411 add TMP3, TOP, TMP3, asl #2
3412 mov TMP4, X, asr #16
3413 add X, X, UX
3414 add TMP4, TOP, TMP4, asl #2
3415 vmlal.u8 q9, d23, d29
3416 vld1.32 {d22}, [TMP3], STRIDE
3417 vld1.32 {d23}, [TMP3]
3418 vmull.u8 q10, d22, d28
3419 vmlal.u8 q10, d23, d29
3420 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3421 vmlsl.u16 q0, d16, d30
3422 vmlal.u16 q0, d17, d30
3423 pld [TMP4, PF_OFFS]
3424 vld1.32 {d16}, [TMP4], STRIDE
3425 vld1.32 {d17}, [TMP4]
3426 pld [TMP4, PF_OFFS]
3427 vmull.u8 q11, d16, d28
3428 vmlal.u8 q11, d17, d29
3429 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3430 vmlsl.u16 q1, d18, d31
3431
3432 mov TMP1, X, asr #16
3433 add X, X, UX
3434 add TMP1, TOP, TMP1, asl #2
3435 mov TMP2, X, asr #16
3436 add X, X, UX
3437 add TMP2, TOP, TMP2, asl #2
3438 vmlal.u16 q1, d19, d31
3439 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3440 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3441 vmlsl.u16 q2, d20, d30
3442 vmlal.u16 q2, d21, d30
3443 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3444 vld1.32 {d20}, [TMP1], STRIDE
3445 vmlsl.u16 q3, d22, d31
3446 vmlal.u16 q3, d23, d31
3447 vld1.32 {d21}, [TMP1]
3448 vmull.u8 q8, d20, d28
3449 vmlal.u8 q8, d21, d29
3450 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3451 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3452 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3453 vld1.32 {d22}, [TMP2], STRIDE
3454 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3455 vadd.u16 q12, q12, q13
3456 vld1.32 {d23}, [TMP2]
3457 vmull.u8 q9, d22, d28
3458 mov TMP3, X, asr #16
3459 add X, X, UX
3460 add TMP3, TOP, TMP3, asl #2
3461 mov TMP4, X, asr #16
3462 add X, X, UX
3463 add TMP4, TOP, TMP4, asl #2
3464 vmlal.u8 q9, d23, d29
3465 vld1.32 {d22}, [TMP3], STRIDE
3466 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3467 vld1.32 {d23}, [TMP3]
3468 vmull.u8 q10, d22, d28
3469 vmlal.u8 q10, d23, d29
3470 vmovn.u16 d8, q0
3471 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3472 vmovn.u16 d9, q2
3473 vmlsl.u16 q0, d16, d30
3474 vmlal.u16 q0, d17, d30
3475 pld [TMP4, PF_OFFS]
3476 vld1.32 {d16}, [TMP4], STRIDE
3477 vadd.u16 q12, q12, q13
3478 vld1.32 {d17}, [TMP4]
3479 pld [TMP4, PF_OFFS]
3480 vmull.u8 q11, d16, d28
3481 vmlal.u8 q11, d17, d29
3482 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3483 vmlsl.u16 q1, d18, d31
3484 .endm
3485
3486 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
3487 vmlal.u16 q1, d19, d31
3488 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3489 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3490 vmlsl.u16 q2, d20, d30
3491 vmlal.u16 q2, d21, d30
3492 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3493 vmlsl.u16 q3, d22, d31
3494 vmlal.u16 q3, d23, d31
3495 vadd.u16 q12, q12, q13
3496 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3497 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3498 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3499 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3500 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3501 vmovn.u16 d10, q0
3502 vmovn.u16 d11, q2
3503 vadd.u16 q12, q12, q13
3504
3505 vuzp.u8 d8, d9
3506 vuzp.u8 d10, d11
3507 vuzp.u8 d9, d11
3508 vuzp.u8 d8, d10
3509 vshll.u8 q6, d9, #8
3510 vshll.u8 q5, d10, #8
3511 vshll.u8 q7, d8, #8
3512 vsri.u16 q5, q6, #5
3513 vsri.u16 q5, q7, #11
3514 vst1.32 {d10, d11}, [OUT, :128]!
3515 .endm
3516
3517 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3518 mov TMP1, X, asr #16
3519 add X, X, UX
3520 add TMP1, TOP, TMP1, asl #2
3521 mov TMP2, X, asr #16
3522 add X, X, UX
3523 add TMP2, TOP, TMP2, asl #2
3524 vmlal.u16 q1, d19, d31
3525 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3526 vuzp.u8 d8, d9
3527 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3528 vmlsl.u16 q2, d20, d30
3529 vmlal.u16 q2, d21, d30
3530 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3531 vld1.32 {d20}, [TMP1], STRIDE
3532 vmlsl.u16 q3, d22, d31
3533 vmlal.u16 q3, d23, d31
3534 vld1.32 {d21}, [TMP1]
3535 vmull.u8 q8, d20, d28
3536 vmlal.u8 q8, d21, d29
3537 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3538 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3539 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3540 vld1.32 {d22}, [TMP2], STRIDE
3541 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3542 vadd.u16 q12, q12, q13
3543 vld1.32 {d23}, [TMP2]
3544 vmull.u8 q9, d22, d28
3545 mov TMP3, X, asr #16
3546 add X, X, UX
3547 add TMP3, TOP, TMP3, asl #2
3548 mov TMP4, X, asr #16
3549 add X, X, UX
3550 add TMP4, TOP, TMP4, asl #2
3551 vmlal.u8 q9, d23, d29
3552 vld1.32 {d22}, [TMP3], STRIDE
3553 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3554 vld1.32 {d23}, [TMP3]
3555 vmull.u8 q10, d22, d28
3556 vmlal.u8 q10, d23, d29
3557 vmovn.u16 d10, q0
3558 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3559 vmovn.u16 d11, q2
3560 vmlsl.u16 q0, d16, d30
3561 vmlal.u16 q0, d17, d30
3562 pld [TMP4, PF_OFFS]
3563 vld1.32 {d16}, [TMP4], STRIDE
3564 vadd.u16 q12, q12, q13
3565 vld1.32 {d17}, [TMP4]
3566 pld [TMP4, PF_OFFS]
3567 vmull.u8 q11, d16, d28
3568 vmlal.u8 q11, d17, d29
3569 vuzp.u8 d10, d11
3570 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3571 vmlsl.u16 q1, d18, d31
3572
3573 mov TMP1, X, asr #16
3574 add X, X, UX
3575 add TMP1, TOP, TMP1, asl #2
3576 mov TMP2, X, asr #16
3577 add X, X, UX
3578 add TMP2, TOP, TMP2, asl #2
3579 vmlal.u16 q1, d19, d31
3580 vuzp.u8 d9, d11
3581 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3582 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3583 vuzp.u8 d8, d10
3584 vmlsl.u16 q2, d20, d30
3585 vmlal.u16 q2, d21, d30
3586 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3587 vld1.32 {d20}, [TMP1], STRIDE
3588 vmlsl.u16 q3, d22, d31
3589 vmlal.u16 q3, d23, d31
3590 vld1.32 {d21}, [TMP1]
3591 vmull.u8 q8, d20, d28
3592 vmlal.u8 q8, d21, d29
3593 vshll.u8 q6, d9, #8
3594 vshll.u8 q5, d10, #8
3595 vshll.u8 q7, d8, #8
3596 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3597 vsri.u16 q5, q6, #5
3598 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3599 vsri.u16 q5, q7, #11
3600 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3601 vld1.32 {d22}, [TMP2], STRIDE
3602 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3603 vadd.u16 q12, q12, q13
3604 vld1.32 {d23}, [TMP2]
3605 vmull.u8 q9, d22, d28
3606 mov TMP3, X, asr #16
3607 add X, X, UX
3608 add TMP3, TOP, TMP3, asl #2
3609 mov TMP4, X, asr #16
3610 add X, X, UX
3611 add TMP4, TOP, TMP4, asl #2
3612 vmlal.u8 q9, d23, d29
3613 vld1.32 {d22}, [TMP3], STRIDE
3614 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3615 vld1.32 {d23}, [TMP3]
3616 vmull.u8 q10, d22, d28
3617 vmlal.u8 q10, d23, d29
3618 vmovn.u16 d8, q0
3619 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3620 vmovn.u16 d9, q2
3621 vmlsl.u16 q0, d16, d30
3622 vmlal.u16 q0, d17, d30
3623 pld [TMP4, PF_OFFS]
3624 vld1.32 {d16}, [TMP4], STRIDE
3625 vadd.u16 q12, q12, q13
3626 vld1.32 {d17}, [TMP4]
3627 pld [TMP4, PF_OFFS]
3628 vmull.u8 q11, d16, d28
3629 vmlal.u8 q11, d17, d29
3630 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3631 vst1.32 {d10, d11}, [OUT, :128]!
3632 vmlsl.u16 q1, d18, d31
3633 .endm
3634 /*****************************************************************************/
3635
3636 generate_bilinear_scanline_func \
3637 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3638 2, 2, 28, BILINEAR_FLAG_UNROLL_4
3639
3640 generate_bilinear_scanline_func \
3641 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3642 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3643
3644 generate_bilinear_scanline_func \
3645 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3646 1, 2, 28, BILINEAR_FLAG_UNROLL_4
3647
3648 generate_bilinear_scanline_func \
3649 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3650 1, 1, 28, BILINEAR_FLAG_UNROLL_4

mercurial