gfx/cairo/libpixman/src/pixman-arm-neon-asm.S

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright © 2009 Nokia Corporation
michael@0 3 *
michael@0 4 * Permission is hereby granted, free of charge, to any person obtaining a
michael@0 5 * copy of this software and associated documentation files (the "Software"),
michael@0 6 * to deal in the Software without restriction, including without limitation
michael@0 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
michael@0 8 * and/or sell copies of the Software, and to permit persons to whom the
michael@0 9 * Software is furnished to do so, subject to the following conditions:
michael@0 10 *
michael@0 11 * The above copyright notice and this permission notice (including the next
michael@0 12 * paragraph) shall be included in all copies or substantial portions of the
michael@0 13 * Software.
michael@0 14 *
michael@0 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
michael@0 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
michael@0 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
michael@0 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
michael@0 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
michael@0 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
michael@0 21 * DEALINGS IN THE SOFTWARE.
michael@0 22 *
michael@0 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
michael@0 24 */
michael@0 25
michael@0 26 /*
michael@0 27 * This file contains implementations of NEON optimized pixel processing
michael@0 28 * functions. There is no full and detailed tutorial, but some functions
michael@0 29 * (those which are exposing some new or interesting features) are
michael@0 30 * extensively commented and can be used as examples.
michael@0 31 *
michael@0 32 * You may want to have a look at the comments for following functions:
michael@0 33 * - pixman_composite_over_8888_0565_asm_neon
michael@0 34 * - pixman_composite_over_n_8_0565_asm_neon
michael@0 35 */
michael@0 36
michael@0 37 /* Prevent the stack from becoming executable for no reason... */
michael@0 38 #if defined(__linux__) && defined(__ELF__)
michael@0 39 .section .note.GNU-stack,"",%progbits
michael@0 40 #endif
michael@0 41
michael@0 42 .text
michael@0 43 .fpu neon
michael@0 44 .arch armv7a
michael@0 45 .object_arch armv4
michael@0 46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
michael@0 47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
michael@0 48 .arm
michael@0 49 .altmacro
michael@0 50 .p2align 2
michael@0 51
michael@0 52 #include "pixman-private.h"
michael@0 53 #include "pixman-arm-neon-asm.h"
michael@0 54
michael@0 55 /* Global configuration options and preferences */
michael@0 56
michael@0 57 /*
michael@0 58 * The code can optionally make use of unaligned memory accesses to improve
michael@0 59 * performance of handling leading/trailing pixels for each scanline.
michael@0 60 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
michael@0 61 * example in linux if unaligned memory accesses are not configured to
michael@0 62 * generate.exceptions.
michael@0 63 */
michael@0 64 .set RESPECT_STRICT_ALIGNMENT, 1
michael@0 65
michael@0 66 /*
michael@0 67 * Set default prefetch type. There is a choice between the following options:
michael@0 68 *
michael@0 69 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
michael@0 70 * as NOP to workaround some HW bugs or for whatever other reason)
michael@0 71 *
michael@0 72 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
michael@0 73 * advanced prefetch intruduces heavy overhead)
michael@0 74 *
michael@0 75 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
michael@0 76 * which can run ARM and NEON instructions simultaneously so that extra ARM
michael@0 77 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
michael@0 78 *
michael@0 79 * Note: some types of function can't support advanced prefetch and fallback
michael@0 80 * to simple one (those which handle 24bpp pixels)
michael@0 81 */
michael@0 82 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
michael@0 83
michael@0 84 /* Prefetch distance in pixels for simple prefetch */
michael@0 85 .set PREFETCH_DISTANCE_SIMPLE, 64
michael@0 86
michael@0 87 /*
michael@0 88 * Implementation of pixman_composite_over_8888_0565_asm_neon
michael@0 89 *
michael@0 90 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
michael@0 91 * performs OVER compositing operation. Function fast_composite_over_8888_0565
michael@0 92 * from pixman-fast-path.c does the same in C and can be used as a reference.
michael@0 93 *
michael@0 94 * First we need to have some NEON assembly code which can do the actual
michael@0 95 * operation on the pixels and provide it to the template macro.
michael@0 96 *
michael@0 97 * Template macro quite conveniently takes care of emitting all the necessary
michael@0 98 * code for memory reading and writing (including quite tricky cases of
michael@0 99 * handling unaligned leading/trailing pixels), so we only need to deal with
michael@0 100 * the data in NEON registers.
michael@0 101 *
michael@0 102 * NEON registers allocation in general is recommented to be the following:
michael@0 103 * d0, d1, d2, d3 - contain loaded source pixel data
michael@0 104 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
michael@0 105 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
michael@0 106 * d28, d29, d30, d31 - place for storing the result (destination pixels)
michael@0 107 *
michael@0 108 * As can be seen above, four 64-bit NEON registers are used for keeping
michael@0 109 * intermediate pixel data and up to 8 pixels can be processed in one step
michael@0 110 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
michael@0 111 *
michael@0 112 * This particular function uses the following registers allocation:
michael@0 113 * d0, d1, d2, d3 - contain loaded source pixel data
michael@0 114 * d4, d5 - contain loaded destination pixels (they are needed)
michael@0 115 * d28, d29 - place for storing the result (destination pixels)
michael@0 116 */
michael@0 117
michael@0 118 /*
michael@0 119 * Step one. We need to have some code to do some arithmetics on pixel data.
michael@0 120 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
michael@0 121 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
michael@0 122 * perform all the needed calculations and write the result to {d28, d29}.
michael@0 123 * The rationale for having two macros and not just one will be explained
michael@0 124 * later. In practice, any single monolitic function which does the work can
michael@0 125 * be split into two parts in any arbitrary way without affecting correctness.
michael@0 126 *
michael@0 127 * There is one special trick here too. Common template macro can optionally
michael@0 128 * make our life a bit easier by doing R, G, B, A color components
michael@0 129 * deinterleaving for 32bpp pixel formats (and this feature is used in
michael@0 130 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
michael@0 131 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
michael@0 132 * actually use d0 register for blue channel (a vector of eight 8-bit
michael@0 133 * values), d1 register for green, d2 for red and d3 for alpha. This
michael@0 134 * simple conversion can be also done with a few NEON instructions:
michael@0 135 *
michael@0 136 * Packed to planar conversion:
michael@0 137 * vuzp.8 d0, d1
michael@0 138 * vuzp.8 d2, d3
michael@0 139 * vuzp.8 d1, d3
michael@0 140 * vuzp.8 d0, d2
michael@0 141 *
michael@0 142 * Planar to packed conversion:
michael@0 143 * vzip.8 d0, d2
michael@0 144 * vzip.8 d1, d3
michael@0 145 * vzip.8 d2, d3
michael@0 146 * vzip.8 d0, d1
michael@0 147 *
michael@0 148 * But pixel can be loaded directly in planar format using VLD4.8 NEON
michael@0 149 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
michael@0 150 * desirable, that's why deinterleaving is optional.
michael@0 151 *
michael@0 152 * But anyway, here is the code:
michael@0 153 */
michael@0 154 .macro pixman_composite_over_8888_0565_process_pixblock_head
michael@0 155 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
michael@0 156 and put data into d6 - red, d7 - green, d30 - blue */
michael@0 157 vshrn.u16 d6, q2, #8
michael@0 158 vshrn.u16 d7, q2, #3
michael@0 159 vsli.u16 q2, q2, #5
michael@0 160 vsri.u8 d6, d6, #5
michael@0 161 vmvn.8 d3, d3 /* invert source alpha */
michael@0 162 vsri.u8 d7, d7, #6
michael@0 163 vshrn.u16 d30, q2, #2
michael@0 164 /* now do alpha blending, storing results in 8-bit planar format
michael@0 165 into d16 - red, d19 - green, d18 - blue */
michael@0 166 vmull.u8 q10, d3, d6
michael@0 167 vmull.u8 q11, d3, d7
michael@0 168 vmull.u8 q12, d3, d30
michael@0 169 vrshr.u16 q13, q10, #8
michael@0 170 vrshr.u16 q3, q11, #8
michael@0 171 vrshr.u16 q15, q12, #8
michael@0 172 vraddhn.u16 d20, q10, q13
michael@0 173 vraddhn.u16 d23, q11, q3
michael@0 174 vraddhn.u16 d22, q12, q15
michael@0 175 .endm
michael@0 176
michael@0 177 .macro pixman_composite_over_8888_0565_process_pixblock_tail
michael@0 178 /* ... continue alpha blending */
michael@0 179 vqadd.u8 d16, d2, d20
michael@0 180 vqadd.u8 q9, q0, q11
michael@0 181 /* convert the result to r5g6b5 and store it into {d28, d29} */
michael@0 182 vshll.u8 q14, d16, #8
michael@0 183 vshll.u8 q8, d19, #8
michael@0 184 vshll.u8 q9, d18, #8
michael@0 185 vsri.u16 q14, q8, #5
michael@0 186 vsri.u16 q14, q9, #11
michael@0 187 .endm
michael@0 188
michael@0 189 /*
michael@0 190 * OK, now we got almost everything that we need. Using the above two
michael@0 191 * macros, the work can be done right. But now we want to optimize
michael@0 192 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
michael@0 193 * a lot from good code scheduling and software pipelining.
michael@0 194 *
michael@0 195 * Let's construct some code, which will run in the core main loop.
michael@0 196 * Some pseudo-code of the main loop will look like this:
michael@0 197 * head
michael@0 198 * while (...) {
michael@0 199 * tail
michael@0 200 * head
michael@0 201 * }
michael@0 202 * tail
michael@0 203 *
michael@0 204 * It may look a bit weird, but this setup allows to hide instruction
michael@0 205 * latencies better and also utilize dual-issue capability more
michael@0 206 * efficiently (make pairs of load-store and ALU instructions).
michael@0 207 *
michael@0 208 * So what we need now is a '*_tail_head' macro, which will be used
michael@0 209 * in the core main loop. A trivial straightforward implementation
michael@0 210 * of this macro would look like this:
michael@0 211 *
michael@0 212 * pixman_composite_over_8888_0565_process_pixblock_tail
michael@0 213 * vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 214 * vld1.16 {d4, d5}, [DST_R, :128]!
michael@0 215 * vld4.32 {d0, d1, d2, d3}, [SRC]!
michael@0 216 * pixman_composite_over_8888_0565_process_pixblock_head
michael@0 217 * cache_preload 8, 8
michael@0 218 *
michael@0 219 * Now it also got some VLD/VST instructions. We simply can't move from
michael@0 220 * processing one block of pixels to the other one with just arithmetics.
michael@0 221 * The previously processed data needs to be written to memory and new
michael@0 222 * data needs to be fetched. Fortunately, this main loop does not deal
michael@0 223 * with partial leading/trailing pixels and can load/store a full block
michael@0 224 * of pixels in a bulk. Additionally, destination buffer is already
michael@0 225 * 16 bytes aligned here (which is good for performance).
michael@0 226 *
michael@0 227 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
michael@0 228 * are the aliases for ARM registers which are used as pointers for
michael@0 229 * accessing data. We maintain separate pointers for reading and writing
michael@0 230 * destination buffer (DST_R and DST_W).
michael@0 231 *
michael@0 232 * Another new thing is 'cache_preload' macro. It is used for prefetching
michael@0 233 * data into CPU L2 cache and improve performance when dealing with large
michael@0 234 * images which are far larger than cache size. It uses one argument
michael@0 235 * (actually two, but they need to be the same here) - number of pixels
michael@0 236 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
michael@0 237 * details about this macro. Moreover, if good performance is needed
michael@0 238 * the code from this macro needs to be copied into '*_tail_head' macro
michael@0 239 * and mixed with the rest of code for optimal instructions scheduling.
michael@0 240 * We are actually doing it below.
michael@0 241 *
michael@0 242 * Now after all the explanations, here is the optimized code.
michael@0 243 * Different instruction streams (originaling from '*_head', '*_tail'
michael@0 244 * and 'cache_preload' macro) use different indentation levels for
michael@0 245 * better readability. Actually taking the code from one of these
michael@0 246 * indentation levels and ignoring a few VLD/VST instructions would
michael@0 247 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
michael@0 248 * macro!
michael@0 249 */
michael@0 250
michael@0 251 #if 1
michael@0 252
michael@0 253 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
michael@0 254 vqadd.u8 d16, d2, d20
michael@0 255 vld1.16 {d4, d5}, [DST_R, :128]!
michael@0 256 vqadd.u8 q9, q0, q11
michael@0 257 vshrn.u16 d6, q2, #8
michael@0 258 fetch_src_pixblock
michael@0 259 vshrn.u16 d7, q2, #3
michael@0 260 vsli.u16 q2, q2, #5
michael@0 261 vshll.u8 q14, d16, #8
michael@0 262 PF add PF_X, PF_X, #8
michael@0 263 vshll.u8 q8, d19, #8
michael@0 264 PF tst PF_CTL, #0xF
michael@0 265 vsri.u8 d6, d6, #5
michael@0 266 PF addne PF_X, PF_X, #8
michael@0 267 vmvn.8 d3, d3
michael@0 268 PF subne PF_CTL, PF_CTL, #1
michael@0 269 vsri.u8 d7, d7, #6
michael@0 270 vshrn.u16 d30, q2, #2
michael@0 271 vmull.u8 q10, d3, d6
michael@0 272 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
michael@0 273 vmull.u8 q11, d3, d7
michael@0 274 vmull.u8 q12, d3, d30
michael@0 275 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
michael@0 276 vsri.u16 q14, q8, #5
michael@0 277 PF cmp PF_X, ORIG_W
michael@0 278 vshll.u8 q9, d18, #8
michael@0 279 vrshr.u16 q13, q10, #8
michael@0 280 PF subge PF_X, PF_X, ORIG_W
michael@0 281 vrshr.u16 q3, q11, #8
michael@0 282 vrshr.u16 q15, q12, #8
michael@0 283 PF subges PF_CTL, PF_CTL, #0x10
michael@0 284 vsri.u16 q14, q9, #11
michael@0 285 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
michael@0 286 vraddhn.u16 d20, q10, q13
michael@0 287 vraddhn.u16 d23, q11, q3
michael@0 288 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
michael@0 289 vraddhn.u16 d22, q12, q15
michael@0 290 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 291 .endm
michael@0 292
michael@0 293 #else
michael@0 294
michael@0 295 /* If we did not care much about the performance, we would just use this... */
michael@0 296 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
michael@0 297 pixman_composite_over_8888_0565_process_pixblock_tail
michael@0 298 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 299 vld1.16 {d4, d5}, [DST_R, :128]!
michael@0 300 fetch_src_pixblock
michael@0 301 pixman_composite_over_8888_0565_process_pixblock_head
michael@0 302 cache_preload 8, 8
michael@0 303 .endm
michael@0 304
michael@0 305 #endif
michael@0 306
michael@0 307 /*
michael@0 308 * And now the final part. We are using 'generate_composite_function' macro
michael@0 309 * to put all the stuff together. We are specifying the name of the function
michael@0 310 * which we want to get, number of bits per pixel for the source, mask and
michael@0 311 * destination (0 if unused, like mask in this case). Next come some bit
michael@0 312 * flags:
michael@0 313 * FLAG_DST_READWRITE - tells that the destination buffer is both read
michael@0 314 * and written, for write-only buffer we would use
michael@0 315 * FLAG_DST_WRITEONLY flag instead
michael@0 316 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
michael@0 317 * and separate color channels for 32bpp format.
michael@0 318 * The next things are:
michael@0 319 * - the number of pixels processed per iteration (8 in this case, because
michael@0 320 * that's the maximum what can fit into four 64-bit NEON registers).
michael@0 321 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
michael@0 322 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
michael@0 323 * prefetch distance can be selected by running some benchmarks.
michael@0 324 *
michael@0 325 * After that we specify some macros, these are 'default_init',
michael@0 326 * 'default_cleanup' here which are empty (but it is possible to have custom
michael@0 327 * init/cleanup macros to be able to save/restore some extra NEON registers
michael@0 328 * like d8-d15 or do anything else) followed by
michael@0 329 * 'pixman_composite_over_8888_0565_process_pixblock_head',
michael@0 330 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
michael@0 331 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
michael@0 332 * which we got implemented above.
michael@0 333 *
michael@0 334 * The last part is the NEON registers allocation scheme.
michael@0 335 */
michael@0 336 generate_composite_function \
michael@0 337 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
michael@0 338 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 339 8, /* number of pixels, processed in a single block */ \
michael@0 340 5, /* prefetch distance */ \
michael@0 341 default_init, \
michael@0 342 default_cleanup, \
michael@0 343 pixman_composite_over_8888_0565_process_pixblock_head, \
michael@0 344 pixman_composite_over_8888_0565_process_pixblock_tail, \
michael@0 345 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
michael@0 346 28, /* dst_w_basereg */ \
michael@0 347 4, /* dst_r_basereg */ \
michael@0 348 0, /* src_basereg */ \
michael@0 349 24 /* mask_basereg */
michael@0 350
michael@0 351 /******************************************************************************/
michael@0 352
michael@0 353 .macro pixman_composite_over_n_0565_process_pixblock_head
michael@0 354 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
michael@0 355 and put data into d6 - red, d7 - green, d30 - blue */
michael@0 356 vshrn.u16 d6, q2, #8
michael@0 357 vshrn.u16 d7, q2, #3
michael@0 358 vsli.u16 q2, q2, #5
michael@0 359 vsri.u8 d6, d6, #5
michael@0 360 vsri.u8 d7, d7, #6
michael@0 361 vshrn.u16 d30, q2, #2
michael@0 362 /* now do alpha blending, storing results in 8-bit planar format
michael@0 363 into d16 - red, d19 - green, d18 - blue */
michael@0 364 vmull.u8 q10, d3, d6
michael@0 365 vmull.u8 q11, d3, d7
michael@0 366 vmull.u8 q12, d3, d30
michael@0 367 vrshr.u16 q13, q10, #8
michael@0 368 vrshr.u16 q3, q11, #8
michael@0 369 vrshr.u16 q15, q12, #8
michael@0 370 vraddhn.u16 d20, q10, q13
michael@0 371 vraddhn.u16 d23, q11, q3
michael@0 372 vraddhn.u16 d22, q12, q15
michael@0 373 .endm
michael@0 374
michael@0 375 .macro pixman_composite_over_n_0565_process_pixblock_tail
michael@0 376 /* ... continue alpha blending */
michael@0 377 vqadd.u8 d16, d2, d20
michael@0 378 vqadd.u8 q9, q0, q11
michael@0 379 /* convert the result to r5g6b5 and store it into {d28, d29} */
michael@0 380 vshll.u8 q14, d16, #8
michael@0 381 vshll.u8 q8, d19, #8
michael@0 382 vshll.u8 q9, d18, #8
michael@0 383 vsri.u16 q14, q8, #5
michael@0 384 vsri.u16 q14, q9, #11
michael@0 385 .endm
michael@0 386
michael@0 387 /* TODO: expand macros and do better instructions scheduling */
michael@0 388 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
michael@0 389 pixman_composite_over_n_0565_process_pixblock_tail
michael@0 390 vld1.16 {d4, d5}, [DST_R, :128]!
michael@0 391 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 392 pixman_composite_over_n_0565_process_pixblock_head
michael@0 393 cache_preload 8, 8
michael@0 394 .endm
michael@0 395
michael@0 396 .macro pixman_composite_over_n_0565_init
michael@0 397 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 398 vld1.32 {d3[0]}, [DUMMY]
michael@0 399 vdup.8 d0, d3[0]
michael@0 400 vdup.8 d1, d3[1]
michael@0 401 vdup.8 d2, d3[2]
michael@0 402 vdup.8 d3, d3[3]
michael@0 403 vmvn.8 d3, d3 /* invert source alpha */
michael@0 404 .endm
michael@0 405
michael@0 406 generate_composite_function \
michael@0 407 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
michael@0 408 FLAG_DST_READWRITE, \
michael@0 409 8, /* number of pixels, processed in a single block */ \
michael@0 410 5, /* prefetch distance */ \
michael@0 411 pixman_composite_over_n_0565_init, \
michael@0 412 default_cleanup, \
michael@0 413 pixman_composite_over_n_0565_process_pixblock_head, \
michael@0 414 pixman_composite_over_n_0565_process_pixblock_tail, \
michael@0 415 pixman_composite_over_n_0565_process_pixblock_tail_head, \
michael@0 416 28, /* dst_w_basereg */ \
michael@0 417 4, /* dst_r_basereg */ \
michael@0 418 0, /* src_basereg */ \
michael@0 419 24 /* mask_basereg */
michael@0 420
michael@0 421 /******************************************************************************/
michael@0 422
michael@0 423 .macro pixman_composite_src_8888_0565_process_pixblock_head
michael@0 424 vshll.u8 q8, d1, #8
michael@0 425 vshll.u8 q14, d2, #8
michael@0 426 vshll.u8 q9, d0, #8
michael@0 427 .endm
michael@0 428
michael@0 429 .macro pixman_composite_src_8888_0565_process_pixblock_tail
michael@0 430 vsri.u16 q14, q8, #5
michael@0 431 vsri.u16 q14, q9, #11
michael@0 432 .endm
michael@0 433
michael@0 434 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
michael@0 435 vsri.u16 q14, q8, #5
michael@0 436 PF add PF_X, PF_X, #8
michael@0 437 PF tst PF_CTL, #0xF
michael@0 438 fetch_src_pixblock
michael@0 439 PF addne PF_X, PF_X, #8
michael@0 440 PF subne PF_CTL, PF_CTL, #1
michael@0 441 vsri.u16 q14, q9, #11
michael@0 442 PF cmp PF_X, ORIG_W
michael@0 443 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
michael@0 444 vshll.u8 q8, d1, #8
michael@0 445 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 446 PF subge PF_X, PF_X, ORIG_W
michael@0 447 PF subges PF_CTL, PF_CTL, #0x10
michael@0 448 vshll.u8 q14, d2, #8
michael@0 449 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
michael@0 450 vshll.u8 q9, d0, #8
michael@0 451 .endm
michael@0 452
michael@0 453 generate_composite_function \
michael@0 454 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
michael@0 455 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
michael@0 456 8, /* number of pixels, processed in a single block */ \
michael@0 457 10, /* prefetch distance */ \
michael@0 458 default_init, \
michael@0 459 default_cleanup, \
michael@0 460 pixman_composite_src_8888_0565_process_pixblock_head, \
michael@0 461 pixman_composite_src_8888_0565_process_pixblock_tail, \
michael@0 462 pixman_composite_src_8888_0565_process_pixblock_tail_head
michael@0 463
michael@0 464 /******************************************************************************/
michael@0 465
michael@0 466 .macro pixman_composite_src_0565_8888_process_pixblock_head
michael@0 467 vshrn.u16 d30, q0, #8
michael@0 468 vshrn.u16 d29, q0, #3
michael@0 469 vsli.u16 q0, q0, #5
michael@0 470 vmov.u8 d31, #255
michael@0 471 vsri.u8 d30, d30, #5
michael@0 472 vsri.u8 d29, d29, #6
michael@0 473 vshrn.u16 d28, q0, #2
michael@0 474 .endm
michael@0 475
michael@0 476 .macro pixman_composite_src_0565_8888_process_pixblock_tail
michael@0 477 .endm
michael@0 478
michael@0 479 /* TODO: expand macros and do better instructions scheduling */
michael@0 480 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
michael@0 481 pixman_composite_src_0565_8888_process_pixblock_tail
michael@0 482 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 483 fetch_src_pixblock
michael@0 484 pixman_composite_src_0565_8888_process_pixblock_head
michael@0 485 cache_preload 8, 8
michael@0 486 .endm
michael@0 487
michael@0 488 generate_composite_function \
michael@0 489 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
michael@0 490 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
michael@0 491 8, /* number of pixels, processed in a single block */ \
michael@0 492 10, /* prefetch distance */ \
michael@0 493 default_init, \
michael@0 494 default_cleanup, \
michael@0 495 pixman_composite_src_0565_8888_process_pixblock_head, \
michael@0 496 pixman_composite_src_0565_8888_process_pixblock_tail, \
michael@0 497 pixman_composite_src_0565_8888_process_pixblock_tail_head
michael@0 498
michael@0 499 /******************************************************************************/
michael@0 500
michael@0 501 .macro pixman_composite_add_8_8_process_pixblock_head
michael@0 502 vqadd.u8 q14, q0, q2
michael@0 503 vqadd.u8 q15, q1, q3
michael@0 504 .endm
michael@0 505
michael@0 506 .macro pixman_composite_add_8_8_process_pixblock_tail
michael@0 507 .endm
michael@0 508
michael@0 509 .macro pixman_composite_add_8_8_process_pixblock_tail_head
michael@0 510 fetch_src_pixblock
michael@0 511 PF add PF_X, PF_X, #32
michael@0 512 PF tst PF_CTL, #0xF
michael@0 513 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 514 PF addne PF_X, PF_X, #32
michael@0 515 PF subne PF_CTL, PF_CTL, #1
michael@0 516 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 517 PF cmp PF_X, ORIG_W
michael@0 518 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
michael@0 519 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
michael@0 520 PF subge PF_X, PF_X, ORIG_W
michael@0 521 PF subges PF_CTL, PF_CTL, #0x10
michael@0 522 vqadd.u8 q14, q0, q2
michael@0 523 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
michael@0 524 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
michael@0 525 vqadd.u8 q15, q1, q3
michael@0 526 .endm
michael@0 527
michael@0 528 generate_composite_function \
michael@0 529 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
michael@0 530 FLAG_DST_READWRITE, \
michael@0 531 32, /* number of pixels, processed in a single block */ \
michael@0 532 10, /* prefetch distance */ \
michael@0 533 default_init, \
michael@0 534 default_cleanup, \
michael@0 535 pixman_composite_add_8_8_process_pixblock_head, \
michael@0 536 pixman_composite_add_8_8_process_pixblock_tail, \
michael@0 537 pixman_composite_add_8_8_process_pixblock_tail_head
michael@0 538
michael@0 539 /******************************************************************************/
michael@0 540
michael@0 541 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
michael@0 542 fetch_src_pixblock
michael@0 543 PF add PF_X, PF_X, #8
michael@0 544 PF tst PF_CTL, #0xF
michael@0 545 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 546 PF addne PF_X, PF_X, #8
michael@0 547 PF subne PF_CTL, PF_CTL, #1
michael@0 548 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 549 PF cmp PF_X, ORIG_W
michael@0 550 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
michael@0 551 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
michael@0 552 PF subge PF_X, PF_X, ORIG_W
michael@0 553 PF subges PF_CTL, PF_CTL, #0x10
michael@0 554 vqadd.u8 q14, q0, q2
michael@0 555 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
michael@0 556 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
michael@0 557 vqadd.u8 q15, q1, q3
michael@0 558 .endm
michael@0 559
michael@0 560 generate_composite_function \
michael@0 561 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
michael@0 562 FLAG_DST_READWRITE, \
michael@0 563 8, /* number of pixels, processed in a single block */ \
michael@0 564 10, /* prefetch distance */ \
michael@0 565 default_init, \
michael@0 566 default_cleanup, \
michael@0 567 pixman_composite_add_8_8_process_pixblock_head, \
michael@0 568 pixman_composite_add_8_8_process_pixblock_tail, \
michael@0 569 pixman_composite_add_8888_8888_process_pixblock_tail_head
michael@0 570
michael@0 571 generate_composite_function_single_scanline \
michael@0 572 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
michael@0 573 FLAG_DST_READWRITE, \
michael@0 574 8, /* number of pixels, processed in a single block */ \
michael@0 575 default_init, \
michael@0 576 default_cleanup, \
michael@0 577 pixman_composite_add_8_8_process_pixblock_head, \
michael@0 578 pixman_composite_add_8_8_process_pixblock_tail, \
michael@0 579 pixman_composite_add_8888_8888_process_pixblock_tail_head
michael@0 580
michael@0 581 /******************************************************************************/
michael@0 582
michael@0 583 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
michael@0 584 vmvn.8 d24, d3 /* get inverted alpha */
michael@0 585 /* do alpha blending */
michael@0 586 vmull.u8 q8, d24, d4
michael@0 587 vmull.u8 q9, d24, d5
michael@0 588 vmull.u8 q10, d24, d6
michael@0 589 vmull.u8 q11, d24, d7
michael@0 590 .endm
michael@0 591
michael@0 592 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
michael@0 593 vrshr.u16 q14, q8, #8
michael@0 594 vrshr.u16 q15, q9, #8
michael@0 595 vrshr.u16 q12, q10, #8
michael@0 596 vrshr.u16 q13, q11, #8
michael@0 597 vraddhn.u16 d28, q14, q8
michael@0 598 vraddhn.u16 d29, q15, q9
michael@0 599 vraddhn.u16 d30, q12, q10
michael@0 600 vraddhn.u16 d31, q13, q11
michael@0 601 .endm
michael@0 602
michael@0 603 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
michael@0 604 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 605 vrshr.u16 q14, q8, #8
michael@0 606 PF add PF_X, PF_X, #8
michael@0 607 PF tst PF_CTL, #0xF
michael@0 608 vrshr.u16 q15, q9, #8
michael@0 609 vrshr.u16 q12, q10, #8
michael@0 610 vrshr.u16 q13, q11, #8
michael@0 611 PF addne PF_X, PF_X, #8
michael@0 612 PF subne PF_CTL, PF_CTL, #1
michael@0 613 vraddhn.u16 d28, q14, q8
michael@0 614 vraddhn.u16 d29, q15, q9
michael@0 615 PF cmp PF_X, ORIG_W
michael@0 616 vraddhn.u16 d30, q12, q10
michael@0 617 vraddhn.u16 d31, q13, q11
michael@0 618 fetch_src_pixblock
michael@0 619 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
michael@0 620 vmvn.8 d22, d3
michael@0 621 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
michael@0 622 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 623 PF subge PF_X, PF_X, ORIG_W
michael@0 624 vmull.u8 q8, d22, d4
michael@0 625 PF subges PF_CTL, PF_CTL, #0x10
michael@0 626 vmull.u8 q9, d22, d5
michael@0 627 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
michael@0 628 vmull.u8 q10, d22, d6
michael@0 629 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
michael@0 630 vmull.u8 q11, d22, d7
michael@0 631 .endm
michael@0 632
michael@0 633 generate_composite_function_single_scanline \
michael@0 634 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
michael@0 635 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 636 8, /* number of pixels, processed in a single block */ \
michael@0 637 default_init, \
michael@0 638 default_cleanup, \
michael@0 639 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
michael@0 640 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
michael@0 641 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
michael@0 642
michael@0 643 /******************************************************************************/
michael@0 644
michael@0 645 .macro pixman_composite_over_8888_8888_process_pixblock_head
michael@0 646 pixman_composite_out_reverse_8888_8888_process_pixblock_head
michael@0 647 .endm
michael@0 648
michael@0 649 .macro pixman_composite_over_8888_8888_process_pixblock_tail
michael@0 650 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
michael@0 651 vqadd.u8 q14, q0, q14
michael@0 652 vqadd.u8 q15, q1, q15
michael@0 653 .endm
michael@0 654
michael@0 655 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
michael@0 656 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 657 vrshr.u16 q14, q8, #8
michael@0 658 PF add PF_X, PF_X, #8
michael@0 659 PF tst PF_CTL, #0xF
michael@0 660 vrshr.u16 q15, q9, #8
michael@0 661 vrshr.u16 q12, q10, #8
michael@0 662 vrshr.u16 q13, q11, #8
michael@0 663 PF addne PF_X, PF_X, #8
michael@0 664 PF subne PF_CTL, PF_CTL, #1
michael@0 665 vraddhn.u16 d28, q14, q8
michael@0 666 vraddhn.u16 d29, q15, q9
michael@0 667 PF cmp PF_X, ORIG_W
michael@0 668 vraddhn.u16 d30, q12, q10
michael@0 669 vraddhn.u16 d31, q13, q11
michael@0 670 vqadd.u8 q14, q0, q14
michael@0 671 vqadd.u8 q15, q1, q15
michael@0 672 fetch_src_pixblock
michael@0 673 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
michael@0 674 vmvn.8 d22, d3
michael@0 675 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
michael@0 676 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 677 PF subge PF_X, PF_X, ORIG_W
michael@0 678 vmull.u8 q8, d22, d4
michael@0 679 PF subges PF_CTL, PF_CTL, #0x10
michael@0 680 vmull.u8 q9, d22, d5
michael@0 681 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
michael@0 682 vmull.u8 q10, d22, d6
michael@0 683 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
michael@0 684 vmull.u8 q11, d22, d7
michael@0 685 .endm
michael@0 686
michael@0 687 generate_composite_function \
michael@0 688 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
michael@0 689 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 690 8, /* number of pixels, processed in a single block */ \
michael@0 691 5, /* prefetch distance */ \
michael@0 692 default_init, \
michael@0 693 default_cleanup, \
michael@0 694 pixman_composite_over_8888_8888_process_pixblock_head, \
michael@0 695 pixman_composite_over_8888_8888_process_pixblock_tail, \
michael@0 696 pixman_composite_over_8888_8888_process_pixblock_tail_head
michael@0 697
michael@0 698 generate_composite_function_single_scanline \
michael@0 699 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
michael@0 700 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 701 8, /* number of pixels, processed in a single block */ \
michael@0 702 default_init, \
michael@0 703 default_cleanup, \
michael@0 704 pixman_composite_over_8888_8888_process_pixblock_head, \
michael@0 705 pixman_composite_over_8888_8888_process_pixblock_tail, \
michael@0 706 pixman_composite_over_8888_8888_process_pixblock_tail_head
michael@0 707
michael@0 708 /******************************************************************************/
michael@0 709
michael@0 710 .macro pixman_composite_over_n_8888_process_pixblock_head
michael@0 711 /* deinterleaved source pixels in {d0, d1, d2, d3} */
michael@0 712 /* inverted alpha in {d24} */
michael@0 713 /* destination pixels in {d4, d5, d6, d7} */
michael@0 714 vmull.u8 q8, d24, d4
michael@0 715 vmull.u8 q9, d24, d5
michael@0 716 vmull.u8 q10, d24, d6
michael@0 717 vmull.u8 q11, d24, d7
michael@0 718 .endm
michael@0 719
michael@0 720 .macro pixman_composite_over_n_8888_process_pixblock_tail
michael@0 721 vrshr.u16 q14, q8, #8
michael@0 722 vrshr.u16 q15, q9, #8
michael@0 723 vrshr.u16 q2, q10, #8
michael@0 724 vrshr.u16 q3, q11, #8
michael@0 725 vraddhn.u16 d28, q14, q8
michael@0 726 vraddhn.u16 d29, q15, q9
michael@0 727 vraddhn.u16 d30, q2, q10
michael@0 728 vraddhn.u16 d31, q3, q11
michael@0 729 vqadd.u8 q14, q0, q14
michael@0 730 vqadd.u8 q15, q1, q15
michael@0 731 .endm
michael@0 732
michael@0 733 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
michael@0 734 vrshr.u16 q14, q8, #8
michael@0 735 vrshr.u16 q15, q9, #8
michael@0 736 vrshr.u16 q2, q10, #8
michael@0 737 vrshr.u16 q3, q11, #8
michael@0 738 vraddhn.u16 d28, q14, q8
michael@0 739 vraddhn.u16 d29, q15, q9
michael@0 740 vraddhn.u16 d30, q2, q10
michael@0 741 vraddhn.u16 d31, q3, q11
michael@0 742 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 743 vqadd.u8 q14, q0, q14
michael@0 744 PF add PF_X, PF_X, #8
michael@0 745 PF tst PF_CTL, #0x0F
michael@0 746 PF addne PF_X, PF_X, #8
michael@0 747 PF subne PF_CTL, PF_CTL, #1
michael@0 748 vqadd.u8 q15, q1, q15
michael@0 749 PF cmp PF_X, ORIG_W
michael@0 750 vmull.u8 q8, d24, d4
michael@0 751 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
michael@0 752 vmull.u8 q9, d24, d5
michael@0 753 PF subge PF_X, PF_X, ORIG_W
michael@0 754 vmull.u8 q10, d24, d6
michael@0 755 PF subges PF_CTL, PF_CTL, #0x10
michael@0 756 vmull.u8 q11, d24, d7
michael@0 757 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
michael@0 758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 759 .endm
michael@0 760
michael@0 761 .macro pixman_composite_over_n_8888_init
michael@0 762 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 763 vld1.32 {d3[0]}, [DUMMY]
michael@0 764 vdup.8 d0, d3[0]
michael@0 765 vdup.8 d1, d3[1]
michael@0 766 vdup.8 d2, d3[2]
michael@0 767 vdup.8 d3, d3[3]
michael@0 768 vmvn.8 d24, d3 /* get inverted alpha */
michael@0 769 .endm
michael@0 770
michael@0 771 generate_composite_function \
michael@0 772 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
michael@0 773 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 774 8, /* number of pixels, processed in a single block */ \
michael@0 775 5, /* prefetch distance */ \
michael@0 776 pixman_composite_over_n_8888_init, \
michael@0 777 default_cleanup, \
michael@0 778 pixman_composite_over_8888_8888_process_pixblock_head, \
michael@0 779 pixman_composite_over_8888_8888_process_pixblock_tail, \
michael@0 780 pixman_composite_over_n_8888_process_pixblock_tail_head
michael@0 781
michael@0 782 /******************************************************************************/
michael@0 783
michael@0 784 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
michael@0 785 vrshr.u16 q14, q8, #8
michael@0 786 PF add PF_X, PF_X, #8
michael@0 787 PF tst PF_CTL, #0xF
michael@0 788 vrshr.u16 q15, q9, #8
michael@0 789 vrshr.u16 q12, q10, #8
michael@0 790 vrshr.u16 q13, q11, #8
michael@0 791 PF addne PF_X, PF_X, #8
michael@0 792 PF subne PF_CTL, PF_CTL, #1
michael@0 793 vraddhn.u16 d28, q14, q8
michael@0 794 vraddhn.u16 d29, q15, q9
michael@0 795 PF cmp PF_X, ORIG_W
michael@0 796 vraddhn.u16 d30, q12, q10
michael@0 797 vraddhn.u16 d31, q13, q11
michael@0 798 vqadd.u8 q14, q0, q14
michael@0 799 vqadd.u8 q15, q1, q15
michael@0 800 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
michael@0 801 vmvn.8 d22, d3
michael@0 802 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
michael@0 803 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 804 PF subge PF_X, PF_X, ORIG_W
michael@0 805 vmull.u8 q8, d22, d4
michael@0 806 PF subges PF_CTL, PF_CTL, #0x10
michael@0 807 vmull.u8 q9, d22, d5
michael@0 808 vmull.u8 q10, d22, d6
michael@0 809 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
michael@0 810 vmull.u8 q11, d22, d7
michael@0 811 .endm
michael@0 812
michael@0 813 .macro pixman_composite_over_reverse_n_8888_init
michael@0 814 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 815 vld1.32 {d7[0]}, [DUMMY]
michael@0 816 vdup.8 d4, d7[0]
michael@0 817 vdup.8 d5, d7[1]
michael@0 818 vdup.8 d6, d7[2]
michael@0 819 vdup.8 d7, d7[3]
michael@0 820 .endm
michael@0 821
michael@0 822 generate_composite_function \
michael@0 823 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
michael@0 824 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 825 8, /* number of pixels, processed in a single block */ \
michael@0 826 5, /* prefetch distance */ \
michael@0 827 pixman_composite_over_reverse_n_8888_init, \
michael@0 828 default_cleanup, \
michael@0 829 pixman_composite_over_8888_8888_process_pixblock_head, \
michael@0 830 pixman_composite_over_8888_8888_process_pixblock_tail, \
michael@0 831 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
michael@0 832 28, /* dst_w_basereg */ \
michael@0 833 0, /* dst_r_basereg */ \
michael@0 834 4, /* src_basereg */ \
michael@0 835 24 /* mask_basereg */
michael@0 836
michael@0 837 /******************************************************************************/
michael@0 838
michael@0 839 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
michael@0 840 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
michael@0 841 vmull.u8 q1, d24, d9
michael@0 842 vmull.u8 q6, d24, d10
michael@0 843 vmull.u8 q7, d24, d11
michael@0 844 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
michael@0 845 vshrn.u16 d7, q2, #3
michael@0 846 vsli.u16 q2, q2, #5
michael@0 847 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
michael@0 848 vrshr.u16 q9, q1, #8
michael@0 849 vrshr.u16 q10, q6, #8
michael@0 850 vrshr.u16 q11, q7, #8
michael@0 851 vraddhn.u16 d0, q0, q8
michael@0 852 vraddhn.u16 d1, q1, q9
michael@0 853 vraddhn.u16 d2, q6, q10
michael@0 854 vraddhn.u16 d3, q7, q11
michael@0 855 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
michael@0 856 vsri.u8 d7, d7, #6
michael@0 857 vmvn.8 d3, d3
michael@0 858 vshrn.u16 d30, q2, #2
michael@0 859 vmull.u8 q8, d3, d6 /* now do alpha blending */
michael@0 860 vmull.u8 q9, d3, d7
michael@0 861 vmull.u8 q10, d3, d30
michael@0 862 .endm
michael@0 863
michael@0 864 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
michael@0 865 /* 3 cycle bubble (after vmull.u8) */
michael@0 866 vrshr.u16 q13, q8, #8
michael@0 867 vrshr.u16 q11, q9, #8
michael@0 868 vrshr.u16 q15, q10, #8
michael@0 869 vraddhn.u16 d16, q8, q13
michael@0 870 vraddhn.u16 d27, q9, q11
michael@0 871 vraddhn.u16 d26, q10, q15
michael@0 872 vqadd.u8 d16, d2, d16
michael@0 873 /* 1 cycle bubble */
michael@0 874 vqadd.u8 q9, q0, q13
michael@0 875 vshll.u8 q14, d16, #8 /* convert to 16bpp */
michael@0 876 vshll.u8 q8, d19, #8
michael@0 877 vshll.u8 q9, d18, #8
michael@0 878 vsri.u16 q14, q8, #5
michael@0 879 /* 1 cycle bubble */
michael@0 880 vsri.u16 q14, q9, #11
michael@0 881 .endm
michael@0 882
michael@0 883 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
michael@0 884 vld1.16 {d4, d5}, [DST_R, :128]!
michael@0 885 vshrn.u16 d6, q2, #8
michael@0 886 fetch_mask_pixblock
michael@0 887 vshrn.u16 d7, q2, #3
michael@0 888 fetch_src_pixblock
michael@0 889 vmull.u8 q6, d24, d10
michael@0 890 vrshr.u16 q13, q8, #8
michael@0 891 vrshr.u16 q11, q9, #8
michael@0 892 vrshr.u16 q15, q10, #8
michael@0 893 vraddhn.u16 d16, q8, q13
michael@0 894 vraddhn.u16 d27, q9, q11
michael@0 895 vraddhn.u16 d26, q10, q15
michael@0 896 vqadd.u8 d16, d2, d16
michael@0 897 vmull.u8 q1, d24, d9
michael@0 898 vqadd.u8 q9, q0, q13
michael@0 899 vshll.u8 q14, d16, #8
michael@0 900 vmull.u8 q0, d24, d8
michael@0 901 vshll.u8 q8, d19, #8
michael@0 902 vshll.u8 q9, d18, #8
michael@0 903 vsri.u16 q14, q8, #5
michael@0 904 vmull.u8 q7, d24, d11
michael@0 905 vsri.u16 q14, q9, #11
michael@0 906
michael@0 907 cache_preload 8, 8
michael@0 908
michael@0 909 vsli.u16 q2, q2, #5
michael@0 910 vrshr.u16 q8, q0, #8
michael@0 911 vrshr.u16 q9, q1, #8
michael@0 912 vrshr.u16 q10, q6, #8
michael@0 913 vrshr.u16 q11, q7, #8
michael@0 914 vraddhn.u16 d0, q0, q8
michael@0 915 vraddhn.u16 d1, q1, q9
michael@0 916 vraddhn.u16 d2, q6, q10
michael@0 917 vraddhn.u16 d3, q7, q11
michael@0 918 vsri.u8 d6, d6, #5
michael@0 919 vsri.u8 d7, d7, #6
michael@0 920 vmvn.8 d3, d3
michael@0 921 vshrn.u16 d30, q2, #2
michael@0 922 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 923 vmull.u8 q8, d3, d6
michael@0 924 vmull.u8 q9, d3, d7
michael@0 925 vmull.u8 q10, d3, d30
michael@0 926 .endm
michael@0 927
michael@0 928 generate_composite_function \
michael@0 929 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
michael@0 930 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 931 8, /* number of pixels, processed in a single block */ \
michael@0 932 5, /* prefetch distance */ \
michael@0 933 default_init_need_all_regs, \
michael@0 934 default_cleanup_need_all_regs, \
michael@0 935 pixman_composite_over_8888_8_0565_process_pixblock_head, \
michael@0 936 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
michael@0 937 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
michael@0 938 28, /* dst_w_basereg */ \
michael@0 939 4, /* dst_r_basereg */ \
michael@0 940 8, /* src_basereg */ \
michael@0 941 24 /* mask_basereg */
michael@0 942
michael@0 943 /******************************************************************************/
michael@0 944
michael@0 945 /*
michael@0 946 * This function needs a special initialization of solid mask.
michael@0 947 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
michael@0 948 * offset, split into color components and replicated in d8-d11
michael@0 949 * registers. Additionally, this function needs all the NEON registers,
michael@0 950 * so it has to save d8-d15 registers which are callee saved according
michael@0 951 * to ABI. These registers are restored from 'cleanup' macro. All the
michael@0 952 * other NEON registers are caller saved, so can be clobbered freely
michael@0 953 * without introducing any problems.
michael@0 954 */
michael@0 955 .macro pixman_composite_over_n_8_0565_init
michael@0 956 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 957 .vsave {d8-d15}
michael@0 958 vpush {d8-d15}
michael@0 959 vld1.32 {d11[0]}, [DUMMY]
michael@0 960 vdup.8 d8, d11[0]
michael@0 961 vdup.8 d9, d11[1]
michael@0 962 vdup.8 d10, d11[2]
michael@0 963 vdup.8 d11, d11[3]
michael@0 964 .endm
michael@0 965
michael@0 966 .macro pixman_composite_over_n_8_0565_cleanup
michael@0 967 vpop {d8-d15}
michael@0 968 .endm
michael@0 969
michael@0 970 generate_composite_function \
michael@0 971 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
michael@0 972 FLAG_DST_READWRITE, \
michael@0 973 8, /* number of pixels, processed in a single block */ \
michael@0 974 5, /* prefetch distance */ \
michael@0 975 pixman_composite_over_n_8_0565_init, \
michael@0 976 pixman_composite_over_n_8_0565_cleanup, \
michael@0 977 pixman_composite_over_8888_8_0565_process_pixblock_head, \
michael@0 978 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
michael@0 979 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
michael@0 980
michael@0 981 /******************************************************************************/
michael@0 982
michael@0 983 .macro pixman_composite_over_8888_n_0565_init
michael@0 984 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
michael@0 985 .vsave {d8-d15}
michael@0 986 vpush {d8-d15}
michael@0 987 vld1.32 {d24[0]}, [DUMMY]
michael@0 988 vdup.8 d24, d24[3]
michael@0 989 .endm
michael@0 990
michael@0 991 .macro pixman_composite_over_8888_n_0565_cleanup
michael@0 992 vpop {d8-d15}
michael@0 993 .endm
michael@0 994
michael@0 995 generate_composite_function \
michael@0 996 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
michael@0 997 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 998 8, /* number of pixels, processed in a single block */ \
michael@0 999 5, /* prefetch distance */ \
michael@0 1000 pixman_composite_over_8888_n_0565_init, \
michael@0 1001 pixman_composite_over_8888_n_0565_cleanup, \
michael@0 1002 pixman_composite_over_8888_8_0565_process_pixblock_head, \
michael@0 1003 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
michael@0 1004 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
michael@0 1005 28, /* dst_w_basereg */ \
michael@0 1006 4, /* dst_r_basereg */ \
michael@0 1007 8, /* src_basereg */ \
michael@0 1008 24 /* mask_basereg */
michael@0 1009
michael@0 1010 /******************************************************************************/
michael@0 1011
michael@0 1012 .macro pixman_composite_src_0565_0565_process_pixblock_head
michael@0 1013 .endm
michael@0 1014
michael@0 1015 .macro pixman_composite_src_0565_0565_process_pixblock_tail
michael@0 1016 .endm
michael@0 1017
michael@0 1018 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
michael@0 1019 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
michael@0 1020 fetch_src_pixblock
michael@0 1021 cache_preload 16, 16
michael@0 1022 .endm
michael@0 1023
michael@0 1024 generate_composite_function \
michael@0 1025 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
michael@0 1026 FLAG_DST_WRITEONLY, \
michael@0 1027 16, /* number of pixels, processed in a single block */ \
michael@0 1028 10, /* prefetch distance */ \
michael@0 1029 default_init, \
michael@0 1030 default_cleanup, \
michael@0 1031 pixman_composite_src_0565_0565_process_pixblock_head, \
michael@0 1032 pixman_composite_src_0565_0565_process_pixblock_tail, \
michael@0 1033 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
michael@0 1034 0, /* dst_w_basereg */ \
michael@0 1035 0, /* dst_r_basereg */ \
michael@0 1036 0, /* src_basereg */ \
michael@0 1037 0 /* mask_basereg */
michael@0 1038
michael@0 1039 /******************************************************************************/
michael@0 1040
michael@0 1041 .macro pixman_composite_src_n_8_process_pixblock_head
michael@0 1042 .endm
michael@0 1043
michael@0 1044 .macro pixman_composite_src_n_8_process_pixblock_tail
michael@0 1045 .endm
michael@0 1046
michael@0 1047 .macro pixman_composite_src_n_8_process_pixblock_tail_head
michael@0 1048 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
michael@0 1049 .endm
michael@0 1050
michael@0 1051 .macro pixman_composite_src_n_8_init
michael@0 1052 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1053 vld1.32 {d0[0]}, [DUMMY]
michael@0 1054 vsli.u64 d0, d0, #8
michael@0 1055 vsli.u64 d0, d0, #16
michael@0 1056 vsli.u64 d0, d0, #32
michael@0 1057 vorr d1, d0, d0
michael@0 1058 vorr q1, q0, q0
michael@0 1059 .endm
michael@0 1060
michael@0 1061 .macro pixman_composite_src_n_8_cleanup
michael@0 1062 .endm
michael@0 1063
michael@0 1064 generate_composite_function \
michael@0 1065 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
michael@0 1066 FLAG_DST_WRITEONLY, \
michael@0 1067 32, /* number of pixels, processed in a single block */ \
michael@0 1068 0, /* prefetch distance */ \
michael@0 1069 pixman_composite_src_n_8_init, \
michael@0 1070 pixman_composite_src_n_8_cleanup, \
michael@0 1071 pixman_composite_src_n_8_process_pixblock_head, \
michael@0 1072 pixman_composite_src_n_8_process_pixblock_tail, \
michael@0 1073 pixman_composite_src_n_8_process_pixblock_tail_head, \
michael@0 1074 0, /* dst_w_basereg */ \
michael@0 1075 0, /* dst_r_basereg */ \
michael@0 1076 0, /* src_basereg */ \
michael@0 1077 0 /* mask_basereg */
michael@0 1078
michael@0 1079 /******************************************************************************/
michael@0 1080
michael@0 1081 .macro pixman_composite_src_n_0565_process_pixblock_head
michael@0 1082 .endm
michael@0 1083
michael@0 1084 .macro pixman_composite_src_n_0565_process_pixblock_tail
michael@0 1085 .endm
michael@0 1086
michael@0 1087 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
michael@0 1088 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
michael@0 1089 .endm
michael@0 1090
michael@0 1091 .macro pixman_composite_src_n_0565_init
michael@0 1092 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1093 vld1.32 {d0[0]}, [DUMMY]
michael@0 1094 vsli.u64 d0, d0, #16
michael@0 1095 vsli.u64 d0, d0, #32
michael@0 1096 vorr d1, d0, d0
michael@0 1097 vorr q1, q0, q0
michael@0 1098 .endm
michael@0 1099
michael@0 1100 .macro pixman_composite_src_n_0565_cleanup
michael@0 1101 .endm
michael@0 1102
michael@0 1103 generate_composite_function \
michael@0 1104 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
michael@0 1105 FLAG_DST_WRITEONLY, \
michael@0 1106 16, /* number of pixels, processed in a single block */ \
michael@0 1107 0, /* prefetch distance */ \
michael@0 1108 pixman_composite_src_n_0565_init, \
michael@0 1109 pixman_composite_src_n_0565_cleanup, \
michael@0 1110 pixman_composite_src_n_0565_process_pixblock_head, \
michael@0 1111 pixman_composite_src_n_0565_process_pixblock_tail, \
michael@0 1112 pixman_composite_src_n_0565_process_pixblock_tail_head, \
michael@0 1113 0, /* dst_w_basereg */ \
michael@0 1114 0, /* dst_r_basereg */ \
michael@0 1115 0, /* src_basereg */ \
michael@0 1116 0 /* mask_basereg */
michael@0 1117
michael@0 1118 /******************************************************************************/
michael@0 1119
michael@0 1120 .macro pixman_composite_src_n_8888_process_pixblock_head
michael@0 1121 .endm
michael@0 1122
michael@0 1123 .macro pixman_composite_src_n_8888_process_pixblock_tail
michael@0 1124 .endm
michael@0 1125
michael@0 1126 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
michael@0 1127 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
michael@0 1128 .endm
michael@0 1129
michael@0 1130 .macro pixman_composite_src_n_8888_init
michael@0 1131 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1132 vld1.32 {d0[0]}, [DUMMY]
michael@0 1133 vsli.u64 d0, d0, #32
michael@0 1134 vorr d1, d0, d0
michael@0 1135 vorr q1, q0, q0
michael@0 1136 .endm
michael@0 1137
michael@0 1138 .macro pixman_composite_src_n_8888_cleanup
michael@0 1139 .endm
michael@0 1140
michael@0 1141 generate_composite_function \
michael@0 1142 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
michael@0 1143 FLAG_DST_WRITEONLY, \
michael@0 1144 8, /* number of pixels, processed in a single block */ \
michael@0 1145 0, /* prefetch distance */ \
michael@0 1146 pixman_composite_src_n_8888_init, \
michael@0 1147 pixman_composite_src_n_8888_cleanup, \
michael@0 1148 pixman_composite_src_n_8888_process_pixblock_head, \
michael@0 1149 pixman_composite_src_n_8888_process_pixblock_tail, \
michael@0 1150 pixman_composite_src_n_8888_process_pixblock_tail_head, \
michael@0 1151 0, /* dst_w_basereg */ \
michael@0 1152 0, /* dst_r_basereg */ \
michael@0 1153 0, /* src_basereg */ \
michael@0 1154 0 /* mask_basereg */
michael@0 1155
michael@0 1156 /******************************************************************************/
michael@0 1157
michael@0 1158 .macro pixman_composite_src_8888_8888_process_pixblock_head
michael@0 1159 .endm
michael@0 1160
michael@0 1161 .macro pixman_composite_src_8888_8888_process_pixblock_tail
michael@0 1162 .endm
michael@0 1163
michael@0 1164 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
michael@0 1165 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
michael@0 1166 fetch_src_pixblock
michael@0 1167 cache_preload 8, 8
michael@0 1168 .endm
michael@0 1169
michael@0 1170 generate_composite_function \
michael@0 1171 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
michael@0 1172 FLAG_DST_WRITEONLY, \
michael@0 1173 8, /* number of pixels, processed in a single block */ \
michael@0 1174 10, /* prefetch distance */ \
michael@0 1175 default_init, \
michael@0 1176 default_cleanup, \
michael@0 1177 pixman_composite_src_8888_8888_process_pixblock_head, \
michael@0 1178 pixman_composite_src_8888_8888_process_pixblock_tail, \
michael@0 1179 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
michael@0 1180 0, /* dst_w_basereg */ \
michael@0 1181 0, /* dst_r_basereg */ \
michael@0 1182 0, /* src_basereg */ \
michael@0 1183 0 /* mask_basereg */
michael@0 1184
michael@0 1185 /******************************************************************************/
michael@0 1186
michael@0 1187 .macro pixman_composite_src_x888_8888_process_pixblock_head
michael@0 1188 vorr q0, q0, q2
michael@0 1189 vorr q1, q1, q2
michael@0 1190 .endm
michael@0 1191
michael@0 1192 .macro pixman_composite_src_x888_8888_process_pixblock_tail
michael@0 1193 .endm
michael@0 1194
michael@0 1195 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
michael@0 1196 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
michael@0 1197 fetch_src_pixblock
michael@0 1198 vorr q0, q0, q2
michael@0 1199 vorr q1, q1, q2
michael@0 1200 cache_preload 8, 8
michael@0 1201 .endm
michael@0 1202
michael@0 1203 .macro pixman_composite_src_x888_8888_init
michael@0 1204 vmov.u8 q2, #0xFF
michael@0 1205 vshl.u32 q2, q2, #24
michael@0 1206 .endm
michael@0 1207
michael@0 1208 generate_composite_function \
michael@0 1209 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
michael@0 1210 FLAG_DST_WRITEONLY, \
michael@0 1211 8, /* number of pixels, processed in a single block */ \
michael@0 1212 10, /* prefetch distance */ \
michael@0 1213 pixman_composite_src_x888_8888_init, \
michael@0 1214 default_cleanup, \
michael@0 1215 pixman_composite_src_x888_8888_process_pixblock_head, \
michael@0 1216 pixman_composite_src_x888_8888_process_pixblock_tail, \
michael@0 1217 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
michael@0 1218 0, /* dst_w_basereg */ \
michael@0 1219 0, /* dst_r_basereg */ \
michael@0 1220 0, /* src_basereg */ \
michael@0 1221 0 /* mask_basereg */
michael@0 1222
michael@0 1223 /******************************************************************************/
michael@0 1224
michael@0 1225 .macro pixman_composite_src_n_8_8888_process_pixblock_head
michael@0 1226 /* expecting solid source in {d0, d1, d2, d3} */
michael@0 1227 /* mask is in d24 (d25, d26, d27 are unused) */
michael@0 1228
michael@0 1229 /* in */
michael@0 1230 vmull.u8 q8, d24, d0
michael@0 1231 vmull.u8 q9, d24, d1
michael@0 1232 vmull.u8 q10, d24, d2
michael@0 1233 vmull.u8 q11, d24, d3
michael@0 1234 vrsra.u16 q8, q8, #8
michael@0 1235 vrsra.u16 q9, q9, #8
michael@0 1236 vrsra.u16 q10, q10, #8
michael@0 1237 vrsra.u16 q11, q11, #8
michael@0 1238 .endm
michael@0 1239
michael@0 1240 .macro pixman_composite_src_n_8_8888_process_pixblock_tail
michael@0 1241 vrshrn.u16 d28, q8, #8
michael@0 1242 vrshrn.u16 d29, q9, #8
michael@0 1243 vrshrn.u16 d30, q10, #8
michael@0 1244 vrshrn.u16 d31, q11, #8
michael@0 1245 .endm
michael@0 1246
michael@0 1247 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
michael@0 1248 fetch_mask_pixblock
michael@0 1249 PF add PF_X, PF_X, #8
michael@0 1250 vrshrn.u16 d28, q8, #8
michael@0 1251 PF tst PF_CTL, #0x0F
michael@0 1252 vrshrn.u16 d29, q9, #8
michael@0 1253 PF addne PF_X, PF_X, #8
michael@0 1254 vrshrn.u16 d30, q10, #8
michael@0 1255 PF subne PF_CTL, PF_CTL, #1
michael@0 1256 vrshrn.u16 d31, q11, #8
michael@0 1257 PF cmp PF_X, ORIG_W
michael@0 1258 vmull.u8 q8, d24, d0
michael@0 1259 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
michael@0 1260 vmull.u8 q9, d24, d1
michael@0 1261 PF subge PF_X, PF_X, ORIG_W
michael@0 1262 vmull.u8 q10, d24, d2
michael@0 1263 PF subges PF_CTL, PF_CTL, #0x10
michael@0 1264 vmull.u8 q11, d24, d3
michael@0 1265 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
michael@0 1266 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 1267 vrsra.u16 q8, q8, #8
michael@0 1268 vrsra.u16 q9, q9, #8
michael@0 1269 vrsra.u16 q10, q10, #8
michael@0 1270 vrsra.u16 q11, q11, #8
michael@0 1271 .endm
michael@0 1272
michael@0 1273 .macro pixman_composite_src_n_8_8888_init
michael@0 1274 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1275 vld1.32 {d3[0]}, [DUMMY]
michael@0 1276 vdup.8 d0, d3[0]
michael@0 1277 vdup.8 d1, d3[1]
michael@0 1278 vdup.8 d2, d3[2]
michael@0 1279 vdup.8 d3, d3[3]
michael@0 1280 .endm
michael@0 1281
michael@0 1282 .macro pixman_composite_src_n_8_8888_cleanup
michael@0 1283 .endm
michael@0 1284
michael@0 1285 generate_composite_function \
michael@0 1286 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
michael@0 1287 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
michael@0 1288 8, /* number of pixels, processed in a single block */ \
michael@0 1289 5, /* prefetch distance */ \
michael@0 1290 pixman_composite_src_n_8_8888_init, \
michael@0 1291 pixman_composite_src_n_8_8888_cleanup, \
michael@0 1292 pixman_composite_src_n_8_8888_process_pixblock_head, \
michael@0 1293 pixman_composite_src_n_8_8888_process_pixblock_tail, \
michael@0 1294 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
michael@0 1295
michael@0 1296 /******************************************************************************/
michael@0 1297
michael@0 1298 .macro pixman_composite_src_n_8_8_process_pixblock_head
michael@0 1299 vmull.u8 q0, d24, d16
michael@0 1300 vmull.u8 q1, d25, d16
michael@0 1301 vmull.u8 q2, d26, d16
michael@0 1302 vmull.u8 q3, d27, d16
michael@0 1303 vrsra.u16 q0, q0, #8
michael@0 1304 vrsra.u16 q1, q1, #8
michael@0 1305 vrsra.u16 q2, q2, #8
michael@0 1306 vrsra.u16 q3, q3, #8
michael@0 1307 .endm
michael@0 1308
michael@0 1309 .macro pixman_composite_src_n_8_8_process_pixblock_tail
michael@0 1310 vrshrn.u16 d28, q0, #8
michael@0 1311 vrshrn.u16 d29, q1, #8
michael@0 1312 vrshrn.u16 d30, q2, #8
michael@0 1313 vrshrn.u16 d31, q3, #8
michael@0 1314 .endm
michael@0 1315
michael@0 1316 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
michael@0 1317 fetch_mask_pixblock
michael@0 1318 PF add PF_X, PF_X, #8
michael@0 1319 vrshrn.u16 d28, q0, #8
michael@0 1320 PF tst PF_CTL, #0x0F
michael@0 1321 vrshrn.u16 d29, q1, #8
michael@0 1322 PF addne PF_X, PF_X, #8
michael@0 1323 vrshrn.u16 d30, q2, #8
michael@0 1324 PF subne PF_CTL, PF_CTL, #1
michael@0 1325 vrshrn.u16 d31, q3, #8
michael@0 1326 PF cmp PF_X, ORIG_W
michael@0 1327 vmull.u8 q0, d24, d16
michael@0 1328 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
michael@0 1329 vmull.u8 q1, d25, d16
michael@0 1330 PF subge PF_X, PF_X, ORIG_W
michael@0 1331 vmull.u8 q2, d26, d16
michael@0 1332 PF subges PF_CTL, PF_CTL, #0x10
michael@0 1333 vmull.u8 q3, d27, d16
michael@0 1334 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
michael@0 1335 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 1336 vrsra.u16 q0, q0, #8
michael@0 1337 vrsra.u16 q1, q1, #8
michael@0 1338 vrsra.u16 q2, q2, #8
michael@0 1339 vrsra.u16 q3, q3, #8
michael@0 1340 .endm
michael@0 1341
michael@0 1342 .macro pixman_composite_src_n_8_8_init
michael@0 1343 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1344 vld1.32 {d16[0]}, [DUMMY]
michael@0 1345 vdup.8 d16, d16[3]
michael@0 1346 .endm
michael@0 1347
michael@0 1348 .macro pixman_composite_src_n_8_8_cleanup
michael@0 1349 .endm
michael@0 1350
michael@0 1351 generate_composite_function \
michael@0 1352 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
michael@0 1353 FLAG_DST_WRITEONLY, \
michael@0 1354 32, /* number of pixels, processed in a single block */ \
michael@0 1355 5, /* prefetch distance */ \
michael@0 1356 pixman_composite_src_n_8_8_init, \
michael@0 1357 pixman_composite_src_n_8_8_cleanup, \
michael@0 1358 pixman_composite_src_n_8_8_process_pixblock_head, \
michael@0 1359 pixman_composite_src_n_8_8_process_pixblock_tail, \
michael@0 1360 pixman_composite_src_n_8_8_process_pixblock_tail_head
michael@0 1361
michael@0 1362 /******************************************************************************/
michael@0 1363
michael@0 1364 .macro pixman_composite_over_n_8_8888_process_pixblock_head
michael@0 1365 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
michael@0 1366 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
michael@0 1367 /* and destination data in {d4, d5, d6, d7} */
michael@0 1368 /* mask is in d24 (d25, d26, d27 are unused) */
michael@0 1369
michael@0 1370 /* in */
michael@0 1371 vmull.u8 q6, d24, d8
michael@0 1372 vmull.u8 q7, d24, d9
michael@0 1373 vmull.u8 q8, d24, d10
michael@0 1374 vmull.u8 q9, d24, d11
michael@0 1375 vrshr.u16 q10, q6, #8
michael@0 1376 vrshr.u16 q11, q7, #8
michael@0 1377 vrshr.u16 q12, q8, #8
michael@0 1378 vrshr.u16 q13, q9, #8
michael@0 1379 vraddhn.u16 d0, q6, q10
michael@0 1380 vraddhn.u16 d1, q7, q11
michael@0 1381 vraddhn.u16 d2, q8, q12
michael@0 1382 vraddhn.u16 d3, q9, q13
michael@0 1383 vmvn.8 d25, d3 /* get inverted alpha */
michael@0 1384 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
michael@0 1385 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
michael@0 1386 /* now do alpha blending */
michael@0 1387 vmull.u8 q8, d25, d4
michael@0 1388 vmull.u8 q9, d25, d5
michael@0 1389 vmull.u8 q10, d25, d6
michael@0 1390 vmull.u8 q11, d25, d7
michael@0 1391 .endm
michael@0 1392
michael@0 1393 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
michael@0 1394 vrshr.u16 q14, q8, #8
michael@0 1395 vrshr.u16 q15, q9, #8
michael@0 1396 vrshr.u16 q6, q10, #8
michael@0 1397 vrshr.u16 q7, q11, #8
michael@0 1398 vraddhn.u16 d28, q14, q8
michael@0 1399 vraddhn.u16 d29, q15, q9
michael@0 1400 vraddhn.u16 d30, q6, q10
michael@0 1401 vraddhn.u16 d31, q7, q11
michael@0 1402 vqadd.u8 q14, q0, q14
michael@0 1403 vqadd.u8 q15, q1, q15
michael@0 1404 .endm
michael@0 1405
michael@0 1406 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
michael@0 1407 vrshr.u16 q14, q8, #8
michael@0 1408 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 1409 vrshr.u16 q15, q9, #8
michael@0 1410 fetch_mask_pixblock
michael@0 1411 vrshr.u16 q6, q10, #8
michael@0 1412 PF add PF_X, PF_X, #8
michael@0 1413 vrshr.u16 q7, q11, #8
michael@0 1414 PF tst PF_CTL, #0x0F
michael@0 1415 vraddhn.u16 d28, q14, q8
michael@0 1416 PF addne PF_X, PF_X, #8
michael@0 1417 vraddhn.u16 d29, q15, q9
michael@0 1418 PF subne PF_CTL, PF_CTL, #1
michael@0 1419 vraddhn.u16 d30, q6, q10
michael@0 1420 PF cmp PF_X, ORIG_W
michael@0 1421 vraddhn.u16 d31, q7, q11
michael@0 1422 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
michael@0 1423 vmull.u8 q6, d24, d8
michael@0 1424 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
michael@0 1425 vmull.u8 q7, d24, d9
michael@0 1426 PF subge PF_X, PF_X, ORIG_W
michael@0 1427 vmull.u8 q8, d24, d10
michael@0 1428 PF subges PF_CTL, PF_CTL, #0x10
michael@0 1429 vmull.u8 q9, d24, d11
michael@0 1430 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
michael@0 1431 vqadd.u8 q14, q0, q14
michael@0 1432 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
michael@0 1433 vqadd.u8 q15, q1, q15
michael@0 1434 vrshr.u16 q10, q6, #8
michael@0 1435 vrshr.u16 q11, q7, #8
michael@0 1436 vrshr.u16 q12, q8, #8
michael@0 1437 vrshr.u16 q13, q9, #8
michael@0 1438 vraddhn.u16 d0, q6, q10
michael@0 1439 vraddhn.u16 d1, q7, q11
michael@0 1440 vraddhn.u16 d2, q8, q12
michael@0 1441 vraddhn.u16 d3, q9, q13
michael@0 1442 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 1443 vmvn.8 d25, d3
michael@0 1444 vmull.u8 q8, d25, d4
michael@0 1445 vmull.u8 q9, d25, d5
michael@0 1446 vmull.u8 q10, d25, d6
michael@0 1447 vmull.u8 q11, d25, d7
michael@0 1448 .endm
michael@0 1449
michael@0 1450 .macro pixman_composite_over_n_8_8888_init
michael@0 1451 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1452 .vsave {d8-d15}
michael@0 1453 vpush {d8-d15}
michael@0 1454 vld1.32 {d11[0]}, [DUMMY]
michael@0 1455 vdup.8 d8, d11[0]
michael@0 1456 vdup.8 d9, d11[1]
michael@0 1457 vdup.8 d10, d11[2]
michael@0 1458 vdup.8 d11, d11[3]
michael@0 1459 .endm
michael@0 1460
michael@0 1461 .macro pixman_composite_over_n_8_8888_cleanup
michael@0 1462 vpop {d8-d15}
michael@0 1463 .endm
michael@0 1464
michael@0 1465 generate_composite_function \
michael@0 1466 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
michael@0 1467 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 1468 8, /* number of pixels, processed in a single block */ \
michael@0 1469 5, /* prefetch distance */ \
michael@0 1470 pixman_composite_over_n_8_8888_init, \
michael@0 1471 pixman_composite_over_n_8_8888_cleanup, \
michael@0 1472 pixman_composite_over_n_8_8888_process_pixblock_head, \
michael@0 1473 pixman_composite_over_n_8_8888_process_pixblock_tail, \
michael@0 1474 pixman_composite_over_n_8_8888_process_pixblock_tail_head
michael@0 1475
michael@0 1476 /******************************************************************************/
michael@0 1477
michael@0 1478 .macro pixman_composite_over_n_8_8_process_pixblock_head
michael@0 1479 vmull.u8 q0, d24, d8
michael@0 1480 vmull.u8 q1, d25, d8
michael@0 1481 vmull.u8 q6, d26, d8
michael@0 1482 vmull.u8 q7, d27, d8
michael@0 1483 vrshr.u16 q10, q0, #8
michael@0 1484 vrshr.u16 q11, q1, #8
michael@0 1485 vrshr.u16 q12, q6, #8
michael@0 1486 vrshr.u16 q13, q7, #8
michael@0 1487 vraddhn.u16 d0, q0, q10
michael@0 1488 vraddhn.u16 d1, q1, q11
michael@0 1489 vraddhn.u16 d2, q6, q12
michael@0 1490 vraddhn.u16 d3, q7, q13
michael@0 1491 vmvn.8 q12, q0
michael@0 1492 vmvn.8 q13, q1
michael@0 1493 vmull.u8 q8, d24, d4
michael@0 1494 vmull.u8 q9, d25, d5
michael@0 1495 vmull.u8 q10, d26, d6
michael@0 1496 vmull.u8 q11, d27, d7
michael@0 1497 .endm
michael@0 1498
michael@0 1499 .macro pixman_composite_over_n_8_8_process_pixblock_tail
michael@0 1500 vrshr.u16 q14, q8, #8
michael@0 1501 vrshr.u16 q15, q9, #8
michael@0 1502 vrshr.u16 q12, q10, #8
michael@0 1503 vrshr.u16 q13, q11, #8
michael@0 1504 vraddhn.u16 d28, q14, q8
michael@0 1505 vraddhn.u16 d29, q15, q9
michael@0 1506 vraddhn.u16 d30, q12, q10
michael@0 1507 vraddhn.u16 d31, q13, q11
michael@0 1508 vqadd.u8 q14, q0, q14
michael@0 1509 vqadd.u8 q15, q1, q15
michael@0 1510 .endm
michael@0 1511
michael@0 1512 /* TODO: expand macros and do better instructions scheduling */
michael@0 1513 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
michael@0 1514 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 1515 pixman_composite_over_n_8_8_process_pixblock_tail
michael@0 1516 fetch_mask_pixblock
michael@0 1517 cache_preload 32, 32
michael@0 1518 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 1519 pixman_composite_over_n_8_8_process_pixblock_head
michael@0 1520 .endm
michael@0 1521
michael@0 1522 .macro pixman_composite_over_n_8_8_init
michael@0 1523 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1524 .vsave {d8-d15}
michael@0 1525 vpush {d8-d15}
michael@0 1526 vld1.32 {d8[0]}, [DUMMY]
michael@0 1527 vdup.8 d8, d8[3]
michael@0 1528 .endm
michael@0 1529
michael@0 1530 .macro pixman_composite_over_n_8_8_cleanup
michael@0 1531 vpop {d8-d15}
michael@0 1532 .endm
michael@0 1533
michael@0 1534 generate_composite_function \
michael@0 1535 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
michael@0 1536 FLAG_DST_READWRITE, \
michael@0 1537 32, /* number of pixels, processed in a single block */ \
michael@0 1538 5, /* prefetch distance */ \
michael@0 1539 pixman_composite_over_n_8_8_init, \
michael@0 1540 pixman_composite_over_n_8_8_cleanup, \
michael@0 1541 pixman_composite_over_n_8_8_process_pixblock_head, \
michael@0 1542 pixman_composite_over_n_8_8_process_pixblock_tail, \
michael@0 1543 pixman_composite_over_n_8_8_process_pixblock_tail_head
michael@0 1544
michael@0 1545 /******************************************************************************/
michael@0 1546
michael@0 1547 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
michael@0 1548 /*
michael@0 1549 * 'combine_mask_ca' replacement
michael@0 1550 *
michael@0 1551 * input: solid src (n) in {d8, d9, d10, d11}
michael@0 1552 * dest in {d4, d5, d6, d7 }
michael@0 1553 * mask in {d24, d25, d26, d27}
michael@0 1554 * output: updated src in {d0, d1, d2, d3 }
michael@0 1555 * updated mask in {d24, d25, d26, d3 }
michael@0 1556 */
michael@0 1557 vmull.u8 q0, d24, d8
michael@0 1558 vmull.u8 q1, d25, d9
michael@0 1559 vmull.u8 q6, d26, d10
michael@0 1560 vmull.u8 q7, d27, d11
michael@0 1561 vmull.u8 q9, d11, d25
michael@0 1562 vmull.u8 q12, d11, d24
michael@0 1563 vmull.u8 q13, d11, d26
michael@0 1564 vrshr.u16 q8, q0, #8
michael@0 1565 vrshr.u16 q10, q1, #8
michael@0 1566 vrshr.u16 q11, q6, #8
michael@0 1567 vraddhn.u16 d0, q0, q8
michael@0 1568 vraddhn.u16 d1, q1, q10
michael@0 1569 vraddhn.u16 d2, q6, q11
michael@0 1570 vrshr.u16 q11, q12, #8
michael@0 1571 vrshr.u16 q8, q9, #8
michael@0 1572 vrshr.u16 q6, q13, #8
michael@0 1573 vrshr.u16 q10, q7, #8
michael@0 1574 vraddhn.u16 d24, q12, q11
michael@0 1575 vraddhn.u16 d25, q9, q8
michael@0 1576 vraddhn.u16 d26, q13, q6
michael@0 1577 vraddhn.u16 d3, q7, q10
michael@0 1578 /*
michael@0 1579 * 'combine_over_ca' replacement
michael@0 1580 *
michael@0 1581 * output: updated dest in {d28, d29, d30, d31}
michael@0 1582 */
michael@0 1583 vmvn.8 q12, q12
michael@0 1584 vmvn.8 d26, d26
michael@0 1585 vmull.u8 q8, d24, d4
michael@0 1586 vmull.u8 q9, d25, d5
michael@0 1587 vmvn.8 d27, d3
michael@0 1588 vmull.u8 q10, d26, d6
michael@0 1589 vmull.u8 q11, d27, d7
michael@0 1590 .endm
michael@0 1591
michael@0 1592 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
michael@0 1593 /* ... continue 'combine_over_ca' replacement */
michael@0 1594 vrshr.u16 q14, q8, #8
michael@0 1595 vrshr.u16 q15, q9, #8
michael@0 1596 vrshr.u16 q6, q10, #8
michael@0 1597 vrshr.u16 q7, q11, #8
michael@0 1598 vraddhn.u16 d28, q14, q8
michael@0 1599 vraddhn.u16 d29, q15, q9
michael@0 1600 vraddhn.u16 d30, q6, q10
michael@0 1601 vraddhn.u16 d31, q7, q11
michael@0 1602 vqadd.u8 q14, q0, q14
michael@0 1603 vqadd.u8 q15, q1, q15
michael@0 1604 .endm
michael@0 1605
michael@0 1606 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
michael@0 1607 vrshr.u16 q14, q8, #8
michael@0 1608 vrshr.u16 q15, q9, #8
michael@0 1609 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 1610 vrshr.u16 q6, q10, #8
michael@0 1611 vrshr.u16 q7, q11, #8
michael@0 1612 vraddhn.u16 d28, q14, q8
michael@0 1613 vraddhn.u16 d29, q15, q9
michael@0 1614 vraddhn.u16 d30, q6, q10
michael@0 1615 vraddhn.u16 d31, q7, q11
michael@0 1616 fetch_mask_pixblock
michael@0 1617 vqadd.u8 q14, q0, q14
michael@0 1618 vqadd.u8 q15, q1, q15
michael@0 1619 cache_preload 8, 8
michael@0 1620 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
michael@0 1621 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 1622 .endm
michael@0 1623
michael@0 1624 .macro pixman_composite_over_n_8888_8888_ca_init
michael@0 1625 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1626 .vsave {d8-d15}
michael@0 1627 vpush {d8-d15}
michael@0 1628 vld1.32 {d11[0]}, [DUMMY]
michael@0 1629 vdup.8 d8, d11[0]
michael@0 1630 vdup.8 d9, d11[1]
michael@0 1631 vdup.8 d10, d11[2]
michael@0 1632 vdup.8 d11, d11[3]
michael@0 1633 .endm
michael@0 1634
michael@0 1635 .macro pixman_composite_over_n_8888_8888_ca_cleanup
michael@0 1636 vpop {d8-d15}
michael@0 1637 .endm
michael@0 1638
michael@0 1639 generate_composite_function \
michael@0 1640 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
michael@0 1641 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 1642 8, /* number of pixels, processed in a single block */ \
michael@0 1643 5, /* prefetch distance */ \
michael@0 1644 pixman_composite_over_n_8888_8888_ca_init, \
michael@0 1645 pixman_composite_over_n_8888_8888_ca_cleanup, \
michael@0 1646 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
michael@0 1647 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
michael@0 1648 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
michael@0 1649
michael@0 1650 /******************************************************************************/
michael@0 1651
michael@0 1652 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
michael@0 1653 /*
michael@0 1654 * 'combine_mask_ca' replacement
michael@0 1655 *
michael@0 1656 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
michael@0 1657 * mask in {d24, d25, d26} [B, G, R]
michael@0 1658 * output: updated src in {d0, d1, d2 } [B, G, R]
michael@0 1659 * updated mask in {d24, d25, d26} [B, G, R]
michael@0 1660 */
michael@0 1661 vmull.u8 q0, d24, d8
michael@0 1662 vmull.u8 q1, d25, d9
michael@0 1663 vmull.u8 q6, d26, d10
michael@0 1664 vmull.u8 q9, d11, d25
michael@0 1665 vmull.u8 q12, d11, d24
michael@0 1666 vmull.u8 q13, d11, d26
michael@0 1667 vrshr.u16 q8, q0, #8
michael@0 1668 vrshr.u16 q10, q1, #8
michael@0 1669 vrshr.u16 q11, q6, #8
michael@0 1670 vraddhn.u16 d0, q0, q8
michael@0 1671 vraddhn.u16 d1, q1, q10
michael@0 1672 vraddhn.u16 d2, q6, q11
michael@0 1673 vrshr.u16 q11, q12, #8
michael@0 1674 vrshr.u16 q8, q9, #8
michael@0 1675 vrshr.u16 q6, q13, #8
michael@0 1676 vraddhn.u16 d24, q12, q11
michael@0 1677 vraddhn.u16 d25, q9, q8
michael@0 1678 /*
michael@0 1679 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
michael@0 1680 * and put data into d16 - blue, d17 - green, d18 - red
michael@0 1681 */
michael@0 1682 vshrn.u16 d17, q2, #3
michael@0 1683 vshrn.u16 d18, q2, #8
michael@0 1684 vraddhn.u16 d26, q13, q6
michael@0 1685 vsli.u16 q2, q2, #5
michael@0 1686 vsri.u8 d18, d18, #5
michael@0 1687 vsri.u8 d17, d17, #6
michael@0 1688 /*
michael@0 1689 * 'combine_over_ca' replacement
michael@0 1690 *
michael@0 1691 * output: updated dest in d16 - blue, d17 - green, d18 - red
michael@0 1692 */
michael@0 1693 vmvn.8 q12, q12
michael@0 1694 vshrn.u16 d16, q2, #2
michael@0 1695 vmvn.8 d26, d26
michael@0 1696 vmull.u8 q6, d16, d24
michael@0 1697 vmull.u8 q7, d17, d25
michael@0 1698 vmull.u8 q11, d18, d26
michael@0 1699 .endm
michael@0 1700
michael@0 1701 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
michael@0 1702 /* ... continue 'combine_over_ca' replacement */
michael@0 1703 vrshr.u16 q10, q6, #8
michael@0 1704 vrshr.u16 q14, q7, #8
michael@0 1705 vrshr.u16 q15, q11, #8
michael@0 1706 vraddhn.u16 d16, q10, q6
michael@0 1707 vraddhn.u16 d17, q14, q7
michael@0 1708 vraddhn.u16 d18, q15, q11
michael@0 1709 vqadd.u8 q8, q0, q8
michael@0 1710 vqadd.u8 d18, d2, d18
michael@0 1711 /*
michael@0 1712 * convert the results in d16, d17, d18 to r5g6b5 and store
michael@0 1713 * them into {d28, d29}
michael@0 1714 */
michael@0 1715 vshll.u8 q14, d18, #8
michael@0 1716 vshll.u8 q10, d17, #8
michael@0 1717 vshll.u8 q15, d16, #8
michael@0 1718 vsri.u16 q14, q10, #5
michael@0 1719 vsri.u16 q14, q15, #11
michael@0 1720 .endm
michael@0 1721
michael@0 1722 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
michael@0 1723 fetch_mask_pixblock
michael@0 1724 vrshr.u16 q10, q6, #8
michael@0 1725 vrshr.u16 q14, q7, #8
michael@0 1726 vld1.16 {d4, d5}, [DST_R, :128]!
michael@0 1727 vrshr.u16 q15, q11, #8
michael@0 1728 vraddhn.u16 d16, q10, q6
michael@0 1729 vraddhn.u16 d17, q14, q7
michael@0 1730 vraddhn.u16 d22, q15, q11
michael@0 1731 /* process_pixblock_head */
michael@0 1732 /*
michael@0 1733 * 'combine_mask_ca' replacement
michael@0 1734 *
michael@0 1735 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
michael@0 1736 * mask in {d24, d25, d26} [B, G, R]
michael@0 1737 * output: updated src in {d0, d1, d2 } [B, G, R]
michael@0 1738 * updated mask in {d24, d25, d26} [B, G, R]
michael@0 1739 */
michael@0 1740 vmull.u8 q6, d26, d10
michael@0 1741 vqadd.u8 q8, q0, q8
michael@0 1742 vmull.u8 q0, d24, d8
michael@0 1743 vqadd.u8 d22, d2, d22
michael@0 1744 vmull.u8 q1, d25, d9
michael@0 1745 /*
michael@0 1746 * convert the result in d16, d17, d22 to r5g6b5 and store
michael@0 1747 * it into {d28, d29}
michael@0 1748 */
michael@0 1749 vshll.u8 q14, d22, #8
michael@0 1750 vshll.u8 q10, d17, #8
michael@0 1751 vshll.u8 q15, d16, #8
michael@0 1752 vmull.u8 q9, d11, d25
michael@0 1753 vsri.u16 q14, q10, #5
michael@0 1754 vmull.u8 q12, d11, d24
michael@0 1755 vmull.u8 q13, d11, d26
michael@0 1756 vsri.u16 q14, q15, #11
michael@0 1757 cache_preload 8, 8
michael@0 1758 vrshr.u16 q8, q0, #8
michael@0 1759 vrshr.u16 q10, q1, #8
michael@0 1760 vrshr.u16 q11, q6, #8
michael@0 1761 vraddhn.u16 d0, q0, q8
michael@0 1762 vraddhn.u16 d1, q1, q10
michael@0 1763 vraddhn.u16 d2, q6, q11
michael@0 1764 vrshr.u16 q11, q12, #8
michael@0 1765 vrshr.u16 q8, q9, #8
michael@0 1766 vrshr.u16 q6, q13, #8
michael@0 1767 vraddhn.u16 d24, q12, q11
michael@0 1768 vraddhn.u16 d25, q9, q8
michael@0 1769 /*
michael@0 1770 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
michael@0 1771 * 8-bit format and put data into d16 - blue, d17 - green,
michael@0 1772 * d18 - red
michael@0 1773 */
michael@0 1774 vshrn.u16 d17, q2, #3
michael@0 1775 vshrn.u16 d18, q2, #8
michael@0 1776 vraddhn.u16 d26, q13, q6
michael@0 1777 vsli.u16 q2, q2, #5
michael@0 1778 vsri.u8 d17, d17, #6
michael@0 1779 vsri.u8 d18, d18, #5
michael@0 1780 /*
michael@0 1781 * 'combine_over_ca' replacement
michael@0 1782 *
michael@0 1783 * output: updated dest in d16 - blue, d17 - green, d18 - red
michael@0 1784 */
michael@0 1785 vmvn.8 q12, q12
michael@0 1786 vshrn.u16 d16, q2, #2
michael@0 1787 vmvn.8 d26, d26
michael@0 1788 vmull.u8 q7, d17, d25
michael@0 1789 vmull.u8 q6, d16, d24
michael@0 1790 vmull.u8 q11, d18, d26
michael@0 1791 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 1792 .endm
michael@0 1793
michael@0 1794 .macro pixman_composite_over_n_8888_0565_ca_init
michael@0 1795 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1796 .vsave {d8-d15}
michael@0 1797 vpush {d8-d15}
michael@0 1798 vld1.32 {d11[0]}, [DUMMY]
michael@0 1799 vdup.8 d8, d11[0]
michael@0 1800 vdup.8 d9, d11[1]
michael@0 1801 vdup.8 d10, d11[2]
michael@0 1802 vdup.8 d11, d11[3]
michael@0 1803 .endm
michael@0 1804
michael@0 1805 .macro pixman_composite_over_n_8888_0565_ca_cleanup
michael@0 1806 vpop {d8-d15}
michael@0 1807 .endm
michael@0 1808
michael@0 1809 generate_composite_function \
michael@0 1810 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
michael@0 1811 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 1812 8, /* number of pixels, processed in a single block */ \
michael@0 1813 5, /* prefetch distance */ \
michael@0 1814 pixman_composite_over_n_8888_0565_ca_init, \
michael@0 1815 pixman_composite_over_n_8888_0565_ca_cleanup, \
michael@0 1816 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
michael@0 1817 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
michael@0 1818 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
michael@0 1819
michael@0 1820 /******************************************************************************/
michael@0 1821
michael@0 1822 .macro pixman_composite_in_n_8_process_pixblock_head
michael@0 1823 /* expecting source data in {d0, d1, d2, d3} */
michael@0 1824 /* and destination data in {d4, d5, d6, d7} */
michael@0 1825 vmull.u8 q8, d4, d3
michael@0 1826 vmull.u8 q9, d5, d3
michael@0 1827 vmull.u8 q10, d6, d3
michael@0 1828 vmull.u8 q11, d7, d3
michael@0 1829 .endm
michael@0 1830
michael@0 1831 .macro pixman_composite_in_n_8_process_pixblock_tail
michael@0 1832 vrshr.u16 q14, q8, #8
michael@0 1833 vrshr.u16 q15, q9, #8
michael@0 1834 vrshr.u16 q12, q10, #8
michael@0 1835 vrshr.u16 q13, q11, #8
michael@0 1836 vraddhn.u16 d28, q8, q14
michael@0 1837 vraddhn.u16 d29, q9, q15
michael@0 1838 vraddhn.u16 d30, q10, q12
michael@0 1839 vraddhn.u16 d31, q11, q13
michael@0 1840 .endm
michael@0 1841
michael@0 1842 .macro pixman_composite_in_n_8_process_pixblock_tail_head
michael@0 1843 pixman_composite_in_n_8_process_pixblock_tail
michael@0 1844 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 1845 cache_preload 32, 32
michael@0 1846 pixman_composite_in_n_8_process_pixblock_head
michael@0 1847 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 1848 .endm
michael@0 1849
michael@0 1850 .macro pixman_composite_in_n_8_init
michael@0 1851 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1852 vld1.32 {d3[0]}, [DUMMY]
michael@0 1853 vdup.8 d3, d3[3]
michael@0 1854 .endm
michael@0 1855
michael@0 1856 .macro pixman_composite_in_n_8_cleanup
michael@0 1857 .endm
michael@0 1858
michael@0 1859 generate_composite_function \
michael@0 1860 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
michael@0 1861 FLAG_DST_READWRITE, \
michael@0 1862 32, /* number of pixels, processed in a single block */ \
michael@0 1863 5, /* prefetch distance */ \
michael@0 1864 pixman_composite_in_n_8_init, \
michael@0 1865 pixman_composite_in_n_8_cleanup, \
michael@0 1866 pixman_composite_in_n_8_process_pixblock_head, \
michael@0 1867 pixman_composite_in_n_8_process_pixblock_tail, \
michael@0 1868 pixman_composite_in_n_8_process_pixblock_tail_head, \
michael@0 1869 28, /* dst_w_basereg */ \
michael@0 1870 4, /* dst_r_basereg */ \
michael@0 1871 0, /* src_basereg */ \
michael@0 1872 24 /* mask_basereg */
michael@0 1873
michael@0 1874 .macro pixman_composite_add_n_8_8_process_pixblock_head
michael@0 1875 /* expecting source data in {d8, d9, d10, d11} */
michael@0 1876 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
michael@0 1877 /* and destination data in {d4, d5, d6, d7} */
michael@0 1878 /* mask is in d24, d25, d26, d27 */
michael@0 1879 vmull.u8 q0, d24, d11
michael@0 1880 vmull.u8 q1, d25, d11
michael@0 1881 vmull.u8 q6, d26, d11
michael@0 1882 vmull.u8 q7, d27, d11
michael@0 1883 vrshr.u16 q10, q0, #8
michael@0 1884 vrshr.u16 q11, q1, #8
michael@0 1885 vrshr.u16 q12, q6, #8
michael@0 1886 vrshr.u16 q13, q7, #8
michael@0 1887 vraddhn.u16 d0, q0, q10
michael@0 1888 vraddhn.u16 d1, q1, q11
michael@0 1889 vraddhn.u16 d2, q6, q12
michael@0 1890 vraddhn.u16 d3, q7, q13
michael@0 1891 vqadd.u8 q14, q0, q2
michael@0 1892 vqadd.u8 q15, q1, q3
michael@0 1893 .endm
michael@0 1894
michael@0 1895 .macro pixman_composite_add_n_8_8_process_pixblock_tail
michael@0 1896 .endm
michael@0 1897
michael@0 1898 /* TODO: expand macros and do better instructions scheduling */
michael@0 1899 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
michael@0 1900 pixman_composite_add_n_8_8_process_pixblock_tail
michael@0 1901 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 1902 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 1903 fetch_mask_pixblock
michael@0 1904 cache_preload 32, 32
michael@0 1905 pixman_composite_add_n_8_8_process_pixblock_head
michael@0 1906 .endm
michael@0 1907
michael@0 1908 .macro pixman_composite_add_n_8_8_init
michael@0 1909 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 1910 .vsave {d8-d15}
michael@0 1911 vpush {d8-d15}
michael@0 1912 vld1.32 {d11[0]}, [DUMMY]
michael@0 1913 vdup.8 d11, d11[3]
michael@0 1914 .endm
michael@0 1915
michael@0 1916 .macro pixman_composite_add_n_8_8_cleanup
michael@0 1917 vpop {d8-d15}
michael@0 1918 .endm
michael@0 1919
michael@0 1920 generate_composite_function \
michael@0 1921 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
michael@0 1922 FLAG_DST_READWRITE, \
michael@0 1923 32, /* number of pixels, processed in a single block */ \
michael@0 1924 5, /* prefetch distance */ \
michael@0 1925 pixman_composite_add_n_8_8_init, \
michael@0 1926 pixman_composite_add_n_8_8_cleanup, \
michael@0 1927 pixman_composite_add_n_8_8_process_pixblock_head, \
michael@0 1928 pixman_composite_add_n_8_8_process_pixblock_tail, \
michael@0 1929 pixman_composite_add_n_8_8_process_pixblock_tail_head
michael@0 1930
michael@0 1931 /******************************************************************************/
michael@0 1932
michael@0 1933 .macro pixman_composite_add_8_8_8_process_pixblock_head
michael@0 1934 /* expecting source data in {d0, d1, d2, d3} */
michael@0 1935 /* destination data in {d4, d5, d6, d7} */
michael@0 1936 /* mask in {d24, d25, d26, d27} */
michael@0 1937 vmull.u8 q8, d24, d0
michael@0 1938 vmull.u8 q9, d25, d1
michael@0 1939 vmull.u8 q10, d26, d2
michael@0 1940 vmull.u8 q11, d27, d3
michael@0 1941 vrshr.u16 q0, q8, #8
michael@0 1942 vrshr.u16 q1, q9, #8
michael@0 1943 vrshr.u16 q12, q10, #8
michael@0 1944 vrshr.u16 q13, q11, #8
michael@0 1945 vraddhn.u16 d0, q0, q8
michael@0 1946 vraddhn.u16 d1, q1, q9
michael@0 1947 vraddhn.u16 d2, q12, q10
michael@0 1948 vraddhn.u16 d3, q13, q11
michael@0 1949 vqadd.u8 q14, q0, q2
michael@0 1950 vqadd.u8 q15, q1, q3
michael@0 1951 .endm
michael@0 1952
michael@0 1953 .macro pixman_composite_add_8_8_8_process_pixblock_tail
michael@0 1954 .endm
michael@0 1955
michael@0 1956 /* TODO: expand macros and do better instructions scheduling */
michael@0 1957 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
michael@0 1958 pixman_composite_add_8_8_8_process_pixblock_tail
michael@0 1959 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 1960 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 1961 fetch_mask_pixblock
michael@0 1962 fetch_src_pixblock
michael@0 1963 cache_preload 32, 32
michael@0 1964 pixman_composite_add_8_8_8_process_pixblock_head
michael@0 1965 .endm
michael@0 1966
michael@0 1967 .macro pixman_composite_add_8_8_8_init
michael@0 1968 .endm
michael@0 1969
michael@0 1970 .macro pixman_composite_add_8_8_8_cleanup
michael@0 1971 .endm
michael@0 1972
michael@0 1973 generate_composite_function \
michael@0 1974 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
michael@0 1975 FLAG_DST_READWRITE, \
michael@0 1976 32, /* number of pixels, processed in a single block */ \
michael@0 1977 5, /* prefetch distance */ \
michael@0 1978 pixman_composite_add_8_8_8_init, \
michael@0 1979 pixman_composite_add_8_8_8_cleanup, \
michael@0 1980 pixman_composite_add_8_8_8_process_pixblock_head, \
michael@0 1981 pixman_composite_add_8_8_8_process_pixblock_tail, \
michael@0 1982 pixman_composite_add_8_8_8_process_pixblock_tail_head
michael@0 1983
michael@0 1984 /******************************************************************************/
michael@0 1985
michael@0 1986 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
michael@0 1987 /* expecting source data in {d0, d1, d2, d3} */
michael@0 1988 /* destination data in {d4, d5, d6, d7} */
michael@0 1989 /* mask in {d24, d25, d26, d27} */
michael@0 1990 vmull.u8 q8, d27, d0
michael@0 1991 vmull.u8 q9, d27, d1
michael@0 1992 vmull.u8 q10, d27, d2
michael@0 1993 vmull.u8 q11, d27, d3
michael@0 1994 /* 1 cycle bubble */
michael@0 1995 vrsra.u16 q8, q8, #8
michael@0 1996 vrsra.u16 q9, q9, #8
michael@0 1997 vrsra.u16 q10, q10, #8
michael@0 1998 vrsra.u16 q11, q11, #8
michael@0 1999 .endm
michael@0 2000
michael@0 2001 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
michael@0 2002 /* 2 cycle bubble */
michael@0 2003 vrshrn.u16 d28, q8, #8
michael@0 2004 vrshrn.u16 d29, q9, #8
michael@0 2005 vrshrn.u16 d30, q10, #8
michael@0 2006 vrshrn.u16 d31, q11, #8
michael@0 2007 vqadd.u8 q14, q2, q14
michael@0 2008 /* 1 cycle bubble */
michael@0 2009 vqadd.u8 q15, q3, q15
michael@0 2010 .endm
michael@0 2011
michael@0 2012 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
michael@0 2013 fetch_src_pixblock
michael@0 2014 vrshrn.u16 d28, q8, #8
michael@0 2015 fetch_mask_pixblock
michael@0 2016 vrshrn.u16 d29, q9, #8
michael@0 2017 vmull.u8 q8, d27, d0
michael@0 2018 vrshrn.u16 d30, q10, #8
michael@0 2019 vmull.u8 q9, d27, d1
michael@0 2020 vrshrn.u16 d31, q11, #8
michael@0 2021 vmull.u8 q10, d27, d2
michael@0 2022 vqadd.u8 q14, q2, q14
michael@0 2023 vmull.u8 q11, d27, d3
michael@0 2024 vqadd.u8 q15, q3, q15
michael@0 2025 vrsra.u16 q8, q8, #8
michael@0 2026 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 2027 vrsra.u16 q9, q9, #8
michael@0 2028 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 2029 vrsra.u16 q10, q10, #8
michael@0 2030
michael@0 2031 cache_preload 8, 8
michael@0 2032
michael@0 2033 vrsra.u16 q11, q11, #8
michael@0 2034 .endm
michael@0 2035
michael@0 2036 generate_composite_function \
michael@0 2037 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
michael@0 2038 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2039 8, /* number of pixels, processed in a single block */ \
michael@0 2040 10, /* prefetch distance */ \
michael@0 2041 default_init, \
michael@0 2042 default_cleanup, \
michael@0 2043 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
michael@0 2044 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
michael@0 2045 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
michael@0 2046
michael@0 2047 generate_composite_function_single_scanline \
michael@0 2048 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
michael@0 2049 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2050 8, /* number of pixels, processed in a single block */ \
michael@0 2051 default_init, \
michael@0 2052 default_cleanup, \
michael@0 2053 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
michael@0 2054 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
michael@0 2055 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
michael@0 2056
michael@0 2057 /******************************************************************************/
michael@0 2058
michael@0 2059 generate_composite_function \
michael@0 2060 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
michael@0 2061 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2062 8, /* number of pixels, processed in a single block */ \
michael@0 2063 5, /* prefetch distance */ \
michael@0 2064 default_init, \
michael@0 2065 default_cleanup, \
michael@0 2066 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
michael@0 2067 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
michael@0 2068 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
michael@0 2069 28, /* dst_w_basereg */ \
michael@0 2070 4, /* dst_r_basereg */ \
michael@0 2071 0, /* src_basereg */ \
michael@0 2072 27 /* mask_basereg */
michael@0 2073
michael@0 2074 /******************************************************************************/
michael@0 2075
michael@0 2076 .macro pixman_composite_add_n_8_8888_init
michael@0 2077 add DUMMY, sp, #ARGS_STACK_OFFSET
michael@0 2078 vld1.32 {d3[0]}, [DUMMY]
michael@0 2079 vdup.8 d0, d3[0]
michael@0 2080 vdup.8 d1, d3[1]
michael@0 2081 vdup.8 d2, d3[2]
michael@0 2082 vdup.8 d3, d3[3]
michael@0 2083 .endm
michael@0 2084
michael@0 2085 .macro pixman_composite_add_n_8_8888_cleanup
michael@0 2086 .endm
michael@0 2087
michael@0 2088 generate_composite_function \
michael@0 2089 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
michael@0 2090 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2091 8, /* number of pixels, processed in a single block */ \
michael@0 2092 5, /* prefetch distance */ \
michael@0 2093 pixman_composite_add_n_8_8888_init, \
michael@0 2094 pixman_composite_add_n_8_8888_cleanup, \
michael@0 2095 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
michael@0 2096 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
michael@0 2097 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
michael@0 2098 28, /* dst_w_basereg */ \
michael@0 2099 4, /* dst_r_basereg */ \
michael@0 2100 0, /* src_basereg */ \
michael@0 2101 27 /* mask_basereg */
michael@0 2102
michael@0 2103 /******************************************************************************/
michael@0 2104
michael@0 2105 .macro pixman_composite_add_8888_n_8888_init
michael@0 2106 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
michael@0 2107 vld1.32 {d27[0]}, [DUMMY]
michael@0 2108 vdup.8 d27, d27[3]
michael@0 2109 .endm
michael@0 2110
michael@0 2111 .macro pixman_composite_add_8888_n_8888_cleanup
michael@0 2112 .endm
michael@0 2113
michael@0 2114 generate_composite_function \
michael@0 2115 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
michael@0 2116 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2117 8, /* number of pixels, processed in a single block */ \
michael@0 2118 5, /* prefetch distance */ \
michael@0 2119 pixman_composite_add_8888_n_8888_init, \
michael@0 2120 pixman_composite_add_8888_n_8888_cleanup, \
michael@0 2121 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
michael@0 2122 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
michael@0 2123 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
michael@0 2124 28, /* dst_w_basereg */ \
michael@0 2125 4, /* dst_r_basereg */ \
michael@0 2126 0, /* src_basereg */ \
michael@0 2127 27 /* mask_basereg */
michael@0 2128
michael@0 2129 /******************************************************************************/
michael@0 2130
michael@0 2131 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
michael@0 2132 /* expecting source data in {d0, d1, d2, d3} */
michael@0 2133 /* destination data in {d4, d5, d6, d7} */
michael@0 2134 /* solid mask is in d15 */
michael@0 2135
michael@0 2136 /* 'in' */
michael@0 2137 vmull.u8 q8, d15, d3
michael@0 2138 vmull.u8 q6, d15, d2
michael@0 2139 vmull.u8 q5, d15, d1
michael@0 2140 vmull.u8 q4, d15, d0
michael@0 2141 vrshr.u16 q13, q8, #8
michael@0 2142 vrshr.u16 q12, q6, #8
michael@0 2143 vrshr.u16 q11, q5, #8
michael@0 2144 vrshr.u16 q10, q4, #8
michael@0 2145 vraddhn.u16 d3, q8, q13
michael@0 2146 vraddhn.u16 d2, q6, q12
michael@0 2147 vraddhn.u16 d1, q5, q11
michael@0 2148 vraddhn.u16 d0, q4, q10
michael@0 2149 vmvn.8 d24, d3 /* get inverted alpha */
michael@0 2150 /* now do alpha blending */
michael@0 2151 vmull.u8 q8, d24, d4
michael@0 2152 vmull.u8 q9, d24, d5
michael@0 2153 vmull.u8 q10, d24, d6
michael@0 2154 vmull.u8 q11, d24, d7
michael@0 2155 .endm
michael@0 2156
michael@0 2157 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
michael@0 2158 vrshr.u16 q14, q8, #8
michael@0 2159 vrshr.u16 q15, q9, #8
michael@0 2160 vrshr.u16 q12, q10, #8
michael@0 2161 vrshr.u16 q13, q11, #8
michael@0 2162 vraddhn.u16 d28, q14, q8
michael@0 2163 vraddhn.u16 d29, q15, q9
michael@0 2164 vraddhn.u16 d30, q12, q10
michael@0 2165 vraddhn.u16 d31, q13, q11
michael@0 2166 .endm
michael@0 2167
michael@0 2168 /* TODO: expand macros and do better instructions scheduling */
michael@0 2169 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
michael@0 2170 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 2171 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
michael@0 2172 fetch_src_pixblock
michael@0 2173 cache_preload 8, 8
michael@0 2174 fetch_mask_pixblock
michael@0 2175 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
michael@0 2176 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 2177 .endm
michael@0 2178
michael@0 2179 generate_composite_function_single_scanline \
michael@0 2180 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
michael@0 2181 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2182 8, /* number of pixels, processed in a single block */ \
michael@0 2183 default_init_need_all_regs, \
michael@0 2184 default_cleanup_need_all_regs, \
michael@0 2185 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
michael@0 2186 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
michael@0 2187 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
michael@0 2188 28, /* dst_w_basereg */ \
michael@0 2189 4, /* dst_r_basereg */ \
michael@0 2190 0, /* src_basereg */ \
michael@0 2191 12 /* mask_basereg */
michael@0 2192
michael@0 2193 /******************************************************************************/
michael@0 2194
michael@0 2195 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
michael@0 2196 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
michael@0 2197 .endm
michael@0 2198
michael@0 2199 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
michael@0 2200 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
michael@0 2201 vqadd.u8 q14, q0, q14
michael@0 2202 vqadd.u8 q15, q1, q15
michael@0 2203 .endm
michael@0 2204
michael@0 2205 /* TODO: expand macros and do better instructions scheduling */
michael@0 2206 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
michael@0 2207 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 2208 pixman_composite_over_8888_n_8888_process_pixblock_tail
michael@0 2209 fetch_src_pixblock
michael@0 2210 cache_preload 8, 8
michael@0 2211 pixman_composite_over_8888_n_8888_process_pixblock_head
michael@0 2212 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 2213 .endm
michael@0 2214
michael@0 2215 .macro pixman_composite_over_8888_n_8888_init
michael@0 2216 add DUMMY, sp, #48
michael@0 2217 .vsave {d8-d15}
michael@0 2218 vpush {d8-d15}
michael@0 2219 vld1.32 {d15[0]}, [DUMMY]
michael@0 2220 vdup.8 d15, d15[3]
michael@0 2221 .endm
michael@0 2222
michael@0 2223 .macro pixman_composite_over_8888_n_8888_cleanup
michael@0 2224 vpop {d8-d15}
michael@0 2225 .endm
michael@0 2226
michael@0 2227 generate_composite_function \
michael@0 2228 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
michael@0 2229 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2230 8, /* number of pixels, processed in a single block */ \
michael@0 2231 5, /* prefetch distance */ \
michael@0 2232 pixman_composite_over_8888_n_8888_init, \
michael@0 2233 pixman_composite_over_8888_n_8888_cleanup, \
michael@0 2234 pixman_composite_over_8888_n_8888_process_pixblock_head, \
michael@0 2235 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
michael@0 2236 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
michael@0 2237
michael@0 2238 /******************************************************************************/
michael@0 2239
michael@0 2240 /* TODO: expand macros and do better instructions scheduling */
michael@0 2241 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
michael@0 2242 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 2243 pixman_composite_over_8888_n_8888_process_pixblock_tail
michael@0 2244 fetch_src_pixblock
michael@0 2245 cache_preload 8, 8
michael@0 2246 fetch_mask_pixblock
michael@0 2247 pixman_composite_over_8888_n_8888_process_pixblock_head
michael@0 2248 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 2249 .endm
michael@0 2250
michael@0 2251 generate_composite_function \
michael@0 2252 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
michael@0 2253 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2254 8, /* number of pixels, processed in a single block */ \
michael@0 2255 5, /* prefetch distance */ \
michael@0 2256 default_init_need_all_regs, \
michael@0 2257 default_cleanup_need_all_regs, \
michael@0 2258 pixman_composite_over_8888_n_8888_process_pixblock_head, \
michael@0 2259 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
michael@0 2260 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
michael@0 2261 28, /* dst_w_basereg */ \
michael@0 2262 4, /* dst_r_basereg */ \
michael@0 2263 0, /* src_basereg */ \
michael@0 2264 12 /* mask_basereg */
michael@0 2265
michael@0 2266 generate_composite_function_single_scanline \
michael@0 2267 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
michael@0 2268 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2269 8, /* number of pixels, processed in a single block */ \
michael@0 2270 default_init_need_all_regs, \
michael@0 2271 default_cleanup_need_all_regs, \
michael@0 2272 pixman_composite_over_8888_n_8888_process_pixblock_head, \
michael@0 2273 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
michael@0 2274 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
michael@0 2275 28, /* dst_w_basereg */ \
michael@0 2276 4, /* dst_r_basereg */ \
michael@0 2277 0, /* src_basereg */ \
michael@0 2278 12 /* mask_basereg */
michael@0 2279
michael@0 2280 /******************************************************************************/
michael@0 2281
michael@0 2282 /* TODO: expand macros and do better instructions scheduling */
michael@0 2283 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
michael@0 2284 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 2285 pixman_composite_over_8888_n_8888_process_pixblock_tail
michael@0 2286 fetch_src_pixblock
michael@0 2287 cache_preload 8, 8
michael@0 2288 fetch_mask_pixblock
michael@0 2289 pixman_composite_over_8888_n_8888_process_pixblock_head
michael@0 2290 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 2291 .endm
michael@0 2292
michael@0 2293 generate_composite_function \
michael@0 2294 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
michael@0 2295 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2296 8, /* number of pixels, processed in a single block */ \
michael@0 2297 5, /* prefetch distance */ \
michael@0 2298 default_init_need_all_regs, \
michael@0 2299 default_cleanup_need_all_regs, \
michael@0 2300 pixman_composite_over_8888_n_8888_process_pixblock_head, \
michael@0 2301 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
michael@0 2302 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
michael@0 2303 28, /* dst_w_basereg */ \
michael@0 2304 4, /* dst_r_basereg */ \
michael@0 2305 0, /* src_basereg */ \
michael@0 2306 15 /* mask_basereg */
michael@0 2307
michael@0 2308 /******************************************************************************/
michael@0 2309
michael@0 2310 .macro pixman_composite_src_0888_0888_process_pixblock_head
michael@0 2311 .endm
michael@0 2312
michael@0 2313 .macro pixman_composite_src_0888_0888_process_pixblock_tail
michael@0 2314 .endm
michael@0 2315
michael@0 2316 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
michael@0 2317 vst3.8 {d0, d1, d2}, [DST_W]!
michael@0 2318 fetch_src_pixblock
michael@0 2319 cache_preload 8, 8
michael@0 2320 .endm
michael@0 2321
michael@0 2322 generate_composite_function \
michael@0 2323 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
michael@0 2324 FLAG_DST_WRITEONLY, \
michael@0 2325 8, /* number of pixels, processed in a single block */ \
michael@0 2326 10, /* prefetch distance */ \
michael@0 2327 default_init, \
michael@0 2328 default_cleanup, \
michael@0 2329 pixman_composite_src_0888_0888_process_pixblock_head, \
michael@0 2330 pixman_composite_src_0888_0888_process_pixblock_tail, \
michael@0 2331 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
michael@0 2332 0, /* dst_w_basereg */ \
michael@0 2333 0, /* dst_r_basereg */ \
michael@0 2334 0, /* src_basereg */ \
michael@0 2335 0 /* mask_basereg */
michael@0 2336
michael@0 2337 /******************************************************************************/
michael@0 2338
michael@0 2339 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
michael@0 2340 vswp d0, d2
michael@0 2341 .endm
michael@0 2342
michael@0 2343 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
michael@0 2344 .endm
michael@0 2345
michael@0 2346 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
michael@0 2347 vst4.8 {d0, d1, d2, d3}, [DST_W]!
michael@0 2348 fetch_src_pixblock
michael@0 2349 vswp d0, d2
michael@0 2350 cache_preload 8, 8
michael@0 2351 .endm
michael@0 2352
michael@0 2353 .macro pixman_composite_src_0888_8888_rev_init
michael@0 2354 veor d3, d3, d3
michael@0 2355 .endm
michael@0 2356
michael@0 2357 generate_composite_function \
michael@0 2358 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
michael@0 2359 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2360 8, /* number of pixels, processed in a single block */ \
michael@0 2361 10, /* prefetch distance */ \
michael@0 2362 pixman_composite_src_0888_8888_rev_init, \
michael@0 2363 default_cleanup, \
michael@0 2364 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
michael@0 2365 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
michael@0 2366 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
michael@0 2367 0, /* dst_w_basereg */ \
michael@0 2368 0, /* dst_r_basereg */ \
michael@0 2369 0, /* src_basereg */ \
michael@0 2370 0 /* mask_basereg */
michael@0 2371
michael@0 2372 /******************************************************************************/
michael@0 2373
michael@0 2374 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
michael@0 2375 vshll.u8 q8, d1, #8
michael@0 2376 vshll.u8 q9, d2, #8
michael@0 2377 .endm
michael@0 2378
michael@0 2379 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
michael@0 2380 vshll.u8 q14, d0, #8
michael@0 2381 vsri.u16 q14, q8, #5
michael@0 2382 vsri.u16 q14, q9, #11
michael@0 2383 .endm
michael@0 2384
michael@0 2385 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
michael@0 2386 vshll.u8 q14, d0, #8
michael@0 2387 fetch_src_pixblock
michael@0 2388 vsri.u16 q14, q8, #5
michael@0 2389 vsri.u16 q14, q9, #11
michael@0 2390 vshll.u8 q8, d1, #8
michael@0 2391 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 2392 vshll.u8 q9, d2, #8
michael@0 2393 .endm
michael@0 2394
michael@0 2395 generate_composite_function \
michael@0 2396 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
michael@0 2397 FLAG_DST_WRITEONLY, \
michael@0 2398 8, /* number of pixels, processed in a single block */ \
michael@0 2399 10, /* prefetch distance */ \
michael@0 2400 default_init, \
michael@0 2401 default_cleanup, \
michael@0 2402 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
michael@0 2403 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
michael@0 2404 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
michael@0 2405 28, /* dst_w_basereg */ \
michael@0 2406 0, /* dst_r_basereg */ \
michael@0 2407 0, /* src_basereg */ \
michael@0 2408 0 /* mask_basereg */
michael@0 2409
michael@0 2410 /******************************************************************************/
michael@0 2411
michael@0 2412 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
michael@0 2413 vmull.u8 q8, d3, d0
michael@0 2414 vmull.u8 q9, d3, d1
michael@0 2415 vmull.u8 q10, d3, d2
michael@0 2416 .endm
michael@0 2417
michael@0 2418 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
michael@0 2419 vrshr.u16 q11, q8, #8
michael@0 2420 vswp d3, d31
michael@0 2421 vrshr.u16 q12, q9, #8
michael@0 2422 vrshr.u16 q13, q10, #8
michael@0 2423 vraddhn.u16 d30, q11, q8
michael@0 2424 vraddhn.u16 d29, q12, q9
michael@0 2425 vraddhn.u16 d28, q13, q10
michael@0 2426 .endm
michael@0 2427
michael@0 2428 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
michael@0 2429 vrshr.u16 q11, q8, #8
michael@0 2430 vswp d3, d31
michael@0 2431 vrshr.u16 q12, q9, #8
michael@0 2432 vrshr.u16 q13, q10, #8
michael@0 2433 fetch_src_pixblock
michael@0 2434 vraddhn.u16 d30, q11, q8
michael@0 2435 PF add PF_X, PF_X, #8
michael@0 2436 PF tst PF_CTL, #0xF
michael@0 2437 PF addne PF_X, PF_X, #8
michael@0 2438 PF subne PF_CTL, PF_CTL, #1
michael@0 2439 vraddhn.u16 d29, q12, q9
michael@0 2440 vraddhn.u16 d28, q13, q10
michael@0 2441 vmull.u8 q8, d3, d0
michael@0 2442 vmull.u8 q9, d3, d1
michael@0 2443 vmull.u8 q10, d3, d2
michael@0 2444 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 2445 PF cmp PF_X, ORIG_W
michael@0 2446 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
michael@0 2447 PF subge PF_X, PF_X, ORIG_W
michael@0 2448 PF subges PF_CTL, PF_CTL, #0x10
michael@0 2449 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
michael@0 2450 .endm
michael@0 2451
michael@0 2452 generate_composite_function \
michael@0 2453 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
michael@0 2454 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2455 8, /* number of pixels, processed in a single block */ \
michael@0 2456 10, /* prefetch distance */ \
michael@0 2457 default_init, \
michael@0 2458 default_cleanup, \
michael@0 2459 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
michael@0 2460 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
michael@0 2461 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
michael@0 2462 28, /* dst_w_basereg */ \
michael@0 2463 0, /* dst_r_basereg */ \
michael@0 2464 0, /* src_basereg */ \
michael@0 2465 0 /* mask_basereg */
michael@0 2466
michael@0 2467 /******************************************************************************/
michael@0 2468
michael@0 2469 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
michael@0 2470 vmull.u8 q8, d3, d0
michael@0 2471 vmull.u8 q9, d3, d1
michael@0 2472 vmull.u8 q10, d3, d2
michael@0 2473 .endm
michael@0 2474
michael@0 2475 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
michael@0 2476 vrshr.u16 q11, q8, #8
michael@0 2477 vswp d3, d31
michael@0 2478 vrshr.u16 q12, q9, #8
michael@0 2479 vrshr.u16 q13, q10, #8
michael@0 2480 vraddhn.u16 d28, q11, q8
michael@0 2481 vraddhn.u16 d29, q12, q9
michael@0 2482 vraddhn.u16 d30, q13, q10
michael@0 2483 .endm
michael@0 2484
michael@0 2485 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
michael@0 2486 vrshr.u16 q11, q8, #8
michael@0 2487 vswp d3, d31
michael@0 2488 vrshr.u16 q12, q9, #8
michael@0 2489 vrshr.u16 q13, q10, #8
michael@0 2490 fetch_src_pixblock
michael@0 2491 vraddhn.u16 d28, q11, q8
michael@0 2492 PF add PF_X, PF_X, #8
michael@0 2493 PF tst PF_CTL, #0xF
michael@0 2494 PF addne PF_X, PF_X, #8
michael@0 2495 PF subne PF_CTL, PF_CTL, #1
michael@0 2496 vraddhn.u16 d29, q12, q9
michael@0 2497 vraddhn.u16 d30, q13, q10
michael@0 2498 vmull.u8 q8, d3, d0
michael@0 2499 vmull.u8 q9, d3, d1
michael@0 2500 vmull.u8 q10, d3, d2
michael@0 2501 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 2502 PF cmp PF_X, ORIG_W
michael@0 2503 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
michael@0 2504 PF subge PF_X, PF_X, ORIG_W
michael@0 2505 PF subges PF_CTL, PF_CTL, #0x10
michael@0 2506 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
michael@0 2507 .endm
michael@0 2508
michael@0 2509 generate_composite_function \
michael@0 2510 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
michael@0 2511 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2512 8, /* number of pixels, processed in a single block */ \
michael@0 2513 10, /* prefetch distance */ \
michael@0 2514 default_init, \
michael@0 2515 default_cleanup, \
michael@0 2516 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
michael@0 2517 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
michael@0 2518 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
michael@0 2519 28, /* dst_w_basereg */ \
michael@0 2520 0, /* dst_r_basereg */ \
michael@0 2521 0, /* src_basereg */ \
michael@0 2522 0 /* mask_basereg */
michael@0 2523
michael@0 2524 /******************************************************************************/
michael@0 2525
michael@0 2526 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
michael@0 2527 /* mask is in d15 */
michael@0 2528 convert_0565_to_x888 q4, d2, d1, d0
michael@0 2529 convert_0565_to_x888 q5, d6, d5, d4
michael@0 2530 /* source pixel data is in {d0, d1, d2, XX} */
michael@0 2531 /* destination pixel data is in {d4, d5, d6, XX} */
michael@0 2532 vmvn.8 d7, d15
michael@0 2533 vmull.u8 q6, d15, d2
michael@0 2534 vmull.u8 q5, d15, d1
michael@0 2535 vmull.u8 q4, d15, d0
michael@0 2536 vmull.u8 q8, d7, d4
michael@0 2537 vmull.u8 q9, d7, d5
michael@0 2538 vmull.u8 q13, d7, d6
michael@0 2539 vrshr.u16 q12, q6, #8
michael@0 2540 vrshr.u16 q11, q5, #8
michael@0 2541 vrshr.u16 q10, q4, #8
michael@0 2542 vraddhn.u16 d2, q6, q12
michael@0 2543 vraddhn.u16 d1, q5, q11
michael@0 2544 vraddhn.u16 d0, q4, q10
michael@0 2545 .endm
michael@0 2546
michael@0 2547 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
michael@0 2548 vrshr.u16 q14, q8, #8
michael@0 2549 vrshr.u16 q15, q9, #8
michael@0 2550 vrshr.u16 q12, q13, #8
michael@0 2551 vraddhn.u16 d28, q14, q8
michael@0 2552 vraddhn.u16 d29, q15, q9
michael@0 2553 vraddhn.u16 d30, q12, q13
michael@0 2554 vqadd.u8 q0, q0, q14
michael@0 2555 vqadd.u8 q1, q1, q15
michael@0 2556 /* 32bpp result is in {d0, d1, d2, XX} */
michael@0 2557 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
michael@0 2558 .endm
michael@0 2559
michael@0 2560 /* TODO: expand macros and do better instructions scheduling */
michael@0 2561 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
michael@0 2562 fetch_mask_pixblock
michael@0 2563 pixman_composite_over_0565_8_0565_process_pixblock_tail
michael@0 2564 fetch_src_pixblock
michael@0 2565 vld1.16 {d10, d11}, [DST_R, :128]!
michael@0 2566 cache_preload 8, 8
michael@0 2567 pixman_composite_over_0565_8_0565_process_pixblock_head
michael@0 2568 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 2569 .endm
michael@0 2570
michael@0 2571 generate_composite_function \
michael@0 2572 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
michael@0 2573 FLAG_DST_READWRITE, \
michael@0 2574 8, /* number of pixels, processed in a single block */ \
michael@0 2575 5, /* prefetch distance */ \
michael@0 2576 default_init_need_all_regs, \
michael@0 2577 default_cleanup_need_all_regs, \
michael@0 2578 pixman_composite_over_0565_8_0565_process_pixblock_head, \
michael@0 2579 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
michael@0 2580 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
michael@0 2581 28, /* dst_w_basereg */ \
michael@0 2582 10, /* dst_r_basereg */ \
michael@0 2583 8, /* src_basereg */ \
michael@0 2584 15 /* mask_basereg */
michael@0 2585
michael@0 2586 /******************************************************************************/
michael@0 2587
michael@0 2588 .macro pixman_composite_over_0565_n_0565_init
michael@0 2589 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
michael@0 2590 .vsave {d8-d15}
michael@0 2591 vpush {d8-d15}
michael@0 2592 vld1.32 {d15[0]}, [DUMMY]
michael@0 2593 vdup.8 d15, d15[3]
michael@0 2594 .endm
michael@0 2595
michael@0 2596 .macro pixman_composite_over_0565_n_0565_cleanup
michael@0 2597 vpop {d8-d15}
michael@0 2598 .endm
michael@0 2599
michael@0 2600 generate_composite_function \
michael@0 2601 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
michael@0 2602 FLAG_DST_READWRITE, \
michael@0 2603 8, /* number of pixels, processed in a single block */ \
michael@0 2604 5, /* prefetch distance */ \
michael@0 2605 pixman_composite_over_0565_n_0565_init, \
michael@0 2606 pixman_composite_over_0565_n_0565_cleanup, \
michael@0 2607 pixman_composite_over_0565_8_0565_process_pixblock_head, \
michael@0 2608 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
michael@0 2609 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
michael@0 2610 28, /* dst_w_basereg */ \
michael@0 2611 10, /* dst_r_basereg */ \
michael@0 2612 8, /* src_basereg */ \
michael@0 2613 15 /* mask_basereg */
michael@0 2614
michael@0 2615 /******************************************************************************/
michael@0 2616
michael@0 2617 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
michael@0 2618 /* mask is in d15 */
michael@0 2619 convert_0565_to_x888 q4, d2, d1, d0
michael@0 2620 convert_0565_to_x888 q5, d6, d5, d4
michael@0 2621 /* source pixel data is in {d0, d1, d2, XX} */
michael@0 2622 /* destination pixel data is in {d4, d5, d6, XX} */
michael@0 2623 vmull.u8 q6, d15, d2
michael@0 2624 vmull.u8 q5, d15, d1
michael@0 2625 vmull.u8 q4, d15, d0
michael@0 2626 vrshr.u16 q12, q6, #8
michael@0 2627 vrshr.u16 q11, q5, #8
michael@0 2628 vrshr.u16 q10, q4, #8
michael@0 2629 vraddhn.u16 d2, q6, q12
michael@0 2630 vraddhn.u16 d1, q5, q11
michael@0 2631 vraddhn.u16 d0, q4, q10
michael@0 2632 .endm
michael@0 2633
michael@0 2634 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
michael@0 2635 vqadd.u8 q0, q0, q2
michael@0 2636 vqadd.u8 q1, q1, q3
michael@0 2637 /* 32bpp result is in {d0, d1, d2, XX} */
michael@0 2638 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
michael@0 2639 .endm
michael@0 2640
michael@0 2641 /* TODO: expand macros and do better instructions scheduling */
michael@0 2642 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
michael@0 2643 fetch_mask_pixblock
michael@0 2644 pixman_composite_add_0565_8_0565_process_pixblock_tail
michael@0 2645 fetch_src_pixblock
michael@0 2646 vld1.16 {d10, d11}, [DST_R, :128]!
michael@0 2647 cache_preload 8, 8
michael@0 2648 pixman_composite_add_0565_8_0565_process_pixblock_head
michael@0 2649 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 2650 .endm
michael@0 2651
michael@0 2652 generate_composite_function \
michael@0 2653 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
michael@0 2654 FLAG_DST_READWRITE, \
michael@0 2655 8, /* number of pixels, processed in a single block */ \
michael@0 2656 5, /* prefetch distance */ \
michael@0 2657 default_init_need_all_regs, \
michael@0 2658 default_cleanup_need_all_regs, \
michael@0 2659 pixman_composite_add_0565_8_0565_process_pixblock_head, \
michael@0 2660 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
michael@0 2661 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
michael@0 2662 28, /* dst_w_basereg */ \
michael@0 2663 10, /* dst_r_basereg */ \
michael@0 2664 8, /* src_basereg */ \
michael@0 2665 15 /* mask_basereg */
michael@0 2666
michael@0 2667 /******************************************************************************/
michael@0 2668
michael@0 2669 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
michael@0 2670 /* mask is in d15 */
michael@0 2671 convert_0565_to_x888 q5, d6, d5, d4
michael@0 2672 /* destination pixel data is in {d4, d5, d6, xx} */
michael@0 2673 vmvn.8 d24, d15 /* get inverted alpha */
michael@0 2674 /* now do alpha blending */
michael@0 2675 vmull.u8 q8, d24, d4
michael@0 2676 vmull.u8 q9, d24, d5
michael@0 2677 vmull.u8 q10, d24, d6
michael@0 2678 .endm
michael@0 2679
michael@0 2680 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
michael@0 2681 vrshr.u16 q14, q8, #8
michael@0 2682 vrshr.u16 q15, q9, #8
michael@0 2683 vrshr.u16 q12, q10, #8
michael@0 2684 vraddhn.u16 d0, q14, q8
michael@0 2685 vraddhn.u16 d1, q15, q9
michael@0 2686 vraddhn.u16 d2, q12, q10
michael@0 2687 /* 32bpp result is in {d0, d1, d2, XX} */
michael@0 2688 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
michael@0 2689 .endm
michael@0 2690
michael@0 2691 /* TODO: expand macros and do better instructions scheduling */
michael@0 2692 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
michael@0 2693 fetch_src_pixblock
michael@0 2694 pixman_composite_out_reverse_8_0565_process_pixblock_tail
michael@0 2695 vld1.16 {d10, d11}, [DST_R, :128]!
michael@0 2696 cache_preload 8, 8
michael@0 2697 pixman_composite_out_reverse_8_0565_process_pixblock_head
michael@0 2698 vst1.16 {d28, d29}, [DST_W, :128]!
michael@0 2699 .endm
michael@0 2700
michael@0 2701 generate_composite_function \
michael@0 2702 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
michael@0 2703 FLAG_DST_READWRITE, \
michael@0 2704 8, /* number of pixels, processed in a single block */ \
michael@0 2705 5, /* prefetch distance */ \
michael@0 2706 default_init_need_all_regs, \
michael@0 2707 default_cleanup_need_all_regs, \
michael@0 2708 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
michael@0 2709 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
michael@0 2710 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
michael@0 2711 28, /* dst_w_basereg */ \
michael@0 2712 10, /* dst_r_basereg */ \
michael@0 2713 15, /* src_basereg */ \
michael@0 2714 0 /* mask_basereg */
michael@0 2715
michael@0 2716 /******************************************************************************/
michael@0 2717
michael@0 2718 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
michael@0 2719 /* src is in d0 */
michael@0 2720 /* destination pixel data is in {d4, d5, d6, d7} */
michael@0 2721 vmvn.8 d1, d0 /* get inverted alpha */
michael@0 2722 /* now do alpha blending */
michael@0 2723 vmull.u8 q8, d1, d4
michael@0 2724 vmull.u8 q9, d1, d5
michael@0 2725 vmull.u8 q10, d1, d6
michael@0 2726 vmull.u8 q11, d1, d7
michael@0 2727 .endm
michael@0 2728
michael@0 2729 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
michael@0 2730 vrshr.u16 q14, q8, #8
michael@0 2731 vrshr.u16 q15, q9, #8
michael@0 2732 vrshr.u16 q12, q10, #8
michael@0 2733 vrshr.u16 q13, q11, #8
michael@0 2734 vraddhn.u16 d28, q14, q8
michael@0 2735 vraddhn.u16 d29, q15, q9
michael@0 2736 vraddhn.u16 d30, q12, q10
michael@0 2737 vraddhn.u16 d31, q13, q11
michael@0 2738 /* 32bpp result is in {d28, d29, d30, d31} */
michael@0 2739 .endm
michael@0 2740
michael@0 2741 /* TODO: expand macros and do better instructions scheduling */
michael@0 2742 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
michael@0 2743 fetch_src_pixblock
michael@0 2744 pixman_composite_out_reverse_8_8888_process_pixblock_tail
michael@0 2745 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
michael@0 2746 cache_preload 8, 8
michael@0 2747 pixman_composite_out_reverse_8_8888_process_pixblock_head
michael@0 2748 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
michael@0 2749 .endm
michael@0 2750
michael@0 2751 generate_composite_function \
michael@0 2752 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
michael@0 2753 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2754 8, /* number of pixels, processed in a single block */ \
michael@0 2755 5, /* prefetch distance */ \
michael@0 2756 default_init, \
michael@0 2757 default_cleanup, \
michael@0 2758 pixman_composite_out_reverse_8_8888_process_pixblock_head, \
michael@0 2759 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
michael@0 2760 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
michael@0 2761 28, /* dst_w_basereg */ \
michael@0 2762 4, /* dst_r_basereg */ \
michael@0 2763 0, /* src_basereg */ \
michael@0 2764 0 /* mask_basereg */
michael@0 2765
michael@0 2766 /******************************************************************************/
michael@0 2767
michael@0 2768 generate_composite_function_nearest_scanline \
michael@0 2769 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
michael@0 2770 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2771 8, /* number of pixels, processed in a single block */ \
michael@0 2772 default_init, \
michael@0 2773 default_cleanup, \
michael@0 2774 pixman_composite_over_8888_8888_process_pixblock_head, \
michael@0 2775 pixman_composite_over_8888_8888_process_pixblock_tail, \
michael@0 2776 pixman_composite_over_8888_8888_process_pixblock_tail_head
michael@0 2777
michael@0 2778 generate_composite_function_nearest_scanline \
michael@0 2779 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
michael@0 2780 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2781 8, /* number of pixels, processed in a single block */ \
michael@0 2782 default_init, \
michael@0 2783 default_cleanup, \
michael@0 2784 pixman_composite_over_8888_0565_process_pixblock_head, \
michael@0 2785 pixman_composite_over_8888_0565_process_pixblock_tail, \
michael@0 2786 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
michael@0 2787 28, /* dst_w_basereg */ \
michael@0 2788 4, /* dst_r_basereg */ \
michael@0 2789 0, /* src_basereg */ \
michael@0 2790 24 /* mask_basereg */
michael@0 2791
michael@0 2792 generate_composite_function_nearest_scanline \
michael@0 2793 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
michael@0 2794 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2795 8, /* number of pixels, processed in a single block */ \
michael@0 2796 default_init, \
michael@0 2797 default_cleanup, \
michael@0 2798 pixman_composite_src_8888_0565_process_pixblock_head, \
michael@0 2799 pixman_composite_src_8888_0565_process_pixblock_tail, \
michael@0 2800 pixman_composite_src_8888_0565_process_pixblock_tail_head
michael@0 2801
michael@0 2802 generate_composite_function_nearest_scanline \
michael@0 2803 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
michael@0 2804 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2805 8, /* number of pixels, processed in a single block */ \
michael@0 2806 default_init, \
michael@0 2807 default_cleanup, \
michael@0 2808 pixman_composite_src_0565_8888_process_pixblock_head, \
michael@0 2809 pixman_composite_src_0565_8888_process_pixblock_tail, \
michael@0 2810 pixman_composite_src_0565_8888_process_pixblock_tail_head
michael@0 2811
michael@0 2812 generate_composite_function_nearest_scanline \
michael@0 2813 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
michael@0 2814 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
michael@0 2815 8, /* number of pixels, processed in a single block */ \
michael@0 2816 default_init_need_all_regs, \
michael@0 2817 default_cleanup_need_all_regs, \
michael@0 2818 pixman_composite_over_8888_8_0565_process_pixblock_head, \
michael@0 2819 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
michael@0 2820 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
michael@0 2821 28, /* dst_w_basereg */ \
michael@0 2822 4, /* dst_r_basereg */ \
michael@0 2823 8, /* src_basereg */ \
michael@0 2824 24 /* mask_basereg */
michael@0 2825
michael@0 2826 generate_composite_function_nearest_scanline \
michael@0 2827 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
michael@0 2828 FLAG_DST_READWRITE, \
michael@0 2829 8, /* number of pixels, processed in a single block */ \
michael@0 2830 default_init_need_all_regs, \
michael@0 2831 default_cleanup_need_all_regs, \
michael@0 2832 pixman_composite_over_0565_8_0565_process_pixblock_head, \
michael@0 2833 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
michael@0 2834 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
michael@0 2835 28, /* dst_w_basereg */ \
michael@0 2836 10, /* dst_r_basereg */ \
michael@0 2837 8, /* src_basereg */ \
michael@0 2838 15 /* mask_basereg */
michael@0 2839
michael@0 2840 /******************************************************************************/
michael@0 2841
michael@0 2842 /* Supplementary macro for setting function attributes */
michael@0 2843 .macro pixman_asm_function fname
michael@0 2844 .func fname
michael@0 2845 .global fname
michael@0 2846 #ifdef __ELF__
michael@0 2847 .hidden fname
michael@0 2848 .type fname, %function
michael@0 2849 #endif
michael@0 2850 fname:
michael@0 2851 .endm
michael@0 2852
michael@0 2853 /*
michael@0 2854 * Bilinear scaling support code which tries to provide pixel fetching, color
michael@0 2855 * format conversion, and interpolation as separate macros which can be used
michael@0 2856 * as the basic building blocks for constructing bilinear scanline functions.
michael@0 2857 */
michael@0 2858
michael@0 2859 .macro bilinear_load_8888 reg1, reg2, tmp
michael@0 2860 mov TMP1, X, asr #16
michael@0 2861 add X, X, UX
michael@0 2862 add TMP1, TOP, TMP1, asl #2
michael@0 2863 vld1.32 {reg1}, [TMP1], STRIDE
michael@0 2864 vld1.32 {reg2}, [TMP1]
michael@0 2865 .endm
michael@0 2866
michael@0 2867 .macro bilinear_load_0565 reg1, reg2, tmp
michael@0 2868 mov TMP1, X, asr #16
michael@0 2869 add X, X, UX
michael@0 2870 add TMP1, TOP, TMP1, asl #1
michael@0 2871 vld1.32 {reg2[0]}, [TMP1], STRIDE
michael@0 2872 vld1.32 {reg2[1]}, [TMP1]
michael@0 2873 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
michael@0 2874 .endm
michael@0 2875
michael@0 2876 .macro bilinear_load_and_vertical_interpolate_two_8888 \
michael@0 2877 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
michael@0 2878
michael@0 2879 bilinear_load_8888 reg1, reg2, tmp1
michael@0 2880 vmull.u8 acc1, reg1, d28
michael@0 2881 vmlal.u8 acc1, reg2, d29
michael@0 2882 bilinear_load_8888 reg3, reg4, tmp2
michael@0 2883 vmull.u8 acc2, reg3, d28
michael@0 2884 vmlal.u8 acc2, reg4, d29
michael@0 2885 .endm
michael@0 2886
michael@0 2887 .macro bilinear_load_and_vertical_interpolate_four_8888 \
michael@0 2888 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
michael@0 2889 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
michael@0 2890
michael@0 2891 bilinear_load_and_vertical_interpolate_two_8888 \
michael@0 2892 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
michael@0 2893 bilinear_load_and_vertical_interpolate_two_8888 \
michael@0 2894 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
michael@0 2895 .endm
michael@0 2896
michael@0 2897 .macro bilinear_load_and_vertical_interpolate_two_0565 \
michael@0 2898 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
michael@0 2899
michael@0 2900 mov TMP1, X, asr #16
michael@0 2901 add X, X, UX
michael@0 2902 add TMP1, TOP, TMP1, asl #1
michael@0 2903 mov TMP2, X, asr #16
michael@0 2904 add X, X, UX
michael@0 2905 add TMP2, TOP, TMP2, asl #1
michael@0 2906 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
michael@0 2907 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
michael@0 2908 vld1.32 {acc2lo[1]}, [TMP1]
michael@0 2909 vld1.32 {acc2hi[1]}, [TMP2]
michael@0 2910 convert_0565_to_x888 acc2, reg3, reg2, reg1
michael@0 2911 vzip.u8 reg1, reg3
michael@0 2912 vzip.u8 reg2, reg4
michael@0 2913 vzip.u8 reg3, reg4
michael@0 2914 vzip.u8 reg1, reg2
michael@0 2915 vmull.u8 acc1, reg1, d28
michael@0 2916 vmlal.u8 acc1, reg2, d29
michael@0 2917 vmull.u8 acc2, reg3, d28
michael@0 2918 vmlal.u8 acc2, reg4, d29
michael@0 2919 .endm
michael@0 2920
michael@0 2921 .macro bilinear_load_and_vertical_interpolate_four_0565 \
michael@0 2922 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
michael@0 2923 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
michael@0 2924
michael@0 2925 mov TMP1, X, asr #16
michael@0 2926 add X, X, UX
michael@0 2927 add TMP1, TOP, TMP1, asl #1
michael@0 2928 mov TMP2, X, asr #16
michael@0 2929 add X, X, UX
michael@0 2930 add TMP2, TOP, TMP2, asl #1
michael@0 2931 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
michael@0 2932 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
michael@0 2933 vld1.32 {xacc2lo[1]}, [TMP1]
michael@0 2934 vld1.32 {xacc2hi[1]}, [TMP2]
michael@0 2935 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
michael@0 2936 mov TMP1, X, asr #16
michael@0 2937 add X, X, UX
michael@0 2938 add TMP1, TOP, TMP1, asl #1
michael@0 2939 mov TMP2, X, asr #16
michael@0 2940 add X, X, UX
michael@0 2941 add TMP2, TOP, TMP2, asl #1
michael@0 2942 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
michael@0 2943 vzip.u8 xreg1, xreg3
michael@0 2944 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
michael@0 2945 vzip.u8 xreg2, xreg4
michael@0 2946 vld1.32 {yacc2lo[1]}, [TMP1]
michael@0 2947 vzip.u8 xreg3, xreg4
michael@0 2948 vld1.32 {yacc2hi[1]}, [TMP2]
michael@0 2949 vzip.u8 xreg1, xreg2
michael@0 2950 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
michael@0 2951 vmull.u8 xacc1, xreg1, d28
michael@0 2952 vzip.u8 yreg1, yreg3
michael@0 2953 vmlal.u8 xacc1, xreg2, d29
michael@0 2954 vzip.u8 yreg2, yreg4
michael@0 2955 vmull.u8 xacc2, xreg3, d28
michael@0 2956 vzip.u8 yreg3, yreg4
michael@0 2957 vmlal.u8 xacc2, xreg4, d29
michael@0 2958 vzip.u8 yreg1, yreg2
michael@0 2959 vmull.u8 yacc1, yreg1, d28
michael@0 2960 vmlal.u8 yacc1, yreg2, d29
michael@0 2961 vmull.u8 yacc2, yreg3, d28
michael@0 2962 vmlal.u8 yacc2, yreg4, d29
michael@0 2963 .endm
michael@0 2964
michael@0 2965 .macro bilinear_store_8888 numpix, tmp1, tmp2
michael@0 2966 .if numpix == 4
michael@0 2967 vst1.32 {d0, d1}, [OUT, :128]!
michael@0 2968 .elseif numpix == 2
michael@0 2969 vst1.32 {d0}, [OUT, :64]!
michael@0 2970 .elseif numpix == 1
michael@0 2971 vst1.32 {d0[0]}, [OUT, :32]!
michael@0 2972 .else
michael@0 2973 .error bilinear_store_8888 numpix is unsupported
michael@0 2974 .endif
michael@0 2975 .endm
michael@0 2976
michael@0 2977 .macro bilinear_store_0565 numpix, tmp1, tmp2
michael@0 2978 vuzp.u8 d0, d1
michael@0 2979 vuzp.u8 d2, d3
michael@0 2980 vuzp.u8 d1, d3
michael@0 2981 vuzp.u8 d0, d2
michael@0 2982 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
michael@0 2983 .if numpix == 4
michael@0 2984 vst1.16 {d2}, [OUT, :64]!
michael@0 2985 .elseif numpix == 2
michael@0 2986 vst1.32 {d2[0]}, [OUT, :32]!
michael@0 2987 .elseif numpix == 1
michael@0 2988 vst1.16 {d2[0]}, [OUT, :16]!
michael@0 2989 .else
michael@0 2990 .error bilinear_store_0565 numpix is unsupported
michael@0 2991 .endif
michael@0 2992 .endm
michael@0 2993
michael@0 2994 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
michael@0 2995 bilinear_load_&src_fmt d0, d1, d2
michael@0 2996 vmull.u8 q1, d0, d28
michael@0 2997 vmlal.u8 q1, d1, d29
michael@0 2998 /* 5 cycles bubble */
michael@0 2999 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
michael@0 3000 vmlsl.u16 q0, d2, d30
michael@0 3001 vmlal.u16 q0, d3, d30
michael@0 3002 /* 5 cycles bubble */
michael@0 3003 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3004 /* 3 cycles bubble */
michael@0 3005 vmovn.u16 d0, q0
michael@0 3006 /* 1 cycle bubble */
michael@0 3007 bilinear_store_&dst_fmt 1, q2, q3
michael@0 3008 .endm
michael@0 3009
michael@0 3010 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
michael@0 3011 bilinear_load_and_vertical_interpolate_two_&src_fmt \
michael@0 3012 q1, q11, d0, d1, d20, d21, d22, d23
michael@0 3013 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
michael@0 3014 vmlsl.u16 q0, d2, d30
michael@0 3015 vmlal.u16 q0, d3, d30
michael@0 3016 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
michael@0 3017 vmlsl.u16 q10, d22, d31
michael@0 3018 vmlal.u16 q10, d23, d31
michael@0 3019 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3020 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3021 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3022 vadd.u16 q12, q12, q13
michael@0 3023 vmovn.u16 d0, q0
michael@0 3024 bilinear_store_&dst_fmt 2, q2, q3
michael@0 3025 .endm
michael@0 3026
michael@0 3027 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
michael@0 3028 bilinear_load_and_vertical_interpolate_four_&src_fmt \
michael@0 3029 q1, q11, d0, d1, d20, d21, d22, d23 \
michael@0 3030 q3, q9, d4, d5, d16, d17, d18, d19
michael@0 3031 pld [TMP1, PF_OFFS]
michael@0 3032 sub TMP1, TMP1, STRIDE
michael@0 3033 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
michael@0 3034 vmlsl.u16 q0, d2, d30
michael@0 3035 vmlal.u16 q0, d3, d30
michael@0 3036 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
michael@0 3037 vmlsl.u16 q10, d22, d31
michael@0 3038 vmlal.u16 q10, d23, d31
michael@0 3039 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3040 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
michael@0 3041 vmlsl.u16 q2, d6, d30
michael@0 3042 vmlal.u16 q2, d7, d30
michael@0 3043 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
michael@0 3044 pld [TMP2, PF_OFFS]
michael@0 3045 vmlsl.u16 q8, d18, d31
michael@0 3046 vmlal.u16 q8, d19, d31
michael@0 3047 vadd.u16 q12, q12, q13
michael@0 3048 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3049 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3050 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3051 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3052 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3053 vmovn.u16 d0, q0
michael@0 3054 vmovn.u16 d1, q2
michael@0 3055 vadd.u16 q12, q12, q13
michael@0 3056 bilinear_store_&dst_fmt 4, q2, q3
michael@0 3057 .endm
michael@0 3058
michael@0 3059 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
michael@0 3060 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
michael@0 3061 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
michael@0 3062 .else
michael@0 3063 bilinear_interpolate_four_pixels src_fmt, dst_fmt
michael@0 3064 .endif
michael@0 3065 .endm
michael@0 3066
michael@0 3067 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
michael@0 3068 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
michael@0 3069 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
michael@0 3070 .endif
michael@0 3071 .endm
michael@0 3072
michael@0 3073 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
michael@0 3074 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
michael@0 3075 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
michael@0 3076 .else
michael@0 3077 bilinear_interpolate_four_pixels src_fmt, dst_fmt
michael@0 3078 .endif
michael@0 3079 .endm
michael@0 3080
michael@0 3081 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
michael@0 3082 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
michael@0 3083 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
michael@0 3084 .else
michael@0 3085 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
michael@0 3086 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
michael@0 3087 .endif
michael@0 3088 .endm
michael@0 3089
michael@0 3090 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
michael@0 3091 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
michael@0 3092 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
michael@0 3093 .else
michael@0 3094 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
michael@0 3095 .endif
michael@0 3096 .endm
michael@0 3097
michael@0 3098 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
michael@0 3099 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
michael@0 3100 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
michael@0 3101 .else
michael@0 3102 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
michael@0 3103 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
michael@0 3104 .endif
michael@0 3105 .endm
michael@0 3106
michael@0 3107 .set BILINEAR_FLAG_UNROLL_4, 0
michael@0 3108 .set BILINEAR_FLAG_UNROLL_8, 1
michael@0 3109 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
michael@0 3110
michael@0 3111 /*
michael@0 3112 * Main template macro for generating NEON optimized bilinear scanline
michael@0 3113 * functions.
michael@0 3114 *
michael@0 3115 * Bilinear scanline scaler macro template uses the following arguments:
michael@0 3116 * fname - name of the function to generate
michael@0 3117 * src_fmt - source color format (8888 or 0565)
michael@0 3118 * dst_fmt - destination color format (8888 or 0565)
michael@0 3119 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
michael@0 3120 * prefetch_distance - prefetch in the source image by that many
michael@0 3121 * pixels ahead
michael@0 3122 */
michael@0 3123
michael@0 3124 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
michael@0 3125 src_bpp_shift, dst_bpp_shift, \
michael@0 3126 prefetch_distance, flags
michael@0 3127
michael@0 3128 pixman_asm_function fname
michael@0 3129 OUT .req r0
michael@0 3130 TOP .req r1
michael@0 3131 BOTTOM .req r2
michael@0 3132 WT .req r3
michael@0 3133 WB .req r4
michael@0 3134 X .req r5
michael@0 3135 UX .req r6
michael@0 3136 WIDTH .req ip
michael@0 3137 TMP1 .req r3
michael@0 3138 TMP2 .req r4
michael@0 3139 PF_OFFS .req r7
michael@0 3140 TMP3 .req r8
michael@0 3141 TMP4 .req r9
michael@0 3142 STRIDE .req r2
michael@0 3143
michael@0 3144 .fnstart
michael@0 3145 mov ip, sp
michael@0 3146 .save {r4, r5, r6, r7, r8, r9}
michael@0 3147 push {r4, r5, r6, r7, r8, r9}
michael@0 3148 mov PF_OFFS, #prefetch_distance
michael@0 3149 ldmia ip, {WB, X, UX, WIDTH}
michael@0 3150 mul PF_OFFS, PF_OFFS, UX
michael@0 3151
michael@0 3152 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
michael@0 3153 .vsave {d8-d15}
michael@0 3154 vpush {d8-d15}
michael@0 3155 .endif
michael@0 3156
michael@0 3157 sub STRIDE, BOTTOM, TOP
michael@0 3158 .unreq BOTTOM
michael@0 3159
michael@0 3160 cmp WIDTH, #0
michael@0 3161 ble 3f
michael@0 3162
michael@0 3163 vdup.u16 q12, X
michael@0 3164 vdup.u16 q13, UX
michael@0 3165 vdup.u8 d28, WT
michael@0 3166 vdup.u8 d29, WB
michael@0 3167 vadd.u16 d25, d25, d26
michael@0 3168
michael@0 3169 /* ensure good destination alignment */
michael@0 3170 cmp WIDTH, #1
michael@0 3171 blt 0f
michael@0 3172 tst OUT, #(1 << dst_bpp_shift)
michael@0 3173 beq 0f
michael@0 3174 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3175 vadd.u16 q12, q12, q13
michael@0 3176 bilinear_interpolate_last_pixel src_fmt, dst_fmt
michael@0 3177 sub WIDTH, WIDTH, #1
michael@0 3178 0:
michael@0 3179 vadd.u16 q13, q13, q13
michael@0 3180 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3181 vadd.u16 q12, q12, q13
michael@0 3182
michael@0 3183 cmp WIDTH, #2
michael@0 3184 blt 0f
michael@0 3185 tst OUT, #(1 << (dst_bpp_shift + 1))
michael@0 3186 beq 0f
michael@0 3187 bilinear_interpolate_two_pixels src_fmt, dst_fmt
michael@0 3188 sub WIDTH, WIDTH, #2
michael@0 3189 0:
michael@0 3190 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
michael@0 3191 /*********** 8 pixels per iteration *****************/
michael@0 3192 cmp WIDTH, #4
michael@0 3193 blt 0f
michael@0 3194 tst OUT, #(1 << (dst_bpp_shift + 2))
michael@0 3195 beq 0f
michael@0 3196 bilinear_interpolate_four_pixels src_fmt, dst_fmt
michael@0 3197 sub WIDTH, WIDTH, #4
michael@0 3198 0:
michael@0 3199 subs WIDTH, WIDTH, #8
michael@0 3200 blt 1f
michael@0 3201 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
michael@0 3202 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
michael@0 3203 subs WIDTH, WIDTH, #8
michael@0 3204 blt 5f
michael@0 3205 0:
michael@0 3206 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
michael@0 3207 subs WIDTH, WIDTH, #8
michael@0 3208 bge 0b
michael@0 3209 5:
michael@0 3210 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
michael@0 3211 1:
michael@0 3212 tst WIDTH, #4
michael@0 3213 beq 2f
michael@0 3214 bilinear_interpolate_four_pixels src_fmt, dst_fmt
michael@0 3215 2:
michael@0 3216 .else
michael@0 3217 /*********** 4 pixels per iteration *****************/
michael@0 3218 subs WIDTH, WIDTH, #4
michael@0 3219 blt 1f
michael@0 3220 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
michael@0 3221 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
michael@0 3222 subs WIDTH, WIDTH, #4
michael@0 3223 blt 5f
michael@0 3224 0:
michael@0 3225 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
michael@0 3226 subs WIDTH, WIDTH, #4
michael@0 3227 bge 0b
michael@0 3228 5:
michael@0 3229 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
michael@0 3230 1:
michael@0 3231 /****************************************************/
michael@0 3232 .endif
michael@0 3233 /* handle the remaining trailing pixels */
michael@0 3234 tst WIDTH, #2
michael@0 3235 beq 2f
michael@0 3236 bilinear_interpolate_two_pixels src_fmt, dst_fmt
michael@0 3237 2:
michael@0 3238 tst WIDTH, #1
michael@0 3239 beq 3f
michael@0 3240 bilinear_interpolate_last_pixel src_fmt, dst_fmt
michael@0 3241 3:
michael@0 3242 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
michael@0 3243 vpop {d8-d15}
michael@0 3244 .endif
michael@0 3245 pop {r4, r5, r6, r7, r8, r9}
michael@0 3246 bx lr
michael@0 3247 .fnend
michael@0 3248
michael@0 3249 .unreq OUT
michael@0 3250 .unreq TOP
michael@0 3251 .unreq WT
michael@0 3252 .unreq WB
michael@0 3253 .unreq X
michael@0 3254 .unreq UX
michael@0 3255 .unreq WIDTH
michael@0 3256 .unreq TMP1
michael@0 3257 .unreq TMP2
michael@0 3258 .unreq PF_OFFS
michael@0 3259 .unreq TMP3
michael@0 3260 .unreq TMP4
michael@0 3261 .unreq STRIDE
michael@0 3262 .endfunc
michael@0 3263
michael@0 3264 .endm
michael@0 3265
michael@0 3266 /*****************************************************************************/
michael@0 3267
michael@0 3268 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
michael@0 3269
michael@0 3270 .macro bilinear_interpolate_four_pixels_8888_8888_head
michael@0 3271 mov TMP1, X, asr #16
michael@0 3272 add X, X, UX
michael@0 3273 add TMP1, TOP, TMP1, asl #2
michael@0 3274 mov TMP2, X, asr #16
michael@0 3275 add X, X, UX
michael@0 3276 add TMP2, TOP, TMP2, asl #2
michael@0 3277
michael@0 3278 vld1.32 {d22}, [TMP1], STRIDE
michael@0 3279 vld1.32 {d23}, [TMP1]
michael@0 3280 mov TMP3, X, asr #16
michael@0 3281 add X, X, UX
michael@0 3282 add TMP3, TOP, TMP3, asl #2
michael@0 3283 vmull.u8 q8, d22, d28
michael@0 3284 vmlal.u8 q8, d23, d29
michael@0 3285
michael@0 3286 vld1.32 {d22}, [TMP2], STRIDE
michael@0 3287 vld1.32 {d23}, [TMP2]
michael@0 3288 mov TMP4, X, asr #16
michael@0 3289 add X, X, UX
michael@0 3290 add TMP4, TOP, TMP4, asl #2
michael@0 3291 vmull.u8 q9, d22, d28
michael@0 3292 vmlal.u8 q9, d23, d29
michael@0 3293
michael@0 3294 vld1.32 {d22}, [TMP3], STRIDE
michael@0 3295 vld1.32 {d23}, [TMP3]
michael@0 3296 vmull.u8 q10, d22, d28
michael@0 3297 vmlal.u8 q10, d23, d29
michael@0 3298
michael@0 3299 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
michael@0 3300 vmlsl.u16 q0, d16, d30
michael@0 3301 vmlal.u16 q0, d17, d30
michael@0 3302
michael@0 3303 pld [TMP4, PF_OFFS]
michael@0 3304 vld1.32 {d16}, [TMP4], STRIDE
michael@0 3305 vld1.32 {d17}, [TMP4]
michael@0 3306 pld [TMP4, PF_OFFS]
michael@0 3307 vmull.u8 q11, d16, d28
michael@0 3308 vmlal.u8 q11, d17, d29
michael@0 3309
michael@0 3310 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
michael@0 3311 vmlsl.u16 q1, d18, d31
michael@0 3312 .endm
michael@0 3313
michael@0 3314 .macro bilinear_interpolate_four_pixels_8888_8888_tail
michael@0 3315 vmlal.u16 q1, d19, d31
michael@0 3316 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3317 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
michael@0 3318 vmlsl.u16 q2, d20, d30
michael@0 3319 vmlal.u16 q2, d21, d30
michael@0 3320 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
michael@0 3321 vmlsl.u16 q3, d22, d31
michael@0 3322 vmlal.u16 q3, d23, d31
michael@0 3323 vadd.u16 q12, q12, q13
michael@0 3324 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3325 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3326 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3327 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3328 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3329 vmovn.u16 d6, q0
michael@0 3330 vmovn.u16 d7, q2
michael@0 3331 vadd.u16 q12, q12, q13
michael@0 3332 vst1.32 {d6, d7}, [OUT, :128]!
michael@0 3333 .endm
michael@0 3334
michael@0 3335 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
michael@0 3336 mov TMP1, X, asr #16
michael@0 3337 add X, X, UX
michael@0 3338 add TMP1, TOP, TMP1, asl #2
michael@0 3339 mov TMP2, X, asr #16
michael@0 3340 add X, X, UX
michael@0 3341 add TMP2, TOP, TMP2, asl #2
michael@0 3342 vmlal.u16 q1, d19, d31
michael@0 3343 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3344 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
michael@0 3345 vmlsl.u16 q2, d20, d30
michael@0 3346 vmlal.u16 q2, d21, d30
michael@0 3347 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
michael@0 3348 vld1.32 {d20}, [TMP1], STRIDE
michael@0 3349 vmlsl.u16 q3, d22, d31
michael@0 3350 vmlal.u16 q3, d23, d31
michael@0 3351 vld1.32 {d21}, [TMP1]
michael@0 3352 vmull.u8 q8, d20, d28
michael@0 3353 vmlal.u8 q8, d21, d29
michael@0 3354 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3355 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3356 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3357 vld1.32 {d22}, [TMP2], STRIDE
michael@0 3358 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3359 vadd.u16 q12, q12, q13
michael@0 3360 vld1.32 {d23}, [TMP2]
michael@0 3361 vmull.u8 q9, d22, d28
michael@0 3362 mov TMP3, X, asr #16
michael@0 3363 add X, X, UX
michael@0 3364 add TMP3, TOP, TMP3, asl #2
michael@0 3365 mov TMP4, X, asr #16
michael@0 3366 add X, X, UX
michael@0 3367 add TMP4, TOP, TMP4, asl #2
michael@0 3368 vmlal.u8 q9, d23, d29
michael@0 3369 vld1.32 {d22}, [TMP3], STRIDE
michael@0 3370 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3371 vld1.32 {d23}, [TMP3]
michael@0 3372 vmull.u8 q10, d22, d28
michael@0 3373 vmlal.u8 q10, d23, d29
michael@0 3374 vmovn.u16 d6, q0
michael@0 3375 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
michael@0 3376 vmovn.u16 d7, q2
michael@0 3377 vmlsl.u16 q0, d16, d30
michael@0 3378 vmlal.u16 q0, d17, d30
michael@0 3379 pld [TMP4, PF_OFFS]
michael@0 3380 vld1.32 {d16}, [TMP4], STRIDE
michael@0 3381 vadd.u16 q12, q12, q13
michael@0 3382 vld1.32 {d17}, [TMP4]
michael@0 3383 pld [TMP4, PF_OFFS]
michael@0 3384 vmull.u8 q11, d16, d28
michael@0 3385 vmlal.u8 q11, d17, d29
michael@0 3386 vst1.32 {d6, d7}, [OUT, :128]!
michael@0 3387 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
michael@0 3388 vmlsl.u16 q1, d18, d31
michael@0 3389 .endm
michael@0 3390
michael@0 3391 /*****************************************************************************/
michael@0 3392
michael@0 3393 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
michael@0 3394
michael@0 3395 .macro bilinear_interpolate_eight_pixels_8888_0565_head
michael@0 3396 mov TMP1, X, asr #16
michael@0 3397 add X, X, UX
michael@0 3398 add TMP1, TOP, TMP1, asl #2
michael@0 3399 mov TMP2, X, asr #16
michael@0 3400 add X, X, UX
michael@0 3401 add TMP2, TOP, TMP2, asl #2
michael@0 3402 vld1.32 {d20}, [TMP1], STRIDE
michael@0 3403 vld1.32 {d21}, [TMP1]
michael@0 3404 vmull.u8 q8, d20, d28
michael@0 3405 vmlal.u8 q8, d21, d29
michael@0 3406 vld1.32 {d22}, [TMP2], STRIDE
michael@0 3407 vld1.32 {d23}, [TMP2]
michael@0 3408 vmull.u8 q9, d22, d28
michael@0 3409 mov TMP3, X, asr #16
michael@0 3410 add X, X, UX
michael@0 3411 add TMP3, TOP, TMP3, asl #2
michael@0 3412 mov TMP4, X, asr #16
michael@0 3413 add X, X, UX
michael@0 3414 add TMP4, TOP, TMP4, asl #2
michael@0 3415 vmlal.u8 q9, d23, d29
michael@0 3416 vld1.32 {d22}, [TMP3], STRIDE
michael@0 3417 vld1.32 {d23}, [TMP3]
michael@0 3418 vmull.u8 q10, d22, d28
michael@0 3419 vmlal.u8 q10, d23, d29
michael@0 3420 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
michael@0 3421 vmlsl.u16 q0, d16, d30
michael@0 3422 vmlal.u16 q0, d17, d30
michael@0 3423 pld [TMP4, PF_OFFS]
michael@0 3424 vld1.32 {d16}, [TMP4], STRIDE
michael@0 3425 vld1.32 {d17}, [TMP4]
michael@0 3426 pld [TMP4, PF_OFFS]
michael@0 3427 vmull.u8 q11, d16, d28
michael@0 3428 vmlal.u8 q11, d17, d29
michael@0 3429 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
michael@0 3430 vmlsl.u16 q1, d18, d31
michael@0 3431
michael@0 3432 mov TMP1, X, asr #16
michael@0 3433 add X, X, UX
michael@0 3434 add TMP1, TOP, TMP1, asl #2
michael@0 3435 mov TMP2, X, asr #16
michael@0 3436 add X, X, UX
michael@0 3437 add TMP2, TOP, TMP2, asl #2
michael@0 3438 vmlal.u16 q1, d19, d31
michael@0 3439 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3440 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
michael@0 3441 vmlsl.u16 q2, d20, d30
michael@0 3442 vmlal.u16 q2, d21, d30
michael@0 3443 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
michael@0 3444 vld1.32 {d20}, [TMP1], STRIDE
michael@0 3445 vmlsl.u16 q3, d22, d31
michael@0 3446 vmlal.u16 q3, d23, d31
michael@0 3447 vld1.32 {d21}, [TMP1]
michael@0 3448 vmull.u8 q8, d20, d28
michael@0 3449 vmlal.u8 q8, d21, d29
michael@0 3450 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3451 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3452 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3453 vld1.32 {d22}, [TMP2], STRIDE
michael@0 3454 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3455 vadd.u16 q12, q12, q13
michael@0 3456 vld1.32 {d23}, [TMP2]
michael@0 3457 vmull.u8 q9, d22, d28
michael@0 3458 mov TMP3, X, asr #16
michael@0 3459 add X, X, UX
michael@0 3460 add TMP3, TOP, TMP3, asl #2
michael@0 3461 mov TMP4, X, asr #16
michael@0 3462 add X, X, UX
michael@0 3463 add TMP4, TOP, TMP4, asl #2
michael@0 3464 vmlal.u8 q9, d23, d29
michael@0 3465 vld1.32 {d22}, [TMP3], STRIDE
michael@0 3466 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3467 vld1.32 {d23}, [TMP3]
michael@0 3468 vmull.u8 q10, d22, d28
michael@0 3469 vmlal.u8 q10, d23, d29
michael@0 3470 vmovn.u16 d8, q0
michael@0 3471 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
michael@0 3472 vmovn.u16 d9, q2
michael@0 3473 vmlsl.u16 q0, d16, d30
michael@0 3474 vmlal.u16 q0, d17, d30
michael@0 3475 pld [TMP4, PF_OFFS]
michael@0 3476 vld1.32 {d16}, [TMP4], STRIDE
michael@0 3477 vadd.u16 q12, q12, q13
michael@0 3478 vld1.32 {d17}, [TMP4]
michael@0 3479 pld [TMP4, PF_OFFS]
michael@0 3480 vmull.u8 q11, d16, d28
michael@0 3481 vmlal.u8 q11, d17, d29
michael@0 3482 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
michael@0 3483 vmlsl.u16 q1, d18, d31
michael@0 3484 .endm
michael@0 3485
michael@0 3486 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
michael@0 3487 vmlal.u16 q1, d19, d31
michael@0 3488 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3489 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
michael@0 3490 vmlsl.u16 q2, d20, d30
michael@0 3491 vmlal.u16 q2, d21, d30
michael@0 3492 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
michael@0 3493 vmlsl.u16 q3, d22, d31
michael@0 3494 vmlal.u16 q3, d23, d31
michael@0 3495 vadd.u16 q12, q12, q13
michael@0 3496 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3497 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3498 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3499 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3500 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3501 vmovn.u16 d10, q0
michael@0 3502 vmovn.u16 d11, q2
michael@0 3503 vadd.u16 q12, q12, q13
michael@0 3504
michael@0 3505 vuzp.u8 d8, d9
michael@0 3506 vuzp.u8 d10, d11
michael@0 3507 vuzp.u8 d9, d11
michael@0 3508 vuzp.u8 d8, d10
michael@0 3509 vshll.u8 q6, d9, #8
michael@0 3510 vshll.u8 q5, d10, #8
michael@0 3511 vshll.u8 q7, d8, #8
michael@0 3512 vsri.u16 q5, q6, #5
michael@0 3513 vsri.u16 q5, q7, #11
michael@0 3514 vst1.32 {d10, d11}, [OUT, :128]!
michael@0 3515 .endm
michael@0 3516
michael@0 3517 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
michael@0 3518 mov TMP1, X, asr #16
michael@0 3519 add X, X, UX
michael@0 3520 add TMP1, TOP, TMP1, asl #2
michael@0 3521 mov TMP2, X, asr #16
michael@0 3522 add X, X, UX
michael@0 3523 add TMP2, TOP, TMP2, asl #2
michael@0 3524 vmlal.u16 q1, d19, d31
michael@0 3525 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3526 vuzp.u8 d8, d9
michael@0 3527 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
michael@0 3528 vmlsl.u16 q2, d20, d30
michael@0 3529 vmlal.u16 q2, d21, d30
michael@0 3530 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
michael@0 3531 vld1.32 {d20}, [TMP1], STRIDE
michael@0 3532 vmlsl.u16 q3, d22, d31
michael@0 3533 vmlal.u16 q3, d23, d31
michael@0 3534 vld1.32 {d21}, [TMP1]
michael@0 3535 vmull.u8 q8, d20, d28
michael@0 3536 vmlal.u8 q8, d21, d29
michael@0 3537 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3538 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3539 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3540 vld1.32 {d22}, [TMP2], STRIDE
michael@0 3541 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3542 vadd.u16 q12, q12, q13
michael@0 3543 vld1.32 {d23}, [TMP2]
michael@0 3544 vmull.u8 q9, d22, d28
michael@0 3545 mov TMP3, X, asr #16
michael@0 3546 add X, X, UX
michael@0 3547 add TMP3, TOP, TMP3, asl #2
michael@0 3548 mov TMP4, X, asr #16
michael@0 3549 add X, X, UX
michael@0 3550 add TMP4, TOP, TMP4, asl #2
michael@0 3551 vmlal.u8 q9, d23, d29
michael@0 3552 vld1.32 {d22}, [TMP3], STRIDE
michael@0 3553 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3554 vld1.32 {d23}, [TMP3]
michael@0 3555 vmull.u8 q10, d22, d28
michael@0 3556 vmlal.u8 q10, d23, d29
michael@0 3557 vmovn.u16 d10, q0
michael@0 3558 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
michael@0 3559 vmovn.u16 d11, q2
michael@0 3560 vmlsl.u16 q0, d16, d30
michael@0 3561 vmlal.u16 q0, d17, d30
michael@0 3562 pld [TMP4, PF_OFFS]
michael@0 3563 vld1.32 {d16}, [TMP4], STRIDE
michael@0 3564 vadd.u16 q12, q12, q13
michael@0 3565 vld1.32 {d17}, [TMP4]
michael@0 3566 pld [TMP4, PF_OFFS]
michael@0 3567 vmull.u8 q11, d16, d28
michael@0 3568 vmlal.u8 q11, d17, d29
michael@0 3569 vuzp.u8 d10, d11
michael@0 3570 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
michael@0 3571 vmlsl.u16 q1, d18, d31
michael@0 3572
michael@0 3573 mov TMP1, X, asr #16
michael@0 3574 add X, X, UX
michael@0 3575 add TMP1, TOP, TMP1, asl #2
michael@0 3576 mov TMP2, X, asr #16
michael@0 3577 add X, X, UX
michael@0 3578 add TMP2, TOP, TMP2, asl #2
michael@0 3579 vmlal.u16 q1, d19, d31
michael@0 3580 vuzp.u8 d9, d11
michael@0 3581 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3582 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
michael@0 3583 vuzp.u8 d8, d10
michael@0 3584 vmlsl.u16 q2, d20, d30
michael@0 3585 vmlal.u16 q2, d21, d30
michael@0 3586 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
michael@0 3587 vld1.32 {d20}, [TMP1], STRIDE
michael@0 3588 vmlsl.u16 q3, d22, d31
michael@0 3589 vmlal.u16 q3, d23, d31
michael@0 3590 vld1.32 {d21}, [TMP1]
michael@0 3591 vmull.u8 q8, d20, d28
michael@0 3592 vmlal.u8 q8, d21, d29
michael@0 3593 vshll.u8 q6, d9, #8
michael@0 3594 vshll.u8 q5, d10, #8
michael@0 3595 vshll.u8 q7, d8, #8
michael@0 3596 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3597 vsri.u16 q5, q6, #5
michael@0 3598 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3599 vsri.u16 q5, q7, #11
michael@0 3600 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3601 vld1.32 {d22}, [TMP2], STRIDE
michael@0 3602 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
michael@0 3603 vadd.u16 q12, q12, q13
michael@0 3604 vld1.32 {d23}, [TMP2]
michael@0 3605 vmull.u8 q9, d22, d28
michael@0 3606 mov TMP3, X, asr #16
michael@0 3607 add X, X, UX
michael@0 3608 add TMP3, TOP, TMP3, asl #2
michael@0 3609 mov TMP4, X, asr #16
michael@0 3610 add X, X, UX
michael@0 3611 add TMP4, TOP, TMP4, asl #2
michael@0 3612 vmlal.u8 q9, d23, d29
michael@0 3613 vld1.32 {d22}, [TMP3], STRIDE
michael@0 3614 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
michael@0 3615 vld1.32 {d23}, [TMP3]
michael@0 3616 vmull.u8 q10, d22, d28
michael@0 3617 vmlal.u8 q10, d23, d29
michael@0 3618 vmovn.u16 d8, q0
michael@0 3619 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
michael@0 3620 vmovn.u16 d9, q2
michael@0 3621 vmlsl.u16 q0, d16, d30
michael@0 3622 vmlal.u16 q0, d17, d30
michael@0 3623 pld [TMP4, PF_OFFS]
michael@0 3624 vld1.32 {d16}, [TMP4], STRIDE
michael@0 3625 vadd.u16 q12, q12, q13
michael@0 3626 vld1.32 {d17}, [TMP4]
michael@0 3627 pld [TMP4, PF_OFFS]
michael@0 3628 vmull.u8 q11, d16, d28
michael@0 3629 vmlal.u8 q11, d17, d29
michael@0 3630 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
michael@0 3631 vst1.32 {d10, d11}, [OUT, :128]!
michael@0 3632 vmlsl.u16 q1, d18, d31
michael@0 3633 .endm
michael@0 3634 /*****************************************************************************/
michael@0 3635
michael@0 3636 generate_bilinear_scanline_func \
michael@0 3637 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
michael@0 3638 2, 2, 28, BILINEAR_FLAG_UNROLL_4
michael@0 3639
michael@0 3640 generate_bilinear_scanline_func \
michael@0 3641 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
michael@0 3642 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
michael@0 3643
michael@0 3644 generate_bilinear_scanline_func \
michael@0 3645 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
michael@0 3646 1, 2, 28, BILINEAR_FLAG_UNROLL_4
michael@0 3647
michael@0 3648 generate_bilinear_scanline_func \
michael@0 3649 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
michael@0 3650 1, 1, 28, BILINEAR_FLAG_UNROLL_4

mercurial