Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright © 2009 Nokia Corporation |
michael@0 | 3 | * |
michael@0 | 4 | * Permission is hereby granted, free of charge, to any person obtaining a |
michael@0 | 5 | * copy of this software and associated documentation files (the "Software"), |
michael@0 | 6 | * to deal in the Software without restriction, including without limitation |
michael@0 | 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
michael@0 | 8 | * and/or sell copies of the Software, and to permit persons to whom the |
michael@0 | 9 | * Software is furnished to do so, subject to the following conditions: |
michael@0 | 10 | * |
michael@0 | 11 | * The above copyright notice and this permission notice (including the next |
michael@0 | 12 | * paragraph) shall be included in all copies or substantial portions of the |
michael@0 | 13 | * Software. |
michael@0 | 14 | * |
michael@0 | 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
michael@0 | 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
michael@0 | 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
michael@0 | 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
michael@0 | 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
michael@0 | 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
michael@0 | 21 | * DEALINGS IN THE SOFTWARE. |
michael@0 | 22 | * |
michael@0 | 23 | * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) |
michael@0 | 24 | */ |
michael@0 | 25 | |
michael@0 | 26 | /* |
michael@0 | 27 | * This file contains implementations of NEON optimized pixel processing |
michael@0 | 28 | * functions. There is no full and detailed tutorial, but some functions |
michael@0 | 29 | * (those which are exposing some new or interesting features) are |
michael@0 | 30 | * extensively commented and can be used as examples. |
michael@0 | 31 | * |
michael@0 | 32 | * You may want to have a look at the comments for following functions: |
michael@0 | 33 | * - pixman_composite_over_8888_0565_asm_neon |
michael@0 | 34 | * - pixman_composite_over_n_8_0565_asm_neon |
michael@0 | 35 | */ |
michael@0 | 36 | |
michael@0 | 37 | /* Prevent the stack from becoming executable for no reason... */ |
michael@0 | 38 | #if defined(__linux__) && defined(__ELF__) |
michael@0 | 39 | .section .note.GNU-stack,"",%progbits |
michael@0 | 40 | #endif |
michael@0 | 41 | |
michael@0 | 42 | .text |
michael@0 | 43 | .fpu neon |
michael@0 | 44 | .arch armv7a |
michael@0 | 45 | .object_arch armv4 |
michael@0 | 46 | .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ |
michael@0 | 47 | .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ |
michael@0 | 48 | .arm |
michael@0 | 49 | .altmacro |
michael@0 | 50 | .p2align 2 |
michael@0 | 51 | |
michael@0 | 52 | #include "pixman-private.h" |
michael@0 | 53 | #include "pixman-arm-neon-asm.h" |
michael@0 | 54 | |
michael@0 | 55 | /* Global configuration options and preferences */ |
michael@0 | 56 | |
michael@0 | 57 | /* |
michael@0 | 58 | * The code can optionally make use of unaligned memory accesses to improve |
michael@0 | 59 | * performance of handling leading/trailing pixels for each scanline. |
michael@0 | 60 | * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for |
michael@0 | 61 | * example in linux if unaligned memory accesses are not configured to |
michael@0 | 62 | * generate.exceptions. |
michael@0 | 63 | */ |
michael@0 | 64 | .set RESPECT_STRICT_ALIGNMENT, 1 |
michael@0 | 65 | |
michael@0 | 66 | /* |
michael@0 | 67 | * Set default prefetch type. There is a choice between the following options: |
michael@0 | 68 | * |
michael@0 | 69 | * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work |
michael@0 | 70 | * as NOP to workaround some HW bugs or for whatever other reason) |
michael@0 | 71 | * |
michael@0 | 72 | * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where |
michael@0 | 73 | * advanced prefetch intruduces heavy overhead) |
michael@0 | 74 | * |
michael@0 | 75 | * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 |
michael@0 | 76 | * which can run ARM and NEON instructions simultaneously so that extra ARM |
michael@0 | 77 | * instructions do not add (many) extra cycles, but improve prefetch efficiency) |
michael@0 | 78 | * |
michael@0 | 79 | * Note: some types of function can't support advanced prefetch and fallback |
michael@0 | 80 | * to simple one (those which handle 24bpp pixels) |
michael@0 | 81 | */ |
michael@0 | 82 | .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED |
michael@0 | 83 | |
michael@0 | 84 | /* Prefetch distance in pixels for simple prefetch */ |
michael@0 | 85 | .set PREFETCH_DISTANCE_SIMPLE, 64 |
michael@0 | 86 | |
michael@0 | 87 | /* |
michael@0 | 88 | * Implementation of pixman_composite_over_8888_0565_asm_neon |
michael@0 | 89 | * |
michael@0 | 90 | * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and |
michael@0 | 91 | * performs OVER compositing operation. Function fast_composite_over_8888_0565 |
michael@0 | 92 | * from pixman-fast-path.c does the same in C and can be used as a reference. |
michael@0 | 93 | * |
michael@0 | 94 | * First we need to have some NEON assembly code which can do the actual |
michael@0 | 95 | * operation on the pixels and provide it to the template macro. |
michael@0 | 96 | * |
michael@0 | 97 | * Template macro quite conveniently takes care of emitting all the necessary |
michael@0 | 98 | * code for memory reading and writing (including quite tricky cases of |
michael@0 | 99 | * handling unaligned leading/trailing pixels), so we only need to deal with |
michael@0 | 100 | * the data in NEON registers. |
michael@0 | 101 | * |
michael@0 | 102 | * NEON registers allocation in general is recommented to be the following: |
michael@0 | 103 | * d0, d1, d2, d3 - contain loaded source pixel data |
michael@0 | 104 | * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) |
michael@0 | 105 | * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) |
michael@0 | 106 | * d28, d29, d30, d31 - place for storing the result (destination pixels) |
michael@0 | 107 | * |
michael@0 | 108 | * As can be seen above, four 64-bit NEON registers are used for keeping |
michael@0 | 109 | * intermediate pixel data and up to 8 pixels can be processed in one step |
michael@0 | 110 | * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). |
michael@0 | 111 | * |
michael@0 | 112 | * This particular function uses the following registers allocation: |
michael@0 | 113 | * d0, d1, d2, d3 - contain loaded source pixel data |
michael@0 | 114 | * d4, d5 - contain loaded destination pixels (they are needed) |
michael@0 | 115 | * d28, d29 - place for storing the result (destination pixels) |
michael@0 | 116 | */ |
michael@0 | 117 | |
michael@0 | 118 | /* |
michael@0 | 119 | * Step one. We need to have some code to do some arithmetics on pixel data. |
michael@0 | 120 | * This is implemented as a pair of macros: '*_head' and '*_tail'. When used |
michael@0 | 121 | * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, |
michael@0 | 122 | * perform all the needed calculations and write the result to {d28, d29}. |
michael@0 | 123 | * The rationale for having two macros and not just one will be explained |
michael@0 | 124 | * later. In practice, any single monolitic function which does the work can |
michael@0 | 125 | * be split into two parts in any arbitrary way without affecting correctness. |
michael@0 | 126 | * |
michael@0 | 127 | * There is one special trick here too. Common template macro can optionally |
michael@0 | 128 | * make our life a bit easier by doing R, G, B, A color components |
michael@0 | 129 | * deinterleaving for 32bpp pixel formats (and this feature is used in |
michael@0 | 130 | * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that |
michael@0 | 131 | * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we |
michael@0 | 132 | * actually use d0 register for blue channel (a vector of eight 8-bit |
michael@0 | 133 | * values), d1 register for green, d2 for red and d3 for alpha. This |
michael@0 | 134 | * simple conversion can be also done with a few NEON instructions: |
michael@0 | 135 | * |
michael@0 | 136 | * Packed to planar conversion: |
michael@0 | 137 | * vuzp.8 d0, d1 |
michael@0 | 138 | * vuzp.8 d2, d3 |
michael@0 | 139 | * vuzp.8 d1, d3 |
michael@0 | 140 | * vuzp.8 d0, d2 |
michael@0 | 141 | * |
michael@0 | 142 | * Planar to packed conversion: |
michael@0 | 143 | * vzip.8 d0, d2 |
michael@0 | 144 | * vzip.8 d1, d3 |
michael@0 | 145 | * vzip.8 d2, d3 |
michael@0 | 146 | * vzip.8 d0, d1 |
michael@0 | 147 | * |
michael@0 | 148 | * But pixel can be loaded directly in planar format using VLD4.8 NEON |
michael@0 | 149 | * instruction. It is 1 cycle slower than VLD1.32, so this is not always |
michael@0 | 150 | * desirable, that's why deinterleaving is optional. |
michael@0 | 151 | * |
michael@0 | 152 | * But anyway, here is the code: |
michael@0 | 153 | */ |
michael@0 | 154 | .macro pixman_composite_over_8888_0565_process_pixblock_head |
michael@0 | 155 | /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format |
michael@0 | 156 | and put data into d6 - red, d7 - green, d30 - blue */ |
michael@0 | 157 | vshrn.u16 d6, q2, #8 |
michael@0 | 158 | vshrn.u16 d7, q2, #3 |
michael@0 | 159 | vsli.u16 q2, q2, #5 |
michael@0 | 160 | vsri.u8 d6, d6, #5 |
michael@0 | 161 | vmvn.8 d3, d3 /* invert source alpha */ |
michael@0 | 162 | vsri.u8 d7, d7, #6 |
michael@0 | 163 | vshrn.u16 d30, q2, #2 |
michael@0 | 164 | /* now do alpha blending, storing results in 8-bit planar format |
michael@0 | 165 | into d16 - red, d19 - green, d18 - blue */ |
michael@0 | 166 | vmull.u8 q10, d3, d6 |
michael@0 | 167 | vmull.u8 q11, d3, d7 |
michael@0 | 168 | vmull.u8 q12, d3, d30 |
michael@0 | 169 | vrshr.u16 q13, q10, #8 |
michael@0 | 170 | vrshr.u16 q3, q11, #8 |
michael@0 | 171 | vrshr.u16 q15, q12, #8 |
michael@0 | 172 | vraddhn.u16 d20, q10, q13 |
michael@0 | 173 | vraddhn.u16 d23, q11, q3 |
michael@0 | 174 | vraddhn.u16 d22, q12, q15 |
michael@0 | 175 | .endm |
michael@0 | 176 | |
michael@0 | 177 | .macro pixman_composite_over_8888_0565_process_pixblock_tail |
michael@0 | 178 | /* ... continue alpha blending */ |
michael@0 | 179 | vqadd.u8 d16, d2, d20 |
michael@0 | 180 | vqadd.u8 q9, q0, q11 |
michael@0 | 181 | /* convert the result to r5g6b5 and store it into {d28, d29} */ |
michael@0 | 182 | vshll.u8 q14, d16, #8 |
michael@0 | 183 | vshll.u8 q8, d19, #8 |
michael@0 | 184 | vshll.u8 q9, d18, #8 |
michael@0 | 185 | vsri.u16 q14, q8, #5 |
michael@0 | 186 | vsri.u16 q14, q9, #11 |
michael@0 | 187 | .endm |
michael@0 | 188 | |
michael@0 | 189 | /* |
michael@0 | 190 | * OK, now we got almost everything that we need. Using the above two |
michael@0 | 191 | * macros, the work can be done right. But now we want to optimize |
michael@0 | 192 | * it a bit. ARM Cortex-A8 is an in-order core, and benefits really |
michael@0 | 193 | * a lot from good code scheduling and software pipelining. |
michael@0 | 194 | * |
michael@0 | 195 | * Let's construct some code, which will run in the core main loop. |
michael@0 | 196 | * Some pseudo-code of the main loop will look like this: |
michael@0 | 197 | * head |
michael@0 | 198 | * while (...) { |
michael@0 | 199 | * tail |
michael@0 | 200 | * head |
michael@0 | 201 | * } |
michael@0 | 202 | * tail |
michael@0 | 203 | * |
michael@0 | 204 | * It may look a bit weird, but this setup allows to hide instruction |
michael@0 | 205 | * latencies better and also utilize dual-issue capability more |
michael@0 | 206 | * efficiently (make pairs of load-store and ALU instructions). |
michael@0 | 207 | * |
michael@0 | 208 | * So what we need now is a '*_tail_head' macro, which will be used |
michael@0 | 209 | * in the core main loop. A trivial straightforward implementation |
michael@0 | 210 | * of this macro would look like this: |
michael@0 | 211 | * |
michael@0 | 212 | * pixman_composite_over_8888_0565_process_pixblock_tail |
michael@0 | 213 | * vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 214 | * vld1.16 {d4, d5}, [DST_R, :128]! |
michael@0 | 215 | * vld4.32 {d0, d1, d2, d3}, [SRC]! |
michael@0 | 216 | * pixman_composite_over_8888_0565_process_pixblock_head |
michael@0 | 217 | * cache_preload 8, 8 |
michael@0 | 218 | * |
michael@0 | 219 | * Now it also got some VLD/VST instructions. We simply can't move from |
michael@0 | 220 | * processing one block of pixels to the other one with just arithmetics. |
michael@0 | 221 | * The previously processed data needs to be written to memory and new |
michael@0 | 222 | * data needs to be fetched. Fortunately, this main loop does not deal |
michael@0 | 223 | * with partial leading/trailing pixels and can load/store a full block |
michael@0 | 224 | * of pixels in a bulk. Additionally, destination buffer is already |
michael@0 | 225 | * 16 bytes aligned here (which is good for performance). |
michael@0 | 226 | * |
michael@0 | 227 | * New things here are DST_R, DST_W, SRC and MASK identifiers. These |
michael@0 | 228 | * are the aliases for ARM registers which are used as pointers for |
michael@0 | 229 | * accessing data. We maintain separate pointers for reading and writing |
michael@0 | 230 | * destination buffer (DST_R and DST_W). |
michael@0 | 231 | * |
michael@0 | 232 | * Another new thing is 'cache_preload' macro. It is used for prefetching |
michael@0 | 233 | * data into CPU L2 cache and improve performance when dealing with large |
michael@0 | 234 | * images which are far larger than cache size. It uses one argument |
michael@0 | 235 | * (actually two, but they need to be the same here) - number of pixels |
michael@0 | 236 | * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some |
michael@0 | 237 | * details about this macro. Moreover, if good performance is needed |
michael@0 | 238 | * the code from this macro needs to be copied into '*_tail_head' macro |
michael@0 | 239 | * and mixed with the rest of code for optimal instructions scheduling. |
michael@0 | 240 | * We are actually doing it below. |
michael@0 | 241 | * |
michael@0 | 242 | * Now after all the explanations, here is the optimized code. |
michael@0 | 243 | * Different instruction streams (originaling from '*_head', '*_tail' |
michael@0 | 244 | * and 'cache_preload' macro) use different indentation levels for |
michael@0 | 245 | * better readability. Actually taking the code from one of these |
michael@0 | 246 | * indentation levels and ignoring a few VLD/VST instructions would |
michael@0 | 247 | * result in exactly the code from '*_head', '*_tail' or 'cache_preload' |
michael@0 | 248 | * macro! |
michael@0 | 249 | */ |
michael@0 | 250 | |
michael@0 | 251 | #if 1 |
michael@0 | 252 | |
michael@0 | 253 | .macro pixman_composite_over_8888_0565_process_pixblock_tail_head |
michael@0 | 254 | vqadd.u8 d16, d2, d20 |
michael@0 | 255 | vld1.16 {d4, d5}, [DST_R, :128]! |
michael@0 | 256 | vqadd.u8 q9, q0, q11 |
michael@0 | 257 | vshrn.u16 d6, q2, #8 |
michael@0 | 258 | fetch_src_pixblock |
michael@0 | 259 | vshrn.u16 d7, q2, #3 |
michael@0 | 260 | vsli.u16 q2, q2, #5 |
michael@0 | 261 | vshll.u8 q14, d16, #8 |
michael@0 | 262 | PF add PF_X, PF_X, #8 |
michael@0 | 263 | vshll.u8 q8, d19, #8 |
michael@0 | 264 | PF tst PF_CTL, #0xF |
michael@0 | 265 | vsri.u8 d6, d6, #5 |
michael@0 | 266 | PF addne PF_X, PF_X, #8 |
michael@0 | 267 | vmvn.8 d3, d3 |
michael@0 | 268 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 269 | vsri.u8 d7, d7, #6 |
michael@0 | 270 | vshrn.u16 d30, q2, #2 |
michael@0 | 271 | vmull.u8 q10, d3, d6 |
michael@0 | 272 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
michael@0 | 273 | vmull.u8 q11, d3, d7 |
michael@0 | 274 | vmull.u8 q12, d3, d30 |
michael@0 | 275 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
michael@0 | 276 | vsri.u16 q14, q8, #5 |
michael@0 | 277 | PF cmp PF_X, ORIG_W |
michael@0 | 278 | vshll.u8 q9, d18, #8 |
michael@0 | 279 | vrshr.u16 q13, q10, #8 |
michael@0 | 280 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 281 | vrshr.u16 q3, q11, #8 |
michael@0 | 282 | vrshr.u16 q15, q12, #8 |
michael@0 | 283 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 284 | vsri.u16 q14, q9, #11 |
michael@0 | 285 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
michael@0 | 286 | vraddhn.u16 d20, q10, q13 |
michael@0 | 287 | vraddhn.u16 d23, q11, q3 |
michael@0 | 288 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
michael@0 | 289 | vraddhn.u16 d22, q12, q15 |
michael@0 | 290 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 291 | .endm |
michael@0 | 292 | |
michael@0 | 293 | #else |
michael@0 | 294 | |
michael@0 | 295 | /* If we did not care much about the performance, we would just use this... */ |
michael@0 | 296 | .macro pixman_composite_over_8888_0565_process_pixblock_tail_head |
michael@0 | 297 | pixman_composite_over_8888_0565_process_pixblock_tail |
michael@0 | 298 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 299 | vld1.16 {d4, d5}, [DST_R, :128]! |
michael@0 | 300 | fetch_src_pixblock |
michael@0 | 301 | pixman_composite_over_8888_0565_process_pixblock_head |
michael@0 | 302 | cache_preload 8, 8 |
michael@0 | 303 | .endm |
michael@0 | 304 | |
michael@0 | 305 | #endif |
michael@0 | 306 | |
michael@0 | 307 | /* |
michael@0 | 308 | * And now the final part. We are using 'generate_composite_function' macro |
michael@0 | 309 | * to put all the stuff together. We are specifying the name of the function |
michael@0 | 310 | * which we want to get, number of bits per pixel for the source, mask and |
michael@0 | 311 | * destination (0 if unused, like mask in this case). Next come some bit |
michael@0 | 312 | * flags: |
michael@0 | 313 | * FLAG_DST_READWRITE - tells that the destination buffer is both read |
michael@0 | 314 | * and written, for write-only buffer we would use |
michael@0 | 315 | * FLAG_DST_WRITEONLY flag instead |
michael@0 | 316 | * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data |
michael@0 | 317 | * and separate color channels for 32bpp format. |
michael@0 | 318 | * The next things are: |
michael@0 | 319 | * - the number of pixels processed per iteration (8 in this case, because |
michael@0 | 320 | * that's the maximum what can fit into four 64-bit NEON registers). |
michael@0 | 321 | * - prefetch distance, measured in pixel blocks. In this case it is 5 times |
michael@0 | 322 | * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal |
michael@0 | 323 | * prefetch distance can be selected by running some benchmarks. |
michael@0 | 324 | * |
michael@0 | 325 | * After that we specify some macros, these are 'default_init', |
michael@0 | 326 | * 'default_cleanup' here which are empty (but it is possible to have custom |
michael@0 | 327 | * init/cleanup macros to be able to save/restore some extra NEON registers |
michael@0 | 328 | * like d8-d15 or do anything else) followed by |
michael@0 | 329 | * 'pixman_composite_over_8888_0565_process_pixblock_head', |
michael@0 | 330 | * 'pixman_composite_over_8888_0565_process_pixblock_tail' and |
michael@0 | 331 | * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' |
michael@0 | 332 | * which we got implemented above. |
michael@0 | 333 | * |
michael@0 | 334 | * The last part is the NEON registers allocation scheme. |
michael@0 | 335 | */ |
michael@0 | 336 | generate_composite_function \ |
michael@0 | 337 | pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ |
michael@0 | 338 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 339 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 340 | 5, /* prefetch distance */ \ |
michael@0 | 341 | default_init, \ |
michael@0 | 342 | default_cleanup, \ |
michael@0 | 343 | pixman_composite_over_8888_0565_process_pixblock_head, \ |
michael@0 | 344 | pixman_composite_over_8888_0565_process_pixblock_tail, \ |
michael@0 | 345 | pixman_composite_over_8888_0565_process_pixblock_tail_head, \ |
michael@0 | 346 | 28, /* dst_w_basereg */ \ |
michael@0 | 347 | 4, /* dst_r_basereg */ \ |
michael@0 | 348 | 0, /* src_basereg */ \ |
michael@0 | 349 | 24 /* mask_basereg */ |
michael@0 | 350 | |
michael@0 | 351 | /******************************************************************************/ |
michael@0 | 352 | |
michael@0 | 353 | .macro pixman_composite_over_n_0565_process_pixblock_head |
michael@0 | 354 | /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format |
michael@0 | 355 | and put data into d6 - red, d7 - green, d30 - blue */ |
michael@0 | 356 | vshrn.u16 d6, q2, #8 |
michael@0 | 357 | vshrn.u16 d7, q2, #3 |
michael@0 | 358 | vsli.u16 q2, q2, #5 |
michael@0 | 359 | vsri.u8 d6, d6, #5 |
michael@0 | 360 | vsri.u8 d7, d7, #6 |
michael@0 | 361 | vshrn.u16 d30, q2, #2 |
michael@0 | 362 | /* now do alpha blending, storing results in 8-bit planar format |
michael@0 | 363 | into d16 - red, d19 - green, d18 - blue */ |
michael@0 | 364 | vmull.u8 q10, d3, d6 |
michael@0 | 365 | vmull.u8 q11, d3, d7 |
michael@0 | 366 | vmull.u8 q12, d3, d30 |
michael@0 | 367 | vrshr.u16 q13, q10, #8 |
michael@0 | 368 | vrshr.u16 q3, q11, #8 |
michael@0 | 369 | vrshr.u16 q15, q12, #8 |
michael@0 | 370 | vraddhn.u16 d20, q10, q13 |
michael@0 | 371 | vraddhn.u16 d23, q11, q3 |
michael@0 | 372 | vraddhn.u16 d22, q12, q15 |
michael@0 | 373 | .endm |
michael@0 | 374 | |
michael@0 | 375 | .macro pixman_composite_over_n_0565_process_pixblock_tail |
michael@0 | 376 | /* ... continue alpha blending */ |
michael@0 | 377 | vqadd.u8 d16, d2, d20 |
michael@0 | 378 | vqadd.u8 q9, q0, q11 |
michael@0 | 379 | /* convert the result to r5g6b5 and store it into {d28, d29} */ |
michael@0 | 380 | vshll.u8 q14, d16, #8 |
michael@0 | 381 | vshll.u8 q8, d19, #8 |
michael@0 | 382 | vshll.u8 q9, d18, #8 |
michael@0 | 383 | vsri.u16 q14, q8, #5 |
michael@0 | 384 | vsri.u16 q14, q9, #11 |
michael@0 | 385 | .endm |
michael@0 | 386 | |
michael@0 | 387 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 388 | .macro pixman_composite_over_n_0565_process_pixblock_tail_head |
michael@0 | 389 | pixman_composite_over_n_0565_process_pixblock_tail |
michael@0 | 390 | vld1.16 {d4, d5}, [DST_R, :128]! |
michael@0 | 391 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 392 | pixman_composite_over_n_0565_process_pixblock_head |
michael@0 | 393 | cache_preload 8, 8 |
michael@0 | 394 | .endm |
michael@0 | 395 | |
michael@0 | 396 | .macro pixman_composite_over_n_0565_init |
michael@0 | 397 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 398 | vld1.32 {d3[0]}, [DUMMY] |
michael@0 | 399 | vdup.8 d0, d3[0] |
michael@0 | 400 | vdup.8 d1, d3[1] |
michael@0 | 401 | vdup.8 d2, d3[2] |
michael@0 | 402 | vdup.8 d3, d3[3] |
michael@0 | 403 | vmvn.8 d3, d3 /* invert source alpha */ |
michael@0 | 404 | .endm |
michael@0 | 405 | |
michael@0 | 406 | generate_composite_function \ |
michael@0 | 407 | pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ |
michael@0 | 408 | FLAG_DST_READWRITE, \ |
michael@0 | 409 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 410 | 5, /* prefetch distance */ \ |
michael@0 | 411 | pixman_composite_over_n_0565_init, \ |
michael@0 | 412 | default_cleanup, \ |
michael@0 | 413 | pixman_composite_over_n_0565_process_pixblock_head, \ |
michael@0 | 414 | pixman_composite_over_n_0565_process_pixblock_tail, \ |
michael@0 | 415 | pixman_composite_over_n_0565_process_pixblock_tail_head, \ |
michael@0 | 416 | 28, /* dst_w_basereg */ \ |
michael@0 | 417 | 4, /* dst_r_basereg */ \ |
michael@0 | 418 | 0, /* src_basereg */ \ |
michael@0 | 419 | 24 /* mask_basereg */ |
michael@0 | 420 | |
michael@0 | 421 | /******************************************************************************/ |
michael@0 | 422 | |
michael@0 | 423 | .macro pixman_composite_src_8888_0565_process_pixblock_head |
michael@0 | 424 | vshll.u8 q8, d1, #8 |
michael@0 | 425 | vshll.u8 q14, d2, #8 |
michael@0 | 426 | vshll.u8 q9, d0, #8 |
michael@0 | 427 | .endm |
michael@0 | 428 | |
michael@0 | 429 | .macro pixman_composite_src_8888_0565_process_pixblock_tail |
michael@0 | 430 | vsri.u16 q14, q8, #5 |
michael@0 | 431 | vsri.u16 q14, q9, #11 |
michael@0 | 432 | .endm |
michael@0 | 433 | |
michael@0 | 434 | .macro pixman_composite_src_8888_0565_process_pixblock_tail_head |
michael@0 | 435 | vsri.u16 q14, q8, #5 |
michael@0 | 436 | PF add PF_X, PF_X, #8 |
michael@0 | 437 | PF tst PF_CTL, #0xF |
michael@0 | 438 | fetch_src_pixblock |
michael@0 | 439 | PF addne PF_X, PF_X, #8 |
michael@0 | 440 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 441 | vsri.u16 q14, q9, #11 |
michael@0 | 442 | PF cmp PF_X, ORIG_W |
michael@0 | 443 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
michael@0 | 444 | vshll.u8 q8, d1, #8 |
michael@0 | 445 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 446 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 447 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 448 | vshll.u8 q14, d2, #8 |
michael@0 | 449 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
michael@0 | 450 | vshll.u8 q9, d0, #8 |
michael@0 | 451 | .endm |
michael@0 | 452 | |
michael@0 | 453 | generate_composite_function \ |
michael@0 | 454 | pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ |
michael@0 | 455 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 456 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 457 | 10, /* prefetch distance */ \ |
michael@0 | 458 | default_init, \ |
michael@0 | 459 | default_cleanup, \ |
michael@0 | 460 | pixman_composite_src_8888_0565_process_pixblock_head, \ |
michael@0 | 461 | pixman_composite_src_8888_0565_process_pixblock_tail, \ |
michael@0 | 462 | pixman_composite_src_8888_0565_process_pixblock_tail_head |
michael@0 | 463 | |
michael@0 | 464 | /******************************************************************************/ |
michael@0 | 465 | |
michael@0 | 466 | .macro pixman_composite_src_0565_8888_process_pixblock_head |
michael@0 | 467 | vshrn.u16 d30, q0, #8 |
michael@0 | 468 | vshrn.u16 d29, q0, #3 |
michael@0 | 469 | vsli.u16 q0, q0, #5 |
michael@0 | 470 | vmov.u8 d31, #255 |
michael@0 | 471 | vsri.u8 d30, d30, #5 |
michael@0 | 472 | vsri.u8 d29, d29, #6 |
michael@0 | 473 | vshrn.u16 d28, q0, #2 |
michael@0 | 474 | .endm |
michael@0 | 475 | |
michael@0 | 476 | .macro pixman_composite_src_0565_8888_process_pixblock_tail |
michael@0 | 477 | .endm |
michael@0 | 478 | |
michael@0 | 479 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 480 | .macro pixman_composite_src_0565_8888_process_pixblock_tail_head |
michael@0 | 481 | pixman_composite_src_0565_8888_process_pixblock_tail |
michael@0 | 482 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 483 | fetch_src_pixblock |
michael@0 | 484 | pixman_composite_src_0565_8888_process_pixblock_head |
michael@0 | 485 | cache_preload 8, 8 |
michael@0 | 486 | .endm |
michael@0 | 487 | |
michael@0 | 488 | generate_composite_function \ |
michael@0 | 489 | pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ |
michael@0 | 490 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 491 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 492 | 10, /* prefetch distance */ \ |
michael@0 | 493 | default_init, \ |
michael@0 | 494 | default_cleanup, \ |
michael@0 | 495 | pixman_composite_src_0565_8888_process_pixblock_head, \ |
michael@0 | 496 | pixman_composite_src_0565_8888_process_pixblock_tail, \ |
michael@0 | 497 | pixman_composite_src_0565_8888_process_pixblock_tail_head |
michael@0 | 498 | |
michael@0 | 499 | /******************************************************************************/ |
michael@0 | 500 | |
michael@0 | 501 | .macro pixman_composite_add_8_8_process_pixblock_head |
michael@0 | 502 | vqadd.u8 q14, q0, q2 |
michael@0 | 503 | vqadd.u8 q15, q1, q3 |
michael@0 | 504 | .endm |
michael@0 | 505 | |
michael@0 | 506 | .macro pixman_composite_add_8_8_process_pixblock_tail |
michael@0 | 507 | .endm |
michael@0 | 508 | |
michael@0 | 509 | .macro pixman_composite_add_8_8_process_pixblock_tail_head |
michael@0 | 510 | fetch_src_pixblock |
michael@0 | 511 | PF add PF_X, PF_X, #32 |
michael@0 | 512 | PF tst PF_CTL, #0xF |
michael@0 | 513 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 514 | PF addne PF_X, PF_X, #32 |
michael@0 | 515 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 516 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 517 | PF cmp PF_X, ORIG_W |
michael@0 | 518 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
michael@0 | 519 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
michael@0 | 520 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 521 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 522 | vqadd.u8 q14, q0, q2 |
michael@0 | 523 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
michael@0 | 524 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
michael@0 | 525 | vqadd.u8 q15, q1, q3 |
michael@0 | 526 | .endm |
michael@0 | 527 | |
michael@0 | 528 | generate_composite_function \ |
michael@0 | 529 | pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ |
michael@0 | 530 | FLAG_DST_READWRITE, \ |
michael@0 | 531 | 32, /* number of pixels, processed in a single block */ \ |
michael@0 | 532 | 10, /* prefetch distance */ \ |
michael@0 | 533 | default_init, \ |
michael@0 | 534 | default_cleanup, \ |
michael@0 | 535 | pixman_composite_add_8_8_process_pixblock_head, \ |
michael@0 | 536 | pixman_composite_add_8_8_process_pixblock_tail, \ |
michael@0 | 537 | pixman_composite_add_8_8_process_pixblock_tail_head |
michael@0 | 538 | |
michael@0 | 539 | /******************************************************************************/ |
michael@0 | 540 | |
michael@0 | 541 | .macro pixman_composite_add_8888_8888_process_pixblock_tail_head |
michael@0 | 542 | fetch_src_pixblock |
michael@0 | 543 | PF add PF_X, PF_X, #8 |
michael@0 | 544 | PF tst PF_CTL, #0xF |
michael@0 | 545 | vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 546 | PF addne PF_X, PF_X, #8 |
michael@0 | 547 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 548 | vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 549 | PF cmp PF_X, ORIG_W |
michael@0 | 550 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
michael@0 | 551 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
michael@0 | 552 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 553 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 554 | vqadd.u8 q14, q0, q2 |
michael@0 | 555 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
michael@0 | 556 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
michael@0 | 557 | vqadd.u8 q15, q1, q3 |
michael@0 | 558 | .endm |
michael@0 | 559 | |
michael@0 | 560 | generate_composite_function \ |
michael@0 | 561 | pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ |
michael@0 | 562 | FLAG_DST_READWRITE, \ |
michael@0 | 563 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 564 | 10, /* prefetch distance */ \ |
michael@0 | 565 | default_init, \ |
michael@0 | 566 | default_cleanup, \ |
michael@0 | 567 | pixman_composite_add_8_8_process_pixblock_head, \ |
michael@0 | 568 | pixman_composite_add_8_8_process_pixblock_tail, \ |
michael@0 | 569 | pixman_composite_add_8888_8888_process_pixblock_tail_head |
michael@0 | 570 | |
michael@0 | 571 | generate_composite_function_single_scanline \ |
michael@0 | 572 | pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ |
michael@0 | 573 | FLAG_DST_READWRITE, \ |
michael@0 | 574 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 575 | default_init, \ |
michael@0 | 576 | default_cleanup, \ |
michael@0 | 577 | pixman_composite_add_8_8_process_pixblock_head, \ |
michael@0 | 578 | pixman_composite_add_8_8_process_pixblock_tail, \ |
michael@0 | 579 | pixman_composite_add_8888_8888_process_pixblock_tail_head |
michael@0 | 580 | |
michael@0 | 581 | /******************************************************************************/ |
michael@0 | 582 | |
michael@0 | 583 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head |
michael@0 | 584 | vmvn.8 d24, d3 /* get inverted alpha */ |
michael@0 | 585 | /* do alpha blending */ |
michael@0 | 586 | vmull.u8 q8, d24, d4 |
michael@0 | 587 | vmull.u8 q9, d24, d5 |
michael@0 | 588 | vmull.u8 q10, d24, d6 |
michael@0 | 589 | vmull.u8 q11, d24, d7 |
michael@0 | 590 | .endm |
michael@0 | 591 | |
michael@0 | 592 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail |
michael@0 | 593 | vrshr.u16 q14, q8, #8 |
michael@0 | 594 | vrshr.u16 q15, q9, #8 |
michael@0 | 595 | vrshr.u16 q12, q10, #8 |
michael@0 | 596 | vrshr.u16 q13, q11, #8 |
michael@0 | 597 | vraddhn.u16 d28, q14, q8 |
michael@0 | 598 | vraddhn.u16 d29, q15, q9 |
michael@0 | 599 | vraddhn.u16 d30, q12, q10 |
michael@0 | 600 | vraddhn.u16 d31, q13, q11 |
michael@0 | 601 | .endm |
michael@0 | 602 | |
michael@0 | 603 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head |
michael@0 | 604 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 605 | vrshr.u16 q14, q8, #8 |
michael@0 | 606 | PF add PF_X, PF_X, #8 |
michael@0 | 607 | PF tst PF_CTL, #0xF |
michael@0 | 608 | vrshr.u16 q15, q9, #8 |
michael@0 | 609 | vrshr.u16 q12, q10, #8 |
michael@0 | 610 | vrshr.u16 q13, q11, #8 |
michael@0 | 611 | PF addne PF_X, PF_X, #8 |
michael@0 | 612 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 613 | vraddhn.u16 d28, q14, q8 |
michael@0 | 614 | vraddhn.u16 d29, q15, q9 |
michael@0 | 615 | PF cmp PF_X, ORIG_W |
michael@0 | 616 | vraddhn.u16 d30, q12, q10 |
michael@0 | 617 | vraddhn.u16 d31, q13, q11 |
michael@0 | 618 | fetch_src_pixblock |
michael@0 | 619 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
michael@0 | 620 | vmvn.8 d22, d3 |
michael@0 | 621 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
michael@0 | 622 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 623 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 624 | vmull.u8 q8, d22, d4 |
michael@0 | 625 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 626 | vmull.u8 q9, d22, d5 |
michael@0 | 627 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
michael@0 | 628 | vmull.u8 q10, d22, d6 |
michael@0 | 629 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
michael@0 | 630 | vmull.u8 q11, d22, d7 |
michael@0 | 631 | .endm |
michael@0 | 632 | |
michael@0 | 633 | generate_composite_function_single_scanline \ |
michael@0 | 634 | pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ |
michael@0 | 635 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 636 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 637 | default_init, \ |
michael@0 | 638 | default_cleanup, \ |
michael@0 | 639 | pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ |
michael@0 | 640 | pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ |
michael@0 | 641 | pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head |
michael@0 | 642 | |
michael@0 | 643 | /******************************************************************************/ |
michael@0 | 644 | |
michael@0 | 645 | .macro pixman_composite_over_8888_8888_process_pixblock_head |
michael@0 | 646 | pixman_composite_out_reverse_8888_8888_process_pixblock_head |
michael@0 | 647 | .endm |
michael@0 | 648 | |
michael@0 | 649 | .macro pixman_composite_over_8888_8888_process_pixblock_tail |
michael@0 | 650 | pixman_composite_out_reverse_8888_8888_process_pixblock_tail |
michael@0 | 651 | vqadd.u8 q14, q0, q14 |
michael@0 | 652 | vqadd.u8 q15, q1, q15 |
michael@0 | 653 | .endm |
michael@0 | 654 | |
michael@0 | 655 | .macro pixman_composite_over_8888_8888_process_pixblock_tail_head |
michael@0 | 656 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 657 | vrshr.u16 q14, q8, #8 |
michael@0 | 658 | PF add PF_X, PF_X, #8 |
michael@0 | 659 | PF tst PF_CTL, #0xF |
michael@0 | 660 | vrshr.u16 q15, q9, #8 |
michael@0 | 661 | vrshr.u16 q12, q10, #8 |
michael@0 | 662 | vrshr.u16 q13, q11, #8 |
michael@0 | 663 | PF addne PF_X, PF_X, #8 |
michael@0 | 664 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 665 | vraddhn.u16 d28, q14, q8 |
michael@0 | 666 | vraddhn.u16 d29, q15, q9 |
michael@0 | 667 | PF cmp PF_X, ORIG_W |
michael@0 | 668 | vraddhn.u16 d30, q12, q10 |
michael@0 | 669 | vraddhn.u16 d31, q13, q11 |
michael@0 | 670 | vqadd.u8 q14, q0, q14 |
michael@0 | 671 | vqadd.u8 q15, q1, q15 |
michael@0 | 672 | fetch_src_pixblock |
michael@0 | 673 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
michael@0 | 674 | vmvn.8 d22, d3 |
michael@0 | 675 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
michael@0 | 676 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 677 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 678 | vmull.u8 q8, d22, d4 |
michael@0 | 679 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 680 | vmull.u8 q9, d22, d5 |
michael@0 | 681 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
michael@0 | 682 | vmull.u8 q10, d22, d6 |
michael@0 | 683 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
michael@0 | 684 | vmull.u8 q11, d22, d7 |
michael@0 | 685 | .endm |
michael@0 | 686 | |
michael@0 | 687 | generate_composite_function \ |
michael@0 | 688 | pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ |
michael@0 | 689 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 690 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 691 | 5, /* prefetch distance */ \ |
michael@0 | 692 | default_init, \ |
michael@0 | 693 | default_cleanup, \ |
michael@0 | 694 | pixman_composite_over_8888_8888_process_pixblock_head, \ |
michael@0 | 695 | pixman_composite_over_8888_8888_process_pixblock_tail, \ |
michael@0 | 696 | pixman_composite_over_8888_8888_process_pixblock_tail_head |
michael@0 | 697 | |
michael@0 | 698 | generate_composite_function_single_scanline \ |
michael@0 | 699 | pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ |
michael@0 | 700 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 701 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 702 | default_init, \ |
michael@0 | 703 | default_cleanup, \ |
michael@0 | 704 | pixman_composite_over_8888_8888_process_pixblock_head, \ |
michael@0 | 705 | pixman_composite_over_8888_8888_process_pixblock_tail, \ |
michael@0 | 706 | pixman_composite_over_8888_8888_process_pixblock_tail_head |
michael@0 | 707 | |
michael@0 | 708 | /******************************************************************************/ |
michael@0 | 709 | |
michael@0 | 710 | .macro pixman_composite_over_n_8888_process_pixblock_head |
michael@0 | 711 | /* deinterleaved source pixels in {d0, d1, d2, d3} */ |
michael@0 | 712 | /* inverted alpha in {d24} */ |
michael@0 | 713 | /* destination pixels in {d4, d5, d6, d7} */ |
michael@0 | 714 | vmull.u8 q8, d24, d4 |
michael@0 | 715 | vmull.u8 q9, d24, d5 |
michael@0 | 716 | vmull.u8 q10, d24, d6 |
michael@0 | 717 | vmull.u8 q11, d24, d7 |
michael@0 | 718 | .endm |
michael@0 | 719 | |
michael@0 | 720 | .macro pixman_composite_over_n_8888_process_pixblock_tail |
michael@0 | 721 | vrshr.u16 q14, q8, #8 |
michael@0 | 722 | vrshr.u16 q15, q9, #8 |
michael@0 | 723 | vrshr.u16 q2, q10, #8 |
michael@0 | 724 | vrshr.u16 q3, q11, #8 |
michael@0 | 725 | vraddhn.u16 d28, q14, q8 |
michael@0 | 726 | vraddhn.u16 d29, q15, q9 |
michael@0 | 727 | vraddhn.u16 d30, q2, q10 |
michael@0 | 728 | vraddhn.u16 d31, q3, q11 |
michael@0 | 729 | vqadd.u8 q14, q0, q14 |
michael@0 | 730 | vqadd.u8 q15, q1, q15 |
michael@0 | 731 | .endm |
michael@0 | 732 | |
michael@0 | 733 | .macro pixman_composite_over_n_8888_process_pixblock_tail_head |
michael@0 | 734 | vrshr.u16 q14, q8, #8 |
michael@0 | 735 | vrshr.u16 q15, q9, #8 |
michael@0 | 736 | vrshr.u16 q2, q10, #8 |
michael@0 | 737 | vrshr.u16 q3, q11, #8 |
michael@0 | 738 | vraddhn.u16 d28, q14, q8 |
michael@0 | 739 | vraddhn.u16 d29, q15, q9 |
michael@0 | 740 | vraddhn.u16 d30, q2, q10 |
michael@0 | 741 | vraddhn.u16 d31, q3, q11 |
michael@0 | 742 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 743 | vqadd.u8 q14, q0, q14 |
michael@0 | 744 | PF add PF_X, PF_X, #8 |
michael@0 | 745 | PF tst PF_CTL, #0x0F |
michael@0 | 746 | PF addne PF_X, PF_X, #8 |
michael@0 | 747 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 748 | vqadd.u8 q15, q1, q15 |
michael@0 | 749 | PF cmp PF_X, ORIG_W |
michael@0 | 750 | vmull.u8 q8, d24, d4 |
michael@0 | 751 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
michael@0 | 752 | vmull.u8 q9, d24, d5 |
michael@0 | 753 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 754 | vmull.u8 q10, d24, d6 |
michael@0 | 755 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 756 | vmull.u8 q11, d24, d7 |
michael@0 | 757 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
michael@0 | 758 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 759 | .endm |
michael@0 | 760 | |
michael@0 | 761 | .macro pixman_composite_over_n_8888_init |
michael@0 | 762 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 763 | vld1.32 {d3[0]}, [DUMMY] |
michael@0 | 764 | vdup.8 d0, d3[0] |
michael@0 | 765 | vdup.8 d1, d3[1] |
michael@0 | 766 | vdup.8 d2, d3[2] |
michael@0 | 767 | vdup.8 d3, d3[3] |
michael@0 | 768 | vmvn.8 d24, d3 /* get inverted alpha */ |
michael@0 | 769 | .endm |
michael@0 | 770 | |
michael@0 | 771 | generate_composite_function \ |
michael@0 | 772 | pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ |
michael@0 | 773 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 774 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 775 | 5, /* prefetch distance */ \ |
michael@0 | 776 | pixman_composite_over_n_8888_init, \ |
michael@0 | 777 | default_cleanup, \ |
michael@0 | 778 | pixman_composite_over_8888_8888_process_pixblock_head, \ |
michael@0 | 779 | pixman_composite_over_8888_8888_process_pixblock_tail, \ |
michael@0 | 780 | pixman_composite_over_n_8888_process_pixblock_tail_head |
michael@0 | 781 | |
michael@0 | 782 | /******************************************************************************/ |
michael@0 | 783 | |
michael@0 | 784 | .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head |
michael@0 | 785 | vrshr.u16 q14, q8, #8 |
michael@0 | 786 | PF add PF_X, PF_X, #8 |
michael@0 | 787 | PF tst PF_CTL, #0xF |
michael@0 | 788 | vrshr.u16 q15, q9, #8 |
michael@0 | 789 | vrshr.u16 q12, q10, #8 |
michael@0 | 790 | vrshr.u16 q13, q11, #8 |
michael@0 | 791 | PF addne PF_X, PF_X, #8 |
michael@0 | 792 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 793 | vraddhn.u16 d28, q14, q8 |
michael@0 | 794 | vraddhn.u16 d29, q15, q9 |
michael@0 | 795 | PF cmp PF_X, ORIG_W |
michael@0 | 796 | vraddhn.u16 d30, q12, q10 |
michael@0 | 797 | vraddhn.u16 d31, q13, q11 |
michael@0 | 798 | vqadd.u8 q14, q0, q14 |
michael@0 | 799 | vqadd.u8 q15, q1, q15 |
michael@0 | 800 | vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! |
michael@0 | 801 | vmvn.8 d22, d3 |
michael@0 | 802 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
michael@0 | 803 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 804 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 805 | vmull.u8 q8, d22, d4 |
michael@0 | 806 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 807 | vmull.u8 q9, d22, d5 |
michael@0 | 808 | vmull.u8 q10, d22, d6 |
michael@0 | 809 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
michael@0 | 810 | vmull.u8 q11, d22, d7 |
michael@0 | 811 | .endm |
michael@0 | 812 | |
michael@0 | 813 | .macro pixman_composite_over_reverse_n_8888_init |
michael@0 | 814 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 815 | vld1.32 {d7[0]}, [DUMMY] |
michael@0 | 816 | vdup.8 d4, d7[0] |
michael@0 | 817 | vdup.8 d5, d7[1] |
michael@0 | 818 | vdup.8 d6, d7[2] |
michael@0 | 819 | vdup.8 d7, d7[3] |
michael@0 | 820 | .endm |
michael@0 | 821 | |
michael@0 | 822 | generate_composite_function \ |
michael@0 | 823 | pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ |
michael@0 | 824 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 825 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 826 | 5, /* prefetch distance */ \ |
michael@0 | 827 | pixman_composite_over_reverse_n_8888_init, \ |
michael@0 | 828 | default_cleanup, \ |
michael@0 | 829 | pixman_composite_over_8888_8888_process_pixblock_head, \ |
michael@0 | 830 | pixman_composite_over_8888_8888_process_pixblock_tail, \ |
michael@0 | 831 | pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ |
michael@0 | 832 | 28, /* dst_w_basereg */ \ |
michael@0 | 833 | 0, /* dst_r_basereg */ \ |
michael@0 | 834 | 4, /* src_basereg */ \ |
michael@0 | 835 | 24 /* mask_basereg */ |
michael@0 | 836 | |
michael@0 | 837 | /******************************************************************************/ |
michael@0 | 838 | |
michael@0 | 839 | .macro pixman_composite_over_8888_8_0565_process_pixblock_head |
michael@0 | 840 | vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ |
michael@0 | 841 | vmull.u8 q1, d24, d9 |
michael@0 | 842 | vmull.u8 q6, d24, d10 |
michael@0 | 843 | vmull.u8 q7, d24, d11 |
michael@0 | 844 | vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ |
michael@0 | 845 | vshrn.u16 d7, q2, #3 |
michael@0 | 846 | vsli.u16 q2, q2, #5 |
michael@0 | 847 | vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ |
michael@0 | 848 | vrshr.u16 q9, q1, #8 |
michael@0 | 849 | vrshr.u16 q10, q6, #8 |
michael@0 | 850 | vrshr.u16 q11, q7, #8 |
michael@0 | 851 | vraddhn.u16 d0, q0, q8 |
michael@0 | 852 | vraddhn.u16 d1, q1, q9 |
michael@0 | 853 | vraddhn.u16 d2, q6, q10 |
michael@0 | 854 | vraddhn.u16 d3, q7, q11 |
michael@0 | 855 | vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ |
michael@0 | 856 | vsri.u8 d7, d7, #6 |
michael@0 | 857 | vmvn.8 d3, d3 |
michael@0 | 858 | vshrn.u16 d30, q2, #2 |
michael@0 | 859 | vmull.u8 q8, d3, d6 /* now do alpha blending */ |
michael@0 | 860 | vmull.u8 q9, d3, d7 |
michael@0 | 861 | vmull.u8 q10, d3, d30 |
michael@0 | 862 | .endm |
michael@0 | 863 | |
michael@0 | 864 | .macro pixman_composite_over_8888_8_0565_process_pixblock_tail |
michael@0 | 865 | /* 3 cycle bubble (after vmull.u8) */ |
michael@0 | 866 | vrshr.u16 q13, q8, #8 |
michael@0 | 867 | vrshr.u16 q11, q9, #8 |
michael@0 | 868 | vrshr.u16 q15, q10, #8 |
michael@0 | 869 | vraddhn.u16 d16, q8, q13 |
michael@0 | 870 | vraddhn.u16 d27, q9, q11 |
michael@0 | 871 | vraddhn.u16 d26, q10, q15 |
michael@0 | 872 | vqadd.u8 d16, d2, d16 |
michael@0 | 873 | /* 1 cycle bubble */ |
michael@0 | 874 | vqadd.u8 q9, q0, q13 |
michael@0 | 875 | vshll.u8 q14, d16, #8 /* convert to 16bpp */ |
michael@0 | 876 | vshll.u8 q8, d19, #8 |
michael@0 | 877 | vshll.u8 q9, d18, #8 |
michael@0 | 878 | vsri.u16 q14, q8, #5 |
michael@0 | 879 | /* 1 cycle bubble */ |
michael@0 | 880 | vsri.u16 q14, q9, #11 |
michael@0 | 881 | .endm |
michael@0 | 882 | |
michael@0 | 883 | .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head |
michael@0 | 884 | vld1.16 {d4, d5}, [DST_R, :128]! |
michael@0 | 885 | vshrn.u16 d6, q2, #8 |
michael@0 | 886 | fetch_mask_pixblock |
michael@0 | 887 | vshrn.u16 d7, q2, #3 |
michael@0 | 888 | fetch_src_pixblock |
michael@0 | 889 | vmull.u8 q6, d24, d10 |
michael@0 | 890 | vrshr.u16 q13, q8, #8 |
michael@0 | 891 | vrshr.u16 q11, q9, #8 |
michael@0 | 892 | vrshr.u16 q15, q10, #8 |
michael@0 | 893 | vraddhn.u16 d16, q8, q13 |
michael@0 | 894 | vraddhn.u16 d27, q9, q11 |
michael@0 | 895 | vraddhn.u16 d26, q10, q15 |
michael@0 | 896 | vqadd.u8 d16, d2, d16 |
michael@0 | 897 | vmull.u8 q1, d24, d9 |
michael@0 | 898 | vqadd.u8 q9, q0, q13 |
michael@0 | 899 | vshll.u8 q14, d16, #8 |
michael@0 | 900 | vmull.u8 q0, d24, d8 |
michael@0 | 901 | vshll.u8 q8, d19, #8 |
michael@0 | 902 | vshll.u8 q9, d18, #8 |
michael@0 | 903 | vsri.u16 q14, q8, #5 |
michael@0 | 904 | vmull.u8 q7, d24, d11 |
michael@0 | 905 | vsri.u16 q14, q9, #11 |
michael@0 | 906 | |
michael@0 | 907 | cache_preload 8, 8 |
michael@0 | 908 | |
michael@0 | 909 | vsli.u16 q2, q2, #5 |
michael@0 | 910 | vrshr.u16 q8, q0, #8 |
michael@0 | 911 | vrshr.u16 q9, q1, #8 |
michael@0 | 912 | vrshr.u16 q10, q6, #8 |
michael@0 | 913 | vrshr.u16 q11, q7, #8 |
michael@0 | 914 | vraddhn.u16 d0, q0, q8 |
michael@0 | 915 | vraddhn.u16 d1, q1, q9 |
michael@0 | 916 | vraddhn.u16 d2, q6, q10 |
michael@0 | 917 | vraddhn.u16 d3, q7, q11 |
michael@0 | 918 | vsri.u8 d6, d6, #5 |
michael@0 | 919 | vsri.u8 d7, d7, #6 |
michael@0 | 920 | vmvn.8 d3, d3 |
michael@0 | 921 | vshrn.u16 d30, q2, #2 |
michael@0 | 922 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 923 | vmull.u8 q8, d3, d6 |
michael@0 | 924 | vmull.u8 q9, d3, d7 |
michael@0 | 925 | vmull.u8 q10, d3, d30 |
michael@0 | 926 | .endm |
michael@0 | 927 | |
michael@0 | 928 | generate_composite_function \ |
michael@0 | 929 | pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ |
michael@0 | 930 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 931 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 932 | 5, /* prefetch distance */ \ |
michael@0 | 933 | default_init_need_all_regs, \ |
michael@0 | 934 | default_cleanup_need_all_regs, \ |
michael@0 | 935 | pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
michael@0 | 936 | pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
michael@0 | 937 | pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ |
michael@0 | 938 | 28, /* dst_w_basereg */ \ |
michael@0 | 939 | 4, /* dst_r_basereg */ \ |
michael@0 | 940 | 8, /* src_basereg */ \ |
michael@0 | 941 | 24 /* mask_basereg */ |
michael@0 | 942 | |
michael@0 | 943 | /******************************************************************************/ |
michael@0 | 944 | |
michael@0 | 945 | /* |
michael@0 | 946 | * This function needs a special initialization of solid mask. |
michael@0 | 947 | * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET |
michael@0 | 948 | * offset, split into color components and replicated in d8-d11 |
michael@0 | 949 | * registers. Additionally, this function needs all the NEON registers, |
michael@0 | 950 | * so it has to save d8-d15 registers which are callee saved according |
michael@0 | 951 | * to ABI. These registers are restored from 'cleanup' macro. All the |
michael@0 | 952 | * other NEON registers are caller saved, so can be clobbered freely |
michael@0 | 953 | * without introducing any problems. |
michael@0 | 954 | */ |
michael@0 | 955 | .macro pixman_composite_over_n_8_0565_init |
michael@0 | 956 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 957 | .vsave {d8-d15} |
michael@0 | 958 | vpush {d8-d15} |
michael@0 | 959 | vld1.32 {d11[0]}, [DUMMY] |
michael@0 | 960 | vdup.8 d8, d11[0] |
michael@0 | 961 | vdup.8 d9, d11[1] |
michael@0 | 962 | vdup.8 d10, d11[2] |
michael@0 | 963 | vdup.8 d11, d11[3] |
michael@0 | 964 | .endm |
michael@0 | 965 | |
michael@0 | 966 | .macro pixman_composite_over_n_8_0565_cleanup |
michael@0 | 967 | vpop {d8-d15} |
michael@0 | 968 | .endm |
michael@0 | 969 | |
michael@0 | 970 | generate_composite_function \ |
michael@0 | 971 | pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ |
michael@0 | 972 | FLAG_DST_READWRITE, \ |
michael@0 | 973 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 974 | 5, /* prefetch distance */ \ |
michael@0 | 975 | pixman_composite_over_n_8_0565_init, \ |
michael@0 | 976 | pixman_composite_over_n_8_0565_cleanup, \ |
michael@0 | 977 | pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
michael@0 | 978 | pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
michael@0 | 979 | pixman_composite_over_8888_8_0565_process_pixblock_tail_head |
michael@0 | 980 | |
michael@0 | 981 | /******************************************************************************/ |
michael@0 | 982 | |
michael@0 | 983 | .macro pixman_composite_over_8888_n_0565_init |
michael@0 | 984 | add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) |
michael@0 | 985 | .vsave {d8-d15} |
michael@0 | 986 | vpush {d8-d15} |
michael@0 | 987 | vld1.32 {d24[0]}, [DUMMY] |
michael@0 | 988 | vdup.8 d24, d24[3] |
michael@0 | 989 | .endm |
michael@0 | 990 | |
michael@0 | 991 | .macro pixman_composite_over_8888_n_0565_cleanup |
michael@0 | 992 | vpop {d8-d15} |
michael@0 | 993 | .endm |
michael@0 | 994 | |
michael@0 | 995 | generate_composite_function \ |
michael@0 | 996 | pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ |
michael@0 | 997 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 998 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 999 | 5, /* prefetch distance */ \ |
michael@0 | 1000 | pixman_composite_over_8888_n_0565_init, \ |
michael@0 | 1001 | pixman_composite_over_8888_n_0565_cleanup, \ |
michael@0 | 1002 | pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
michael@0 | 1003 | pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
michael@0 | 1004 | pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ |
michael@0 | 1005 | 28, /* dst_w_basereg */ \ |
michael@0 | 1006 | 4, /* dst_r_basereg */ \ |
michael@0 | 1007 | 8, /* src_basereg */ \ |
michael@0 | 1008 | 24 /* mask_basereg */ |
michael@0 | 1009 | |
michael@0 | 1010 | /******************************************************************************/ |
michael@0 | 1011 | |
michael@0 | 1012 | .macro pixman_composite_src_0565_0565_process_pixblock_head |
michael@0 | 1013 | .endm |
michael@0 | 1014 | |
michael@0 | 1015 | .macro pixman_composite_src_0565_0565_process_pixblock_tail |
michael@0 | 1016 | .endm |
michael@0 | 1017 | |
michael@0 | 1018 | .macro pixman_composite_src_0565_0565_process_pixblock_tail_head |
michael@0 | 1019 | vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! |
michael@0 | 1020 | fetch_src_pixblock |
michael@0 | 1021 | cache_preload 16, 16 |
michael@0 | 1022 | .endm |
michael@0 | 1023 | |
michael@0 | 1024 | generate_composite_function \ |
michael@0 | 1025 | pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ |
michael@0 | 1026 | FLAG_DST_WRITEONLY, \ |
michael@0 | 1027 | 16, /* number of pixels, processed in a single block */ \ |
michael@0 | 1028 | 10, /* prefetch distance */ \ |
michael@0 | 1029 | default_init, \ |
michael@0 | 1030 | default_cleanup, \ |
michael@0 | 1031 | pixman_composite_src_0565_0565_process_pixblock_head, \ |
michael@0 | 1032 | pixman_composite_src_0565_0565_process_pixblock_tail, \ |
michael@0 | 1033 | pixman_composite_src_0565_0565_process_pixblock_tail_head, \ |
michael@0 | 1034 | 0, /* dst_w_basereg */ \ |
michael@0 | 1035 | 0, /* dst_r_basereg */ \ |
michael@0 | 1036 | 0, /* src_basereg */ \ |
michael@0 | 1037 | 0 /* mask_basereg */ |
michael@0 | 1038 | |
michael@0 | 1039 | /******************************************************************************/ |
michael@0 | 1040 | |
michael@0 | 1041 | .macro pixman_composite_src_n_8_process_pixblock_head |
michael@0 | 1042 | .endm |
michael@0 | 1043 | |
michael@0 | 1044 | .macro pixman_composite_src_n_8_process_pixblock_tail |
michael@0 | 1045 | .endm |
michael@0 | 1046 | |
michael@0 | 1047 | .macro pixman_composite_src_n_8_process_pixblock_tail_head |
michael@0 | 1048 | vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! |
michael@0 | 1049 | .endm |
michael@0 | 1050 | |
michael@0 | 1051 | .macro pixman_composite_src_n_8_init |
michael@0 | 1052 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1053 | vld1.32 {d0[0]}, [DUMMY] |
michael@0 | 1054 | vsli.u64 d0, d0, #8 |
michael@0 | 1055 | vsli.u64 d0, d0, #16 |
michael@0 | 1056 | vsli.u64 d0, d0, #32 |
michael@0 | 1057 | vorr d1, d0, d0 |
michael@0 | 1058 | vorr q1, q0, q0 |
michael@0 | 1059 | .endm |
michael@0 | 1060 | |
michael@0 | 1061 | .macro pixman_composite_src_n_8_cleanup |
michael@0 | 1062 | .endm |
michael@0 | 1063 | |
michael@0 | 1064 | generate_composite_function \ |
michael@0 | 1065 | pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ |
michael@0 | 1066 | FLAG_DST_WRITEONLY, \ |
michael@0 | 1067 | 32, /* number of pixels, processed in a single block */ \ |
michael@0 | 1068 | 0, /* prefetch distance */ \ |
michael@0 | 1069 | pixman_composite_src_n_8_init, \ |
michael@0 | 1070 | pixman_composite_src_n_8_cleanup, \ |
michael@0 | 1071 | pixman_composite_src_n_8_process_pixblock_head, \ |
michael@0 | 1072 | pixman_composite_src_n_8_process_pixblock_tail, \ |
michael@0 | 1073 | pixman_composite_src_n_8_process_pixblock_tail_head, \ |
michael@0 | 1074 | 0, /* dst_w_basereg */ \ |
michael@0 | 1075 | 0, /* dst_r_basereg */ \ |
michael@0 | 1076 | 0, /* src_basereg */ \ |
michael@0 | 1077 | 0 /* mask_basereg */ |
michael@0 | 1078 | |
michael@0 | 1079 | /******************************************************************************/ |
michael@0 | 1080 | |
michael@0 | 1081 | .macro pixman_composite_src_n_0565_process_pixblock_head |
michael@0 | 1082 | .endm |
michael@0 | 1083 | |
michael@0 | 1084 | .macro pixman_composite_src_n_0565_process_pixblock_tail |
michael@0 | 1085 | .endm |
michael@0 | 1086 | |
michael@0 | 1087 | .macro pixman_composite_src_n_0565_process_pixblock_tail_head |
michael@0 | 1088 | vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! |
michael@0 | 1089 | .endm |
michael@0 | 1090 | |
michael@0 | 1091 | .macro pixman_composite_src_n_0565_init |
michael@0 | 1092 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1093 | vld1.32 {d0[0]}, [DUMMY] |
michael@0 | 1094 | vsli.u64 d0, d0, #16 |
michael@0 | 1095 | vsli.u64 d0, d0, #32 |
michael@0 | 1096 | vorr d1, d0, d0 |
michael@0 | 1097 | vorr q1, q0, q0 |
michael@0 | 1098 | .endm |
michael@0 | 1099 | |
michael@0 | 1100 | .macro pixman_composite_src_n_0565_cleanup |
michael@0 | 1101 | .endm |
michael@0 | 1102 | |
michael@0 | 1103 | generate_composite_function \ |
michael@0 | 1104 | pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ |
michael@0 | 1105 | FLAG_DST_WRITEONLY, \ |
michael@0 | 1106 | 16, /* number of pixels, processed in a single block */ \ |
michael@0 | 1107 | 0, /* prefetch distance */ \ |
michael@0 | 1108 | pixman_composite_src_n_0565_init, \ |
michael@0 | 1109 | pixman_composite_src_n_0565_cleanup, \ |
michael@0 | 1110 | pixman_composite_src_n_0565_process_pixblock_head, \ |
michael@0 | 1111 | pixman_composite_src_n_0565_process_pixblock_tail, \ |
michael@0 | 1112 | pixman_composite_src_n_0565_process_pixblock_tail_head, \ |
michael@0 | 1113 | 0, /* dst_w_basereg */ \ |
michael@0 | 1114 | 0, /* dst_r_basereg */ \ |
michael@0 | 1115 | 0, /* src_basereg */ \ |
michael@0 | 1116 | 0 /* mask_basereg */ |
michael@0 | 1117 | |
michael@0 | 1118 | /******************************************************************************/ |
michael@0 | 1119 | |
michael@0 | 1120 | .macro pixman_composite_src_n_8888_process_pixblock_head |
michael@0 | 1121 | .endm |
michael@0 | 1122 | |
michael@0 | 1123 | .macro pixman_composite_src_n_8888_process_pixblock_tail |
michael@0 | 1124 | .endm |
michael@0 | 1125 | |
michael@0 | 1126 | .macro pixman_composite_src_n_8888_process_pixblock_tail_head |
michael@0 | 1127 | vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! |
michael@0 | 1128 | .endm |
michael@0 | 1129 | |
michael@0 | 1130 | .macro pixman_composite_src_n_8888_init |
michael@0 | 1131 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1132 | vld1.32 {d0[0]}, [DUMMY] |
michael@0 | 1133 | vsli.u64 d0, d0, #32 |
michael@0 | 1134 | vorr d1, d0, d0 |
michael@0 | 1135 | vorr q1, q0, q0 |
michael@0 | 1136 | .endm |
michael@0 | 1137 | |
michael@0 | 1138 | .macro pixman_composite_src_n_8888_cleanup |
michael@0 | 1139 | .endm |
michael@0 | 1140 | |
michael@0 | 1141 | generate_composite_function \ |
michael@0 | 1142 | pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ |
michael@0 | 1143 | FLAG_DST_WRITEONLY, \ |
michael@0 | 1144 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 1145 | 0, /* prefetch distance */ \ |
michael@0 | 1146 | pixman_composite_src_n_8888_init, \ |
michael@0 | 1147 | pixman_composite_src_n_8888_cleanup, \ |
michael@0 | 1148 | pixman_composite_src_n_8888_process_pixblock_head, \ |
michael@0 | 1149 | pixman_composite_src_n_8888_process_pixblock_tail, \ |
michael@0 | 1150 | pixman_composite_src_n_8888_process_pixblock_tail_head, \ |
michael@0 | 1151 | 0, /* dst_w_basereg */ \ |
michael@0 | 1152 | 0, /* dst_r_basereg */ \ |
michael@0 | 1153 | 0, /* src_basereg */ \ |
michael@0 | 1154 | 0 /* mask_basereg */ |
michael@0 | 1155 | |
michael@0 | 1156 | /******************************************************************************/ |
michael@0 | 1157 | |
michael@0 | 1158 | .macro pixman_composite_src_8888_8888_process_pixblock_head |
michael@0 | 1159 | .endm |
michael@0 | 1160 | |
michael@0 | 1161 | .macro pixman_composite_src_8888_8888_process_pixblock_tail |
michael@0 | 1162 | .endm |
michael@0 | 1163 | |
michael@0 | 1164 | .macro pixman_composite_src_8888_8888_process_pixblock_tail_head |
michael@0 | 1165 | vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! |
michael@0 | 1166 | fetch_src_pixblock |
michael@0 | 1167 | cache_preload 8, 8 |
michael@0 | 1168 | .endm |
michael@0 | 1169 | |
michael@0 | 1170 | generate_composite_function \ |
michael@0 | 1171 | pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ |
michael@0 | 1172 | FLAG_DST_WRITEONLY, \ |
michael@0 | 1173 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 1174 | 10, /* prefetch distance */ \ |
michael@0 | 1175 | default_init, \ |
michael@0 | 1176 | default_cleanup, \ |
michael@0 | 1177 | pixman_composite_src_8888_8888_process_pixblock_head, \ |
michael@0 | 1178 | pixman_composite_src_8888_8888_process_pixblock_tail, \ |
michael@0 | 1179 | pixman_composite_src_8888_8888_process_pixblock_tail_head, \ |
michael@0 | 1180 | 0, /* dst_w_basereg */ \ |
michael@0 | 1181 | 0, /* dst_r_basereg */ \ |
michael@0 | 1182 | 0, /* src_basereg */ \ |
michael@0 | 1183 | 0 /* mask_basereg */ |
michael@0 | 1184 | |
michael@0 | 1185 | /******************************************************************************/ |
michael@0 | 1186 | |
michael@0 | 1187 | .macro pixman_composite_src_x888_8888_process_pixblock_head |
michael@0 | 1188 | vorr q0, q0, q2 |
michael@0 | 1189 | vorr q1, q1, q2 |
michael@0 | 1190 | .endm |
michael@0 | 1191 | |
michael@0 | 1192 | .macro pixman_composite_src_x888_8888_process_pixblock_tail |
michael@0 | 1193 | .endm |
michael@0 | 1194 | |
michael@0 | 1195 | .macro pixman_composite_src_x888_8888_process_pixblock_tail_head |
michael@0 | 1196 | vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! |
michael@0 | 1197 | fetch_src_pixblock |
michael@0 | 1198 | vorr q0, q0, q2 |
michael@0 | 1199 | vorr q1, q1, q2 |
michael@0 | 1200 | cache_preload 8, 8 |
michael@0 | 1201 | .endm |
michael@0 | 1202 | |
michael@0 | 1203 | .macro pixman_composite_src_x888_8888_init |
michael@0 | 1204 | vmov.u8 q2, #0xFF |
michael@0 | 1205 | vshl.u32 q2, q2, #24 |
michael@0 | 1206 | .endm |
michael@0 | 1207 | |
michael@0 | 1208 | generate_composite_function \ |
michael@0 | 1209 | pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ |
michael@0 | 1210 | FLAG_DST_WRITEONLY, \ |
michael@0 | 1211 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 1212 | 10, /* prefetch distance */ \ |
michael@0 | 1213 | pixman_composite_src_x888_8888_init, \ |
michael@0 | 1214 | default_cleanup, \ |
michael@0 | 1215 | pixman_composite_src_x888_8888_process_pixblock_head, \ |
michael@0 | 1216 | pixman_composite_src_x888_8888_process_pixblock_tail, \ |
michael@0 | 1217 | pixman_composite_src_x888_8888_process_pixblock_tail_head, \ |
michael@0 | 1218 | 0, /* dst_w_basereg */ \ |
michael@0 | 1219 | 0, /* dst_r_basereg */ \ |
michael@0 | 1220 | 0, /* src_basereg */ \ |
michael@0 | 1221 | 0 /* mask_basereg */ |
michael@0 | 1222 | |
michael@0 | 1223 | /******************************************************************************/ |
michael@0 | 1224 | |
michael@0 | 1225 | .macro pixman_composite_src_n_8_8888_process_pixblock_head |
michael@0 | 1226 | /* expecting solid source in {d0, d1, d2, d3} */ |
michael@0 | 1227 | /* mask is in d24 (d25, d26, d27 are unused) */ |
michael@0 | 1228 | |
michael@0 | 1229 | /* in */ |
michael@0 | 1230 | vmull.u8 q8, d24, d0 |
michael@0 | 1231 | vmull.u8 q9, d24, d1 |
michael@0 | 1232 | vmull.u8 q10, d24, d2 |
michael@0 | 1233 | vmull.u8 q11, d24, d3 |
michael@0 | 1234 | vrsra.u16 q8, q8, #8 |
michael@0 | 1235 | vrsra.u16 q9, q9, #8 |
michael@0 | 1236 | vrsra.u16 q10, q10, #8 |
michael@0 | 1237 | vrsra.u16 q11, q11, #8 |
michael@0 | 1238 | .endm |
michael@0 | 1239 | |
michael@0 | 1240 | .macro pixman_composite_src_n_8_8888_process_pixblock_tail |
michael@0 | 1241 | vrshrn.u16 d28, q8, #8 |
michael@0 | 1242 | vrshrn.u16 d29, q9, #8 |
michael@0 | 1243 | vrshrn.u16 d30, q10, #8 |
michael@0 | 1244 | vrshrn.u16 d31, q11, #8 |
michael@0 | 1245 | .endm |
michael@0 | 1246 | |
michael@0 | 1247 | .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head |
michael@0 | 1248 | fetch_mask_pixblock |
michael@0 | 1249 | PF add PF_X, PF_X, #8 |
michael@0 | 1250 | vrshrn.u16 d28, q8, #8 |
michael@0 | 1251 | PF tst PF_CTL, #0x0F |
michael@0 | 1252 | vrshrn.u16 d29, q9, #8 |
michael@0 | 1253 | PF addne PF_X, PF_X, #8 |
michael@0 | 1254 | vrshrn.u16 d30, q10, #8 |
michael@0 | 1255 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 1256 | vrshrn.u16 d31, q11, #8 |
michael@0 | 1257 | PF cmp PF_X, ORIG_W |
michael@0 | 1258 | vmull.u8 q8, d24, d0 |
michael@0 | 1259 | PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] |
michael@0 | 1260 | vmull.u8 q9, d24, d1 |
michael@0 | 1261 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 1262 | vmull.u8 q10, d24, d2 |
michael@0 | 1263 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 1264 | vmull.u8 q11, d24, d3 |
michael@0 | 1265 | PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! |
michael@0 | 1266 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 1267 | vrsra.u16 q8, q8, #8 |
michael@0 | 1268 | vrsra.u16 q9, q9, #8 |
michael@0 | 1269 | vrsra.u16 q10, q10, #8 |
michael@0 | 1270 | vrsra.u16 q11, q11, #8 |
michael@0 | 1271 | .endm |
michael@0 | 1272 | |
michael@0 | 1273 | .macro pixman_composite_src_n_8_8888_init |
michael@0 | 1274 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1275 | vld1.32 {d3[0]}, [DUMMY] |
michael@0 | 1276 | vdup.8 d0, d3[0] |
michael@0 | 1277 | vdup.8 d1, d3[1] |
michael@0 | 1278 | vdup.8 d2, d3[2] |
michael@0 | 1279 | vdup.8 d3, d3[3] |
michael@0 | 1280 | .endm |
michael@0 | 1281 | |
michael@0 | 1282 | .macro pixman_composite_src_n_8_8888_cleanup |
michael@0 | 1283 | .endm |
michael@0 | 1284 | |
michael@0 | 1285 | generate_composite_function \ |
michael@0 | 1286 | pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ |
michael@0 | 1287 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 1288 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 1289 | 5, /* prefetch distance */ \ |
michael@0 | 1290 | pixman_composite_src_n_8_8888_init, \ |
michael@0 | 1291 | pixman_composite_src_n_8_8888_cleanup, \ |
michael@0 | 1292 | pixman_composite_src_n_8_8888_process_pixblock_head, \ |
michael@0 | 1293 | pixman_composite_src_n_8_8888_process_pixblock_tail, \ |
michael@0 | 1294 | pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ |
michael@0 | 1295 | |
michael@0 | 1296 | /******************************************************************************/ |
michael@0 | 1297 | |
michael@0 | 1298 | .macro pixman_composite_src_n_8_8_process_pixblock_head |
michael@0 | 1299 | vmull.u8 q0, d24, d16 |
michael@0 | 1300 | vmull.u8 q1, d25, d16 |
michael@0 | 1301 | vmull.u8 q2, d26, d16 |
michael@0 | 1302 | vmull.u8 q3, d27, d16 |
michael@0 | 1303 | vrsra.u16 q0, q0, #8 |
michael@0 | 1304 | vrsra.u16 q1, q1, #8 |
michael@0 | 1305 | vrsra.u16 q2, q2, #8 |
michael@0 | 1306 | vrsra.u16 q3, q3, #8 |
michael@0 | 1307 | .endm |
michael@0 | 1308 | |
michael@0 | 1309 | .macro pixman_composite_src_n_8_8_process_pixblock_tail |
michael@0 | 1310 | vrshrn.u16 d28, q0, #8 |
michael@0 | 1311 | vrshrn.u16 d29, q1, #8 |
michael@0 | 1312 | vrshrn.u16 d30, q2, #8 |
michael@0 | 1313 | vrshrn.u16 d31, q3, #8 |
michael@0 | 1314 | .endm |
michael@0 | 1315 | |
michael@0 | 1316 | .macro pixman_composite_src_n_8_8_process_pixblock_tail_head |
michael@0 | 1317 | fetch_mask_pixblock |
michael@0 | 1318 | PF add PF_X, PF_X, #8 |
michael@0 | 1319 | vrshrn.u16 d28, q0, #8 |
michael@0 | 1320 | PF tst PF_CTL, #0x0F |
michael@0 | 1321 | vrshrn.u16 d29, q1, #8 |
michael@0 | 1322 | PF addne PF_X, PF_X, #8 |
michael@0 | 1323 | vrshrn.u16 d30, q2, #8 |
michael@0 | 1324 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 1325 | vrshrn.u16 d31, q3, #8 |
michael@0 | 1326 | PF cmp PF_X, ORIG_W |
michael@0 | 1327 | vmull.u8 q0, d24, d16 |
michael@0 | 1328 | PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] |
michael@0 | 1329 | vmull.u8 q1, d25, d16 |
michael@0 | 1330 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 1331 | vmull.u8 q2, d26, d16 |
michael@0 | 1332 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 1333 | vmull.u8 q3, d27, d16 |
michael@0 | 1334 | PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! |
michael@0 | 1335 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 1336 | vrsra.u16 q0, q0, #8 |
michael@0 | 1337 | vrsra.u16 q1, q1, #8 |
michael@0 | 1338 | vrsra.u16 q2, q2, #8 |
michael@0 | 1339 | vrsra.u16 q3, q3, #8 |
michael@0 | 1340 | .endm |
michael@0 | 1341 | |
michael@0 | 1342 | .macro pixman_composite_src_n_8_8_init |
michael@0 | 1343 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1344 | vld1.32 {d16[0]}, [DUMMY] |
michael@0 | 1345 | vdup.8 d16, d16[3] |
michael@0 | 1346 | .endm |
michael@0 | 1347 | |
michael@0 | 1348 | .macro pixman_composite_src_n_8_8_cleanup |
michael@0 | 1349 | .endm |
michael@0 | 1350 | |
michael@0 | 1351 | generate_composite_function \ |
michael@0 | 1352 | pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ |
michael@0 | 1353 | FLAG_DST_WRITEONLY, \ |
michael@0 | 1354 | 32, /* number of pixels, processed in a single block */ \ |
michael@0 | 1355 | 5, /* prefetch distance */ \ |
michael@0 | 1356 | pixman_composite_src_n_8_8_init, \ |
michael@0 | 1357 | pixman_composite_src_n_8_8_cleanup, \ |
michael@0 | 1358 | pixman_composite_src_n_8_8_process_pixblock_head, \ |
michael@0 | 1359 | pixman_composite_src_n_8_8_process_pixblock_tail, \ |
michael@0 | 1360 | pixman_composite_src_n_8_8_process_pixblock_tail_head |
michael@0 | 1361 | |
michael@0 | 1362 | /******************************************************************************/ |
michael@0 | 1363 | |
michael@0 | 1364 | .macro pixman_composite_over_n_8_8888_process_pixblock_head |
michael@0 | 1365 | /* expecting deinterleaved source data in {d8, d9, d10, d11} */ |
michael@0 | 1366 | /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ |
michael@0 | 1367 | /* and destination data in {d4, d5, d6, d7} */ |
michael@0 | 1368 | /* mask is in d24 (d25, d26, d27 are unused) */ |
michael@0 | 1369 | |
michael@0 | 1370 | /* in */ |
michael@0 | 1371 | vmull.u8 q6, d24, d8 |
michael@0 | 1372 | vmull.u8 q7, d24, d9 |
michael@0 | 1373 | vmull.u8 q8, d24, d10 |
michael@0 | 1374 | vmull.u8 q9, d24, d11 |
michael@0 | 1375 | vrshr.u16 q10, q6, #8 |
michael@0 | 1376 | vrshr.u16 q11, q7, #8 |
michael@0 | 1377 | vrshr.u16 q12, q8, #8 |
michael@0 | 1378 | vrshr.u16 q13, q9, #8 |
michael@0 | 1379 | vraddhn.u16 d0, q6, q10 |
michael@0 | 1380 | vraddhn.u16 d1, q7, q11 |
michael@0 | 1381 | vraddhn.u16 d2, q8, q12 |
michael@0 | 1382 | vraddhn.u16 d3, q9, q13 |
michael@0 | 1383 | vmvn.8 d25, d3 /* get inverted alpha */ |
michael@0 | 1384 | /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ |
michael@0 | 1385 | /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ |
michael@0 | 1386 | /* now do alpha blending */ |
michael@0 | 1387 | vmull.u8 q8, d25, d4 |
michael@0 | 1388 | vmull.u8 q9, d25, d5 |
michael@0 | 1389 | vmull.u8 q10, d25, d6 |
michael@0 | 1390 | vmull.u8 q11, d25, d7 |
michael@0 | 1391 | .endm |
michael@0 | 1392 | |
michael@0 | 1393 | .macro pixman_composite_over_n_8_8888_process_pixblock_tail |
michael@0 | 1394 | vrshr.u16 q14, q8, #8 |
michael@0 | 1395 | vrshr.u16 q15, q9, #8 |
michael@0 | 1396 | vrshr.u16 q6, q10, #8 |
michael@0 | 1397 | vrshr.u16 q7, q11, #8 |
michael@0 | 1398 | vraddhn.u16 d28, q14, q8 |
michael@0 | 1399 | vraddhn.u16 d29, q15, q9 |
michael@0 | 1400 | vraddhn.u16 d30, q6, q10 |
michael@0 | 1401 | vraddhn.u16 d31, q7, q11 |
michael@0 | 1402 | vqadd.u8 q14, q0, q14 |
michael@0 | 1403 | vqadd.u8 q15, q1, q15 |
michael@0 | 1404 | .endm |
michael@0 | 1405 | |
michael@0 | 1406 | .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head |
michael@0 | 1407 | vrshr.u16 q14, q8, #8 |
michael@0 | 1408 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 1409 | vrshr.u16 q15, q9, #8 |
michael@0 | 1410 | fetch_mask_pixblock |
michael@0 | 1411 | vrshr.u16 q6, q10, #8 |
michael@0 | 1412 | PF add PF_X, PF_X, #8 |
michael@0 | 1413 | vrshr.u16 q7, q11, #8 |
michael@0 | 1414 | PF tst PF_CTL, #0x0F |
michael@0 | 1415 | vraddhn.u16 d28, q14, q8 |
michael@0 | 1416 | PF addne PF_X, PF_X, #8 |
michael@0 | 1417 | vraddhn.u16 d29, q15, q9 |
michael@0 | 1418 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 1419 | vraddhn.u16 d30, q6, q10 |
michael@0 | 1420 | PF cmp PF_X, ORIG_W |
michael@0 | 1421 | vraddhn.u16 d31, q7, q11 |
michael@0 | 1422 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
michael@0 | 1423 | vmull.u8 q6, d24, d8 |
michael@0 | 1424 | PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] |
michael@0 | 1425 | vmull.u8 q7, d24, d9 |
michael@0 | 1426 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 1427 | vmull.u8 q8, d24, d10 |
michael@0 | 1428 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 1429 | vmull.u8 q9, d24, d11 |
michael@0 | 1430 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
michael@0 | 1431 | vqadd.u8 q14, q0, q14 |
michael@0 | 1432 | PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! |
michael@0 | 1433 | vqadd.u8 q15, q1, q15 |
michael@0 | 1434 | vrshr.u16 q10, q6, #8 |
michael@0 | 1435 | vrshr.u16 q11, q7, #8 |
michael@0 | 1436 | vrshr.u16 q12, q8, #8 |
michael@0 | 1437 | vrshr.u16 q13, q9, #8 |
michael@0 | 1438 | vraddhn.u16 d0, q6, q10 |
michael@0 | 1439 | vraddhn.u16 d1, q7, q11 |
michael@0 | 1440 | vraddhn.u16 d2, q8, q12 |
michael@0 | 1441 | vraddhn.u16 d3, q9, q13 |
michael@0 | 1442 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 1443 | vmvn.8 d25, d3 |
michael@0 | 1444 | vmull.u8 q8, d25, d4 |
michael@0 | 1445 | vmull.u8 q9, d25, d5 |
michael@0 | 1446 | vmull.u8 q10, d25, d6 |
michael@0 | 1447 | vmull.u8 q11, d25, d7 |
michael@0 | 1448 | .endm |
michael@0 | 1449 | |
michael@0 | 1450 | .macro pixman_composite_over_n_8_8888_init |
michael@0 | 1451 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1452 | .vsave {d8-d15} |
michael@0 | 1453 | vpush {d8-d15} |
michael@0 | 1454 | vld1.32 {d11[0]}, [DUMMY] |
michael@0 | 1455 | vdup.8 d8, d11[0] |
michael@0 | 1456 | vdup.8 d9, d11[1] |
michael@0 | 1457 | vdup.8 d10, d11[2] |
michael@0 | 1458 | vdup.8 d11, d11[3] |
michael@0 | 1459 | .endm |
michael@0 | 1460 | |
michael@0 | 1461 | .macro pixman_composite_over_n_8_8888_cleanup |
michael@0 | 1462 | vpop {d8-d15} |
michael@0 | 1463 | .endm |
michael@0 | 1464 | |
michael@0 | 1465 | generate_composite_function \ |
michael@0 | 1466 | pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ |
michael@0 | 1467 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 1468 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 1469 | 5, /* prefetch distance */ \ |
michael@0 | 1470 | pixman_composite_over_n_8_8888_init, \ |
michael@0 | 1471 | pixman_composite_over_n_8_8888_cleanup, \ |
michael@0 | 1472 | pixman_composite_over_n_8_8888_process_pixblock_head, \ |
michael@0 | 1473 | pixman_composite_over_n_8_8888_process_pixblock_tail, \ |
michael@0 | 1474 | pixman_composite_over_n_8_8888_process_pixblock_tail_head |
michael@0 | 1475 | |
michael@0 | 1476 | /******************************************************************************/ |
michael@0 | 1477 | |
michael@0 | 1478 | .macro pixman_composite_over_n_8_8_process_pixblock_head |
michael@0 | 1479 | vmull.u8 q0, d24, d8 |
michael@0 | 1480 | vmull.u8 q1, d25, d8 |
michael@0 | 1481 | vmull.u8 q6, d26, d8 |
michael@0 | 1482 | vmull.u8 q7, d27, d8 |
michael@0 | 1483 | vrshr.u16 q10, q0, #8 |
michael@0 | 1484 | vrshr.u16 q11, q1, #8 |
michael@0 | 1485 | vrshr.u16 q12, q6, #8 |
michael@0 | 1486 | vrshr.u16 q13, q7, #8 |
michael@0 | 1487 | vraddhn.u16 d0, q0, q10 |
michael@0 | 1488 | vraddhn.u16 d1, q1, q11 |
michael@0 | 1489 | vraddhn.u16 d2, q6, q12 |
michael@0 | 1490 | vraddhn.u16 d3, q7, q13 |
michael@0 | 1491 | vmvn.8 q12, q0 |
michael@0 | 1492 | vmvn.8 q13, q1 |
michael@0 | 1493 | vmull.u8 q8, d24, d4 |
michael@0 | 1494 | vmull.u8 q9, d25, d5 |
michael@0 | 1495 | vmull.u8 q10, d26, d6 |
michael@0 | 1496 | vmull.u8 q11, d27, d7 |
michael@0 | 1497 | .endm |
michael@0 | 1498 | |
michael@0 | 1499 | .macro pixman_composite_over_n_8_8_process_pixblock_tail |
michael@0 | 1500 | vrshr.u16 q14, q8, #8 |
michael@0 | 1501 | vrshr.u16 q15, q9, #8 |
michael@0 | 1502 | vrshr.u16 q12, q10, #8 |
michael@0 | 1503 | vrshr.u16 q13, q11, #8 |
michael@0 | 1504 | vraddhn.u16 d28, q14, q8 |
michael@0 | 1505 | vraddhn.u16 d29, q15, q9 |
michael@0 | 1506 | vraddhn.u16 d30, q12, q10 |
michael@0 | 1507 | vraddhn.u16 d31, q13, q11 |
michael@0 | 1508 | vqadd.u8 q14, q0, q14 |
michael@0 | 1509 | vqadd.u8 q15, q1, q15 |
michael@0 | 1510 | .endm |
michael@0 | 1511 | |
michael@0 | 1512 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 1513 | .macro pixman_composite_over_n_8_8_process_pixblock_tail_head |
michael@0 | 1514 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 1515 | pixman_composite_over_n_8_8_process_pixblock_tail |
michael@0 | 1516 | fetch_mask_pixblock |
michael@0 | 1517 | cache_preload 32, 32 |
michael@0 | 1518 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 1519 | pixman_composite_over_n_8_8_process_pixblock_head |
michael@0 | 1520 | .endm |
michael@0 | 1521 | |
michael@0 | 1522 | .macro pixman_composite_over_n_8_8_init |
michael@0 | 1523 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1524 | .vsave {d8-d15} |
michael@0 | 1525 | vpush {d8-d15} |
michael@0 | 1526 | vld1.32 {d8[0]}, [DUMMY] |
michael@0 | 1527 | vdup.8 d8, d8[3] |
michael@0 | 1528 | .endm |
michael@0 | 1529 | |
michael@0 | 1530 | .macro pixman_composite_over_n_8_8_cleanup |
michael@0 | 1531 | vpop {d8-d15} |
michael@0 | 1532 | .endm |
michael@0 | 1533 | |
michael@0 | 1534 | generate_composite_function \ |
michael@0 | 1535 | pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ |
michael@0 | 1536 | FLAG_DST_READWRITE, \ |
michael@0 | 1537 | 32, /* number of pixels, processed in a single block */ \ |
michael@0 | 1538 | 5, /* prefetch distance */ \ |
michael@0 | 1539 | pixman_composite_over_n_8_8_init, \ |
michael@0 | 1540 | pixman_composite_over_n_8_8_cleanup, \ |
michael@0 | 1541 | pixman_composite_over_n_8_8_process_pixblock_head, \ |
michael@0 | 1542 | pixman_composite_over_n_8_8_process_pixblock_tail, \ |
michael@0 | 1543 | pixman_composite_over_n_8_8_process_pixblock_tail_head |
michael@0 | 1544 | |
michael@0 | 1545 | /******************************************************************************/ |
michael@0 | 1546 | |
michael@0 | 1547 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head |
michael@0 | 1548 | /* |
michael@0 | 1549 | * 'combine_mask_ca' replacement |
michael@0 | 1550 | * |
michael@0 | 1551 | * input: solid src (n) in {d8, d9, d10, d11} |
michael@0 | 1552 | * dest in {d4, d5, d6, d7 } |
michael@0 | 1553 | * mask in {d24, d25, d26, d27} |
michael@0 | 1554 | * output: updated src in {d0, d1, d2, d3 } |
michael@0 | 1555 | * updated mask in {d24, d25, d26, d3 } |
michael@0 | 1556 | */ |
michael@0 | 1557 | vmull.u8 q0, d24, d8 |
michael@0 | 1558 | vmull.u8 q1, d25, d9 |
michael@0 | 1559 | vmull.u8 q6, d26, d10 |
michael@0 | 1560 | vmull.u8 q7, d27, d11 |
michael@0 | 1561 | vmull.u8 q9, d11, d25 |
michael@0 | 1562 | vmull.u8 q12, d11, d24 |
michael@0 | 1563 | vmull.u8 q13, d11, d26 |
michael@0 | 1564 | vrshr.u16 q8, q0, #8 |
michael@0 | 1565 | vrshr.u16 q10, q1, #8 |
michael@0 | 1566 | vrshr.u16 q11, q6, #8 |
michael@0 | 1567 | vraddhn.u16 d0, q0, q8 |
michael@0 | 1568 | vraddhn.u16 d1, q1, q10 |
michael@0 | 1569 | vraddhn.u16 d2, q6, q11 |
michael@0 | 1570 | vrshr.u16 q11, q12, #8 |
michael@0 | 1571 | vrshr.u16 q8, q9, #8 |
michael@0 | 1572 | vrshr.u16 q6, q13, #8 |
michael@0 | 1573 | vrshr.u16 q10, q7, #8 |
michael@0 | 1574 | vraddhn.u16 d24, q12, q11 |
michael@0 | 1575 | vraddhn.u16 d25, q9, q8 |
michael@0 | 1576 | vraddhn.u16 d26, q13, q6 |
michael@0 | 1577 | vraddhn.u16 d3, q7, q10 |
michael@0 | 1578 | /* |
michael@0 | 1579 | * 'combine_over_ca' replacement |
michael@0 | 1580 | * |
michael@0 | 1581 | * output: updated dest in {d28, d29, d30, d31} |
michael@0 | 1582 | */ |
michael@0 | 1583 | vmvn.8 q12, q12 |
michael@0 | 1584 | vmvn.8 d26, d26 |
michael@0 | 1585 | vmull.u8 q8, d24, d4 |
michael@0 | 1586 | vmull.u8 q9, d25, d5 |
michael@0 | 1587 | vmvn.8 d27, d3 |
michael@0 | 1588 | vmull.u8 q10, d26, d6 |
michael@0 | 1589 | vmull.u8 q11, d27, d7 |
michael@0 | 1590 | .endm |
michael@0 | 1591 | |
michael@0 | 1592 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail |
michael@0 | 1593 | /* ... continue 'combine_over_ca' replacement */ |
michael@0 | 1594 | vrshr.u16 q14, q8, #8 |
michael@0 | 1595 | vrshr.u16 q15, q9, #8 |
michael@0 | 1596 | vrshr.u16 q6, q10, #8 |
michael@0 | 1597 | vrshr.u16 q7, q11, #8 |
michael@0 | 1598 | vraddhn.u16 d28, q14, q8 |
michael@0 | 1599 | vraddhn.u16 d29, q15, q9 |
michael@0 | 1600 | vraddhn.u16 d30, q6, q10 |
michael@0 | 1601 | vraddhn.u16 d31, q7, q11 |
michael@0 | 1602 | vqadd.u8 q14, q0, q14 |
michael@0 | 1603 | vqadd.u8 q15, q1, q15 |
michael@0 | 1604 | .endm |
michael@0 | 1605 | |
michael@0 | 1606 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head |
michael@0 | 1607 | vrshr.u16 q14, q8, #8 |
michael@0 | 1608 | vrshr.u16 q15, q9, #8 |
michael@0 | 1609 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 1610 | vrshr.u16 q6, q10, #8 |
michael@0 | 1611 | vrshr.u16 q7, q11, #8 |
michael@0 | 1612 | vraddhn.u16 d28, q14, q8 |
michael@0 | 1613 | vraddhn.u16 d29, q15, q9 |
michael@0 | 1614 | vraddhn.u16 d30, q6, q10 |
michael@0 | 1615 | vraddhn.u16 d31, q7, q11 |
michael@0 | 1616 | fetch_mask_pixblock |
michael@0 | 1617 | vqadd.u8 q14, q0, q14 |
michael@0 | 1618 | vqadd.u8 q15, q1, q15 |
michael@0 | 1619 | cache_preload 8, 8 |
michael@0 | 1620 | pixman_composite_over_n_8888_8888_ca_process_pixblock_head |
michael@0 | 1621 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 1622 | .endm |
michael@0 | 1623 | |
michael@0 | 1624 | .macro pixman_composite_over_n_8888_8888_ca_init |
michael@0 | 1625 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1626 | .vsave {d8-d15} |
michael@0 | 1627 | vpush {d8-d15} |
michael@0 | 1628 | vld1.32 {d11[0]}, [DUMMY] |
michael@0 | 1629 | vdup.8 d8, d11[0] |
michael@0 | 1630 | vdup.8 d9, d11[1] |
michael@0 | 1631 | vdup.8 d10, d11[2] |
michael@0 | 1632 | vdup.8 d11, d11[3] |
michael@0 | 1633 | .endm |
michael@0 | 1634 | |
michael@0 | 1635 | .macro pixman_composite_over_n_8888_8888_ca_cleanup |
michael@0 | 1636 | vpop {d8-d15} |
michael@0 | 1637 | .endm |
michael@0 | 1638 | |
michael@0 | 1639 | generate_composite_function \ |
michael@0 | 1640 | pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ |
michael@0 | 1641 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 1642 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 1643 | 5, /* prefetch distance */ \ |
michael@0 | 1644 | pixman_composite_over_n_8888_8888_ca_init, \ |
michael@0 | 1645 | pixman_composite_over_n_8888_8888_ca_cleanup, \ |
michael@0 | 1646 | pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ |
michael@0 | 1647 | pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ |
michael@0 | 1648 | pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head |
michael@0 | 1649 | |
michael@0 | 1650 | /******************************************************************************/ |
michael@0 | 1651 | |
michael@0 | 1652 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head |
michael@0 | 1653 | /* |
michael@0 | 1654 | * 'combine_mask_ca' replacement |
michael@0 | 1655 | * |
michael@0 | 1656 | * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] |
michael@0 | 1657 | * mask in {d24, d25, d26} [B, G, R] |
michael@0 | 1658 | * output: updated src in {d0, d1, d2 } [B, G, R] |
michael@0 | 1659 | * updated mask in {d24, d25, d26} [B, G, R] |
michael@0 | 1660 | */ |
michael@0 | 1661 | vmull.u8 q0, d24, d8 |
michael@0 | 1662 | vmull.u8 q1, d25, d9 |
michael@0 | 1663 | vmull.u8 q6, d26, d10 |
michael@0 | 1664 | vmull.u8 q9, d11, d25 |
michael@0 | 1665 | vmull.u8 q12, d11, d24 |
michael@0 | 1666 | vmull.u8 q13, d11, d26 |
michael@0 | 1667 | vrshr.u16 q8, q0, #8 |
michael@0 | 1668 | vrshr.u16 q10, q1, #8 |
michael@0 | 1669 | vrshr.u16 q11, q6, #8 |
michael@0 | 1670 | vraddhn.u16 d0, q0, q8 |
michael@0 | 1671 | vraddhn.u16 d1, q1, q10 |
michael@0 | 1672 | vraddhn.u16 d2, q6, q11 |
michael@0 | 1673 | vrshr.u16 q11, q12, #8 |
michael@0 | 1674 | vrshr.u16 q8, q9, #8 |
michael@0 | 1675 | vrshr.u16 q6, q13, #8 |
michael@0 | 1676 | vraddhn.u16 d24, q12, q11 |
michael@0 | 1677 | vraddhn.u16 d25, q9, q8 |
michael@0 | 1678 | /* |
michael@0 | 1679 | * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format |
michael@0 | 1680 | * and put data into d16 - blue, d17 - green, d18 - red |
michael@0 | 1681 | */ |
michael@0 | 1682 | vshrn.u16 d17, q2, #3 |
michael@0 | 1683 | vshrn.u16 d18, q2, #8 |
michael@0 | 1684 | vraddhn.u16 d26, q13, q6 |
michael@0 | 1685 | vsli.u16 q2, q2, #5 |
michael@0 | 1686 | vsri.u8 d18, d18, #5 |
michael@0 | 1687 | vsri.u8 d17, d17, #6 |
michael@0 | 1688 | /* |
michael@0 | 1689 | * 'combine_over_ca' replacement |
michael@0 | 1690 | * |
michael@0 | 1691 | * output: updated dest in d16 - blue, d17 - green, d18 - red |
michael@0 | 1692 | */ |
michael@0 | 1693 | vmvn.8 q12, q12 |
michael@0 | 1694 | vshrn.u16 d16, q2, #2 |
michael@0 | 1695 | vmvn.8 d26, d26 |
michael@0 | 1696 | vmull.u8 q6, d16, d24 |
michael@0 | 1697 | vmull.u8 q7, d17, d25 |
michael@0 | 1698 | vmull.u8 q11, d18, d26 |
michael@0 | 1699 | .endm |
michael@0 | 1700 | |
michael@0 | 1701 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail |
michael@0 | 1702 | /* ... continue 'combine_over_ca' replacement */ |
michael@0 | 1703 | vrshr.u16 q10, q6, #8 |
michael@0 | 1704 | vrshr.u16 q14, q7, #8 |
michael@0 | 1705 | vrshr.u16 q15, q11, #8 |
michael@0 | 1706 | vraddhn.u16 d16, q10, q6 |
michael@0 | 1707 | vraddhn.u16 d17, q14, q7 |
michael@0 | 1708 | vraddhn.u16 d18, q15, q11 |
michael@0 | 1709 | vqadd.u8 q8, q0, q8 |
michael@0 | 1710 | vqadd.u8 d18, d2, d18 |
michael@0 | 1711 | /* |
michael@0 | 1712 | * convert the results in d16, d17, d18 to r5g6b5 and store |
michael@0 | 1713 | * them into {d28, d29} |
michael@0 | 1714 | */ |
michael@0 | 1715 | vshll.u8 q14, d18, #8 |
michael@0 | 1716 | vshll.u8 q10, d17, #8 |
michael@0 | 1717 | vshll.u8 q15, d16, #8 |
michael@0 | 1718 | vsri.u16 q14, q10, #5 |
michael@0 | 1719 | vsri.u16 q14, q15, #11 |
michael@0 | 1720 | .endm |
michael@0 | 1721 | |
michael@0 | 1722 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head |
michael@0 | 1723 | fetch_mask_pixblock |
michael@0 | 1724 | vrshr.u16 q10, q6, #8 |
michael@0 | 1725 | vrshr.u16 q14, q7, #8 |
michael@0 | 1726 | vld1.16 {d4, d5}, [DST_R, :128]! |
michael@0 | 1727 | vrshr.u16 q15, q11, #8 |
michael@0 | 1728 | vraddhn.u16 d16, q10, q6 |
michael@0 | 1729 | vraddhn.u16 d17, q14, q7 |
michael@0 | 1730 | vraddhn.u16 d22, q15, q11 |
michael@0 | 1731 | /* process_pixblock_head */ |
michael@0 | 1732 | /* |
michael@0 | 1733 | * 'combine_mask_ca' replacement |
michael@0 | 1734 | * |
michael@0 | 1735 | * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] |
michael@0 | 1736 | * mask in {d24, d25, d26} [B, G, R] |
michael@0 | 1737 | * output: updated src in {d0, d1, d2 } [B, G, R] |
michael@0 | 1738 | * updated mask in {d24, d25, d26} [B, G, R] |
michael@0 | 1739 | */ |
michael@0 | 1740 | vmull.u8 q6, d26, d10 |
michael@0 | 1741 | vqadd.u8 q8, q0, q8 |
michael@0 | 1742 | vmull.u8 q0, d24, d8 |
michael@0 | 1743 | vqadd.u8 d22, d2, d22 |
michael@0 | 1744 | vmull.u8 q1, d25, d9 |
michael@0 | 1745 | /* |
michael@0 | 1746 | * convert the result in d16, d17, d22 to r5g6b5 and store |
michael@0 | 1747 | * it into {d28, d29} |
michael@0 | 1748 | */ |
michael@0 | 1749 | vshll.u8 q14, d22, #8 |
michael@0 | 1750 | vshll.u8 q10, d17, #8 |
michael@0 | 1751 | vshll.u8 q15, d16, #8 |
michael@0 | 1752 | vmull.u8 q9, d11, d25 |
michael@0 | 1753 | vsri.u16 q14, q10, #5 |
michael@0 | 1754 | vmull.u8 q12, d11, d24 |
michael@0 | 1755 | vmull.u8 q13, d11, d26 |
michael@0 | 1756 | vsri.u16 q14, q15, #11 |
michael@0 | 1757 | cache_preload 8, 8 |
michael@0 | 1758 | vrshr.u16 q8, q0, #8 |
michael@0 | 1759 | vrshr.u16 q10, q1, #8 |
michael@0 | 1760 | vrshr.u16 q11, q6, #8 |
michael@0 | 1761 | vraddhn.u16 d0, q0, q8 |
michael@0 | 1762 | vraddhn.u16 d1, q1, q10 |
michael@0 | 1763 | vraddhn.u16 d2, q6, q11 |
michael@0 | 1764 | vrshr.u16 q11, q12, #8 |
michael@0 | 1765 | vrshr.u16 q8, q9, #8 |
michael@0 | 1766 | vrshr.u16 q6, q13, #8 |
michael@0 | 1767 | vraddhn.u16 d24, q12, q11 |
michael@0 | 1768 | vraddhn.u16 d25, q9, q8 |
michael@0 | 1769 | /* |
michael@0 | 1770 | * convert 8 r5g6b5 pixel data from {d4, d5} to planar |
michael@0 | 1771 | * 8-bit format and put data into d16 - blue, d17 - green, |
michael@0 | 1772 | * d18 - red |
michael@0 | 1773 | */ |
michael@0 | 1774 | vshrn.u16 d17, q2, #3 |
michael@0 | 1775 | vshrn.u16 d18, q2, #8 |
michael@0 | 1776 | vraddhn.u16 d26, q13, q6 |
michael@0 | 1777 | vsli.u16 q2, q2, #5 |
michael@0 | 1778 | vsri.u8 d17, d17, #6 |
michael@0 | 1779 | vsri.u8 d18, d18, #5 |
michael@0 | 1780 | /* |
michael@0 | 1781 | * 'combine_over_ca' replacement |
michael@0 | 1782 | * |
michael@0 | 1783 | * output: updated dest in d16 - blue, d17 - green, d18 - red |
michael@0 | 1784 | */ |
michael@0 | 1785 | vmvn.8 q12, q12 |
michael@0 | 1786 | vshrn.u16 d16, q2, #2 |
michael@0 | 1787 | vmvn.8 d26, d26 |
michael@0 | 1788 | vmull.u8 q7, d17, d25 |
michael@0 | 1789 | vmull.u8 q6, d16, d24 |
michael@0 | 1790 | vmull.u8 q11, d18, d26 |
michael@0 | 1791 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 1792 | .endm |
michael@0 | 1793 | |
michael@0 | 1794 | .macro pixman_composite_over_n_8888_0565_ca_init |
michael@0 | 1795 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1796 | .vsave {d8-d15} |
michael@0 | 1797 | vpush {d8-d15} |
michael@0 | 1798 | vld1.32 {d11[0]}, [DUMMY] |
michael@0 | 1799 | vdup.8 d8, d11[0] |
michael@0 | 1800 | vdup.8 d9, d11[1] |
michael@0 | 1801 | vdup.8 d10, d11[2] |
michael@0 | 1802 | vdup.8 d11, d11[3] |
michael@0 | 1803 | .endm |
michael@0 | 1804 | |
michael@0 | 1805 | .macro pixman_composite_over_n_8888_0565_ca_cleanup |
michael@0 | 1806 | vpop {d8-d15} |
michael@0 | 1807 | .endm |
michael@0 | 1808 | |
michael@0 | 1809 | generate_composite_function \ |
michael@0 | 1810 | pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ |
michael@0 | 1811 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 1812 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 1813 | 5, /* prefetch distance */ \ |
michael@0 | 1814 | pixman_composite_over_n_8888_0565_ca_init, \ |
michael@0 | 1815 | pixman_composite_over_n_8888_0565_ca_cleanup, \ |
michael@0 | 1816 | pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ |
michael@0 | 1817 | pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ |
michael@0 | 1818 | pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head |
michael@0 | 1819 | |
michael@0 | 1820 | /******************************************************************************/ |
michael@0 | 1821 | |
michael@0 | 1822 | .macro pixman_composite_in_n_8_process_pixblock_head |
michael@0 | 1823 | /* expecting source data in {d0, d1, d2, d3} */ |
michael@0 | 1824 | /* and destination data in {d4, d5, d6, d7} */ |
michael@0 | 1825 | vmull.u8 q8, d4, d3 |
michael@0 | 1826 | vmull.u8 q9, d5, d3 |
michael@0 | 1827 | vmull.u8 q10, d6, d3 |
michael@0 | 1828 | vmull.u8 q11, d7, d3 |
michael@0 | 1829 | .endm |
michael@0 | 1830 | |
michael@0 | 1831 | .macro pixman_composite_in_n_8_process_pixblock_tail |
michael@0 | 1832 | vrshr.u16 q14, q8, #8 |
michael@0 | 1833 | vrshr.u16 q15, q9, #8 |
michael@0 | 1834 | vrshr.u16 q12, q10, #8 |
michael@0 | 1835 | vrshr.u16 q13, q11, #8 |
michael@0 | 1836 | vraddhn.u16 d28, q8, q14 |
michael@0 | 1837 | vraddhn.u16 d29, q9, q15 |
michael@0 | 1838 | vraddhn.u16 d30, q10, q12 |
michael@0 | 1839 | vraddhn.u16 d31, q11, q13 |
michael@0 | 1840 | .endm |
michael@0 | 1841 | |
michael@0 | 1842 | .macro pixman_composite_in_n_8_process_pixblock_tail_head |
michael@0 | 1843 | pixman_composite_in_n_8_process_pixblock_tail |
michael@0 | 1844 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 1845 | cache_preload 32, 32 |
michael@0 | 1846 | pixman_composite_in_n_8_process_pixblock_head |
michael@0 | 1847 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 1848 | .endm |
michael@0 | 1849 | |
michael@0 | 1850 | .macro pixman_composite_in_n_8_init |
michael@0 | 1851 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1852 | vld1.32 {d3[0]}, [DUMMY] |
michael@0 | 1853 | vdup.8 d3, d3[3] |
michael@0 | 1854 | .endm |
michael@0 | 1855 | |
michael@0 | 1856 | .macro pixman_composite_in_n_8_cleanup |
michael@0 | 1857 | .endm |
michael@0 | 1858 | |
michael@0 | 1859 | generate_composite_function \ |
michael@0 | 1860 | pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ |
michael@0 | 1861 | FLAG_DST_READWRITE, \ |
michael@0 | 1862 | 32, /* number of pixels, processed in a single block */ \ |
michael@0 | 1863 | 5, /* prefetch distance */ \ |
michael@0 | 1864 | pixman_composite_in_n_8_init, \ |
michael@0 | 1865 | pixman_composite_in_n_8_cleanup, \ |
michael@0 | 1866 | pixman_composite_in_n_8_process_pixblock_head, \ |
michael@0 | 1867 | pixman_composite_in_n_8_process_pixblock_tail, \ |
michael@0 | 1868 | pixman_composite_in_n_8_process_pixblock_tail_head, \ |
michael@0 | 1869 | 28, /* dst_w_basereg */ \ |
michael@0 | 1870 | 4, /* dst_r_basereg */ \ |
michael@0 | 1871 | 0, /* src_basereg */ \ |
michael@0 | 1872 | 24 /* mask_basereg */ |
michael@0 | 1873 | |
michael@0 | 1874 | .macro pixman_composite_add_n_8_8_process_pixblock_head |
michael@0 | 1875 | /* expecting source data in {d8, d9, d10, d11} */ |
michael@0 | 1876 | /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ |
michael@0 | 1877 | /* and destination data in {d4, d5, d6, d7} */ |
michael@0 | 1878 | /* mask is in d24, d25, d26, d27 */ |
michael@0 | 1879 | vmull.u8 q0, d24, d11 |
michael@0 | 1880 | vmull.u8 q1, d25, d11 |
michael@0 | 1881 | vmull.u8 q6, d26, d11 |
michael@0 | 1882 | vmull.u8 q7, d27, d11 |
michael@0 | 1883 | vrshr.u16 q10, q0, #8 |
michael@0 | 1884 | vrshr.u16 q11, q1, #8 |
michael@0 | 1885 | vrshr.u16 q12, q6, #8 |
michael@0 | 1886 | vrshr.u16 q13, q7, #8 |
michael@0 | 1887 | vraddhn.u16 d0, q0, q10 |
michael@0 | 1888 | vraddhn.u16 d1, q1, q11 |
michael@0 | 1889 | vraddhn.u16 d2, q6, q12 |
michael@0 | 1890 | vraddhn.u16 d3, q7, q13 |
michael@0 | 1891 | vqadd.u8 q14, q0, q2 |
michael@0 | 1892 | vqadd.u8 q15, q1, q3 |
michael@0 | 1893 | .endm |
michael@0 | 1894 | |
michael@0 | 1895 | .macro pixman_composite_add_n_8_8_process_pixblock_tail |
michael@0 | 1896 | .endm |
michael@0 | 1897 | |
michael@0 | 1898 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 1899 | .macro pixman_composite_add_n_8_8_process_pixblock_tail_head |
michael@0 | 1900 | pixman_composite_add_n_8_8_process_pixblock_tail |
michael@0 | 1901 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 1902 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 1903 | fetch_mask_pixblock |
michael@0 | 1904 | cache_preload 32, 32 |
michael@0 | 1905 | pixman_composite_add_n_8_8_process_pixblock_head |
michael@0 | 1906 | .endm |
michael@0 | 1907 | |
michael@0 | 1908 | .macro pixman_composite_add_n_8_8_init |
michael@0 | 1909 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 1910 | .vsave {d8-d15} |
michael@0 | 1911 | vpush {d8-d15} |
michael@0 | 1912 | vld1.32 {d11[0]}, [DUMMY] |
michael@0 | 1913 | vdup.8 d11, d11[3] |
michael@0 | 1914 | .endm |
michael@0 | 1915 | |
michael@0 | 1916 | .macro pixman_composite_add_n_8_8_cleanup |
michael@0 | 1917 | vpop {d8-d15} |
michael@0 | 1918 | .endm |
michael@0 | 1919 | |
michael@0 | 1920 | generate_composite_function \ |
michael@0 | 1921 | pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ |
michael@0 | 1922 | FLAG_DST_READWRITE, \ |
michael@0 | 1923 | 32, /* number of pixels, processed in a single block */ \ |
michael@0 | 1924 | 5, /* prefetch distance */ \ |
michael@0 | 1925 | pixman_composite_add_n_8_8_init, \ |
michael@0 | 1926 | pixman_composite_add_n_8_8_cleanup, \ |
michael@0 | 1927 | pixman_composite_add_n_8_8_process_pixblock_head, \ |
michael@0 | 1928 | pixman_composite_add_n_8_8_process_pixblock_tail, \ |
michael@0 | 1929 | pixman_composite_add_n_8_8_process_pixblock_tail_head |
michael@0 | 1930 | |
michael@0 | 1931 | /******************************************************************************/ |
michael@0 | 1932 | |
michael@0 | 1933 | .macro pixman_composite_add_8_8_8_process_pixblock_head |
michael@0 | 1934 | /* expecting source data in {d0, d1, d2, d3} */ |
michael@0 | 1935 | /* destination data in {d4, d5, d6, d7} */ |
michael@0 | 1936 | /* mask in {d24, d25, d26, d27} */ |
michael@0 | 1937 | vmull.u8 q8, d24, d0 |
michael@0 | 1938 | vmull.u8 q9, d25, d1 |
michael@0 | 1939 | vmull.u8 q10, d26, d2 |
michael@0 | 1940 | vmull.u8 q11, d27, d3 |
michael@0 | 1941 | vrshr.u16 q0, q8, #8 |
michael@0 | 1942 | vrshr.u16 q1, q9, #8 |
michael@0 | 1943 | vrshr.u16 q12, q10, #8 |
michael@0 | 1944 | vrshr.u16 q13, q11, #8 |
michael@0 | 1945 | vraddhn.u16 d0, q0, q8 |
michael@0 | 1946 | vraddhn.u16 d1, q1, q9 |
michael@0 | 1947 | vraddhn.u16 d2, q12, q10 |
michael@0 | 1948 | vraddhn.u16 d3, q13, q11 |
michael@0 | 1949 | vqadd.u8 q14, q0, q2 |
michael@0 | 1950 | vqadd.u8 q15, q1, q3 |
michael@0 | 1951 | .endm |
michael@0 | 1952 | |
michael@0 | 1953 | .macro pixman_composite_add_8_8_8_process_pixblock_tail |
michael@0 | 1954 | .endm |
michael@0 | 1955 | |
michael@0 | 1956 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 1957 | .macro pixman_composite_add_8_8_8_process_pixblock_tail_head |
michael@0 | 1958 | pixman_composite_add_8_8_8_process_pixblock_tail |
michael@0 | 1959 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 1960 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 1961 | fetch_mask_pixblock |
michael@0 | 1962 | fetch_src_pixblock |
michael@0 | 1963 | cache_preload 32, 32 |
michael@0 | 1964 | pixman_composite_add_8_8_8_process_pixblock_head |
michael@0 | 1965 | .endm |
michael@0 | 1966 | |
michael@0 | 1967 | .macro pixman_composite_add_8_8_8_init |
michael@0 | 1968 | .endm |
michael@0 | 1969 | |
michael@0 | 1970 | .macro pixman_composite_add_8_8_8_cleanup |
michael@0 | 1971 | .endm |
michael@0 | 1972 | |
michael@0 | 1973 | generate_composite_function \ |
michael@0 | 1974 | pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ |
michael@0 | 1975 | FLAG_DST_READWRITE, \ |
michael@0 | 1976 | 32, /* number of pixels, processed in a single block */ \ |
michael@0 | 1977 | 5, /* prefetch distance */ \ |
michael@0 | 1978 | pixman_composite_add_8_8_8_init, \ |
michael@0 | 1979 | pixman_composite_add_8_8_8_cleanup, \ |
michael@0 | 1980 | pixman_composite_add_8_8_8_process_pixblock_head, \ |
michael@0 | 1981 | pixman_composite_add_8_8_8_process_pixblock_tail, \ |
michael@0 | 1982 | pixman_composite_add_8_8_8_process_pixblock_tail_head |
michael@0 | 1983 | |
michael@0 | 1984 | /******************************************************************************/ |
michael@0 | 1985 | |
michael@0 | 1986 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_head |
michael@0 | 1987 | /* expecting source data in {d0, d1, d2, d3} */ |
michael@0 | 1988 | /* destination data in {d4, d5, d6, d7} */ |
michael@0 | 1989 | /* mask in {d24, d25, d26, d27} */ |
michael@0 | 1990 | vmull.u8 q8, d27, d0 |
michael@0 | 1991 | vmull.u8 q9, d27, d1 |
michael@0 | 1992 | vmull.u8 q10, d27, d2 |
michael@0 | 1993 | vmull.u8 q11, d27, d3 |
michael@0 | 1994 | /* 1 cycle bubble */ |
michael@0 | 1995 | vrsra.u16 q8, q8, #8 |
michael@0 | 1996 | vrsra.u16 q9, q9, #8 |
michael@0 | 1997 | vrsra.u16 q10, q10, #8 |
michael@0 | 1998 | vrsra.u16 q11, q11, #8 |
michael@0 | 1999 | .endm |
michael@0 | 2000 | |
michael@0 | 2001 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail |
michael@0 | 2002 | /* 2 cycle bubble */ |
michael@0 | 2003 | vrshrn.u16 d28, q8, #8 |
michael@0 | 2004 | vrshrn.u16 d29, q9, #8 |
michael@0 | 2005 | vrshrn.u16 d30, q10, #8 |
michael@0 | 2006 | vrshrn.u16 d31, q11, #8 |
michael@0 | 2007 | vqadd.u8 q14, q2, q14 |
michael@0 | 2008 | /* 1 cycle bubble */ |
michael@0 | 2009 | vqadd.u8 q15, q3, q15 |
michael@0 | 2010 | .endm |
michael@0 | 2011 | |
michael@0 | 2012 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head |
michael@0 | 2013 | fetch_src_pixblock |
michael@0 | 2014 | vrshrn.u16 d28, q8, #8 |
michael@0 | 2015 | fetch_mask_pixblock |
michael@0 | 2016 | vrshrn.u16 d29, q9, #8 |
michael@0 | 2017 | vmull.u8 q8, d27, d0 |
michael@0 | 2018 | vrshrn.u16 d30, q10, #8 |
michael@0 | 2019 | vmull.u8 q9, d27, d1 |
michael@0 | 2020 | vrshrn.u16 d31, q11, #8 |
michael@0 | 2021 | vmull.u8 q10, d27, d2 |
michael@0 | 2022 | vqadd.u8 q14, q2, q14 |
michael@0 | 2023 | vmull.u8 q11, d27, d3 |
michael@0 | 2024 | vqadd.u8 q15, q3, q15 |
michael@0 | 2025 | vrsra.u16 q8, q8, #8 |
michael@0 | 2026 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 2027 | vrsra.u16 q9, q9, #8 |
michael@0 | 2028 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 2029 | vrsra.u16 q10, q10, #8 |
michael@0 | 2030 | |
michael@0 | 2031 | cache_preload 8, 8 |
michael@0 | 2032 | |
michael@0 | 2033 | vrsra.u16 q11, q11, #8 |
michael@0 | 2034 | .endm |
michael@0 | 2035 | |
michael@0 | 2036 | generate_composite_function \ |
michael@0 | 2037 | pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ |
michael@0 | 2038 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2039 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2040 | 10, /* prefetch distance */ \ |
michael@0 | 2041 | default_init, \ |
michael@0 | 2042 | default_cleanup, \ |
michael@0 | 2043 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
michael@0 | 2044 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
michael@0 | 2045 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head |
michael@0 | 2046 | |
michael@0 | 2047 | generate_composite_function_single_scanline \ |
michael@0 | 2048 | pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ |
michael@0 | 2049 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2050 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2051 | default_init, \ |
michael@0 | 2052 | default_cleanup, \ |
michael@0 | 2053 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
michael@0 | 2054 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
michael@0 | 2055 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head |
michael@0 | 2056 | |
michael@0 | 2057 | /******************************************************************************/ |
michael@0 | 2058 | |
michael@0 | 2059 | generate_composite_function \ |
michael@0 | 2060 | pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ |
michael@0 | 2061 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2062 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2063 | 5, /* prefetch distance */ \ |
michael@0 | 2064 | default_init, \ |
michael@0 | 2065 | default_cleanup, \ |
michael@0 | 2066 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
michael@0 | 2067 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
michael@0 | 2068 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ |
michael@0 | 2069 | 28, /* dst_w_basereg */ \ |
michael@0 | 2070 | 4, /* dst_r_basereg */ \ |
michael@0 | 2071 | 0, /* src_basereg */ \ |
michael@0 | 2072 | 27 /* mask_basereg */ |
michael@0 | 2073 | |
michael@0 | 2074 | /******************************************************************************/ |
michael@0 | 2075 | |
michael@0 | 2076 | .macro pixman_composite_add_n_8_8888_init |
michael@0 | 2077 | add DUMMY, sp, #ARGS_STACK_OFFSET |
michael@0 | 2078 | vld1.32 {d3[0]}, [DUMMY] |
michael@0 | 2079 | vdup.8 d0, d3[0] |
michael@0 | 2080 | vdup.8 d1, d3[1] |
michael@0 | 2081 | vdup.8 d2, d3[2] |
michael@0 | 2082 | vdup.8 d3, d3[3] |
michael@0 | 2083 | .endm |
michael@0 | 2084 | |
michael@0 | 2085 | .macro pixman_composite_add_n_8_8888_cleanup |
michael@0 | 2086 | .endm |
michael@0 | 2087 | |
michael@0 | 2088 | generate_composite_function \ |
michael@0 | 2089 | pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ |
michael@0 | 2090 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2091 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2092 | 5, /* prefetch distance */ \ |
michael@0 | 2093 | pixman_composite_add_n_8_8888_init, \ |
michael@0 | 2094 | pixman_composite_add_n_8_8888_cleanup, \ |
michael@0 | 2095 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
michael@0 | 2096 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
michael@0 | 2097 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ |
michael@0 | 2098 | 28, /* dst_w_basereg */ \ |
michael@0 | 2099 | 4, /* dst_r_basereg */ \ |
michael@0 | 2100 | 0, /* src_basereg */ \ |
michael@0 | 2101 | 27 /* mask_basereg */ |
michael@0 | 2102 | |
michael@0 | 2103 | /******************************************************************************/ |
michael@0 | 2104 | |
michael@0 | 2105 | .macro pixman_composite_add_8888_n_8888_init |
michael@0 | 2106 | add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) |
michael@0 | 2107 | vld1.32 {d27[0]}, [DUMMY] |
michael@0 | 2108 | vdup.8 d27, d27[3] |
michael@0 | 2109 | .endm |
michael@0 | 2110 | |
michael@0 | 2111 | .macro pixman_composite_add_8888_n_8888_cleanup |
michael@0 | 2112 | .endm |
michael@0 | 2113 | |
michael@0 | 2114 | generate_composite_function \ |
michael@0 | 2115 | pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ |
michael@0 | 2116 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2117 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2118 | 5, /* prefetch distance */ \ |
michael@0 | 2119 | pixman_composite_add_8888_n_8888_init, \ |
michael@0 | 2120 | pixman_composite_add_8888_n_8888_cleanup, \ |
michael@0 | 2121 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
michael@0 | 2122 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
michael@0 | 2123 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ |
michael@0 | 2124 | 28, /* dst_w_basereg */ \ |
michael@0 | 2125 | 4, /* dst_r_basereg */ \ |
michael@0 | 2126 | 0, /* src_basereg */ \ |
michael@0 | 2127 | 27 /* mask_basereg */ |
michael@0 | 2128 | |
michael@0 | 2129 | /******************************************************************************/ |
michael@0 | 2130 | |
michael@0 | 2131 | .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
michael@0 | 2132 | /* expecting source data in {d0, d1, d2, d3} */ |
michael@0 | 2133 | /* destination data in {d4, d5, d6, d7} */ |
michael@0 | 2134 | /* solid mask is in d15 */ |
michael@0 | 2135 | |
michael@0 | 2136 | /* 'in' */ |
michael@0 | 2137 | vmull.u8 q8, d15, d3 |
michael@0 | 2138 | vmull.u8 q6, d15, d2 |
michael@0 | 2139 | vmull.u8 q5, d15, d1 |
michael@0 | 2140 | vmull.u8 q4, d15, d0 |
michael@0 | 2141 | vrshr.u16 q13, q8, #8 |
michael@0 | 2142 | vrshr.u16 q12, q6, #8 |
michael@0 | 2143 | vrshr.u16 q11, q5, #8 |
michael@0 | 2144 | vrshr.u16 q10, q4, #8 |
michael@0 | 2145 | vraddhn.u16 d3, q8, q13 |
michael@0 | 2146 | vraddhn.u16 d2, q6, q12 |
michael@0 | 2147 | vraddhn.u16 d1, q5, q11 |
michael@0 | 2148 | vraddhn.u16 d0, q4, q10 |
michael@0 | 2149 | vmvn.8 d24, d3 /* get inverted alpha */ |
michael@0 | 2150 | /* now do alpha blending */ |
michael@0 | 2151 | vmull.u8 q8, d24, d4 |
michael@0 | 2152 | vmull.u8 q9, d24, d5 |
michael@0 | 2153 | vmull.u8 q10, d24, d6 |
michael@0 | 2154 | vmull.u8 q11, d24, d7 |
michael@0 | 2155 | .endm |
michael@0 | 2156 | |
michael@0 | 2157 | .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
michael@0 | 2158 | vrshr.u16 q14, q8, #8 |
michael@0 | 2159 | vrshr.u16 q15, q9, #8 |
michael@0 | 2160 | vrshr.u16 q12, q10, #8 |
michael@0 | 2161 | vrshr.u16 q13, q11, #8 |
michael@0 | 2162 | vraddhn.u16 d28, q14, q8 |
michael@0 | 2163 | vraddhn.u16 d29, q15, q9 |
michael@0 | 2164 | vraddhn.u16 d30, q12, q10 |
michael@0 | 2165 | vraddhn.u16 d31, q13, q11 |
michael@0 | 2166 | .endm |
michael@0 | 2167 | |
michael@0 | 2168 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 2169 | .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head |
michael@0 | 2170 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 2171 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
michael@0 | 2172 | fetch_src_pixblock |
michael@0 | 2173 | cache_preload 8, 8 |
michael@0 | 2174 | fetch_mask_pixblock |
michael@0 | 2175 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
michael@0 | 2176 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 2177 | .endm |
michael@0 | 2178 | |
michael@0 | 2179 | generate_composite_function_single_scanline \ |
michael@0 | 2180 | pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ |
michael@0 | 2181 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2182 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2183 | default_init_need_all_regs, \ |
michael@0 | 2184 | default_cleanup_need_all_regs, \ |
michael@0 | 2185 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ |
michael@0 | 2186 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ |
michael@0 | 2187 | pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ |
michael@0 | 2188 | 28, /* dst_w_basereg */ \ |
michael@0 | 2189 | 4, /* dst_r_basereg */ \ |
michael@0 | 2190 | 0, /* src_basereg */ \ |
michael@0 | 2191 | 12 /* mask_basereg */ |
michael@0 | 2192 | |
michael@0 | 2193 | /******************************************************************************/ |
michael@0 | 2194 | |
michael@0 | 2195 | .macro pixman_composite_over_8888_n_8888_process_pixblock_head |
michael@0 | 2196 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
michael@0 | 2197 | .endm |
michael@0 | 2198 | |
michael@0 | 2199 | .macro pixman_composite_over_8888_n_8888_process_pixblock_tail |
michael@0 | 2200 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
michael@0 | 2201 | vqadd.u8 q14, q0, q14 |
michael@0 | 2202 | vqadd.u8 q15, q1, q15 |
michael@0 | 2203 | .endm |
michael@0 | 2204 | |
michael@0 | 2205 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 2206 | .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head |
michael@0 | 2207 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 2208 | pixman_composite_over_8888_n_8888_process_pixblock_tail |
michael@0 | 2209 | fetch_src_pixblock |
michael@0 | 2210 | cache_preload 8, 8 |
michael@0 | 2211 | pixman_composite_over_8888_n_8888_process_pixblock_head |
michael@0 | 2212 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 2213 | .endm |
michael@0 | 2214 | |
michael@0 | 2215 | .macro pixman_composite_over_8888_n_8888_init |
michael@0 | 2216 | add DUMMY, sp, #48 |
michael@0 | 2217 | .vsave {d8-d15} |
michael@0 | 2218 | vpush {d8-d15} |
michael@0 | 2219 | vld1.32 {d15[0]}, [DUMMY] |
michael@0 | 2220 | vdup.8 d15, d15[3] |
michael@0 | 2221 | .endm |
michael@0 | 2222 | |
michael@0 | 2223 | .macro pixman_composite_over_8888_n_8888_cleanup |
michael@0 | 2224 | vpop {d8-d15} |
michael@0 | 2225 | .endm |
michael@0 | 2226 | |
michael@0 | 2227 | generate_composite_function \ |
michael@0 | 2228 | pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ |
michael@0 | 2229 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2230 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2231 | 5, /* prefetch distance */ \ |
michael@0 | 2232 | pixman_composite_over_8888_n_8888_init, \ |
michael@0 | 2233 | pixman_composite_over_8888_n_8888_cleanup, \ |
michael@0 | 2234 | pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
michael@0 | 2235 | pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
michael@0 | 2236 | pixman_composite_over_8888_n_8888_process_pixblock_tail_head |
michael@0 | 2237 | |
michael@0 | 2238 | /******************************************************************************/ |
michael@0 | 2239 | |
michael@0 | 2240 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 2241 | .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head |
michael@0 | 2242 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 2243 | pixman_composite_over_8888_n_8888_process_pixblock_tail |
michael@0 | 2244 | fetch_src_pixblock |
michael@0 | 2245 | cache_preload 8, 8 |
michael@0 | 2246 | fetch_mask_pixblock |
michael@0 | 2247 | pixman_composite_over_8888_n_8888_process_pixblock_head |
michael@0 | 2248 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 2249 | .endm |
michael@0 | 2250 | |
michael@0 | 2251 | generate_composite_function \ |
michael@0 | 2252 | pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ |
michael@0 | 2253 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2254 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2255 | 5, /* prefetch distance */ \ |
michael@0 | 2256 | default_init_need_all_regs, \ |
michael@0 | 2257 | default_cleanup_need_all_regs, \ |
michael@0 | 2258 | pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
michael@0 | 2259 | pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
michael@0 | 2260 | pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ |
michael@0 | 2261 | 28, /* dst_w_basereg */ \ |
michael@0 | 2262 | 4, /* dst_r_basereg */ \ |
michael@0 | 2263 | 0, /* src_basereg */ \ |
michael@0 | 2264 | 12 /* mask_basereg */ |
michael@0 | 2265 | |
michael@0 | 2266 | generate_composite_function_single_scanline \ |
michael@0 | 2267 | pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ |
michael@0 | 2268 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2269 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2270 | default_init_need_all_regs, \ |
michael@0 | 2271 | default_cleanup_need_all_regs, \ |
michael@0 | 2272 | pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
michael@0 | 2273 | pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
michael@0 | 2274 | pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ |
michael@0 | 2275 | 28, /* dst_w_basereg */ \ |
michael@0 | 2276 | 4, /* dst_r_basereg */ \ |
michael@0 | 2277 | 0, /* src_basereg */ \ |
michael@0 | 2278 | 12 /* mask_basereg */ |
michael@0 | 2279 | |
michael@0 | 2280 | /******************************************************************************/ |
michael@0 | 2281 | |
michael@0 | 2282 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 2283 | .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head |
michael@0 | 2284 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 2285 | pixman_composite_over_8888_n_8888_process_pixblock_tail |
michael@0 | 2286 | fetch_src_pixblock |
michael@0 | 2287 | cache_preload 8, 8 |
michael@0 | 2288 | fetch_mask_pixblock |
michael@0 | 2289 | pixman_composite_over_8888_n_8888_process_pixblock_head |
michael@0 | 2290 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 2291 | .endm |
michael@0 | 2292 | |
michael@0 | 2293 | generate_composite_function \ |
michael@0 | 2294 | pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ |
michael@0 | 2295 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2296 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2297 | 5, /* prefetch distance */ \ |
michael@0 | 2298 | default_init_need_all_regs, \ |
michael@0 | 2299 | default_cleanup_need_all_regs, \ |
michael@0 | 2300 | pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
michael@0 | 2301 | pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
michael@0 | 2302 | pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ |
michael@0 | 2303 | 28, /* dst_w_basereg */ \ |
michael@0 | 2304 | 4, /* dst_r_basereg */ \ |
michael@0 | 2305 | 0, /* src_basereg */ \ |
michael@0 | 2306 | 15 /* mask_basereg */ |
michael@0 | 2307 | |
michael@0 | 2308 | /******************************************************************************/ |
michael@0 | 2309 | |
michael@0 | 2310 | .macro pixman_composite_src_0888_0888_process_pixblock_head |
michael@0 | 2311 | .endm |
michael@0 | 2312 | |
michael@0 | 2313 | .macro pixman_composite_src_0888_0888_process_pixblock_tail |
michael@0 | 2314 | .endm |
michael@0 | 2315 | |
michael@0 | 2316 | .macro pixman_composite_src_0888_0888_process_pixblock_tail_head |
michael@0 | 2317 | vst3.8 {d0, d1, d2}, [DST_W]! |
michael@0 | 2318 | fetch_src_pixblock |
michael@0 | 2319 | cache_preload 8, 8 |
michael@0 | 2320 | .endm |
michael@0 | 2321 | |
michael@0 | 2322 | generate_composite_function \ |
michael@0 | 2323 | pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ |
michael@0 | 2324 | FLAG_DST_WRITEONLY, \ |
michael@0 | 2325 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2326 | 10, /* prefetch distance */ \ |
michael@0 | 2327 | default_init, \ |
michael@0 | 2328 | default_cleanup, \ |
michael@0 | 2329 | pixman_composite_src_0888_0888_process_pixblock_head, \ |
michael@0 | 2330 | pixman_composite_src_0888_0888_process_pixblock_tail, \ |
michael@0 | 2331 | pixman_composite_src_0888_0888_process_pixblock_tail_head, \ |
michael@0 | 2332 | 0, /* dst_w_basereg */ \ |
michael@0 | 2333 | 0, /* dst_r_basereg */ \ |
michael@0 | 2334 | 0, /* src_basereg */ \ |
michael@0 | 2335 | 0 /* mask_basereg */ |
michael@0 | 2336 | |
michael@0 | 2337 | /******************************************************************************/ |
michael@0 | 2338 | |
michael@0 | 2339 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_head |
michael@0 | 2340 | vswp d0, d2 |
michael@0 | 2341 | .endm |
michael@0 | 2342 | |
michael@0 | 2343 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail |
michael@0 | 2344 | .endm |
michael@0 | 2345 | |
michael@0 | 2346 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head |
michael@0 | 2347 | vst4.8 {d0, d1, d2, d3}, [DST_W]! |
michael@0 | 2348 | fetch_src_pixblock |
michael@0 | 2349 | vswp d0, d2 |
michael@0 | 2350 | cache_preload 8, 8 |
michael@0 | 2351 | .endm |
michael@0 | 2352 | |
michael@0 | 2353 | .macro pixman_composite_src_0888_8888_rev_init |
michael@0 | 2354 | veor d3, d3, d3 |
michael@0 | 2355 | .endm |
michael@0 | 2356 | |
michael@0 | 2357 | generate_composite_function \ |
michael@0 | 2358 | pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ |
michael@0 | 2359 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2360 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2361 | 10, /* prefetch distance */ \ |
michael@0 | 2362 | pixman_composite_src_0888_8888_rev_init, \ |
michael@0 | 2363 | default_cleanup, \ |
michael@0 | 2364 | pixman_composite_src_0888_8888_rev_process_pixblock_head, \ |
michael@0 | 2365 | pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ |
michael@0 | 2366 | pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ |
michael@0 | 2367 | 0, /* dst_w_basereg */ \ |
michael@0 | 2368 | 0, /* dst_r_basereg */ \ |
michael@0 | 2369 | 0, /* src_basereg */ \ |
michael@0 | 2370 | 0 /* mask_basereg */ |
michael@0 | 2371 | |
michael@0 | 2372 | /******************************************************************************/ |
michael@0 | 2373 | |
michael@0 | 2374 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_head |
michael@0 | 2375 | vshll.u8 q8, d1, #8 |
michael@0 | 2376 | vshll.u8 q9, d2, #8 |
michael@0 | 2377 | .endm |
michael@0 | 2378 | |
michael@0 | 2379 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail |
michael@0 | 2380 | vshll.u8 q14, d0, #8 |
michael@0 | 2381 | vsri.u16 q14, q8, #5 |
michael@0 | 2382 | vsri.u16 q14, q9, #11 |
michael@0 | 2383 | .endm |
michael@0 | 2384 | |
michael@0 | 2385 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head |
michael@0 | 2386 | vshll.u8 q14, d0, #8 |
michael@0 | 2387 | fetch_src_pixblock |
michael@0 | 2388 | vsri.u16 q14, q8, #5 |
michael@0 | 2389 | vsri.u16 q14, q9, #11 |
michael@0 | 2390 | vshll.u8 q8, d1, #8 |
michael@0 | 2391 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 2392 | vshll.u8 q9, d2, #8 |
michael@0 | 2393 | .endm |
michael@0 | 2394 | |
michael@0 | 2395 | generate_composite_function \ |
michael@0 | 2396 | pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ |
michael@0 | 2397 | FLAG_DST_WRITEONLY, \ |
michael@0 | 2398 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2399 | 10, /* prefetch distance */ \ |
michael@0 | 2400 | default_init, \ |
michael@0 | 2401 | default_cleanup, \ |
michael@0 | 2402 | pixman_composite_src_0888_0565_rev_process_pixblock_head, \ |
michael@0 | 2403 | pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ |
michael@0 | 2404 | pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ |
michael@0 | 2405 | 28, /* dst_w_basereg */ \ |
michael@0 | 2406 | 0, /* dst_r_basereg */ \ |
michael@0 | 2407 | 0, /* src_basereg */ \ |
michael@0 | 2408 | 0 /* mask_basereg */ |
michael@0 | 2409 | |
michael@0 | 2410 | /******************************************************************************/ |
michael@0 | 2411 | |
michael@0 | 2412 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_head |
michael@0 | 2413 | vmull.u8 q8, d3, d0 |
michael@0 | 2414 | vmull.u8 q9, d3, d1 |
michael@0 | 2415 | vmull.u8 q10, d3, d2 |
michael@0 | 2416 | .endm |
michael@0 | 2417 | |
michael@0 | 2418 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail |
michael@0 | 2419 | vrshr.u16 q11, q8, #8 |
michael@0 | 2420 | vswp d3, d31 |
michael@0 | 2421 | vrshr.u16 q12, q9, #8 |
michael@0 | 2422 | vrshr.u16 q13, q10, #8 |
michael@0 | 2423 | vraddhn.u16 d30, q11, q8 |
michael@0 | 2424 | vraddhn.u16 d29, q12, q9 |
michael@0 | 2425 | vraddhn.u16 d28, q13, q10 |
michael@0 | 2426 | .endm |
michael@0 | 2427 | |
michael@0 | 2428 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head |
michael@0 | 2429 | vrshr.u16 q11, q8, #8 |
michael@0 | 2430 | vswp d3, d31 |
michael@0 | 2431 | vrshr.u16 q12, q9, #8 |
michael@0 | 2432 | vrshr.u16 q13, q10, #8 |
michael@0 | 2433 | fetch_src_pixblock |
michael@0 | 2434 | vraddhn.u16 d30, q11, q8 |
michael@0 | 2435 | PF add PF_X, PF_X, #8 |
michael@0 | 2436 | PF tst PF_CTL, #0xF |
michael@0 | 2437 | PF addne PF_X, PF_X, #8 |
michael@0 | 2438 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 2439 | vraddhn.u16 d29, q12, q9 |
michael@0 | 2440 | vraddhn.u16 d28, q13, q10 |
michael@0 | 2441 | vmull.u8 q8, d3, d0 |
michael@0 | 2442 | vmull.u8 q9, d3, d1 |
michael@0 | 2443 | vmull.u8 q10, d3, d2 |
michael@0 | 2444 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 2445 | PF cmp PF_X, ORIG_W |
michael@0 | 2446 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
michael@0 | 2447 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 2448 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 2449 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
michael@0 | 2450 | .endm |
michael@0 | 2451 | |
michael@0 | 2452 | generate_composite_function \ |
michael@0 | 2453 | pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ |
michael@0 | 2454 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2455 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2456 | 10, /* prefetch distance */ \ |
michael@0 | 2457 | default_init, \ |
michael@0 | 2458 | default_cleanup, \ |
michael@0 | 2459 | pixman_composite_src_pixbuf_8888_process_pixblock_head, \ |
michael@0 | 2460 | pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ |
michael@0 | 2461 | pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ |
michael@0 | 2462 | 28, /* dst_w_basereg */ \ |
michael@0 | 2463 | 0, /* dst_r_basereg */ \ |
michael@0 | 2464 | 0, /* src_basereg */ \ |
michael@0 | 2465 | 0 /* mask_basereg */ |
michael@0 | 2466 | |
michael@0 | 2467 | /******************************************************************************/ |
michael@0 | 2468 | |
michael@0 | 2469 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head |
michael@0 | 2470 | vmull.u8 q8, d3, d0 |
michael@0 | 2471 | vmull.u8 q9, d3, d1 |
michael@0 | 2472 | vmull.u8 q10, d3, d2 |
michael@0 | 2473 | .endm |
michael@0 | 2474 | |
michael@0 | 2475 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail |
michael@0 | 2476 | vrshr.u16 q11, q8, #8 |
michael@0 | 2477 | vswp d3, d31 |
michael@0 | 2478 | vrshr.u16 q12, q9, #8 |
michael@0 | 2479 | vrshr.u16 q13, q10, #8 |
michael@0 | 2480 | vraddhn.u16 d28, q11, q8 |
michael@0 | 2481 | vraddhn.u16 d29, q12, q9 |
michael@0 | 2482 | vraddhn.u16 d30, q13, q10 |
michael@0 | 2483 | .endm |
michael@0 | 2484 | |
michael@0 | 2485 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head |
michael@0 | 2486 | vrshr.u16 q11, q8, #8 |
michael@0 | 2487 | vswp d3, d31 |
michael@0 | 2488 | vrshr.u16 q12, q9, #8 |
michael@0 | 2489 | vrshr.u16 q13, q10, #8 |
michael@0 | 2490 | fetch_src_pixblock |
michael@0 | 2491 | vraddhn.u16 d28, q11, q8 |
michael@0 | 2492 | PF add PF_X, PF_X, #8 |
michael@0 | 2493 | PF tst PF_CTL, #0xF |
michael@0 | 2494 | PF addne PF_X, PF_X, #8 |
michael@0 | 2495 | PF subne PF_CTL, PF_CTL, #1 |
michael@0 | 2496 | vraddhn.u16 d29, q12, q9 |
michael@0 | 2497 | vraddhn.u16 d30, q13, q10 |
michael@0 | 2498 | vmull.u8 q8, d3, d0 |
michael@0 | 2499 | vmull.u8 q9, d3, d1 |
michael@0 | 2500 | vmull.u8 q10, d3, d2 |
michael@0 | 2501 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 2502 | PF cmp PF_X, ORIG_W |
michael@0 | 2503 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
michael@0 | 2504 | PF subge PF_X, PF_X, ORIG_W |
michael@0 | 2505 | PF subges PF_CTL, PF_CTL, #0x10 |
michael@0 | 2506 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
michael@0 | 2507 | .endm |
michael@0 | 2508 | |
michael@0 | 2509 | generate_composite_function \ |
michael@0 | 2510 | pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ |
michael@0 | 2511 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2512 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2513 | 10, /* prefetch distance */ \ |
michael@0 | 2514 | default_init, \ |
michael@0 | 2515 | default_cleanup, \ |
michael@0 | 2516 | pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ |
michael@0 | 2517 | pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ |
michael@0 | 2518 | pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ |
michael@0 | 2519 | 28, /* dst_w_basereg */ \ |
michael@0 | 2520 | 0, /* dst_r_basereg */ \ |
michael@0 | 2521 | 0, /* src_basereg */ \ |
michael@0 | 2522 | 0 /* mask_basereg */ |
michael@0 | 2523 | |
michael@0 | 2524 | /******************************************************************************/ |
michael@0 | 2525 | |
michael@0 | 2526 | .macro pixman_composite_over_0565_8_0565_process_pixblock_head |
michael@0 | 2527 | /* mask is in d15 */ |
michael@0 | 2528 | convert_0565_to_x888 q4, d2, d1, d0 |
michael@0 | 2529 | convert_0565_to_x888 q5, d6, d5, d4 |
michael@0 | 2530 | /* source pixel data is in {d0, d1, d2, XX} */ |
michael@0 | 2531 | /* destination pixel data is in {d4, d5, d6, XX} */ |
michael@0 | 2532 | vmvn.8 d7, d15 |
michael@0 | 2533 | vmull.u8 q6, d15, d2 |
michael@0 | 2534 | vmull.u8 q5, d15, d1 |
michael@0 | 2535 | vmull.u8 q4, d15, d0 |
michael@0 | 2536 | vmull.u8 q8, d7, d4 |
michael@0 | 2537 | vmull.u8 q9, d7, d5 |
michael@0 | 2538 | vmull.u8 q13, d7, d6 |
michael@0 | 2539 | vrshr.u16 q12, q6, #8 |
michael@0 | 2540 | vrshr.u16 q11, q5, #8 |
michael@0 | 2541 | vrshr.u16 q10, q4, #8 |
michael@0 | 2542 | vraddhn.u16 d2, q6, q12 |
michael@0 | 2543 | vraddhn.u16 d1, q5, q11 |
michael@0 | 2544 | vraddhn.u16 d0, q4, q10 |
michael@0 | 2545 | .endm |
michael@0 | 2546 | |
michael@0 | 2547 | .macro pixman_composite_over_0565_8_0565_process_pixblock_tail |
michael@0 | 2548 | vrshr.u16 q14, q8, #8 |
michael@0 | 2549 | vrshr.u16 q15, q9, #8 |
michael@0 | 2550 | vrshr.u16 q12, q13, #8 |
michael@0 | 2551 | vraddhn.u16 d28, q14, q8 |
michael@0 | 2552 | vraddhn.u16 d29, q15, q9 |
michael@0 | 2553 | vraddhn.u16 d30, q12, q13 |
michael@0 | 2554 | vqadd.u8 q0, q0, q14 |
michael@0 | 2555 | vqadd.u8 q1, q1, q15 |
michael@0 | 2556 | /* 32bpp result is in {d0, d1, d2, XX} */ |
michael@0 | 2557 | convert_8888_to_0565 d2, d1, d0, q14, q15, q3 |
michael@0 | 2558 | .endm |
michael@0 | 2559 | |
michael@0 | 2560 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 2561 | .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head |
michael@0 | 2562 | fetch_mask_pixblock |
michael@0 | 2563 | pixman_composite_over_0565_8_0565_process_pixblock_tail |
michael@0 | 2564 | fetch_src_pixblock |
michael@0 | 2565 | vld1.16 {d10, d11}, [DST_R, :128]! |
michael@0 | 2566 | cache_preload 8, 8 |
michael@0 | 2567 | pixman_composite_over_0565_8_0565_process_pixblock_head |
michael@0 | 2568 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 2569 | .endm |
michael@0 | 2570 | |
michael@0 | 2571 | generate_composite_function \ |
michael@0 | 2572 | pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ |
michael@0 | 2573 | FLAG_DST_READWRITE, \ |
michael@0 | 2574 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2575 | 5, /* prefetch distance */ \ |
michael@0 | 2576 | default_init_need_all_regs, \ |
michael@0 | 2577 | default_cleanup_need_all_regs, \ |
michael@0 | 2578 | pixman_composite_over_0565_8_0565_process_pixblock_head, \ |
michael@0 | 2579 | pixman_composite_over_0565_8_0565_process_pixblock_tail, \ |
michael@0 | 2580 | pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ |
michael@0 | 2581 | 28, /* dst_w_basereg */ \ |
michael@0 | 2582 | 10, /* dst_r_basereg */ \ |
michael@0 | 2583 | 8, /* src_basereg */ \ |
michael@0 | 2584 | 15 /* mask_basereg */ |
michael@0 | 2585 | |
michael@0 | 2586 | /******************************************************************************/ |
michael@0 | 2587 | |
michael@0 | 2588 | .macro pixman_composite_over_0565_n_0565_init |
michael@0 | 2589 | add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) |
michael@0 | 2590 | .vsave {d8-d15} |
michael@0 | 2591 | vpush {d8-d15} |
michael@0 | 2592 | vld1.32 {d15[0]}, [DUMMY] |
michael@0 | 2593 | vdup.8 d15, d15[3] |
michael@0 | 2594 | .endm |
michael@0 | 2595 | |
michael@0 | 2596 | .macro pixman_composite_over_0565_n_0565_cleanup |
michael@0 | 2597 | vpop {d8-d15} |
michael@0 | 2598 | .endm |
michael@0 | 2599 | |
michael@0 | 2600 | generate_composite_function \ |
michael@0 | 2601 | pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ |
michael@0 | 2602 | FLAG_DST_READWRITE, \ |
michael@0 | 2603 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2604 | 5, /* prefetch distance */ \ |
michael@0 | 2605 | pixman_composite_over_0565_n_0565_init, \ |
michael@0 | 2606 | pixman_composite_over_0565_n_0565_cleanup, \ |
michael@0 | 2607 | pixman_composite_over_0565_8_0565_process_pixblock_head, \ |
michael@0 | 2608 | pixman_composite_over_0565_8_0565_process_pixblock_tail, \ |
michael@0 | 2609 | pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ |
michael@0 | 2610 | 28, /* dst_w_basereg */ \ |
michael@0 | 2611 | 10, /* dst_r_basereg */ \ |
michael@0 | 2612 | 8, /* src_basereg */ \ |
michael@0 | 2613 | 15 /* mask_basereg */ |
michael@0 | 2614 | |
michael@0 | 2615 | /******************************************************************************/ |
michael@0 | 2616 | |
michael@0 | 2617 | .macro pixman_composite_add_0565_8_0565_process_pixblock_head |
michael@0 | 2618 | /* mask is in d15 */ |
michael@0 | 2619 | convert_0565_to_x888 q4, d2, d1, d0 |
michael@0 | 2620 | convert_0565_to_x888 q5, d6, d5, d4 |
michael@0 | 2621 | /* source pixel data is in {d0, d1, d2, XX} */ |
michael@0 | 2622 | /* destination pixel data is in {d4, d5, d6, XX} */ |
michael@0 | 2623 | vmull.u8 q6, d15, d2 |
michael@0 | 2624 | vmull.u8 q5, d15, d1 |
michael@0 | 2625 | vmull.u8 q4, d15, d0 |
michael@0 | 2626 | vrshr.u16 q12, q6, #8 |
michael@0 | 2627 | vrshr.u16 q11, q5, #8 |
michael@0 | 2628 | vrshr.u16 q10, q4, #8 |
michael@0 | 2629 | vraddhn.u16 d2, q6, q12 |
michael@0 | 2630 | vraddhn.u16 d1, q5, q11 |
michael@0 | 2631 | vraddhn.u16 d0, q4, q10 |
michael@0 | 2632 | .endm |
michael@0 | 2633 | |
michael@0 | 2634 | .macro pixman_composite_add_0565_8_0565_process_pixblock_tail |
michael@0 | 2635 | vqadd.u8 q0, q0, q2 |
michael@0 | 2636 | vqadd.u8 q1, q1, q3 |
michael@0 | 2637 | /* 32bpp result is in {d0, d1, d2, XX} */ |
michael@0 | 2638 | convert_8888_to_0565 d2, d1, d0, q14, q15, q3 |
michael@0 | 2639 | .endm |
michael@0 | 2640 | |
michael@0 | 2641 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 2642 | .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head |
michael@0 | 2643 | fetch_mask_pixblock |
michael@0 | 2644 | pixman_composite_add_0565_8_0565_process_pixblock_tail |
michael@0 | 2645 | fetch_src_pixblock |
michael@0 | 2646 | vld1.16 {d10, d11}, [DST_R, :128]! |
michael@0 | 2647 | cache_preload 8, 8 |
michael@0 | 2648 | pixman_composite_add_0565_8_0565_process_pixblock_head |
michael@0 | 2649 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 2650 | .endm |
michael@0 | 2651 | |
michael@0 | 2652 | generate_composite_function \ |
michael@0 | 2653 | pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ |
michael@0 | 2654 | FLAG_DST_READWRITE, \ |
michael@0 | 2655 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2656 | 5, /* prefetch distance */ \ |
michael@0 | 2657 | default_init_need_all_regs, \ |
michael@0 | 2658 | default_cleanup_need_all_regs, \ |
michael@0 | 2659 | pixman_composite_add_0565_8_0565_process_pixblock_head, \ |
michael@0 | 2660 | pixman_composite_add_0565_8_0565_process_pixblock_tail, \ |
michael@0 | 2661 | pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ |
michael@0 | 2662 | 28, /* dst_w_basereg */ \ |
michael@0 | 2663 | 10, /* dst_r_basereg */ \ |
michael@0 | 2664 | 8, /* src_basereg */ \ |
michael@0 | 2665 | 15 /* mask_basereg */ |
michael@0 | 2666 | |
michael@0 | 2667 | /******************************************************************************/ |
michael@0 | 2668 | |
michael@0 | 2669 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_head |
michael@0 | 2670 | /* mask is in d15 */ |
michael@0 | 2671 | convert_0565_to_x888 q5, d6, d5, d4 |
michael@0 | 2672 | /* destination pixel data is in {d4, d5, d6, xx} */ |
michael@0 | 2673 | vmvn.8 d24, d15 /* get inverted alpha */ |
michael@0 | 2674 | /* now do alpha blending */ |
michael@0 | 2675 | vmull.u8 q8, d24, d4 |
michael@0 | 2676 | vmull.u8 q9, d24, d5 |
michael@0 | 2677 | vmull.u8 q10, d24, d6 |
michael@0 | 2678 | .endm |
michael@0 | 2679 | |
michael@0 | 2680 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail |
michael@0 | 2681 | vrshr.u16 q14, q8, #8 |
michael@0 | 2682 | vrshr.u16 q15, q9, #8 |
michael@0 | 2683 | vrshr.u16 q12, q10, #8 |
michael@0 | 2684 | vraddhn.u16 d0, q14, q8 |
michael@0 | 2685 | vraddhn.u16 d1, q15, q9 |
michael@0 | 2686 | vraddhn.u16 d2, q12, q10 |
michael@0 | 2687 | /* 32bpp result is in {d0, d1, d2, XX} */ |
michael@0 | 2688 | convert_8888_to_0565 d2, d1, d0, q14, q15, q3 |
michael@0 | 2689 | .endm |
michael@0 | 2690 | |
michael@0 | 2691 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 2692 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head |
michael@0 | 2693 | fetch_src_pixblock |
michael@0 | 2694 | pixman_composite_out_reverse_8_0565_process_pixblock_tail |
michael@0 | 2695 | vld1.16 {d10, d11}, [DST_R, :128]! |
michael@0 | 2696 | cache_preload 8, 8 |
michael@0 | 2697 | pixman_composite_out_reverse_8_0565_process_pixblock_head |
michael@0 | 2698 | vst1.16 {d28, d29}, [DST_W, :128]! |
michael@0 | 2699 | .endm |
michael@0 | 2700 | |
michael@0 | 2701 | generate_composite_function \ |
michael@0 | 2702 | pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ |
michael@0 | 2703 | FLAG_DST_READWRITE, \ |
michael@0 | 2704 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2705 | 5, /* prefetch distance */ \ |
michael@0 | 2706 | default_init_need_all_regs, \ |
michael@0 | 2707 | default_cleanup_need_all_regs, \ |
michael@0 | 2708 | pixman_composite_out_reverse_8_0565_process_pixblock_head, \ |
michael@0 | 2709 | pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ |
michael@0 | 2710 | pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ |
michael@0 | 2711 | 28, /* dst_w_basereg */ \ |
michael@0 | 2712 | 10, /* dst_r_basereg */ \ |
michael@0 | 2713 | 15, /* src_basereg */ \ |
michael@0 | 2714 | 0 /* mask_basereg */ |
michael@0 | 2715 | |
michael@0 | 2716 | /******************************************************************************/ |
michael@0 | 2717 | |
michael@0 | 2718 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_head |
michael@0 | 2719 | /* src is in d0 */ |
michael@0 | 2720 | /* destination pixel data is in {d4, d5, d6, d7} */ |
michael@0 | 2721 | vmvn.8 d1, d0 /* get inverted alpha */ |
michael@0 | 2722 | /* now do alpha blending */ |
michael@0 | 2723 | vmull.u8 q8, d1, d4 |
michael@0 | 2724 | vmull.u8 q9, d1, d5 |
michael@0 | 2725 | vmull.u8 q10, d1, d6 |
michael@0 | 2726 | vmull.u8 q11, d1, d7 |
michael@0 | 2727 | .endm |
michael@0 | 2728 | |
michael@0 | 2729 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail |
michael@0 | 2730 | vrshr.u16 q14, q8, #8 |
michael@0 | 2731 | vrshr.u16 q15, q9, #8 |
michael@0 | 2732 | vrshr.u16 q12, q10, #8 |
michael@0 | 2733 | vrshr.u16 q13, q11, #8 |
michael@0 | 2734 | vraddhn.u16 d28, q14, q8 |
michael@0 | 2735 | vraddhn.u16 d29, q15, q9 |
michael@0 | 2736 | vraddhn.u16 d30, q12, q10 |
michael@0 | 2737 | vraddhn.u16 d31, q13, q11 |
michael@0 | 2738 | /* 32bpp result is in {d28, d29, d30, d31} */ |
michael@0 | 2739 | .endm |
michael@0 | 2740 | |
michael@0 | 2741 | /* TODO: expand macros and do better instructions scheduling */ |
michael@0 | 2742 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head |
michael@0 | 2743 | fetch_src_pixblock |
michael@0 | 2744 | pixman_composite_out_reverse_8_8888_process_pixblock_tail |
michael@0 | 2745 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! |
michael@0 | 2746 | cache_preload 8, 8 |
michael@0 | 2747 | pixman_composite_out_reverse_8_8888_process_pixblock_head |
michael@0 | 2748 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! |
michael@0 | 2749 | .endm |
michael@0 | 2750 | |
michael@0 | 2751 | generate_composite_function \ |
michael@0 | 2752 | pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ |
michael@0 | 2753 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2754 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2755 | 5, /* prefetch distance */ \ |
michael@0 | 2756 | default_init, \ |
michael@0 | 2757 | default_cleanup, \ |
michael@0 | 2758 | pixman_composite_out_reverse_8_8888_process_pixblock_head, \ |
michael@0 | 2759 | pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ |
michael@0 | 2760 | pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ |
michael@0 | 2761 | 28, /* dst_w_basereg */ \ |
michael@0 | 2762 | 4, /* dst_r_basereg */ \ |
michael@0 | 2763 | 0, /* src_basereg */ \ |
michael@0 | 2764 | 0 /* mask_basereg */ |
michael@0 | 2765 | |
michael@0 | 2766 | /******************************************************************************/ |
michael@0 | 2767 | |
michael@0 | 2768 | generate_composite_function_nearest_scanline \ |
michael@0 | 2769 | pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ |
michael@0 | 2770 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2771 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2772 | default_init, \ |
michael@0 | 2773 | default_cleanup, \ |
michael@0 | 2774 | pixman_composite_over_8888_8888_process_pixblock_head, \ |
michael@0 | 2775 | pixman_composite_over_8888_8888_process_pixblock_tail, \ |
michael@0 | 2776 | pixman_composite_over_8888_8888_process_pixblock_tail_head |
michael@0 | 2777 | |
michael@0 | 2778 | generate_composite_function_nearest_scanline \ |
michael@0 | 2779 | pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ |
michael@0 | 2780 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2781 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2782 | default_init, \ |
michael@0 | 2783 | default_cleanup, \ |
michael@0 | 2784 | pixman_composite_over_8888_0565_process_pixblock_head, \ |
michael@0 | 2785 | pixman_composite_over_8888_0565_process_pixblock_tail, \ |
michael@0 | 2786 | pixman_composite_over_8888_0565_process_pixblock_tail_head, \ |
michael@0 | 2787 | 28, /* dst_w_basereg */ \ |
michael@0 | 2788 | 4, /* dst_r_basereg */ \ |
michael@0 | 2789 | 0, /* src_basereg */ \ |
michael@0 | 2790 | 24 /* mask_basereg */ |
michael@0 | 2791 | |
michael@0 | 2792 | generate_composite_function_nearest_scanline \ |
michael@0 | 2793 | pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ |
michael@0 | 2794 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2795 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2796 | default_init, \ |
michael@0 | 2797 | default_cleanup, \ |
michael@0 | 2798 | pixman_composite_src_8888_0565_process_pixblock_head, \ |
michael@0 | 2799 | pixman_composite_src_8888_0565_process_pixblock_tail, \ |
michael@0 | 2800 | pixman_composite_src_8888_0565_process_pixblock_tail_head |
michael@0 | 2801 | |
michael@0 | 2802 | generate_composite_function_nearest_scanline \ |
michael@0 | 2803 | pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ |
michael@0 | 2804 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2805 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2806 | default_init, \ |
michael@0 | 2807 | default_cleanup, \ |
michael@0 | 2808 | pixman_composite_src_0565_8888_process_pixblock_head, \ |
michael@0 | 2809 | pixman_composite_src_0565_8888_process_pixblock_tail, \ |
michael@0 | 2810 | pixman_composite_src_0565_8888_process_pixblock_tail_head |
michael@0 | 2811 | |
michael@0 | 2812 | generate_composite_function_nearest_scanline \ |
michael@0 | 2813 | pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ |
michael@0 | 2814 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
michael@0 | 2815 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2816 | default_init_need_all_regs, \ |
michael@0 | 2817 | default_cleanup_need_all_regs, \ |
michael@0 | 2818 | pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
michael@0 | 2819 | pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
michael@0 | 2820 | pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ |
michael@0 | 2821 | 28, /* dst_w_basereg */ \ |
michael@0 | 2822 | 4, /* dst_r_basereg */ \ |
michael@0 | 2823 | 8, /* src_basereg */ \ |
michael@0 | 2824 | 24 /* mask_basereg */ |
michael@0 | 2825 | |
michael@0 | 2826 | generate_composite_function_nearest_scanline \ |
michael@0 | 2827 | pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ |
michael@0 | 2828 | FLAG_DST_READWRITE, \ |
michael@0 | 2829 | 8, /* number of pixels, processed in a single block */ \ |
michael@0 | 2830 | default_init_need_all_regs, \ |
michael@0 | 2831 | default_cleanup_need_all_regs, \ |
michael@0 | 2832 | pixman_composite_over_0565_8_0565_process_pixblock_head, \ |
michael@0 | 2833 | pixman_composite_over_0565_8_0565_process_pixblock_tail, \ |
michael@0 | 2834 | pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ |
michael@0 | 2835 | 28, /* dst_w_basereg */ \ |
michael@0 | 2836 | 10, /* dst_r_basereg */ \ |
michael@0 | 2837 | 8, /* src_basereg */ \ |
michael@0 | 2838 | 15 /* mask_basereg */ |
michael@0 | 2839 | |
michael@0 | 2840 | /******************************************************************************/ |
michael@0 | 2841 | |
michael@0 | 2842 | /* Supplementary macro for setting function attributes */ |
michael@0 | 2843 | .macro pixman_asm_function fname |
michael@0 | 2844 | .func fname |
michael@0 | 2845 | .global fname |
michael@0 | 2846 | #ifdef __ELF__ |
michael@0 | 2847 | .hidden fname |
michael@0 | 2848 | .type fname, %function |
michael@0 | 2849 | #endif |
michael@0 | 2850 | fname: |
michael@0 | 2851 | .endm |
michael@0 | 2852 | |
michael@0 | 2853 | /* |
michael@0 | 2854 | * Bilinear scaling support code which tries to provide pixel fetching, color |
michael@0 | 2855 | * format conversion, and interpolation as separate macros which can be used |
michael@0 | 2856 | * as the basic building blocks for constructing bilinear scanline functions. |
michael@0 | 2857 | */ |
michael@0 | 2858 | |
michael@0 | 2859 | .macro bilinear_load_8888 reg1, reg2, tmp |
michael@0 | 2860 | mov TMP1, X, asr #16 |
michael@0 | 2861 | add X, X, UX |
michael@0 | 2862 | add TMP1, TOP, TMP1, asl #2 |
michael@0 | 2863 | vld1.32 {reg1}, [TMP1], STRIDE |
michael@0 | 2864 | vld1.32 {reg2}, [TMP1] |
michael@0 | 2865 | .endm |
michael@0 | 2866 | |
michael@0 | 2867 | .macro bilinear_load_0565 reg1, reg2, tmp |
michael@0 | 2868 | mov TMP1, X, asr #16 |
michael@0 | 2869 | add X, X, UX |
michael@0 | 2870 | add TMP1, TOP, TMP1, asl #1 |
michael@0 | 2871 | vld1.32 {reg2[0]}, [TMP1], STRIDE |
michael@0 | 2872 | vld1.32 {reg2[1]}, [TMP1] |
michael@0 | 2873 | convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp |
michael@0 | 2874 | .endm |
michael@0 | 2875 | |
michael@0 | 2876 | .macro bilinear_load_and_vertical_interpolate_two_8888 \ |
michael@0 | 2877 | acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 |
michael@0 | 2878 | |
michael@0 | 2879 | bilinear_load_8888 reg1, reg2, tmp1 |
michael@0 | 2880 | vmull.u8 acc1, reg1, d28 |
michael@0 | 2881 | vmlal.u8 acc1, reg2, d29 |
michael@0 | 2882 | bilinear_load_8888 reg3, reg4, tmp2 |
michael@0 | 2883 | vmull.u8 acc2, reg3, d28 |
michael@0 | 2884 | vmlal.u8 acc2, reg4, d29 |
michael@0 | 2885 | .endm |
michael@0 | 2886 | |
michael@0 | 2887 | .macro bilinear_load_and_vertical_interpolate_four_8888 \ |
michael@0 | 2888 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ |
michael@0 | 2889 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
michael@0 | 2890 | |
michael@0 | 2891 | bilinear_load_and_vertical_interpolate_two_8888 \ |
michael@0 | 2892 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi |
michael@0 | 2893 | bilinear_load_and_vertical_interpolate_two_8888 \ |
michael@0 | 2894 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
michael@0 | 2895 | .endm |
michael@0 | 2896 | |
michael@0 | 2897 | .macro bilinear_load_and_vertical_interpolate_two_0565 \ |
michael@0 | 2898 | acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi |
michael@0 | 2899 | |
michael@0 | 2900 | mov TMP1, X, asr #16 |
michael@0 | 2901 | add X, X, UX |
michael@0 | 2902 | add TMP1, TOP, TMP1, asl #1 |
michael@0 | 2903 | mov TMP2, X, asr #16 |
michael@0 | 2904 | add X, X, UX |
michael@0 | 2905 | add TMP2, TOP, TMP2, asl #1 |
michael@0 | 2906 | vld1.32 {acc2lo[0]}, [TMP1], STRIDE |
michael@0 | 2907 | vld1.32 {acc2hi[0]}, [TMP2], STRIDE |
michael@0 | 2908 | vld1.32 {acc2lo[1]}, [TMP1] |
michael@0 | 2909 | vld1.32 {acc2hi[1]}, [TMP2] |
michael@0 | 2910 | convert_0565_to_x888 acc2, reg3, reg2, reg1 |
michael@0 | 2911 | vzip.u8 reg1, reg3 |
michael@0 | 2912 | vzip.u8 reg2, reg4 |
michael@0 | 2913 | vzip.u8 reg3, reg4 |
michael@0 | 2914 | vzip.u8 reg1, reg2 |
michael@0 | 2915 | vmull.u8 acc1, reg1, d28 |
michael@0 | 2916 | vmlal.u8 acc1, reg2, d29 |
michael@0 | 2917 | vmull.u8 acc2, reg3, d28 |
michael@0 | 2918 | vmlal.u8 acc2, reg4, d29 |
michael@0 | 2919 | .endm |
michael@0 | 2920 | |
michael@0 | 2921 | .macro bilinear_load_and_vertical_interpolate_four_0565 \ |
michael@0 | 2922 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ |
michael@0 | 2923 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
michael@0 | 2924 | |
michael@0 | 2925 | mov TMP1, X, asr #16 |
michael@0 | 2926 | add X, X, UX |
michael@0 | 2927 | add TMP1, TOP, TMP1, asl #1 |
michael@0 | 2928 | mov TMP2, X, asr #16 |
michael@0 | 2929 | add X, X, UX |
michael@0 | 2930 | add TMP2, TOP, TMP2, asl #1 |
michael@0 | 2931 | vld1.32 {xacc2lo[0]}, [TMP1], STRIDE |
michael@0 | 2932 | vld1.32 {xacc2hi[0]}, [TMP2], STRIDE |
michael@0 | 2933 | vld1.32 {xacc2lo[1]}, [TMP1] |
michael@0 | 2934 | vld1.32 {xacc2hi[1]}, [TMP2] |
michael@0 | 2935 | convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 |
michael@0 | 2936 | mov TMP1, X, asr #16 |
michael@0 | 2937 | add X, X, UX |
michael@0 | 2938 | add TMP1, TOP, TMP1, asl #1 |
michael@0 | 2939 | mov TMP2, X, asr #16 |
michael@0 | 2940 | add X, X, UX |
michael@0 | 2941 | add TMP2, TOP, TMP2, asl #1 |
michael@0 | 2942 | vld1.32 {yacc2lo[0]}, [TMP1], STRIDE |
michael@0 | 2943 | vzip.u8 xreg1, xreg3 |
michael@0 | 2944 | vld1.32 {yacc2hi[0]}, [TMP2], STRIDE |
michael@0 | 2945 | vzip.u8 xreg2, xreg4 |
michael@0 | 2946 | vld1.32 {yacc2lo[1]}, [TMP1] |
michael@0 | 2947 | vzip.u8 xreg3, xreg4 |
michael@0 | 2948 | vld1.32 {yacc2hi[1]}, [TMP2] |
michael@0 | 2949 | vzip.u8 xreg1, xreg2 |
michael@0 | 2950 | convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 |
michael@0 | 2951 | vmull.u8 xacc1, xreg1, d28 |
michael@0 | 2952 | vzip.u8 yreg1, yreg3 |
michael@0 | 2953 | vmlal.u8 xacc1, xreg2, d29 |
michael@0 | 2954 | vzip.u8 yreg2, yreg4 |
michael@0 | 2955 | vmull.u8 xacc2, xreg3, d28 |
michael@0 | 2956 | vzip.u8 yreg3, yreg4 |
michael@0 | 2957 | vmlal.u8 xacc2, xreg4, d29 |
michael@0 | 2958 | vzip.u8 yreg1, yreg2 |
michael@0 | 2959 | vmull.u8 yacc1, yreg1, d28 |
michael@0 | 2960 | vmlal.u8 yacc1, yreg2, d29 |
michael@0 | 2961 | vmull.u8 yacc2, yreg3, d28 |
michael@0 | 2962 | vmlal.u8 yacc2, yreg4, d29 |
michael@0 | 2963 | .endm |
michael@0 | 2964 | |
michael@0 | 2965 | .macro bilinear_store_8888 numpix, tmp1, tmp2 |
michael@0 | 2966 | .if numpix == 4 |
michael@0 | 2967 | vst1.32 {d0, d1}, [OUT, :128]! |
michael@0 | 2968 | .elseif numpix == 2 |
michael@0 | 2969 | vst1.32 {d0}, [OUT, :64]! |
michael@0 | 2970 | .elseif numpix == 1 |
michael@0 | 2971 | vst1.32 {d0[0]}, [OUT, :32]! |
michael@0 | 2972 | .else |
michael@0 | 2973 | .error bilinear_store_8888 numpix is unsupported |
michael@0 | 2974 | .endif |
michael@0 | 2975 | .endm |
michael@0 | 2976 | |
michael@0 | 2977 | .macro bilinear_store_0565 numpix, tmp1, tmp2 |
michael@0 | 2978 | vuzp.u8 d0, d1 |
michael@0 | 2979 | vuzp.u8 d2, d3 |
michael@0 | 2980 | vuzp.u8 d1, d3 |
michael@0 | 2981 | vuzp.u8 d0, d2 |
michael@0 | 2982 | convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 |
michael@0 | 2983 | .if numpix == 4 |
michael@0 | 2984 | vst1.16 {d2}, [OUT, :64]! |
michael@0 | 2985 | .elseif numpix == 2 |
michael@0 | 2986 | vst1.32 {d2[0]}, [OUT, :32]! |
michael@0 | 2987 | .elseif numpix == 1 |
michael@0 | 2988 | vst1.16 {d2[0]}, [OUT, :16]! |
michael@0 | 2989 | .else |
michael@0 | 2990 | .error bilinear_store_0565 numpix is unsupported |
michael@0 | 2991 | .endif |
michael@0 | 2992 | .endm |
michael@0 | 2993 | |
michael@0 | 2994 | .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt |
michael@0 | 2995 | bilinear_load_&src_fmt d0, d1, d2 |
michael@0 | 2996 | vmull.u8 q1, d0, d28 |
michael@0 | 2997 | vmlal.u8 q1, d1, d29 |
michael@0 | 2998 | /* 5 cycles bubble */ |
michael@0 | 2999 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3000 | vmlsl.u16 q0, d2, d30 |
michael@0 | 3001 | vmlal.u16 q0, d3, d30 |
michael@0 | 3002 | /* 5 cycles bubble */ |
michael@0 | 3003 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3004 | /* 3 cycles bubble */ |
michael@0 | 3005 | vmovn.u16 d0, q0 |
michael@0 | 3006 | /* 1 cycle bubble */ |
michael@0 | 3007 | bilinear_store_&dst_fmt 1, q2, q3 |
michael@0 | 3008 | .endm |
michael@0 | 3009 | |
michael@0 | 3010 | .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt |
michael@0 | 3011 | bilinear_load_and_vertical_interpolate_two_&src_fmt \ |
michael@0 | 3012 | q1, q11, d0, d1, d20, d21, d22, d23 |
michael@0 | 3013 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3014 | vmlsl.u16 q0, d2, d30 |
michael@0 | 3015 | vmlal.u16 q0, d3, d30 |
michael@0 | 3016 | vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3017 | vmlsl.u16 q10, d22, d31 |
michael@0 | 3018 | vmlal.u16 q10, d23, d31 |
michael@0 | 3019 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3020 | vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3021 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3022 | vadd.u16 q12, q12, q13 |
michael@0 | 3023 | vmovn.u16 d0, q0 |
michael@0 | 3024 | bilinear_store_&dst_fmt 2, q2, q3 |
michael@0 | 3025 | .endm |
michael@0 | 3026 | |
michael@0 | 3027 | .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt |
michael@0 | 3028 | bilinear_load_and_vertical_interpolate_four_&src_fmt \ |
michael@0 | 3029 | q1, q11, d0, d1, d20, d21, d22, d23 \ |
michael@0 | 3030 | q3, q9, d4, d5, d16, d17, d18, d19 |
michael@0 | 3031 | pld [TMP1, PF_OFFS] |
michael@0 | 3032 | sub TMP1, TMP1, STRIDE |
michael@0 | 3033 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3034 | vmlsl.u16 q0, d2, d30 |
michael@0 | 3035 | vmlal.u16 q0, d3, d30 |
michael@0 | 3036 | vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3037 | vmlsl.u16 q10, d22, d31 |
michael@0 | 3038 | vmlal.u16 q10, d23, d31 |
michael@0 | 3039 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3040 | vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3041 | vmlsl.u16 q2, d6, d30 |
michael@0 | 3042 | vmlal.u16 q2, d7, d30 |
michael@0 | 3043 | vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3044 | pld [TMP2, PF_OFFS] |
michael@0 | 3045 | vmlsl.u16 q8, d18, d31 |
michael@0 | 3046 | vmlal.u16 q8, d19, d31 |
michael@0 | 3047 | vadd.u16 q12, q12, q13 |
michael@0 | 3048 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3049 | vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3050 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3051 | vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3052 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3053 | vmovn.u16 d0, q0 |
michael@0 | 3054 | vmovn.u16 d1, q2 |
michael@0 | 3055 | vadd.u16 q12, q12, q13 |
michael@0 | 3056 | bilinear_store_&dst_fmt 4, q2, q3 |
michael@0 | 3057 | .endm |
michael@0 | 3058 | |
michael@0 | 3059 | .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt |
michael@0 | 3060 | .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt |
michael@0 | 3061 | bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head |
michael@0 | 3062 | .else |
michael@0 | 3063 | bilinear_interpolate_four_pixels src_fmt, dst_fmt |
michael@0 | 3064 | .endif |
michael@0 | 3065 | .endm |
michael@0 | 3066 | |
michael@0 | 3067 | .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt |
michael@0 | 3068 | .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt |
michael@0 | 3069 | bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail |
michael@0 | 3070 | .endif |
michael@0 | 3071 | .endm |
michael@0 | 3072 | |
michael@0 | 3073 | .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
michael@0 | 3074 | .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt |
michael@0 | 3075 | bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head |
michael@0 | 3076 | .else |
michael@0 | 3077 | bilinear_interpolate_four_pixels src_fmt, dst_fmt |
michael@0 | 3078 | .endif |
michael@0 | 3079 | .endm |
michael@0 | 3080 | |
michael@0 | 3081 | .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt |
michael@0 | 3082 | .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt |
michael@0 | 3083 | bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head |
michael@0 | 3084 | .else |
michael@0 | 3085 | bilinear_interpolate_four_pixels_head src_fmt, dst_fmt |
michael@0 | 3086 | bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
michael@0 | 3087 | .endif |
michael@0 | 3088 | .endm |
michael@0 | 3089 | |
michael@0 | 3090 | .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt |
michael@0 | 3091 | .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt |
michael@0 | 3092 | bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail |
michael@0 | 3093 | .else |
michael@0 | 3094 | bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt |
michael@0 | 3095 | .endif |
michael@0 | 3096 | .endm |
michael@0 | 3097 | |
michael@0 | 3098 | .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt |
michael@0 | 3099 | .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt |
michael@0 | 3100 | bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head |
michael@0 | 3101 | .else |
michael@0 | 3102 | bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
michael@0 | 3103 | bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
michael@0 | 3104 | .endif |
michael@0 | 3105 | .endm |
michael@0 | 3106 | |
michael@0 | 3107 | .set BILINEAR_FLAG_UNROLL_4, 0 |
michael@0 | 3108 | .set BILINEAR_FLAG_UNROLL_8, 1 |
michael@0 | 3109 | .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 |
michael@0 | 3110 | |
michael@0 | 3111 | /* |
michael@0 | 3112 | * Main template macro for generating NEON optimized bilinear scanline |
michael@0 | 3113 | * functions. |
michael@0 | 3114 | * |
michael@0 | 3115 | * Bilinear scanline scaler macro template uses the following arguments: |
michael@0 | 3116 | * fname - name of the function to generate |
michael@0 | 3117 | * src_fmt - source color format (8888 or 0565) |
michael@0 | 3118 | * dst_fmt - destination color format (8888 or 0565) |
michael@0 | 3119 | * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes |
michael@0 | 3120 | * prefetch_distance - prefetch in the source image by that many |
michael@0 | 3121 | * pixels ahead |
michael@0 | 3122 | */ |
michael@0 | 3123 | |
michael@0 | 3124 | .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ |
michael@0 | 3125 | src_bpp_shift, dst_bpp_shift, \ |
michael@0 | 3126 | prefetch_distance, flags |
michael@0 | 3127 | |
michael@0 | 3128 | pixman_asm_function fname |
michael@0 | 3129 | OUT .req r0 |
michael@0 | 3130 | TOP .req r1 |
michael@0 | 3131 | BOTTOM .req r2 |
michael@0 | 3132 | WT .req r3 |
michael@0 | 3133 | WB .req r4 |
michael@0 | 3134 | X .req r5 |
michael@0 | 3135 | UX .req r6 |
michael@0 | 3136 | WIDTH .req ip |
michael@0 | 3137 | TMP1 .req r3 |
michael@0 | 3138 | TMP2 .req r4 |
michael@0 | 3139 | PF_OFFS .req r7 |
michael@0 | 3140 | TMP3 .req r8 |
michael@0 | 3141 | TMP4 .req r9 |
michael@0 | 3142 | STRIDE .req r2 |
michael@0 | 3143 | |
michael@0 | 3144 | .fnstart |
michael@0 | 3145 | mov ip, sp |
michael@0 | 3146 | .save {r4, r5, r6, r7, r8, r9} |
michael@0 | 3147 | push {r4, r5, r6, r7, r8, r9} |
michael@0 | 3148 | mov PF_OFFS, #prefetch_distance |
michael@0 | 3149 | ldmia ip, {WB, X, UX, WIDTH} |
michael@0 | 3150 | mul PF_OFFS, PF_OFFS, UX |
michael@0 | 3151 | |
michael@0 | 3152 | .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 |
michael@0 | 3153 | .vsave {d8-d15} |
michael@0 | 3154 | vpush {d8-d15} |
michael@0 | 3155 | .endif |
michael@0 | 3156 | |
michael@0 | 3157 | sub STRIDE, BOTTOM, TOP |
michael@0 | 3158 | .unreq BOTTOM |
michael@0 | 3159 | |
michael@0 | 3160 | cmp WIDTH, #0 |
michael@0 | 3161 | ble 3f |
michael@0 | 3162 | |
michael@0 | 3163 | vdup.u16 q12, X |
michael@0 | 3164 | vdup.u16 q13, UX |
michael@0 | 3165 | vdup.u8 d28, WT |
michael@0 | 3166 | vdup.u8 d29, WB |
michael@0 | 3167 | vadd.u16 d25, d25, d26 |
michael@0 | 3168 | |
michael@0 | 3169 | /* ensure good destination alignment */ |
michael@0 | 3170 | cmp WIDTH, #1 |
michael@0 | 3171 | blt 0f |
michael@0 | 3172 | tst OUT, #(1 << dst_bpp_shift) |
michael@0 | 3173 | beq 0f |
michael@0 | 3174 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3175 | vadd.u16 q12, q12, q13 |
michael@0 | 3176 | bilinear_interpolate_last_pixel src_fmt, dst_fmt |
michael@0 | 3177 | sub WIDTH, WIDTH, #1 |
michael@0 | 3178 | 0: |
michael@0 | 3179 | vadd.u16 q13, q13, q13 |
michael@0 | 3180 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3181 | vadd.u16 q12, q12, q13 |
michael@0 | 3182 | |
michael@0 | 3183 | cmp WIDTH, #2 |
michael@0 | 3184 | blt 0f |
michael@0 | 3185 | tst OUT, #(1 << (dst_bpp_shift + 1)) |
michael@0 | 3186 | beq 0f |
michael@0 | 3187 | bilinear_interpolate_two_pixels src_fmt, dst_fmt |
michael@0 | 3188 | sub WIDTH, WIDTH, #2 |
michael@0 | 3189 | 0: |
michael@0 | 3190 | .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 |
michael@0 | 3191 | /*********** 8 pixels per iteration *****************/ |
michael@0 | 3192 | cmp WIDTH, #4 |
michael@0 | 3193 | blt 0f |
michael@0 | 3194 | tst OUT, #(1 << (dst_bpp_shift + 2)) |
michael@0 | 3195 | beq 0f |
michael@0 | 3196 | bilinear_interpolate_four_pixels src_fmt, dst_fmt |
michael@0 | 3197 | sub WIDTH, WIDTH, #4 |
michael@0 | 3198 | 0: |
michael@0 | 3199 | subs WIDTH, WIDTH, #8 |
michael@0 | 3200 | blt 1f |
michael@0 | 3201 | mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) |
michael@0 | 3202 | bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt |
michael@0 | 3203 | subs WIDTH, WIDTH, #8 |
michael@0 | 3204 | blt 5f |
michael@0 | 3205 | 0: |
michael@0 | 3206 | bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt |
michael@0 | 3207 | subs WIDTH, WIDTH, #8 |
michael@0 | 3208 | bge 0b |
michael@0 | 3209 | 5: |
michael@0 | 3210 | bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt |
michael@0 | 3211 | 1: |
michael@0 | 3212 | tst WIDTH, #4 |
michael@0 | 3213 | beq 2f |
michael@0 | 3214 | bilinear_interpolate_four_pixels src_fmt, dst_fmt |
michael@0 | 3215 | 2: |
michael@0 | 3216 | .else |
michael@0 | 3217 | /*********** 4 pixels per iteration *****************/ |
michael@0 | 3218 | subs WIDTH, WIDTH, #4 |
michael@0 | 3219 | blt 1f |
michael@0 | 3220 | mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) |
michael@0 | 3221 | bilinear_interpolate_four_pixels_head src_fmt, dst_fmt |
michael@0 | 3222 | subs WIDTH, WIDTH, #4 |
michael@0 | 3223 | blt 5f |
michael@0 | 3224 | 0: |
michael@0 | 3225 | bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
michael@0 | 3226 | subs WIDTH, WIDTH, #4 |
michael@0 | 3227 | bge 0b |
michael@0 | 3228 | 5: |
michael@0 | 3229 | bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt |
michael@0 | 3230 | 1: |
michael@0 | 3231 | /****************************************************/ |
michael@0 | 3232 | .endif |
michael@0 | 3233 | /* handle the remaining trailing pixels */ |
michael@0 | 3234 | tst WIDTH, #2 |
michael@0 | 3235 | beq 2f |
michael@0 | 3236 | bilinear_interpolate_two_pixels src_fmt, dst_fmt |
michael@0 | 3237 | 2: |
michael@0 | 3238 | tst WIDTH, #1 |
michael@0 | 3239 | beq 3f |
michael@0 | 3240 | bilinear_interpolate_last_pixel src_fmt, dst_fmt |
michael@0 | 3241 | 3: |
michael@0 | 3242 | .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 |
michael@0 | 3243 | vpop {d8-d15} |
michael@0 | 3244 | .endif |
michael@0 | 3245 | pop {r4, r5, r6, r7, r8, r9} |
michael@0 | 3246 | bx lr |
michael@0 | 3247 | .fnend |
michael@0 | 3248 | |
michael@0 | 3249 | .unreq OUT |
michael@0 | 3250 | .unreq TOP |
michael@0 | 3251 | .unreq WT |
michael@0 | 3252 | .unreq WB |
michael@0 | 3253 | .unreq X |
michael@0 | 3254 | .unreq UX |
michael@0 | 3255 | .unreq WIDTH |
michael@0 | 3256 | .unreq TMP1 |
michael@0 | 3257 | .unreq TMP2 |
michael@0 | 3258 | .unreq PF_OFFS |
michael@0 | 3259 | .unreq TMP3 |
michael@0 | 3260 | .unreq TMP4 |
michael@0 | 3261 | .unreq STRIDE |
michael@0 | 3262 | .endfunc |
michael@0 | 3263 | |
michael@0 | 3264 | .endm |
michael@0 | 3265 | |
michael@0 | 3266 | /*****************************************************************************/ |
michael@0 | 3267 | |
michael@0 | 3268 | .set have_bilinear_interpolate_four_pixels_8888_8888, 1 |
michael@0 | 3269 | |
michael@0 | 3270 | .macro bilinear_interpolate_four_pixels_8888_8888_head |
michael@0 | 3271 | mov TMP1, X, asr #16 |
michael@0 | 3272 | add X, X, UX |
michael@0 | 3273 | add TMP1, TOP, TMP1, asl #2 |
michael@0 | 3274 | mov TMP2, X, asr #16 |
michael@0 | 3275 | add X, X, UX |
michael@0 | 3276 | add TMP2, TOP, TMP2, asl #2 |
michael@0 | 3277 | |
michael@0 | 3278 | vld1.32 {d22}, [TMP1], STRIDE |
michael@0 | 3279 | vld1.32 {d23}, [TMP1] |
michael@0 | 3280 | mov TMP3, X, asr #16 |
michael@0 | 3281 | add X, X, UX |
michael@0 | 3282 | add TMP3, TOP, TMP3, asl #2 |
michael@0 | 3283 | vmull.u8 q8, d22, d28 |
michael@0 | 3284 | vmlal.u8 q8, d23, d29 |
michael@0 | 3285 | |
michael@0 | 3286 | vld1.32 {d22}, [TMP2], STRIDE |
michael@0 | 3287 | vld1.32 {d23}, [TMP2] |
michael@0 | 3288 | mov TMP4, X, asr #16 |
michael@0 | 3289 | add X, X, UX |
michael@0 | 3290 | add TMP4, TOP, TMP4, asl #2 |
michael@0 | 3291 | vmull.u8 q9, d22, d28 |
michael@0 | 3292 | vmlal.u8 q9, d23, d29 |
michael@0 | 3293 | |
michael@0 | 3294 | vld1.32 {d22}, [TMP3], STRIDE |
michael@0 | 3295 | vld1.32 {d23}, [TMP3] |
michael@0 | 3296 | vmull.u8 q10, d22, d28 |
michael@0 | 3297 | vmlal.u8 q10, d23, d29 |
michael@0 | 3298 | |
michael@0 | 3299 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3300 | vmlsl.u16 q0, d16, d30 |
michael@0 | 3301 | vmlal.u16 q0, d17, d30 |
michael@0 | 3302 | |
michael@0 | 3303 | pld [TMP4, PF_OFFS] |
michael@0 | 3304 | vld1.32 {d16}, [TMP4], STRIDE |
michael@0 | 3305 | vld1.32 {d17}, [TMP4] |
michael@0 | 3306 | pld [TMP4, PF_OFFS] |
michael@0 | 3307 | vmull.u8 q11, d16, d28 |
michael@0 | 3308 | vmlal.u8 q11, d17, d29 |
michael@0 | 3309 | |
michael@0 | 3310 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3311 | vmlsl.u16 q1, d18, d31 |
michael@0 | 3312 | .endm |
michael@0 | 3313 | |
michael@0 | 3314 | .macro bilinear_interpolate_four_pixels_8888_8888_tail |
michael@0 | 3315 | vmlal.u16 q1, d19, d31 |
michael@0 | 3316 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3317 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3318 | vmlsl.u16 q2, d20, d30 |
michael@0 | 3319 | vmlal.u16 q2, d21, d30 |
michael@0 | 3320 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3321 | vmlsl.u16 q3, d22, d31 |
michael@0 | 3322 | vmlal.u16 q3, d23, d31 |
michael@0 | 3323 | vadd.u16 q12, q12, q13 |
michael@0 | 3324 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3325 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3326 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3327 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3328 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3329 | vmovn.u16 d6, q0 |
michael@0 | 3330 | vmovn.u16 d7, q2 |
michael@0 | 3331 | vadd.u16 q12, q12, q13 |
michael@0 | 3332 | vst1.32 {d6, d7}, [OUT, :128]! |
michael@0 | 3333 | .endm |
michael@0 | 3334 | |
michael@0 | 3335 | .macro bilinear_interpolate_four_pixels_8888_8888_tail_head |
michael@0 | 3336 | mov TMP1, X, asr #16 |
michael@0 | 3337 | add X, X, UX |
michael@0 | 3338 | add TMP1, TOP, TMP1, asl #2 |
michael@0 | 3339 | mov TMP2, X, asr #16 |
michael@0 | 3340 | add X, X, UX |
michael@0 | 3341 | add TMP2, TOP, TMP2, asl #2 |
michael@0 | 3342 | vmlal.u16 q1, d19, d31 |
michael@0 | 3343 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3344 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3345 | vmlsl.u16 q2, d20, d30 |
michael@0 | 3346 | vmlal.u16 q2, d21, d30 |
michael@0 | 3347 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3348 | vld1.32 {d20}, [TMP1], STRIDE |
michael@0 | 3349 | vmlsl.u16 q3, d22, d31 |
michael@0 | 3350 | vmlal.u16 q3, d23, d31 |
michael@0 | 3351 | vld1.32 {d21}, [TMP1] |
michael@0 | 3352 | vmull.u8 q8, d20, d28 |
michael@0 | 3353 | vmlal.u8 q8, d21, d29 |
michael@0 | 3354 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3355 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3356 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3357 | vld1.32 {d22}, [TMP2], STRIDE |
michael@0 | 3358 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3359 | vadd.u16 q12, q12, q13 |
michael@0 | 3360 | vld1.32 {d23}, [TMP2] |
michael@0 | 3361 | vmull.u8 q9, d22, d28 |
michael@0 | 3362 | mov TMP3, X, asr #16 |
michael@0 | 3363 | add X, X, UX |
michael@0 | 3364 | add TMP3, TOP, TMP3, asl #2 |
michael@0 | 3365 | mov TMP4, X, asr #16 |
michael@0 | 3366 | add X, X, UX |
michael@0 | 3367 | add TMP4, TOP, TMP4, asl #2 |
michael@0 | 3368 | vmlal.u8 q9, d23, d29 |
michael@0 | 3369 | vld1.32 {d22}, [TMP3], STRIDE |
michael@0 | 3370 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3371 | vld1.32 {d23}, [TMP3] |
michael@0 | 3372 | vmull.u8 q10, d22, d28 |
michael@0 | 3373 | vmlal.u8 q10, d23, d29 |
michael@0 | 3374 | vmovn.u16 d6, q0 |
michael@0 | 3375 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3376 | vmovn.u16 d7, q2 |
michael@0 | 3377 | vmlsl.u16 q0, d16, d30 |
michael@0 | 3378 | vmlal.u16 q0, d17, d30 |
michael@0 | 3379 | pld [TMP4, PF_OFFS] |
michael@0 | 3380 | vld1.32 {d16}, [TMP4], STRIDE |
michael@0 | 3381 | vadd.u16 q12, q12, q13 |
michael@0 | 3382 | vld1.32 {d17}, [TMP4] |
michael@0 | 3383 | pld [TMP4, PF_OFFS] |
michael@0 | 3384 | vmull.u8 q11, d16, d28 |
michael@0 | 3385 | vmlal.u8 q11, d17, d29 |
michael@0 | 3386 | vst1.32 {d6, d7}, [OUT, :128]! |
michael@0 | 3387 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3388 | vmlsl.u16 q1, d18, d31 |
michael@0 | 3389 | .endm |
michael@0 | 3390 | |
michael@0 | 3391 | /*****************************************************************************/ |
michael@0 | 3392 | |
michael@0 | 3393 | .set have_bilinear_interpolate_eight_pixels_8888_0565, 1 |
michael@0 | 3394 | |
michael@0 | 3395 | .macro bilinear_interpolate_eight_pixels_8888_0565_head |
michael@0 | 3396 | mov TMP1, X, asr #16 |
michael@0 | 3397 | add X, X, UX |
michael@0 | 3398 | add TMP1, TOP, TMP1, asl #2 |
michael@0 | 3399 | mov TMP2, X, asr #16 |
michael@0 | 3400 | add X, X, UX |
michael@0 | 3401 | add TMP2, TOP, TMP2, asl #2 |
michael@0 | 3402 | vld1.32 {d20}, [TMP1], STRIDE |
michael@0 | 3403 | vld1.32 {d21}, [TMP1] |
michael@0 | 3404 | vmull.u8 q8, d20, d28 |
michael@0 | 3405 | vmlal.u8 q8, d21, d29 |
michael@0 | 3406 | vld1.32 {d22}, [TMP2], STRIDE |
michael@0 | 3407 | vld1.32 {d23}, [TMP2] |
michael@0 | 3408 | vmull.u8 q9, d22, d28 |
michael@0 | 3409 | mov TMP3, X, asr #16 |
michael@0 | 3410 | add X, X, UX |
michael@0 | 3411 | add TMP3, TOP, TMP3, asl #2 |
michael@0 | 3412 | mov TMP4, X, asr #16 |
michael@0 | 3413 | add X, X, UX |
michael@0 | 3414 | add TMP4, TOP, TMP4, asl #2 |
michael@0 | 3415 | vmlal.u8 q9, d23, d29 |
michael@0 | 3416 | vld1.32 {d22}, [TMP3], STRIDE |
michael@0 | 3417 | vld1.32 {d23}, [TMP3] |
michael@0 | 3418 | vmull.u8 q10, d22, d28 |
michael@0 | 3419 | vmlal.u8 q10, d23, d29 |
michael@0 | 3420 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3421 | vmlsl.u16 q0, d16, d30 |
michael@0 | 3422 | vmlal.u16 q0, d17, d30 |
michael@0 | 3423 | pld [TMP4, PF_OFFS] |
michael@0 | 3424 | vld1.32 {d16}, [TMP4], STRIDE |
michael@0 | 3425 | vld1.32 {d17}, [TMP4] |
michael@0 | 3426 | pld [TMP4, PF_OFFS] |
michael@0 | 3427 | vmull.u8 q11, d16, d28 |
michael@0 | 3428 | vmlal.u8 q11, d17, d29 |
michael@0 | 3429 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3430 | vmlsl.u16 q1, d18, d31 |
michael@0 | 3431 | |
michael@0 | 3432 | mov TMP1, X, asr #16 |
michael@0 | 3433 | add X, X, UX |
michael@0 | 3434 | add TMP1, TOP, TMP1, asl #2 |
michael@0 | 3435 | mov TMP2, X, asr #16 |
michael@0 | 3436 | add X, X, UX |
michael@0 | 3437 | add TMP2, TOP, TMP2, asl #2 |
michael@0 | 3438 | vmlal.u16 q1, d19, d31 |
michael@0 | 3439 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3440 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3441 | vmlsl.u16 q2, d20, d30 |
michael@0 | 3442 | vmlal.u16 q2, d21, d30 |
michael@0 | 3443 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3444 | vld1.32 {d20}, [TMP1], STRIDE |
michael@0 | 3445 | vmlsl.u16 q3, d22, d31 |
michael@0 | 3446 | vmlal.u16 q3, d23, d31 |
michael@0 | 3447 | vld1.32 {d21}, [TMP1] |
michael@0 | 3448 | vmull.u8 q8, d20, d28 |
michael@0 | 3449 | vmlal.u8 q8, d21, d29 |
michael@0 | 3450 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3451 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3452 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3453 | vld1.32 {d22}, [TMP2], STRIDE |
michael@0 | 3454 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3455 | vadd.u16 q12, q12, q13 |
michael@0 | 3456 | vld1.32 {d23}, [TMP2] |
michael@0 | 3457 | vmull.u8 q9, d22, d28 |
michael@0 | 3458 | mov TMP3, X, asr #16 |
michael@0 | 3459 | add X, X, UX |
michael@0 | 3460 | add TMP3, TOP, TMP3, asl #2 |
michael@0 | 3461 | mov TMP4, X, asr #16 |
michael@0 | 3462 | add X, X, UX |
michael@0 | 3463 | add TMP4, TOP, TMP4, asl #2 |
michael@0 | 3464 | vmlal.u8 q9, d23, d29 |
michael@0 | 3465 | vld1.32 {d22}, [TMP3], STRIDE |
michael@0 | 3466 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3467 | vld1.32 {d23}, [TMP3] |
michael@0 | 3468 | vmull.u8 q10, d22, d28 |
michael@0 | 3469 | vmlal.u8 q10, d23, d29 |
michael@0 | 3470 | vmovn.u16 d8, q0 |
michael@0 | 3471 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3472 | vmovn.u16 d9, q2 |
michael@0 | 3473 | vmlsl.u16 q0, d16, d30 |
michael@0 | 3474 | vmlal.u16 q0, d17, d30 |
michael@0 | 3475 | pld [TMP4, PF_OFFS] |
michael@0 | 3476 | vld1.32 {d16}, [TMP4], STRIDE |
michael@0 | 3477 | vadd.u16 q12, q12, q13 |
michael@0 | 3478 | vld1.32 {d17}, [TMP4] |
michael@0 | 3479 | pld [TMP4, PF_OFFS] |
michael@0 | 3480 | vmull.u8 q11, d16, d28 |
michael@0 | 3481 | vmlal.u8 q11, d17, d29 |
michael@0 | 3482 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3483 | vmlsl.u16 q1, d18, d31 |
michael@0 | 3484 | .endm |
michael@0 | 3485 | |
michael@0 | 3486 | .macro bilinear_interpolate_eight_pixels_8888_0565_tail |
michael@0 | 3487 | vmlal.u16 q1, d19, d31 |
michael@0 | 3488 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3489 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3490 | vmlsl.u16 q2, d20, d30 |
michael@0 | 3491 | vmlal.u16 q2, d21, d30 |
michael@0 | 3492 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3493 | vmlsl.u16 q3, d22, d31 |
michael@0 | 3494 | vmlal.u16 q3, d23, d31 |
michael@0 | 3495 | vadd.u16 q12, q12, q13 |
michael@0 | 3496 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3497 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3498 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3499 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3500 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3501 | vmovn.u16 d10, q0 |
michael@0 | 3502 | vmovn.u16 d11, q2 |
michael@0 | 3503 | vadd.u16 q12, q12, q13 |
michael@0 | 3504 | |
michael@0 | 3505 | vuzp.u8 d8, d9 |
michael@0 | 3506 | vuzp.u8 d10, d11 |
michael@0 | 3507 | vuzp.u8 d9, d11 |
michael@0 | 3508 | vuzp.u8 d8, d10 |
michael@0 | 3509 | vshll.u8 q6, d9, #8 |
michael@0 | 3510 | vshll.u8 q5, d10, #8 |
michael@0 | 3511 | vshll.u8 q7, d8, #8 |
michael@0 | 3512 | vsri.u16 q5, q6, #5 |
michael@0 | 3513 | vsri.u16 q5, q7, #11 |
michael@0 | 3514 | vst1.32 {d10, d11}, [OUT, :128]! |
michael@0 | 3515 | .endm |
michael@0 | 3516 | |
michael@0 | 3517 | .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head |
michael@0 | 3518 | mov TMP1, X, asr #16 |
michael@0 | 3519 | add X, X, UX |
michael@0 | 3520 | add TMP1, TOP, TMP1, asl #2 |
michael@0 | 3521 | mov TMP2, X, asr #16 |
michael@0 | 3522 | add X, X, UX |
michael@0 | 3523 | add TMP2, TOP, TMP2, asl #2 |
michael@0 | 3524 | vmlal.u16 q1, d19, d31 |
michael@0 | 3525 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3526 | vuzp.u8 d8, d9 |
michael@0 | 3527 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3528 | vmlsl.u16 q2, d20, d30 |
michael@0 | 3529 | vmlal.u16 q2, d21, d30 |
michael@0 | 3530 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3531 | vld1.32 {d20}, [TMP1], STRIDE |
michael@0 | 3532 | vmlsl.u16 q3, d22, d31 |
michael@0 | 3533 | vmlal.u16 q3, d23, d31 |
michael@0 | 3534 | vld1.32 {d21}, [TMP1] |
michael@0 | 3535 | vmull.u8 q8, d20, d28 |
michael@0 | 3536 | vmlal.u8 q8, d21, d29 |
michael@0 | 3537 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3538 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3539 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3540 | vld1.32 {d22}, [TMP2], STRIDE |
michael@0 | 3541 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3542 | vadd.u16 q12, q12, q13 |
michael@0 | 3543 | vld1.32 {d23}, [TMP2] |
michael@0 | 3544 | vmull.u8 q9, d22, d28 |
michael@0 | 3545 | mov TMP3, X, asr #16 |
michael@0 | 3546 | add X, X, UX |
michael@0 | 3547 | add TMP3, TOP, TMP3, asl #2 |
michael@0 | 3548 | mov TMP4, X, asr #16 |
michael@0 | 3549 | add X, X, UX |
michael@0 | 3550 | add TMP4, TOP, TMP4, asl #2 |
michael@0 | 3551 | vmlal.u8 q9, d23, d29 |
michael@0 | 3552 | vld1.32 {d22}, [TMP3], STRIDE |
michael@0 | 3553 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3554 | vld1.32 {d23}, [TMP3] |
michael@0 | 3555 | vmull.u8 q10, d22, d28 |
michael@0 | 3556 | vmlal.u8 q10, d23, d29 |
michael@0 | 3557 | vmovn.u16 d10, q0 |
michael@0 | 3558 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3559 | vmovn.u16 d11, q2 |
michael@0 | 3560 | vmlsl.u16 q0, d16, d30 |
michael@0 | 3561 | vmlal.u16 q0, d17, d30 |
michael@0 | 3562 | pld [TMP4, PF_OFFS] |
michael@0 | 3563 | vld1.32 {d16}, [TMP4], STRIDE |
michael@0 | 3564 | vadd.u16 q12, q12, q13 |
michael@0 | 3565 | vld1.32 {d17}, [TMP4] |
michael@0 | 3566 | pld [TMP4, PF_OFFS] |
michael@0 | 3567 | vmull.u8 q11, d16, d28 |
michael@0 | 3568 | vmlal.u8 q11, d17, d29 |
michael@0 | 3569 | vuzp.u8 d10, d11 |
michael@0 | 3570 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3571 | vmlsl.u16 q1, d18, d31 |
michael@0 | 3572 | |
michael@0 | 3573 | mov TMP1, X, asr #16 |
michael@0 | 3574 | add X, X, UX |
michael@0 | 3575 | add TMP1, TOP, TMP1, asl #2 |
michael@0 | 3576 | mov TMP2, X, asr #16 |
michael@0 | 3577 | add X, X, UX |
michael@0 | 3578 | add TMP2, TOP, TMP2, asl #2 |
michael@0 | 3579 | vmlal.u16 q1, d19, d31 |
michael@0 | 3580 | vuzp.u8 d9, d11 |
michael@0 | 3581 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3582 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3583 | vuzp.u8 d8, d10 |
michael@0 | 3584 | vmlsl.u16 q2, d20, d30 |
michael@0 | 3585 | vmlal.u16 q2, d21, d30 |
michael@0 | 3586 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3587 | vld1.32 {d20}, [TMP1], STRIDE |
michael@0 | 3588 | vmlsl.u16 q3, d22, d31 |
michael@0 | 3589 | vmlal.u16 q3, d23, d31 |
michael@0 | 3590 | vld1.32 {d21}, [TMP1] |
michael@0 | 3591 | vmull.u8 q8, d20, d28 |
michael@0 | 3592 | vmlal.u8 q8, d21, d29 |
michael@0 | 3593 | vshll.u8 q6, d9, #8 |
michael@0 | 3594 | vshll.u8 q5, d10, #8 |
michael@0 | 3595 | vshll.u8 q7, d8, #8 |
michael@0 | 3596 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3597 | vsri.u16 q5, q6, #5 |
michael@0 | 3598 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3599 | vsri.u16 q5, q7, #11 |
michael@0 | 3600 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3601 | vld1.32 {d22}, [TMP2], STRIDE |
michael@0 | 3602 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3603 | vadd.u16 q12, q12, q13 |
michael@0 | 3604 | vld1.32 {d23}, [TMP2] |
michael@0 | 3605 | vmull.u8 q9, d22, d28 |
michael@0 | 3606 | mov TMP3, X, asr #16 |
michael@0 | 3607 | add X, X, UX |
michael@0 | 3608 | add TMP3, TOP, TMP3, asl #2 |
michael@0 | 3609 | mov TMP4, X, asr #16 |
michael@0 | 3610 | add X, X, UX |
michael@0 | 3611 | add TMP4, TOP, TMP4, asl #2 |
michael@0 | 3612 | vmlal.u8 q9, d23, d29 |
michael@0 | 3613 | vld1.32 {d22}, [TMP3], STRIDE |
michael@0 | 3614 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) |
michael@0 | 3615 | vld1.32 {d23}, [TMP3] |
michael@0 | 3616 | vmull.u8 q10, d22, d28 |
michael@0 | 3617 | vmlal.u8 q10, d23, d29 |
michael@0 | 3618 | vmovn.u16 d8, q0 |
michael@0 | 3619 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3620 | vmovn.u16 d9, q2 |
michael@0 | 3621 | vmlsl.u16 q0, d16, d30 |
michael@0 | 3622 | vmlal.u16 q0, d17, d30 |
michael@0 | 3623 | pld [TMP4, PF_OFFS] |
michael@0 | 3624 | vld1.32 {d16}, [TMP4], STRIDE |
michael@0 | 3625 | vadd.u16 q12, q12, q13 |
michael@0 | 3626 | vld1.32 {d17}, [TMP4] |
michael@0 | 3627 | pld [TMP4, PF_OFFS] |
michael@0 | 3628 | vmull.u8 q11, d16, d28 |
michael@0 | 3629 | vmlal.u8 q11, d17, d29 |
michael@0 | 3630 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS |
michael@0 | 3631 | vst1.32 {d10, d11}, [OUT, :128]! |
michael@0 | 3632 | vmlsl.u16 q1, d18, d31 |
michael@0 | 3633 | .endm |
michael@0 | 3634 | /*****************************************************************************/ |
michael@0 | 3635 | |
michael@0 | 3636 | generate_bilinear_scanline_func \ |
michael@0 | 3637 | pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ |
michael@0 | 3638 | 2, 2, 28, BILINEAR_FLAG_UNROLL_4 |
michael@0 | 3639 | |
michael@0 | 3640 | generate_bilinear_scanline_func \ |
michael@0 | 3641 | pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ |
michael@0 | 3642 | 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS |
michael@0 | 3643 | |
michael@0 | 3644 | generate_bilinear_scanline_func \ |
michael@0 | 3645 | pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ |
michael@0 | 3646 | 1, 2, 28, BILINEAR_FLAG_UNROLL_4 |
michael@0 | 3647 | |
michael@0 | 3648 | generate_bilinear_scanline_func \ |
michael@0 | 3649 | pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ |
michael@0 | 3650 | 1, 1, 28, BILINEAR_FLAG_UNROLL_4 |