1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1204 @@ 1.4 +/* 1.5 + * Copyright © 2009 Nokia Corporation 1.6 + * 1.7 + * Permission is hereby granted, free of charge, to any person obtaining a 1.8 + * copy of this software and associated documentation files (the "Software"), 1.9 + * to deal in the Software without restriction, including without limitation 1.10 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1.11 + * and/or sell copies of the Software, and to permit persons to whom the 1.12 + * Software is furnished to do so, subject to the following conditions: 1.13 + * 1.14 + * The above copyright notice and this permission notice (including the next 1.15 + * paragraph) shall be included in all copies or substantial portions of the 1.16 + * Software. 1.17 + * 1.18 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1.19 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1.20 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1.21 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1.22 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 1.23 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 1.24 + * DEALINGS IN THE SOFTWARE. 1.25 + * 1.26 + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 1.27 + */ 1.28 + 1.29 +/* 1.30 + * This file contains a macro ('generate_composite_function') which can 1.31 + * construct 2D image processing functions, based on a common template. 1.32 + * Any combinations of source, destination and mask images with 8bpp, 1.33 + * 16bpp, 24bpp, 32bpp color formats are supported. 1.34 + * 1.35 + * This macro takes care of: 1.36 + * - handling of leading and trailing unaligned pixels 1.37 + * - doing most of the work related to L2 cache preload 1.38 + * - encourages the use of software pipelining for better instructions 1.39 + * scheduling 1.40 + * 1.41 + * The user of this macro has to provide some configuration parameters 1.42 + * (bit depths for the images, prefetch distance, etc.) and a set of 1.43 + * macros, which should implement basic code chunks responsible for 1.44 + * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage 1.45 + * examples. 1.46 + * 1.47 + * TODO: 1.48 + * - try overlapped pixel method (from Ian Rickards) when processing 1.49 + * exactly two blocks of pixels 1.50 + * - maybe add an option to do reverse scanline processing 1.51 + */ 1.52 + 1.53 +/* 1.54 + * Bit flags for 'generate_composite_function' macro which are used 1.55 + * to tune generated functions behavior. 1.56 + */ 1.57 +.set FLAG_DST_WRITEONLY, 0 1.58 +.set FLAG_DST_READWRITE, 1 1.59 +.set FLAG_DEINTERLEAVE_32BPP, 2 1.60 + 1.61 +/* 1.62 + * Offset in stack where mask and source pointer/stride can be accessed 1.63 + * from 'init' macro. This is useful for doing special handling for solid mask. 1.64 + */ 1.65 +.set ARGS_STACK_OFFSET, 40 1.66 + 1.67 +/* 1.68 + * Constants for selecting preferable prefetch type. 1.69 + */ 1.70 +.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ 1.71 +.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ 1.72 +.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ 1.73 + 1.74 +/* 1.75 + * Definitions of supplementary pixld/pixst macros (for partial load/store of 1.76 + * pixel data). 1.77 + */ 1.78 + 1.79 +.macro pixldst1 op, elem_size, reg1, mem_operand, abits 1.80 +.if abits > 0 1.81 + op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! 1.82 +.else 1.83 + op&.&elem_size {d®1}, [&mem_operand&]! 1.84 +.endif 1.85 +.endm 1.86 + 1.87 +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits 1.88 +.if abits > 0 1.89 + op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! 1.90 +.else 1.91 + op&.&elem_size {d®1, d®2}, [&mem_operand&]! 1.92 +.endif 1.93 +.endm 1.94 + 1.95 +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits 1.96 +.if abits > 0 1.97 + op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! 1.98 +.else 1.99 + op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! 1.100 +.endif 1.101 +.endm 1.102 + 1.103 +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits 1.104 + op&.&elem_size {d®1[idx]}, [&mem_operand&]! 1.105 +.endm 1.106 + 1.107 +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand 1.108 + op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! 1.109 +.endm 1.110 + 1.111 +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand 1.112 + op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! 1.113 +.endm 1.114 + 1.115 +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits 1.116 +.if numbytes == 32 1.117 + pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ 1.118 + %(basereg+6), %(basereg+7), mem_operand, abits 1.119 +.elseif numbytes == 16 1.120 + pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits 1.121 +.elseif numbytes == 8 1.122 + pixldst1 op, elem_size, %(basereg+1), mem_operand, abits 1.123 +.elseif numbytes == 4 1.124 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) 1.125 + pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits 1.126 + .elseif elem_size == 16 1.127 + pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits 1.128 + pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits 1.129 + .else 1.130 + pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits 1.131 + pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits 1.132 + pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits 1.133 + pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits 1.134 + .endif 1.135 +.elseif numbytes == 2 1.136 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) 1.137 + pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits 1.138 + .else 1.139 + pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits 1.140 + pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits 1.141 + .endif 1.142 +.elseif numbytes == 1 1.143 + pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits 1.144 +.else 1.145 + .error "unsupported size: numbytes" 1.146 +.endif 1.147 +.endm 1.148 + 1.149 +.macro pixld numpix, bpp, basereg, mem_operand, abits=0 1.150 +.if bpp > 0 1.151 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 1.152 + pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ 1.153 + %(basereg+6), %(basereg+7), mem_operand, abits 1.154 +.elseif (bpp == 24) && (numpix == 8) 1.155 + pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand 1.156 +.elseif (bpp == 24) && (numpix == 4) 1.157 + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand 1.158 + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand 1.159 + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand 1.160 + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand 1.161 +.elseif (bpp == 24) && (numpix == 2) 1.162 + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand 1.163 + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand 1.164 +.elseif (bpp == 24) && (numpix == 1) 1.165 + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand 1.166 +.else 1.167 + pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits 1.168 +.endif 1.169 +.endif 1.170 +.endm 1.171 + 1.172 +.macro pixst numpix, bpp, basereg, mem_operand, abits=0 1.173 +.if bpp > 0 1.174 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 1.175 + pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ 1.176 + %(basereg+6), %(basereg+7), mem_operand, abits 1.177 +.elseif (bpp == 24) && (numpix == 8) 1.178 + pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand 1.179 +.elseif (bpp == 24) && (numpix == 4) 1.180 + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand 1.181 + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand 1.182 + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand 1.183 + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand 1.184 +.elseif (bpp == 24) && (numpix == 2) 1.185 + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand 1.186 + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand 1.187 +.elseif (bpp == 24) && (numpix == 1) 1.188 + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand 1.189 +.else 1.190 + pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits 1.191 +.endif 1.192 +.endif 1.193 +.endm 1.194 + 1.195 +.macro pixld_a numpix, bpp, basereg, mem_operand 1.196 +.if (bpp * numpix) <= 128 1.197 + pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) 1.198 +.else 1.199 + pixld numpix, bpp, basereg, mem_operand, 128 1.200 +.endif 1.201 +.endm 1.202 + 1.203 +.macro pixst_a numpix, bpp, basereg, mem_operand 1.204 +.if (bpp * numpix) <= 128 1.205 + pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) 1.206 +.else 1.207 + pixst numpix, bpp, basereg, mem_operand, 128 1.208 +.endif 1.209 +.endm 1.210 + 1.211 +/* 1.212 + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register 1.213 + * aliases to be defined) 1.214 + */ 1.215 +.macro pixld1_s elem_size, reg1, mem_operand 1.216 +.if elem_size == 16 1.217 + mov TMP1, VX, asr #16 1.218 + adds VX, VX, UNIT_X 1.219 +5: subpls VX, VX, SRC_WIDTH_FIXED 1.220 + bpl 5b 1.221 + add TMP1, mem_operand, TMP1, asl #1 1.222 + mov TMP2, VX, asr #16 1.223 + adds VX, VX, UNIT_X 1.224 +5: subpls VX, VX, SRC_WIDTH_FIXED 1.225 + bpl 5b 1.226 + add TMP2, mem_operand, TMP2, asl #1 1.227 + vld1.16 {d®1&[0]}, [TMP1, :16] 1.228 + mov TMP1, VX, asr #16 1.229 + adds VX, VX, UNIT_X 1.230 +5: subpls VX, VX, SRC_WIDTH_FIXED 1.231 + bpl 5b 1.232 + add TMP1, mem_operand, TMP1, asl #1 1.233 + vld1.16 {d®1&[1]}, [TMP2, :16] 1.234 + mov TMP2, VX, asr #16 1.235 + adds VX, VX, UNIT_X 1.236 +5: subpls VX, VX, SRC_WIDTH_FIXED 1.237 + bpl 5b 1.238 + add TMP2, mem_operand, TMP2, asl #1 1.239 + vld1.16 {d®1&[2]}, [TMP1, :16] 1.240 + vld1.16 {d®1&[3]}, [TMP2, :16] 1.241 +.elseif elem_size == 32 1.242 + mov TMP1, VX, asr #16 1.243 + adds VX, VX, UNIT_X 1.244 +5: subpls VX, VX, SRC_WIDTH_FIXED 1.245 + bpl 5b 1.246 + add TMP1, mem_operand, TMP1, asl #2 1.247 + mov TMP2, VX, asr #16 1.248 + adds VX, VX, UNIT_X 1.249 +5: subpls VX, VX, SRC_WIDTH_FIXED 1.250 + bpl 5b 1.251 + add TMP2, mem_operand, TMP2, asl #2 1.252 + vld1.32 {d®1&[0]}, [TMP1, :32] 1.253 + vld1.32 {d®1&[1]}, [TMP2, :32] 1.254 +.else 1.255 + .error "unsupported" 1.256 +.endif 1.257 +.endm 1.258 + 1.259 +.macro pixld2_s elem_size, reg1, reg2, mem_operand 1.260 +.if 0 /* elem_size == 32 */ 1.261 + mov TMP1, VX, asr #16 1.262 + add VX, VX, UNIT_X, asl #1 1.263 + add TMP1, mem_operand, TMP1, asl #2 1.264 + mov TMP2, VX, asr #16 1.265 + sub VX, VX, UNIT_X 1.266 + add TMP2, mem_operand, TMP2, asl #2 1.267 + vld1.32 {d®1&[0]}, [TMP1, :32] 1.268 + mov TMP1, VX, asr #16 1.269 + add VX, VX, UNIT_X, asl #1 1.270 + add TMP1, mem_operand, TMP1, asl #2 1.271 + vld1.32 {d®2&[0]}, [TMP2, :32] 1.272 + mov TMP2, VX, asr #16 1.273 + add VX, VX, UNIT_X 1.274 + add TMP2, mem_operand, TMP2, asl #2 1.275 + vld1.32 {d®1&[1]}, [TMP1, :32] 1.276 + vld1.32 {d®2&[1]}, [TMP2, :32] 1.277 +.else 1.278 + pixld1_s elem_size, reg1, mem_operand 1.279 + pixld1_s elem_size, reg2, mem_operand 1.280 +.endif 1.281 +.endm 1.282 + 1.283 +.macro pixld0_s elem_size, reg1, idx, mem_operand 1.284 +.if elem_size == 16 1.285 + mov TMP1, VX, asr #16 1.286 + adds VX, VX, UNIT_X 1.287 +5: subpls VX, VX, SRC_WIDTH_FIXED 1.288 + bpl 5b 1.289 + add TMP1, mem_operand, TMP1, asl #1 1.290 + vld1.16 {d®1&[idx]}, [TMP1, :16] 1.291 +.elseif elem_size == 32 1.292 + mov TMP1, VX, asr #16 1.293 + adds VX, VX, UNIT_X 1.294 +5: subpls VX, VX, SRC_WIDTH_FIXED 1.295 + bpl 5b 1.296 + add TMP1, mem_operand, TMP1, asl #2 1.297 + vld1.32 {d®1&[idx]}, [TMP1, :32] 1.298 +.endif 1.299 +.endm 1.300 + 1.301 +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand 1.302 +.if numbytes == 32 1.303 + pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand 1.304 + pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand 1.305 + pixdeinterleave elem_size, %(basereg+4) 1.306 +.elseif numbytes == 16 1.307 + pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand 1.308 +.elseif numbytes == 8 1.309 + pixld1_s elem_size, %(basereg+1), mem_operand 1.310 +.elseif numbytes == 4 1.311 + .if elem_size == 32 1.312 + pixld0_s elem_size, %(basereg+0), 1, mem_operand 1.313 + .elseif elem_size == 16 1.314 + pixld0_s elem_size, %(basereg+0), 2, mem_operand 1.315 + pixld0_s elem_size, %(basereg+0), 3, mem_operand 1.316 + .else 1.317 + pixld0_s elem_size, %(basereg+0), 4, mem_operand 1.318 + pixld0_s elem_size, %(basereg+0), 5, mem_operand 1.319 + pixld0_s elem_size, %(basereg+0), 6, mem_operand 1.320 + pixld0_s elem_size, %(basereg+0), 7, mem_operand 1.321 + .endif 1.322 +.elseif numbytes == 2 1.323 + .if elem_size == 16 1.324 + pixld0_s elem_size, %(basereg+0), 1, mem_operand 1.325 + .else 1.326 + pixld0_s elem_size, %(basereg+0), 2, mem_operand 1.327 + pixld0_s elem_size, %(basereg+0), 3, mem_operand 1.328 + .endif 1.329 +.elseif numbytes == 1 1.330 + pixld0_s elem_size, %(basereg+0), 1, mem_operand 1.331 +.else 1.332 + .error "unsupported size: numbytes" 1.333 +.endif 1.334 +.endm 1.335 + 1.336 +.macro pixld_s numpix, bpp, basereg, mem_operand 1.337 +.if bpp > 0 1.338 + pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand 1.339 +.endif 1.340 +.endm 1.341 + 1.342 +.macro vuzp8 reg1, reg2 1.343 + vuzp.8 d®1, d®2 1.344 +.endm 1.345 + 1.346 +.macro vzip8 reg1, reg2 1.347 + vzip.8 d®1, d®2 1.348 +.endm 1.349 + 1.350 +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 1.351 +.macro pixdeinterleave bpp, basereg 1.352 +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 1.353 + vuzp8 %(basereg+0), %(basereg+1) 1.354 + vuzp8 %(basereg+2), %(basereg+3) 1.355 + vuzp8 %(basereg+1), %(basereg+3) 1.356 + vuzp8 %(basereg+0), %(basereg+2) 1.357 +.endif 1.358 +.endm 1.359 + 1.360 +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 1.361 +.macro pixinterleave bpp, basereg 1.362 +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 1.363 + vzip8 %(basereg+0), %(basereg+2) 1.364 + vzip8 %(basereg+1), %(basereg+3) 1.365 + vzip8 %(basereg+2), %(basereg+3) 1.366 + vzip8 %(basereg+0), %(basereg+1) 1.367 +.endif 1.368 +.endm 1.369 + 1.370 +/* 1.371 + * This is a macro for implementing cache preload. The main idea is that 1.372 + * cache preload logic is mostly independent from the rest of pixels 1.373 + * processing code. It starts at the top left pixel and moves forward 1.374 + * across pixels and can jump across scanlines. Prefetch distance is 1.375 + * handled in an 'incremental' way: it starts from 0 and advances to the 1.376 + * optimal distance over time. After reaching optimal prefetch distance, 1.377 + * it is kept constant. There are some checks which prevent prefetching 1.378 + * unneeded pixel lines below the image (but it still can prefetch a bit 1.379 + * more data on the right side of the image - not a big issue and may 1.380 + * be actually helpful when rendering text glyphs). Additional trick is 1.381 + * the use of LDR instruction for prefetch instead of PLD when moving to 1.382 + * the next line, the point is that we have a high chance of getting TLB 1.383 + * miss in this case, and PLD would be useless. 1.384 + * 1.385 + * This sounds like it may introduce a noticeable overhead (when working with 1.386 + * fully cached data). But in reality, due to having a separate pipeline and 1.387 + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can 1.388 + * execute simultaneously with NEON and be completely shadowed by it. Thus 1.389 + * we get no performance overhead at all (*). This looks like a very nice 1.390 + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, 1.391 + * but still can implement some rather advanced prefetch logic in sofware 1.392 + * for almost zero cost! 1.393 + * 1.394 + * (*) The overhead of the prefetcher is visible when running some trivial 1.395 + * pixels processing like simple copy. Anyway, having prefetch is a must 1.396 + * when working with the graphics data. 1.397 + */ 1.398 +.macro PF a, x:vararg 1.399 +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) 1.400 + a x 1.401 +.endif 1.402 +.endm 1.403 + 1.404 +.macro cache_preload std_increment, boost_increment 1.405 +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) 1.406 +.if regs_shortage 1.407 + PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ 1.408 +.endif 1.409 +.if std_increment != 0 1.410 + PF add PF_X, PF_X, #std_increment 1.411 +.endif 1.412 + PF tst PF_CTL, #0xF 1.413 + PF addne PF_X, PF_X, #boost_increment 1.414 + PF subne PF_CTL, PF_CTL, #1 1.415 + PF cmp PF_X, ORIG_W 1.416 +.if src_bpp_shift >= 0 1.417 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1.418 +.endif 1.419 +.if dst_r_bpp != 0 1.420 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1.421 +.endif 1.422 +.if mask_bpp_shift >= 0 1.423 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1.424 +.endif 1.425 + PF subge PF_X, PF_X, ORIG_W 1.426 + PF subges PF_CTL, PF_CTL, #0x10 1.427 +.if src_bpp_shift >= 0 1.428 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1.429 +.endif 1.430 +.if dst_r_bpp != 0 1.431 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1.432 +.endif 1.433 +.if mask_bpp_shift >= 0 1.434 + PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1.435 +.endif 1.436 +.endif 1.437 +.endm 1.438 + 1.439 +.macro cache_preload_simple 1.440 +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) 1.441 +.if src_bpp > 0 1.442 + pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] 1.443 +.endif 1.444 +.if dst_r_bpp > 0 1.445 + pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] 1.446 +.endif 1.447 +.if mask_bpp > 0 1.448 + pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] 1.449 +.endif 1.450 +.endif 1.451 +.endm 1.452 + 1.453 +.macro fetch_mask_pixblock 1.454 + pixld pixblock_size, mask_bpp, \ 1.455 + (mask_basereg - pixblock_size * mask_bpp / 64), MASK 1.456 +.endm 1.457 + 1.458 +/* 1.459 + * Macro which is used to process leading pixels until destination 1.460 + * pointer is properly aligned (at 16 bytes boundary). When destination 1.461 + * buffer uses 16bpp format, this is unnecessary, or even pointless. 1.462 + */ 1.463 +.macro ensure_destination_ptr_alignment process_pixblock_head, \ 1.464 + process_pixblock_tail, \ 1.465 + process_pixblock_tail_head 1.466 +.if dst_w_bpp != 24 1.467 + tst DST_R, #0xF 1.468 + beq 2f 1.469 + 1.470 +.irp lowbit, 1, 2, 4, 8, 16 1.471 +local skip1 1.472 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) 1.473 +.if lowbit < 16 /* we don't need more than 16-byte alignment */ 1.474 + tst DST_R, #lowbit 1.475 + beq 1f 1.476 +.endif 1.477 + pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC 1.478 + pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK 1.479 +.if dst_r_bpp > 0 1.480 + pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R 1.481 +.else 1.482 + add DST_R, DST_R, #lowbit 1.483 +.endif 1.484 + PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) 1.485 + sub W, W, #(lowbit * 8 / dst_w_bpp) 1.486 +1: 1.487 +.endif 1.488 +.endr 1.489 + pixdeinterleave src_bpp, src_basereg 1.490 + pixdeinterleave mask_bpp, mask_basereg 1.491 + pixdeinterleave dst_r_bpp, dst_r_basereg 1.492 + 1.493 + process_pixblock_head 1.494 + cache_preload 0, pixblock_size 1.495 + cache_preload_simple 1.496 + process_pixblock_tail 1.497 + 1.498 + pixinterleave dst_w_bpp, dst_w_basereg 1.499 +.irp lowbit, 1, 2, 4, 8, 16 1.500 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) 1.501 +.if lowbit < 16 /* we don't need more than 16-byte alignment */ 1.502 + tst DST_W, #lowbit 1.503 + beq 1f 1.504 +.endif 1.505 + pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W 1.506 +1: 1.507 +.endif 1.508 +.endr 1.509 +.endif 1.510 +2: 1.511 +.endm 1.512 + 1.513 +/* 1.514 + * Special code for processing up to (pixblock_size - 1) remaining 1.515 + * trailing pixels. As SIMD processing performs operation on 1.516 + * pixblock_size pixels, anything smaller than this has to be loaded 1.517 + * and stored in a special way. Loading and storing of pixel data is 1.518 + * performed in such a way that we fill some 'slots' in the NEON 1.519 + * registers (some slots naturally are unused), then perform compositing 1.520 + * operation as usual. In the end, the data is taken from these 'slots' 1.521 + * and saved to memory. 1.522 + * 1.523 + * cache_preload_flag - allows to suppress prefetch if 1.524 + * set to 0 1.525 + * dst_aligned_flag - selects whether destination buffer 1.526 + * is aligned 1.527 + */ 1.528 +.macro process_trailing_pixels cache_preload_flag, \ 1.529 + dst_aligned_flag, \ 1.530 + process_pixblock_head, \ 1.531 + process_pixblock_tail, \ 1.532 + process_pixblock_tail_head 1.533 + tst W, #(pixblock_size - 1) 1.534 + beq 2f 1.535 +.irp chunk_size, 16, 8, 4, 2, 1 1.536 +.if pixblock_size > chunk_size 1.537 + tst W, #chunk_size 1.538 + beq 1f 1.539 + pixld_src chunk_size, src_bpp, src_basereg, SRC 1.540 + pixld chunk_size, mask_bpp, mask_basereg, MASK 1.541 +.if dst_aligned_flag != 0 1.542 + pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R 1.543 +.else 1.544 + pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R 1.545 +.endif 1.546 +.if cache_preload_flag != 0 1.547 + PF add PF_X, PF_X, #chunk_size 1.548 +.endif 1.549 +1: 1.550 +.endif 1.551 +.endr 1.552 + pixdeinterleave src_bpp, src_basereg 1.553 + pixdeinterleave mask_bpp, mask_basereg 1.554 + pixdeinterleave dst_r_bpp, dst_r_basereg 1.555 + 1.556 + process_pixblock_head 1.557 +.if cache_preload_flag != 0 1.558 + cache_preload 0, pixblock_size 1.559 + cache_preload_simple 1.560 +.endif 1.561 + process_pixblock_tail 1.562 + pixinterleave dst_w_bpp, dst_w_basereg 1.563 +.irp chunk_size, 16, 8, 4, 2, 1 1.564 +.if pixblock_size > chunk_size 1.565 + tst W, #chunk_size 1.566 + beq 1f 1.567 +.if dst_aligned_flag != 0 1.568 + pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W 1.569 +.else 1.570 + pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W 1.571 +.endif 1.572 +1: 1.573 +.endif 1.574 +.endr 1.575 +2: 1.576 +.endm 1.577 + 1.578 +/* 1.579 + * Macro, which performs all the needed operations to switch to the next 1.580 + * scanline and start the next loop iteration unless all the scanlines 1.581 + * are already processed. 1.582 + */ 1.583 +.macro advance_to_next_scanline start_of_loop_label 1.584 +.if regs_shortage 1.585 + ldrd W, [sp] /* load W and H (width and height) from stack */ 1.586 +.else 1.587 + mov W, ORIG_W 1.588 +.endif 1.589 + add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift 1.590 +.if src_bpp != 0 1.591 + add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift 1.592 +.endif 1.593 +.if mask_bpp != 0 1.594 + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift 1.595 +.endif 1.596 +.if (dst_w_bpp != 24) 1.597 + sub DST_W, DST_W, W, lsl #dst_bpp_shift 1.598 +.endif 1.599 +.if (src_bpp != 24) && (src_bpp != 0) 1.600 + sub SRC, SRC, W, lsl #src_bpp_shift 1.601 +.endif 1.602 +.if (mask_bpp != 24) && (mask_bpp != 0) 1.603 + sub MASK, MASK, W, lsl #mask_bpp_shift 1.604 +.endif 1.605 + subs H, H, #1 1.606 + mov DST_R, DST_W 1.607 +.if regs_shortage 1.608 + str H, [sp, #4] /* save updated height to stack */ 1.609 +.endif 1.610 + bge start_of_loop_label 1.611 +.endm 1.612 + 1.613 +/* 1.614 + * Registers are allocated in the following way by default: 1.615 + * d0, d1, d2, d3 - reserved for loading source pixel data 1.616 + * d4, d5, d6, d7 - reserved for loading destination pixel data 1.617 + * d24, d25, d26, d27 - reserved for loading mask pixel data 1.618 + * d28, d29, d30, d31 - final destination pixel data for writeback to memory 1.619 + */ 1.620 +.macro generate_composite_function fname, \ 1.621 + src_bpp_, \ 1.622 + mask_bpp_, \ 1.623 + dst_w_bpp_, \ 1.624 + flags, \ 1.625 + pixblock_size_, \ 1.626 + prefetch_distance, \ 1.627 + init, \ 1.628 + cleanup, \ 1.629 + process_pixblock_head, \ 1.630 + process_pixblock_tail, \ 1.631 + process_pixblock_tail_head, \ 1.632 + dst_w_basereg_ = 28, \ 1.633 + dst_r_basereg_ = 4, \ 1.634 + src_basereg_ = 0, \ 1.635 + mask_basereg_ = 24 1.636 + 1.637 + .func fname 1.638 + .global fname 1.639 + /* For ELF format also set function visibility to hidden */ 1.640 +#ifdef __ELF__ 1.641 + .hidden fname 1.642 + .type fname, %function 1.643 +#endif 1.644 +fname: 1.645 + .fnstart 1.646 + .save {r4-r12, lr} 1.647 + push {r4-r12, lr} /* save all registers */ 1.648 + 1.649 +/* 1.650 + * Select prefetch type for this function. If prefetch distance is 1.651 + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch 1.652 + * has to be used instead of ADVANCED. 1.653 + */ 1.654 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT 1.655 +.if prefetch_distance == 0 1.656 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 1.657 +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ 1.658 + ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) 1.659 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE 1.660 +.endif 1.661 + 1.662 +/* 1.663 + * Make some macro arguments globally visible and accessible 1.664 + * from other macros 1.665 + */ 1.666 + .set src_bpp, src_bpp_ 1.667 + .set mask_bpp, mask_bpp_ 1.668 + .set dst_w_bpp, dst_w_bpp_ 1.669 + .set pixblock_size, pixblock_size_ 1.670 + .set dst_w_basereg, dst_w_basereg_ 1.671 + .set dst_r_basereg, dst_r_basereg_ 1.672 + .set src_basereg, src_basereg_ 1.673 + .set mask_basereg, mask_basereg_ 1.674 + 1.675 + .macro pixld_src x:vararg 1.676 + pixld x 1.677 + .endm 1.678 + .macro fetch_src_pixblock 1.679 + pixld_src pixblock_size, src_bpp, \ 1.680 + (src_basereg - pixblock_size * src_bpp / 64), SRC 1.681 + .endm 1.682 +/* 1.683 + * Assign symbolic names to registers 1.684 + */ 1.685 + W .req r0 /* width (is updated during processing) */ 1.686 + H .req r1 /* height (is updated during processing) */ 1.687 + DST_W .req r2 /* destination buffer pointer for writes */ 1.688 + DST_STRIDE .req r3 /* destination image stride */ 1.689 + SRC .req r4 /* source buffer pointer */ 1.690 + SRC_STRIDE .req r5 /* source image stride */ 1.691 + DST_R .req r6 /* destination buffer pointer for reads */ 1.692 + 1.693 + MASK .req r7 /* mask pointer */ 1.694 + MASK_STRIDE .req r8 /* mask stride */ 1.695 + 1.696 + PF_CTL .req r9 /* combined lines counter and prefetch */ 1.697 + /* distance increment counter */ 1.698 + PF_X .req r10 /* pixel index in a scanline for current */ 1.699 + /* pretetch position */ 1.700 + PF_SRC .req r11 /* pointer to source scanline start */ 1.701 + /* for prefetch purposes */ 1.702 + PF_DST .req r12 /* pointer to destination scanline start */ 1.703 + /* for prefetch purposes */ 1.704 + PF_MASK .req r14 /* pointer to mask scanline start */ 1.705 + /* for prefetch purposes */ 1.706 +/* 1.707 + * Check whether we have enough registers for all the local variables. 1.708 + * If we don't have enough registers, original width and height are 1.709 + * kept on top of stack (and 'regs_shortage' variable is set to indicate 1.710 + * this for the rest of code). Even if there are enough registers, the 1.711 + * allocation scheme may be a bit different depending on whether source 1.712 + * or mask is not used. 1.713 + */ 1.714 +.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) 1.715 + ORIG_W .req r10 /* saved original width */ 1.716 + DUMMY .req r12 /* temporary register */ 1.717 + .set regs_shortage, 0 1.718 +.elseif mask_bpp == 0 1.719 + ORIG_W .req r7 /* saved original width */ 1.720 + DUMMY .req r8 /* temporary register */ 1.721 + .set regs_shortage, 0 1.722 +.elseif src_bpp == 0 1.723 + ORIG_W .req r4 /* saved original width */ 1.724 + DUMMY .req r5 /* temporary register */ 1.725 + .set regs_shortage, 0 1.726 +.else 1.727 + ORIG_W .req r1 /* saved original width */ 1.728 + DUMMY .req r1 /* temporary register */ 1.729 + .set regs_shortage, 1 1.730 +.endif 1.731 + 1.732 + .set mask_bpp_shift, -1 1.733 +.if src_bpp == 32 1.734 + .set src_bpp_shift, 2 1.735 +.elseif src_bpp == 24 1.736 + .set src_bpp_shift, 0 1.737 +.elseif src_bpp == 16 1.738 + .set src_bpp_shift, 1 1.739 +.elseif src_bpp == 8 1.740 + .set src_bpp_shift, 0 1.741 +.elseif src_bpp == 0 1.742 + .set src_bpp_shift, -1 1.743 +.else 1.744 + .error "requested src bpp (src_bpp) is not supported" 1.745 +.endif 1.746 +.if mask_bpp == 32 1.747 + .set mask_bpp_shift, 2 1.748 +.elseif mask_bpp == 24 1.749 + .set mask_bpp_shift, 0 1.750 +.elseif mask_bpp == 8 1.751 + .set mask_bpp_shift, 0 1.752 +.elseif mask_bpp == 0 1.753 + .set mask_bpp_shift, -1 1.754 +.else 1.755 + .error "requested mask bpp (mask_bpp) is not supported" 1.756 +.endif 1.757 +.if dst_w_bpp == 32 1.758 + .set dst_bpp_shift, 2 1.759 +.elseif dst_w_bpp == 24 1.760 + .set dst_bpp_shift, 0 1.761 +.elseif dst_w_bpp == 16 1.762 + .set dst_bpp_shift, 1 1.763 +.elseif dst_w_bpp == 8 1.764 + .set dst_bpp_shift, 0 1.765 +.else 1.766 + .error "requested dst bpp (dst_w_bpp) is not supported" 1.767 +.endif 1.768 + 1.769 +.if (((flags) & FLAG_DST_READWRITE) != 0) 1.770 + .set dst_r_bpp, dst_w_bpp 1.771 +.else 1.772 + .set dst_r_bpp, 0 1.773 +.endif 1.774 +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) 1.775 + .set DEINTERLEAVE_32BPP_ENABLED, 1 1.776 +.else 1.777 + .set DEINTERLEAVE_32BPP_ENABLED, 0 1.778 +.endif 1.779 + 1.780 +.if prefetch_distance < 0 || prefetch_distance > 15 1.781 + .error "invalid prefetch distance (prefetch_distance)" 1.782 +.endif 1.783 + 1.784 +.if src_bpp > 0 1.785 + ldr SRC, [sp, #40] 1.786 +.endif 1.787 +.if mask_bpp > 0 1.788 + ldr MASK, [sp, #48] 1.789 +.endif 1.790 + PF mov PF_X, #0 1.791 +.if src_bpp > 0 1.792 + ldr SRC_STRIDE, [sp, #44] 1.793 +.endif 1.794 +.if mask_bpp > 0 1.795 + ldr MASK_STRIDE, [sp, #52] 1.796 +.endif 1.797 + mov DST_R, DST_W 1.798 + 1.799 +.if src_bpp == 24 1.800 + sub SRC_STRIDE, SRC_STRIDE, W 1.801 + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 1.802 +.endif 1.803 +.if mask_bpp == 24 1.804 + sub MASK_STRIDE, MASK_STRIDE, W 1.805 + sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 1.806 +.endif 1.807 +.if dst_w_bpp == 24 1.808 + sub DST_STRIDE, DST_STRIDE, W 1.809 + sub DST_STRIDE, DST_STRIDE, W, lsl #1 1.810 +.endif 1.811 + 1.812 +/* 1.813 + * Setup advanced prefetcher initial state 1.814 + */ 1.815 + PF mov PF_SRC, SRC 1.816 + PF mov PF_DST, DST_R 1.817 + PF mov PF_MASK, MASK 1.818 + /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ 1.819 + PF mov PF_CTL, H, lsl #4 1.820 + PF add PF_CTL, #(prefetch_distance - 0x10) 1.821 + 1.822 + init 1.823 +.if regs_shortage 1.824 + .save {r0, r1} 1.825 + push {r0, r1} 1.826 +.endif 1.827 + subs H, H, #1 1.828 +.if regs_shortage 1.829 + str H, [sp, #4] /* save updated height to stack */ 1.830 +.else 1.831 + mov ORIG_W, W 1.832 +.endif 1.833 + blt 9f 1.834 + cmp W, #(pixblock_size * 2) 1.835 + blt 8f 1.836 +/* 1.837 + * This is the start of the pipelined loop, which if optimized for 1.838 + * long scanlines 1.839 + */ 1.840 +0: 1.841 + ensure_destination_ptr_alignment process_pixblock_head, \ 1.842 + process_pixblock_tail, \ 1.843 + process_pixblock_tail_head 1.844 + 1.845 + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 1.846 + pixld_a pixblock_size, dst_r_bpp, \ 1.847 + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 1.848 + fetch_src_pixblock 1.849 + pixld pixblock_size, mask_bpp, \ 1.850 + (mask_basereg - pixblock_size * mask_bpp / 64), MASK 1.851 + PF add PF_X, PF_X, #pixblock_size 1.852 + process_pixblock_head 1.853 + cache_preload 0, pixblock_size 1.854 + cache_preload_simple 1.855 + subs W, W, #(pixblock_size * 2) 1.856 + blt 2f 1.857 +1: 1.858 + process_pixblock_tail_head 1.859 + cache_preload_simple 1.860 + subs W, W, #pixblock_size 1.861 + bge 1b 1.862 +2: 1.863 + process_pixblock_tail 1.864 + pixst_a pixblock_size, dst_w_bpp, \ 1.865 + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 1.866 + 1.867 + /* Process the remaining trailing pixels in the scanline */ 1.868 + process_trailing_pixels 1, 1, \ 1.869 + process_pixblock_head, \ 1.870 + process_pixblock_tail, \ 1.871 + process_pixblock_tail_head 1.872 + advance_to_next_scanline 0b 1.873 + 1.874 +.if regs_shortage 1.875 + pop {r0, r1} 1.876 +.endif 1.877 + cleanup 1.878 + pop {r4-r12, pc} /* exit */ 1.879 +/* 1.880 + * This is the start of the loop, designed to process images with small width 1.881 + * (less than pixblock_size * 2 pixels). In this case neither pipelining 1.882 + * nor prefetch are used. 1.883 + */ 1.884 +8: 1.885 + /* Process exactly pixblock_size pixels if needed */ 1.886 + tst W, #pixblock_size 1.887 + beq 1f 1.888 + pixld pixblock_size, dst_r_bpp, \ 1.889 + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 1.890 + fetch_src_pixblock 1.891 + pixld pixblock_size, mask_bpp, \ 1.892 + (mask_basereg - pixblock_size * mask_bpp / 64), MASK 1.893 + process_pixblock_head 1.894 + process_pixblock_tail 1.895 + pixst pixblock_size, dst_w_bpp, \ 1.896 + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 1.897 +1: 1.898 + /* Process the remaining trailing pixels in the scanline */ 1.899 + process_trailing_pixels 0, 0, \ 1.900 + process_pixblock_head, \ 1.901 + process_pixblock_tail, \ 1.902 + process_pixblock_tail_head 1.903 + advance_to_next_scanline 8b 1.904 +9: 1.905 +.if regs_shortage 1.906 + pop {r0, r1} 1.907 +.endif 1.908 + cleanup 1.909 + pop {r4-r12, pc} /* exit */ 1.910 + .fnend 1.911 + 1.912 + .purgem fetch_src_pixblock 1.913 + .purgem pixld_src 1.914 + 1.915 + .unreq SRC 1.916 + .unreq MASK 1.917 + .unreq DST_R 1.918 + .unreq DST_W 1.919 + .unreq ORIG_W 1.920 + .unreq W 1.921 + .unreq H 1.922 + .unreq SRC_STRIDE 1.923 + .unreq DST_STRIDE 1.924 + .unreq MASK_STRIDE 1.925 + .unreq PF_CTL 1.926 + .unreq PF_X 1.927 + .unreq PF_SRC 1.928 + .unreq PF_DST 1.929 + .unreq PF_MASK 1.930 + .unreq DUMMY 1.931 + .endfunc 1.932 +.endm 1.933 + 1.934 +/* 1.935 + * A simplified variant of function generation template for a single 1.936 + * scanline processing (for implementing pixman combine functions) 1.937 + */ 1.938 +.macro generate_composite_function_scanline use_nearest_scaling, \ 1.939 + fname, \ 1.940 + src_bpp_, \ 1.941 + mask_bpp_, \ 1.942 + dst_w_bpp_, \ 1.943 + flags, \ 1.944 + pixblock_size_, \ 1.945 + init, \ 1.946 + cleanup, \ 1.947 + process_pixblock_head, \ 1.948 + process_pixblock_tail, \ 1.949 + process_pixblock_tail_head, \ 1.950 + dst_w_basereg_ = 28, \ 1.951 + dst_r_basereg_ = 4, \ 1.952 + src_basereg_ = 0, \ 1.953 + mask_basereg_ = 24 1.954 + 1.955 + .func fname 1.956 + .global fname 1.957 + /* For ELF format also set function visibility to hidden */ 1.958 +#ifdef __ELF__ 1.959 + .hidden fname 1.960 + .type fname, %function 1.961 +#endif 1.962 +fname: 1.963 + .fnstart 1.964 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 1.965 +/* 1.966 + * Make some macro arguments globally visible and accessible 1.967 + * from other macros 1.968 + */ 1.969 + .set src_bpp, src_bpp_ 1.970 + .set mask_bpp, mask_bpp_ 1.971 + .set dst_w_bpp, dst_w_bpp_ 1.972 + .set pixblock_size, pixblock_size_ 1.973 + .set dst_w_basereg, dst_w_basereg_ 1.974 + .set dst_r_basereg, dst_r_basereg_ 1.975 + .set src_basereg, src_basereg_ 1.976 + .set mask_basereg, mask_basereg_ 1.977 + 1.978 +.if use_nearest_scaling != 0 1.979 + /* 1.980 + * Assign symbolic names to registers for nearest scaling 1.981 + */ 1.982 + W .req r0 1.983 + DST_W .req r1 1.984 + SRC .req r2 1.985 + VX .req r3 1.986 + UNIT_X .req ip 1.987 + MASK .req lr 1.988 + TMP1 .req r4 1.989 + TMP2 .req r5 1.990 + DST_R .req r6 1.991 + SRC_WIDTH_FIXED .req r7 1.992 + 1.993 + .macro pixld_src x:vararg 1.994 + pixld_s x 1.995 + .endm 1.996 + 1.997 + ldr UNIT_X, [sp] 1.998 + .save {r4-r8, lr} 1.999 + push {r4-r8, lr} 1.1000 + ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] 1.1001 + .if mask_bpp != 0 1.1002 + ldr MASK, [sp, #(24 + 8)] 1.1003 + .endif 1.1004 +.else 1.1005 + /* 1.1006 + * Assign symbolic names to registers 1.1007 + */ 1.1008 + W .req r0 /* width (is updated during processing) */ 1.1009 + DST_W .req r1 /* destination buffer pointer for writes */ 1.1010 + SRC .req r2 /* source buffer pointer */ 1.1011 + DST_R .req ip /* destination buffer pointer for reads */ 1.1012 + MASK .req r3 /* mask pointer */ 1.1013 + 1.1014 + .macro pixld_src x:vararg 1.1015 + pixld x 1.1016 + .endm 1.1017 +.endif 1.1018 + 1.1019 +.if (((flags) & FLAG_DST_READWRITE) != 0) 1.1020 + .set dst_r_bpp, dst_w_bpp 1.1021 +.else 1.1022 + .set dst_r_bpp, 0 1.1023 +.endif 1.1024 +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) 1.1025 + .set DEINTERLEAVE_32BPP_ENABLED, 1 1.1026 +.else 1.1027 + .set DEINTERLEAVE_32BPP_ENABLED, 0 1.1028 +.endif 1.1029 + 1.1030 + .macro fetch_src_pixblock 1.1031 + pixld_src pixblock_size, src_bpp, \ 1.1032 + (src_basereg - pixblock_size * src_bpp / 64), SRC 1.1033 + .endm 1.1034 + 1.1035 + init 1.1036 + mov DST_R, DST_W 1.1037 + 1.1038 + cmp W, #pixblock_size 1.1039 + blt 8f 1.1040 + 1.1041 + ensure_destination_ptr_alignment process_pixblock_head, \ 1.1042 + process_pixblock_tail, \ 1.1043 + process_pixblock_tail_head 1.1044 + 1.1045 + subs W, W, #pixblock_size 1.1046 + blt 7f 1.1047 + 1.1048 + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 1.1049 + pixld_a pixblock_size, dst_r_bpp, \ 1.1050 + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 1.1051 + fetch_src_pixblock 1.1052 + pixld pixblock_size, mask_bpp, \ 1.1053 + (mask_basereg - pixblock_size * mask_bpp / 64), MASK 1.1054 + process_pixblock_head 1.1055 + subs W, W, #pixblock_size 1.1056 + blt 2f 1.1057 +1: 1.1058 + process_pixblock_tail_head 1.1059 + subs W, W, #pixblock_size 1.1060 + bge 1b 1.1061 +2: 1.1062 + process_pixblock_tail 1.1063 + pixst_a pixblock_size, dst_w_bpp, \ 1.1064 + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 1.1065 +7: 1.1066 + /* Process the remaining trailing pixels in the scanline (dst aligned) */ 1.1067 + process_trailing_pixels 0, 1, \ 1.1068 + process_pixblock_head, \ 1.1069 + process_pixblock_tail, \ 1.1070 + process_pixblock_tail_head 1.1071 + 1.1072 + cleanup 1.1073 +.if use_nearest_scaling != 0 1.1074 + pop {r4-r8, pc} /* exit */ 1.1075 +.else 1.1076 + bx lr /* exit */ 1.1077 +.endif 1.1078 +8: 1.1079 + /* Process the remaining trailing pixels in the scanline (dst unaligned) */ 1.1080 + process_trailing_pixels 0, 0, \ 1.1081 + process_pixblock_head, \ 1.1082 + process_pixblock_tail, \ 1.1083 + process_pixblock_tail_head 1.1084 + 1.1085 + cleanup 1.1086 + 1.1087 +.if use_nearest_scaling != 0 1.1088 + pop {r4-r8, pc} /* exit */ 1.1089 + 1.1090 + .unreq DST_R 1.1091 + .unreq SRC 1.1092 + .unreq W 1.1093 + .unreq VX 1.1094 + .unreq UNIT_X 1.1095 + .unreq TMP1 1.1096 + .unreq TMP2 1.1097 + .unreq DST_W 1.1098 + .unreq MASK 1.1099 + .unreq SRC_WIDTH_FIXED 1.1100 + 1.1101 +.else 1.1102 + bx lr /* exit */ 1.1103 + 1.1104 + .unreq SRC 1.1105 + .unreq MASK 1.1106 + .unreq DST_R 1.1107 + .unreq DST_W 1.1108 + .unreq W 1.1109 +.endif 1.1110 + 1.1111 + .purgem fetch_src_pixblock 1.1112 + .purgem pixld_src 1.1113 + 1.1114 + .fnend 1.1115 + .endfunc 1.1116 +.endm 1.1117 + 1.1118 +.macro generate_composite_function_single_scanline x:vararg 1.1119 + generate_composite_function_scanline 0, x 1.1120 +.endm 1.1121 + 1.1122 +.macro generate_composite_function_nearest_scanline x:vararg 1.1123 + generate_composite_function_scanline 1, x 1.1124 +.endm 1.1125 + 1.1126 +/* Default prologue/epilogue, nothing special needs to be done */ 1.1127 + 1.1128 +.macro default_init 1.1129 +.endm 1.1130 + 1.1131 +.macro default_cleanup 1.1132 +.endm 1.1133 + 1.1134 +/* 1.1135 + * Prologue/epilogue variant which additionally saves/restores d8-d15 1.1136 + * registers (they need to be saved/restored by callee according to ABI). 1.1137 + * This is required if the code needs to use all the NEON registers. 1.1138 + */ 1.1139 + 1.1140 +.macro default_init_need_all_regs 1.1141 + .vsave {d8-d15} 1.1142 + vpush {d8-d15} 1.1143 +.endm 1.1144 + 1.1145 +.macro default_cleanup_need_all_regs 1.1146 + vpop {d8-d15} 1.1147 +.endm 1.1148 + 1.1149 +/******************************************************************************/ 1.1150 + 1.1151 +/* 1.1152 + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) 1.1153 + * into a planar a8r8g8b8 format (with a, r, g, b color components 1.1154 + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). 1.1155 + * 1.1156 + * Warning: the conversion is destructive and the original 1.1157 + * value (in) is lost. 1.1158 + */ 1.1159 +.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b 1.1160 + vshrn.u16 out_r, in, #8 1.1161 + vshrn.u16 out_g, in, #3 1.1162 + vsli.u16 in, in, #5 1.1163 + vmov.u8 out_a, #255 1.1164 + vsri.u8 out_r, out_r, #5 1.1165 + vsri.u8 out_g, out_g, #6 1.1166 + vshrn.u16 out_b, in, #2 1.1167 +.endm 1.1168 + 1.1169 +.macro convert_0565_to_x888 in, out_r, out_g, out_b 1.1170 + vshrn.u16 out_r, in, #8 1.1171 + vshrn.u16 out_g, in, #3 1.1172 + vsli.u16 in, in, #5 1.1173 + vsri.u8 out_r, out_r, #5 1.1174 + vsri.u8 out_g, out_g, #6 1.1175 + vshrn.u16 out_b, in, #2 1.1176 +.endm 1.1177 + 1.1178 +/* 1.1179 + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components 1.1180 + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 1.1181 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit 1.1182 + * registers (tmp1, tmp2) 1.1183 + */ 1.1184 +.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 1.1185 + vshll.u8 tmp1, in_g, #8 1.1186 + vshll.u8 out, in_r, #8 1.1187 + vshll.u8 tmp2, in_b, #8 1.1188 + vsri.u16 out, tmp1, #5 1.1189 + vsri.u16 out, tmp2, #11 1.1190 +.endm 1.1191 + 1.1192 +/* 1.1193 + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels 1.1194 + * returned in (out0, out1) registers pair. Requires one temporary 1.1195 + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original 1.1196 + * value from 'in' is lost 1.1197 + */ 1.1198 +.macro convert_four_0565_to_x888_packed in, out0, out1, tmp 1.1199 + vshl.u16 out0, in, #5 /* G top 6 bits */ 1.1200 + vshl.u16 tmp, in, #11 /* B top 5 bits */ 1.1201 + vsri.u16 in, in, #5 /* R is ready in top bits */ 1.1202 + vsri.u16 out0, out0, #6 /* G is ready in top bits */ 1.1203 + vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ 1.1204 + vshr.u16 out1, in, #8 /* R is in place */ 1.1205 + vsri.u16 out0, tmp, #8 /* G & B is in place */ 1.1206 + vzip.u16 out0, out1 /* everything is in place */ 1.1207 +.endm