gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1368 @@
     1.4 +/*
     1.5 + * Copyright © 2011 SCore Corporation
     1.6 + *
     1.7 + * Permission is hereby granted, free of charge, to any person obtaining a
     1.8 + * copy of this software and associated documentation files (the "Software"),
     1.9 + * to deal in the Software without restriction, including without limitation
    1.10 + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    1.11 + * and/or sell copies of the Software, and to permit persons to whom the
    1.12 + * Software is furnished to do so, subject to the following conditions:
    1.13 + *
    1.14 + * The above copyright notice and this permission notice (including the next
    1.15 + * paragraph) shall be included in all copies or substantial portions of the
    1.16 + * Software.
    1.17 + *
    1.18 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    1.19 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    1.20 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
    1.21 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    1.22 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    1.23 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    1.24 + * DEALINGS IN THE SOFTWARE.
    1.25 + *
    1.26 + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
    1.27 + * Author:  Taekyun Kim (tkq.kim@samsung.com)
    1.28 + */
    1.29 +
    1.30 +/*
    1.31 + * This file contains scaled bilinear scanline functions implemented
    1.32 + * using older siarhei's bilinear macro template.
    1.33 + *
    1.34 + * << General scanline function procedures >>
    1.35 + *  1. bilinear interpolate source pixels
    1.36 + *  2. load mask pixels
    1.37 + *  3. load destination pixels
    1.38 + *  4. duplicate mask to fill whole register
    1.39 + *  5. interleave source & destination pixels
    1.40 + *  6. apply mask to source pixels
    1.41 + *  7. combine source & destination pixels
    1.42 + *  8, Deinterleave final result
    1.43 + *  9. store destination pixels
    1.44 + *
    1.45 + * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
    1.46 + * Registers with double numbers(src01, dst01) are 128-bits registers.
    1.47 + * All temp registers can be used freely outside the code block.
    1.48 + * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
    1.49 + *
    1.50 + * Remarks
    1.51 + *  There can be lots of pipeline stalls inside code block and between code blocks.
    1.52 + *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
    1.53 + */
    1.54 +
    1.55 +/* Prevent the stack from becoming executable for no reason... */
    1.56 +#if defined(__linux__) && defined (__ELF__)
    1.57 +.section .note.GNU-stack,"",%progbits
    1.58 +#endif
    1.59 +
    1.60 +.text
    1.61 +.fpu neon
    1.62 +.arch armv7a
    1.63 +.object_arch armv4
    1.64 +.eabi_attribute 10, 0
    1.65 +.eabi_attribute 12, 0
    1.66 +.arm
    1.67 +.altmacro
    1.68 +.p2align 2
    1.69 +
    1.70 +#include "pixman-private.h"
    1.71 +#include "pixman-arm-neon-asm.h"
    1.72 +
    1.73 +/*
    1.74 + * Bilinear macros from pixman-arm-neon-asm.S
    1.75 + */
    1.76 +
    1.77 +/* Supplementary macro for setting function attributes */
    1.78 +.macro pixman_asm_function fname
    1.79 +    .func fname
    1.80 +    .global fname
    1.81 +#ifdef __ELF__
    1.82 +    .hidden fname
    1.83 +    .type fname, %function
    1.84 +#endif
    1.85 +fname:
    1.86 +.endm
    1.87 +
    1.88 +/*
    1.89 + * Bilinear scaling support code which tries to provide pixel fetching, color
    1.90 + * format conversion, and interpolation as separate macros which can be used
    1.91 + * as the basic building blocks for constructing bilinear scanline functions.
    1.92 + */
    1.93 +
    1.94 +.macro bilinear_load_8888 reg1, reg2, tmp
    1.95 +    mov       TMP1, X, asr #16
    1.96 +    add       X, X, UX
    1.97 +    add       TMP1, TOP, TMP1, asl #2
    1.98 +    vld1.32   {reg1}, [TMP1], STRIDE
    1.99 +    vld1.32   {reg2}, [TMP1]
   1.100 +.endm
   1.101 +
   1.102 +.macro bilinear_load_0565 reg1, reg2, tmp
   1.103 +    mov       TMP1, X, asr #16
   1.104 +    add       X, X, UX
   1.105 +    add       TMP1, TOP, TMP1, asl #1
   1.106 +    vld1.32   {reg2[0]}, [TMP1], STRIDE
   1.107 +    vld1.32   {reg2[1]}, [TMP1]
   1.108 +    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
   1.109 +.endm
   1.110 +
   1.111 +.macro bilinear_load_and_vertical_interpolate_two_8888 \
   1.112 +                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
   1.113 +
   1.114 +    bilinear_load_8888 reg1, reg2, tmp1
   1.115 +    vmull.u8  acc1, reg1, d28
   1.116 +    vmlal.u8  acc1, reg2, d29
   1.117 +    bilinear_load_8888 reg3, reg4, tmp2
   1.118 +    vmull.u8  acc2, reg3, d28
   1.119 +    vmlal.u8  acc2, reg4, d29
   1.120 +.endm
   1.121 +
   1.122 +.macro bilinear_load_and_vertical_interpolate_four_8888 \
   1.123 +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
   1.124 +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   1.125 +
   1.126 +    bilinear_load_and_vertical_interpolate_two_8888 \
   1.127 +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
   1.128 +    bilinear_load_and_vertical_interpolate_two_8888 \
   1.129 +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   1.130 +.endm
   1.131 +
   1.132 +.macro bilinear_load_and_vertical_interpolate_two_0565 \
   1.133 +                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
   1.134 +
   1.135 +    mov       TMP1, X, asr #16
   1.136 +    add       X, X, UX
   1.137 +    add       TMP1, TOP, TMP1, asl #1
   1.138 +    mov       TMP2, X, asr #16
   1.139 +    add       X, X, UX
   1.140 +    add       TMP2, TOP, TMP2, asl #1
   1.141 +    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
   1.142 +    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
   1.143 +    vld1.32   {acc2lo[1]}, [TMP1]
   1.144 +    vld1.32   {acc2hi[1]}, [TMP2]
   1.145 +    convert_0565_to_x888 acc2, reg3, reg2, reg1
   1.146 +    vzip.u8   reg1, reg3
   1.147 +    vzip.u8   reg2, reg4
   1.148 +    vzip.u8   reg3, reg4
   1.149 +    vzip.u8   reg1, reg2
   1.150 +    vmull.u8  acc1, reg1, d28
   1.151 +    vmlal.u8  acc1, reg2, d29
   1.152 +    vmull.u8  acc2, reg3, d28
   1.153 +    vmlal.u8  acc2, reg4, d29
   1.154 +.endm
   1.155 +
   1.156 +.macro bilinear_load_and_vertical_interpolate_four_0565 \
   1.157 +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
   1.158 +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   1.159 +
   1.160 +    mov       TMP1, X, asr #16
   1.161 +    add       X, X, UX
   1.162 +    add       TMP1, TOP, TMP1, asl #1
   1.163 +    mov       TMP2, X, asr #16
   1.164 +    add       X, X, UX
   1.165 +    add       TMP2, TOP, TMP2, asl #1
   1.166 +    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
   1.167 +    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
   1.168 +    vld1.32   {xacc2lo[1]}, [TMP1]
   1.169 +    vld1.32   {xacc2hi[1]}, [TMP2]
   1.170 +    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
   1.171 +    mov       TMP1, X, asr #16
   1.172 +    add       X, X, UX
   1.173 +    add       TMP1, TOP, TMP1, asl #1
   1.174 +    mov       TMP2, X, asr #16
   1.175 +    add       X, X, UX
   1.176 +    add       TMP2, TOP, TMP2, asl #1
   1.177 +    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
   1.178 +    vzip.u8   xreg1, xreg3
   1.179 +    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
   1.180 +    vzip.u8   xreg2, xreg4
   1.181 +    vld1.32   {yacc2lo[1]}, [TMP1]
   1.182 +    vzip.u8   xreg3, xreg4
   1.183 +    vld1.32   {yacc2hi[1]}, [TMP2]
   1.184 +    vzip.u8   xreg1, xreg2
   1.185 +    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
   1.186 +    vmull.u8  xacc1, xreg1, d28
   1.187 +    vzip.u8   yreg1, yreg3
   1.188 +    vmlal.u8  xacc1, xreg2, d29
   1.189 +    vzip.u8   yreg2, yreg4
   1.190 +    vmull.u8  xacc2, xreg3, d28
   1.191 +    vzip.u8   yreg3, yreg4
   1.192 +    vmlal.u8  xacc2, xreg4, d29
   1.193 +    vzip.u8   yreg1, yreg2
   1.194 +    vmull.u8  yacc1, yreg1, d28
   1.195 +    vmlal.u8  yacc1, yreg2, d29
   1.196 +    vmull.u8  yacc2, yreg3, d28
   1.197 +    vmlal.u8  yacc2, yreg4, d29
   1.198 +.endm
   1.199 +
   1.200 +.macro bilinear_store_8888 numpix, tmp1, tmp2
   1.201 +.if numpix == 4
   1.202 +    vst1.32   {d0, d1}, [OUT]!
   1.203 +.elseif numpix == 2
   1.204 +    vst1.32   {d0}, [OUT]!
   1.205 +.elseif numpix == 1
   1.206 +    vst1.32   {d0[0]}, [OUT, :32]!
   1.207 +.else
   1.208 +    .error bilinear_store_8888 numpix is unsupported
   1.209 +.endif
   1.210 +.endm
   1.211 +
   1.212 +.macro bilinear_store_0565 numpix, tmp1, tmp2
   1.213 +    vuzp.u8 d0, d1
   1.214 +    vuzp.u8 d2, d3
   1.215 +    vuzp.u8 d1, d3
   1.216 +    vuzp.u8 d0, d2
   1.217 +    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
   1.218 +.if numpix == 4
   1.219 +    vst1.16   {d2}, [OUT]!
   1.220 +.elseif numpix == 2
   1.221 +    vst1.32   {d2[0]}, [OUT]!
   1.222 +.elseif numpix == 1
   1.223 +    vst1.16   {d2[0]}, [OUT]!
   1.224 +.else
   1.225 +    .error bilinear_store_0565 numpix is unsupported
   1.226 +.endif
   1.227 +.endm
   1.228 +
   1.229 +
   1.230 +/*
   1.231 + * Macros for loading mask pixels into register 'mask'.
   1.232 + * vdup must be done in somewhere else.
   1.233 + */
   1.234 +.macro bilinear_load_mask_x numpix, mask
   1.235 +.endm
   1.236 +
   1.237 +.macro bilinear_load_mask_8 numpix, mask
   1.238 +.if numpix == 4
   1.239 +    vld1.32     {mask[0]}, [MASK]!
   1.240 +.elseif numpix == 2
   1.241 +    vld1.16     {mask[0]}, [MASK]!
   1.242 +.elseif numpix == 1
   1.243 +    vld1.8      {mask[0]}, [MASK]!
   1.244 +.else
   1.245 +    .error bilinear_load_mask_8 numpix is unsupported
   1.246 +.endif
   1.247 +    pld         [MASK, #prefetch_offset]
   1.248 +.endm
   1.249 +
   1.250 +.macro bilinear_load_mask mask_fmt, numpix, mask
   1.251 +    bilinear_load_mask_&mask_fmt numpix, mask
   1.252 +.endm
   1.253 +
   1.254 +
   1.255 +/*
   1.256 + * Macros for loading destination pixels into register 'dst0' and 'dst1'.
   1.257 + * Interleave should be done somewhere else.
   1.258 + */
   1.259 +.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
   1.260 +.endm
   1.261 +
   1.262 +.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
   1.263 +.endm
   1.264 +
   1.265 +.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
   1.266 +.if numpix == 4
   1.267 +    vld1.32     {dst0, dst1}, [OUT]
   1.268 +.elseif numpix == 2
   1.269 +    vld1.32     {dst0}, [OUT]
   1.270 +.elseif numpix == 1
   1.271 +    vld1.32     {dst0[0]}, [OUT]
   1.272 +.else
   1.273 +    .error bilinear_load_dst_8888 numpix is unsupported
   1.274 +.endif
   1.275 +    pld         [OUT, #(prefetch_offset * 4)]
   1.276 +.endm
   1.277 +
   1.278 +.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
   1.279 +    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
   1.280 +.endm
   1.281 +
   1.282 +.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
   1.283 +    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
   1.284 +.endm
   1.285 +
   1.286 +.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
   1.287 +    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
   1.288 +.endm
   1.289 +
   1.290 +/*
   1.291 + * Macros for duplicating partially loaded mask to fill entire register.
   1.292 + * We will apply mask to interleaved source pixels, that is
   1.293 + *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
   1.294 + *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
   1.295 + * So, we need to duplicate loaded mask into whole register.
   1.296 + *
   1.297 + * For two pixel case
   1.298 + *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
   1.299 + *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
   1.300 + * We can do some optimizations for this including last pixel cases.
   1.301 + */
   1.302 +.macro bilinear_duplicate_mask_x numpix, mask
   1.303 +.endm
   1.304 +
   1.305 +.macro bilinear_duplicate_mask_8 numpix, mask
   1.306 +.if numpix == 4
   1.307 +    vdup.32     mask, mask[0]
   1.308 +.elseif numpix == 2
   1.309 +    vdup.16     mask, mask[0]
   1.310 +.elseif numpix == 1
   1.311 +    vdup.8      mask, mask[0]
   1.312 +.else
   1.313 +    .error bilinear_duplicate_mask_8 is unsupported
   1.314 +.endif
   1.315 +.endm
   1.316 +
   1.317 +.macro bilinear_duplicate_mask mask_fmt, numpix, mask
   1.318 +    bilinear_duplicate_mask_&mask_fmt numpix, mask
   1.319 +.endm
   1.320 +
   1.321 +/*
   1.322 + * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
   1.323 + * Interleave should be done when maks is enabled or operator is 'over'.
   1.324 + */
   1.325 +.macro bilinear_interleave src0, src1, dst0, dst1
   1.326 +    vuzp.8      src0, src1
   1.327 +    vuzp.8      dst0, dst1
   1.328 +    vuzp.8      src0, src1
   1.329 +    vuzp.8      dst0, dst1
   1.330 +.endm
   1.331 +
   1.332 +.macro bilinear_interleave_src_dst_x_src \
   1.333 +                numpix, src0, src1, src01, dst0, dst1, dst01
   1.334 +.endm
   1.335 +
   1.336 +.macro bilinear_interleave_src_dst_x_over \
   1.337 +                numpix, src0, src1, src01, dst0, dst1, dst01
   1.338 +
   1.339 +    bilinear_interleave src0, src1, dst0, dst1
   1.340 +.endm
   1.341 +
   1.342 +.macro bilinear_interleave_src_dst_x_add \
   1.343 +                numpix, src0, src1, src01, dst0, dst1, dst01
   1.344 +.endm
   1.345 +
   1.346 +.macro bilinear_interleave_src_dst_8_src \
   1.347 +                numpix, src0, src1, src01, dst0, dst1, dst01
   1.348 +
   1.349 +    bilinear_interleave src0, src1, dst0, dst1
   1.350 +.endm
   1.351 +
   1.352 +.macro bilinear_interleave_src_dst_8_over \
   1.353 +                numpix, src0, src1, src01, dst0, dst1, dst01
   1.354 +
   1.355 +    bilinear_interleave src0, src1, dst0, dst1
   1.356 +.endm
   1.357 +
   1.358 +.macro bilinear_interleave_src_dst_8_add \
   1.359 +                numpix, src0, src1, src01, dst0, dst1, dst01
   1.360 +
   1.361 +    bilinear_interleave src0, src1, dst0, dst1
   1.362 +.endm
   1.363 +
   1.364 +.macro bilinear_interleave_src_dst \
   1.365 +                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
   1.366 +
   1.367 +    bilinear_interleave_src_dst_&mask_fmt&_&op \
   1.368 +                numpix, src0, src1, src01, dst0, dst1, dst01
   1.369 +.endm
   1.370 +
   1.371 +
   1.372 +/*
   1.373 + * Macros for applying masks to src pixels. (see combine_mask_u() function)
   1.374 + * src, dst should be in interleaved form.
   1.375 + * mask register should be in form (m0, m1, m2, m3).
   1.376 + */
   1.377 +.macro bilinear_apply_mask_to_src_x \
   1.378 +                numpix, src0, src1, src01, mask, \
   1.379 +                tmp01, tmp23, tmp45, tmp67
   1.380 +.endm
   1.381 +
   1.382 +.macro bilinear_apply_mask_to_src_8 \
   1.383 +                numpix, src0, src1, src01, mask, \
   1.384 +                tmp01, tmp23, tmp45, tmp67
   1.385 +
   1.386 +    vmull.u8        tmp01, src0, mask
   1.387 +    vmull.u8        tmp23, src1, mask
   1.388 +    /* bubbles */
   1.389 +    vrshr.u16       tmp45, tmp01, #8
   1.390 +    vrshr.u16       tmp67, tmp23, #8
   1.391 +    /* bubbles */
   1.392 +    vraddhn.u16     src0, tmp45, tmp01
   1.393 +    vraddhn.u16     src1, tmp67, tmp23
   1.394 +.endm
   1.395 +
   1.396 +.macro bilinear_apply_mask_to_src \
   1.397 +                mask_fmt, numpix, src0, src1, src01, mask, \
   1.398 +                tmp01, tmp23, tmp45, tmp67
   1.399 +
   1.400 +    bilinear_apply_mask_to_src_&mask_fmt \
   1.401 +                numpix, src0, src1, src01, mask, \
   1.402 +                tmp01, tmp23, tmp45, tmp67
   1.403 +.endm
   1.404 +
   1.405 +
   1.406 +/*
   1.407 + * Macros for combining src and destination pixels.
   1.408 + * Interleave or not is depending on operator 'op'.
   1.409 + */
   1.410 +.macro bilinear_combine_src \
   1.411 +                numpix, src0, src1, src01, dst0, dst1, dst01, \
   1.412 +                tmp01, tmp23, tmp45, tmp67, tmp8
   1.413 +.endm
   1.414 +
   1.415 +.macro bilinear_combine_over \
   1.416 +                numpix, src0, src1, src01, dst0, dst1, dst01, \
   1.417 +                tmp01, tmp23, tmp45, tmp67, tmp8
   1.418 +
   1.419 +    vdup.32     tmp8, src1[1]
   1.420 +    /* bubbles */
   1.421 +    vmvn.8      tmp8, tmp8
   1.422 +    /* bubbles */
   1.423 +    vmull.u8    tmp01, dst0, tmp8
   1.424 +    /* bubbles */
   1.425 +    vmull.u8    tmp23, dst1, tmp8
   1.426 +    /* bubbles */
   1.427 +    vrshr.u16   tmp45, tmp01, #8
   1.428 +    vrshr.u16   tmp67, tmp23, #8
   1.429 +    /* bubbles */
   1.430 +    vraddhn.u16 dst0, tmp45, tmp01
   1.431 +    vraddhn.u16 dst1, tmp67, tmp23
   1.432 +    /* bubbles */
   1.433 +    vqadd.u8    src01, dst01, src01
   1.434 +.endm
   1.435 +
   1.436 +.macro bilinear_combine_add \
   1.437 +                numpix, src0, src1, src01, dst0, dst1, dst01, \
   1.438 +                tmp01, tmp23, tmp45, tmp67, tmp8
   1.439 +
   1.440 +    vqadd.u8    src01, dst01, src01
   1.441 +.endm
   1.442 +
   1.443 +.macro bilinear_combine \
   1.444 +                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
   1.445 +                tmp01, tmp23, tmp45, tmp67, tmp8
   1.446 +
   1.447 +    bilinear_combine_&op \
   1.448 +                numpix, src0, src1, src01, dst0, dst1, dst01, \
   1.449 +                tmp01, tmp23, tmp45, tmp67, tmp8
   1.450 +.endm
   1.451 +
   1.452 +/*
   1.453 + * Macros for final deinterleaving of destination pixels if needed.
   1.454 + */
   1.455 +.macro bilinear_deinterleave numpix, dst0, dst1, dst01
   1.456 +    vuzp.8      dst0, dst1
   1.457 +    /* bubbles */
   1.458 +    vuzp.8      dst0, dst1
   1.459 +.endm
   1.460 +
   1.461 +.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
   1.462 +.endm
   1.463 +
   1.464 +.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
   1.465 +    bilinear_deinterleave numpix, dst0, dst1, dst01
   1.466 +.endm
   1.467 +
   1.468 +.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
   1.469 +.endm
   1.470 +
   1.471 +.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
   1.472 +    bilinear_deinterleave numpix, dst0, dst1, dst01
   1.473 +.endm
   1.474 +
   1.475 +.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
   1.476 +    bilinear_deinterleave numpix, dst0, dst1, dst01
   1.477 +.endm
   1.478 +
   1.479 +.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
   1.480 +    bilinear_deinterleave numpix, dst0, dst1, dst01
   1.481 +.endm
   1.482 +
   1.483 +.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
   1.484 +    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
   1.485 +.endm
   1.486 +
   1.487 +
   1.488 +.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
   1.489 +    bilinear_load_&src_fmt d0, d1, d2
   1.490 +    bilinear_load_mask mask_fmt, 1, d4
   1.491 +    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
   1.492 +    vmull.u8  q1, d0, d28
   1.493 +    vmlal.u8  q1, d1, d29
   1.494 +    /* 5 cycles bubble */
   1.495 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   1.496 +    vmlsl.u16 q0, d2, d30
   1.497 +    vmlal.u16 q0, d3, d30
   1.498 +    /* 5 cycles bubble */
   1.499 +    bilinear_duplicate_mask mask_fmt, 1, d4
   1.500 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.501 +    /* 3 cycles bubble */
   1.502 +    vmovn.u16 d0, q0
   1.503 +    /* 1 cycle bubble */
   1.504 +    bilinear_interleave_src_dst \
   1.505 +                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
   1.506 +    bilinear_apply_mask_to_src \
   1.507 +                mask_fmt, 1, d0, d1, q0, d4, \
   1.508 +                q3, q8, q10, q11
   1.509 +    bilinear_combine \
   1.510 +                op, 1, d0, d1, q0, d18, d19, q9, \
   1.511 +                q3, q8, q10, q11, d5
   1.512 +    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
   1.513 +    bilinear_store_&dst_fmt 1, q2, q3
   1.514 +.endm
   1.515 +
   1.516 +.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
   1.517 +    bilinear_load_and_vertical_interpolate_two_&src_fmt \
   1.518 +                q1, q11, d0, d1, d20, d21, d22, d23
   1.519 +    bilinear_load_mask mask_fmt, 2, d4
   1.520 +    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
   1.521 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   1.522 +    vmlsl.u16 q0, d2, d30
   1.523 +    vmlal.u16 q0, d3, d30
   1.524 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
   1.525 +    vmlsl.u16 q10, d22, d31
   1.526 +    vmlal.u16 q10, d23, d31
   1.527 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.528 +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.529 +    bilinear_duplicate_mask mask_fmt, 2, d4
   1.530 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1.531 +    vadd.u16  q12, q12, q13
   1.532 +    vmovn.u16 d0, q0
   1.533 +    bilinear_interleave_src_dst \
   1.534 +                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
   1.535 +    bilinear_apply_mask_to_src \
   1.536 +                mask_fmt, 2, d0, d1, q0, d4, \
   1.537 +                q3, q8, q10, q11
   1.538 +    bilinear_combine \
   1.539 +                op, 2, d0, d1, q0, d18, d19, q9, \
   1.540 +                q3, q8, q10, q11, d5
   1.541 +    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
   1.542 +    bilinear_store_&dst_fmt 2, q2, q3
   1.543 +.endm
   1.544 +
   1.545 +.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
   1.546 +    bilinear_load_and_vertical_interpolate_four_&src_fmt \
   1.547 +                q1, q11, d0, d1, d20, d21, d22, d23 \
   1.548 +                q3, q9,  d4, d5, d16, d17, d18, d19
   1.549 +    pld       [TMP1, PF_OFFS]
   1.550 +    sub       TMP1, TMP1, STRIDE
   1.551 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   1.552 +    vmlsl.u16 q0, d2, d30
   1.553 +    vmlal.u16 q0, d3, d30
   1.554 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
   1.555 +    vmlsl.u16 q10, d22, d31
   1.556 +    vmlal.u16 q10, d23, d31
   1.557 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1.558 +    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
   1.559 +    vmlsl.u16 q2, d6, d30
   1.560 +    vmlal.u16 q2, d7, d30
   1.561 +    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
   1.562 +    bilinear_load_mask mask_fmt, 4, d22
   1.563 +    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
   1.564 +    pld       [TMP1, PF_OFFS]
   1.565 +    vmlsl.u16 q8, d18, d31
   1.566 +    vmlal.u16 q8, d19, d31
   1.567 +    vadd.u16  q12, q12, q13
   1.568 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.569 +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.570 +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.571 +    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.572 +    bilinear_duplicate_mask mask_fmt, 4, d22
   1.573 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1.574 +    vmovn.u16 d0, q0
   1.575 +    vmovn.u16 d1, q2
   1.576 +    vadd.u16  q12, q12, q13
   1.577 +    bilinear_interleave_src_dst \
   1.578 +                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
   1.579 +    bilinear_apply_mask_to_src \
   1.580 +                mask_fmt, 4, d0, d1, q0, d22, \
   1.581 +                q3, q8, q9, q10
   1.582 +    bilinear_combine \
   1.583 +                op, 4, d0, d1, q0, d2, d3, q1, \
   1.584 +                q3, q8, q9, q10, d23
   1.585 +    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
   1.586 +    bilinear_store_&dst_fmt 4, q2, q3
   1.587 +.endm
   1.588 +
   1.589 +.set BILINEAR_FLAG_USE_MASK,		1
   1.590 +.set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
   1.591 +
   1.592 +/*
   1.593 + * Main template macro for generating NEON optimized bilinear scanline functions.
   1.594 + *
   1.595 + * Bilinear scanline generator macro take folling arguments:
   1.596 + *  fname			- name of the function to generate
   1.597 + *  src_fmt			- source color format (8888 or 0565)
   1.598 + *  dst_fmt			- destination color format (8888 or 0565)
   1.599 + *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
   1.600 + *  process_last_pixel		- code block that interpolate one pixel and does not
   1.601 + *				  update horizontal weight
   1.602 + *  process_two_pixels		- code block that interpolate two pixels and update
   1.603 + *				  horizontal weight
   1.604 + *  process_four_pixels		- code block that interpolate four pixels and update
   1.605 + *				  horizontal weight
   1.606 + *  process_pixblock_head	- head part of middle loop
   1.607 + *  process_pixblock_tail	- tail part of middle loop
   1.608 + *  process_pixblock_tail_head	- tail_head of middle loop
   1.609 + *  pixblock_size		- number of pixels processed in a single middle loop
   1.610 + *  prefetch_distance		- prefetch in the source image by that many pixels ahead
   1.611 + */
   1.612 +
   1.613 +.macro generate_bilinear_scanline_func \
   1.614 +	fname, \
   1.615 +	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
   1.616 +	bilinear_process_last_pixel, \
   1.617 +	bilinear_process_two_pixels, \
   1.618 +	bilinear_process_four_pixels, \
   1.619 +	bilinear_process_pixblock_head, \
   1.620 +	bilinear_process_pixblock_tail, \
   1.621 +	bilinear_process_pixblock_tail_head, \
   1.622 +	pixblock_size, \
   1.623 +	prefetch_distance, \
   1.624 +	flags
   1.625 +
   1.626 +pixman_asm_function fname
   1.627 +.if pixblock_size == 8
   1.628 +.elseif pixblock_size == 4
   1.629 +.else
   1.630 +    .error unsupported pixblock size
   1.631 +.endif
   1.632 +
   1.633 +.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
   1.634 +    OUT       .req    r0
   1.635 +    TOP       .req    r1
   1.636 +    BOTTOM    .req    r2
   1.637 +    WT        .req    r3
   1.638 +    WB        .req    r4
   1.639 +    X         .req    r5
   1.640 +    UX        .req    r6
   1.641 +    WIDTH     .req    ip
   1.642 +    TMP1      .req    r3
   1.643 +    TMP2      .req    r4
   1.644 +    PF_OFFS   .req    r7
   1.645 +    TMP3      .req    r8
   1.646 +    TMP4      .req    r9
   1.647 +    STRIDE    .req    r2
   1.648 +
   1.649 +    mov		ip, sp
   1.650 +    push	{r4, r5, r6, r7, r8, r9}
   1.651 +    mov		PF_OFFS, #prefetch_distance
   1.652 +    ldmia	ip, {WB, X, UX, WIDTH}
   1.653 +.else
   1.654 +    OUT       .req      r0
   1.655 +    MASK      .req      r1
   1.656 +    TOP       .req      r2
   1.657 +    BOTTOM    .req      r3
   1.658 +    WT        .req      r4
   1.659 +    WB        .req      r5
   1.660 +    X         .req      r6
   1.661 +    UX        .req      r7
   1.662 +    WIDTH     .req      ip
   1.663 +    TMP1      .req      r4
   1.664 +    TMP2      .req      r5
   1.665 +    PF_OFFS   .req      r8
   1.666 +    TMP3      .req      r9
   1.667 +    TMP4      .req      r10
   1.668 +    STRIDE    .req      r3
   1.669 +
   1.670 +    .set prefetch_offset, prefetch_distance
   1.671 +
   1.672 +    mov       ip, sp
   1.673 +    push      {r4, r5, r6, r7, r8, r9, r10, ip}
   1.674 +    mov       PF_OFFS, #prefetch_distance
   1.675 +    ldmia     ip, {WT, WB, X, UX, WIDTH}
   1.676 +.endif
   1.677 +
   1.678 +    mul       PF_OFFS, PF_OFFS, UX
   1.679 +
   1.680 +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
   1.681 +    vpush     {d8-d15}
   1.682 +.endif
   1.683 +
   1.684 +    sub	      STRIDE, BOTTOM, TOP
   1.685 +    .unreq    BOTTOM
   1.686 +
   1.687 +    cmp       WIDTH, #0
   1.688 +    ble       3f
   1.689 +
   1.690 +    vdup.u16  q12, X
   1.691 +    vdup.u16  q13, UX
   1.692 +    vdup.u8   d28, WT
   1.693 +    vdup.u8   d29, WB
   1.694 +    vadd.u16  d25, d25, d26
   1.695 +
   1.696 +    /* ensure good destination alignment  */
   1.697 +    cmp       WIDTH, #1
   1.698 +    blt       0f
   1.699 +    tst       OUT, #(1 << dst_bpp_shift)
   1.700 +    beq       0f
   1.701 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1.702 +    vadd.u16  q12, q12, q13
   1.703 +    bilinear_process_last_pixel
   1.704 +    sub       WIDTH, WIDTH, #1
   1.705 +0:
   1.706 +    vadd.u16  q13, q13, q13
   1.707 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1.708 +    vadd.u16  q12, q12, q13
   1.709 +
   1.710 +    cmp       WIDTH, #2
   1.711 +    blt       0f
   1.712 +    tst       OUT, #(1 << (dst_bpp_shift + 1))
   1.713 +    beq       0f
   1.714 +    bilinear_process_two_pixels
   1.715 +    sub       WIDTH, WIDTH, #2
   1.716 +0:
   1.717 +.if pixblock_size == 8
   1.718 +    cmp       WIDTH, #4
   1.719 +    blt       0f
   1.720 +    tst       OUT, #(1 << (dst_bpp_shift + 2))
   1.721 +    beq       0f
   1.722 +    bilinear_process_four_pixels
   1.723 +    sub       WIDTH, WIDTH, #4
   1.724 +0:
   1.725 +.endif
   1.726 +    subs      WIDTH, WIDTH, #pixblock_size
   1.727 +    blt       1f
   1.728 +    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
   1.729 +    bilinear_process_pixblock_head
   1.730 +    subs      WIDTH, WIDTH, #pixblock_size
   1.731 +    blt       5f
   1.732 +0:
   1.733 +    bilinear_process_pixblock_tail_head
   1.734 +    subs      WIDTH, WIDTH, #pixblock_size
   1.735 +    bge       0b
   1.736 +5:
   1.737 +    bilinear_process_pixblock_tail
   1.738 +1:
   1.739 +.if pixblock_size == 8
   1.740 +    tst       WIDTH, #4
   1.741 +    beq       2f
   1.742 +    bilinear_process_four_pixels
   1.743 +2:
   1.744 +.endif
   1.745 +    /* handle the remaining trailing pixels */
   1.746 +    tst       WIDTH, #2
   1.747 +    beq       2f
   1.748 +    bilinear_process_two_pixels
   1.749 +2:
   1.750 +    tst       WIDTH, #1
   1.751 +    beq       3f
   1.752 +    bilinear_process_last_pixel
   1.753 +3:
   1.754 +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
   1.755 +    vpop      {d8-d15}
   1.756 +.endif
   1.757 +
   1.758 +.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
   1.759 +    pop       {r4, r5, r6, r7, r8, r9}
   1.760 +.else
   1.761 +    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
   1.762 +.endif
   1.763 +    bx        lr
   1.764 +
   1.765 +    .unreq    OUT
   1.766 +    .unreq    TOP
   1.767 +    .unreq    WT
   1.768 +    .unreq    WB
   1.769 +    .unreq    X
   1.770 +    .unreq    UX
   1.771 +    .unreq    WIDTH
   1.772 +    .unreq    TMP1
   1.773 +    .unreq    TMP2
   1.774 +    .unreq    PF_OFFS
   1.775 +    .unreq    TMP3
   1.776 +    .unreq    TMP4
   1.777 +    .unreq    STRIDE
   1.778 +.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
   1.779 +    .unreq    MASK
   1.780 +.endif
   1.781 +
   1.782 +.endfunc
   1.783 +
   1.784 +.endm
   1.785 +
   1.786 +/* src_8888_8_8888 */
   1.787 +.macro bilinear_src_8888_8_8888_process_last_pixel
   1.788 +    bilinear_interpolate_last_pixel 8888, 8, 8888, src
   1.789 +.endm
   1.790 +
   1.791 +.macro bilinear_src_8888_8_8888_process_two_pixels
   1.792 +    bilinear_interpolate_two_pixels 8888, 8, 8888, src
   1.793 +.endm
   1.794 +
   1.795 +.macro bilinear_src_8888_8_8888_process_four_pixels
   1.796 +    bilinear_interpolate_four_pixels 8888, 8, 8888, src
   1.797 +.endm
   1.798 +
   1.799 +.macro bilinear_src_8888_8_8888_process_pixblock_head
   1.800 +    bilinear_src_8888_8_8888_process_four_pixels
   1.801 +.endm
   1.802 +
   1.803 +.macro bilinear_src_8888_8_8888_process_pixblock_tail
   1.804 +.endm
   1.805 +
   1.806 +.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
   1.807 +    bilinear_src_8888_8_8888_process_pixblock_tail
   1.808 +    bilinear_src_8888_8_8888_process_pixblock_head
   1.809 +.endm
   1.810 +
   1.811 +/* src_8888_8_0565 */
   1.812 +.macro bilinear_src_8888_8_0565_process_last_pixel
   1.813 +    bilinear_interpolate_last_pixel 8888, 8, 0565, src
   1.814 +.endm
   1.815 +
   1.816 +.macro bilinear_src_8888_8_0565_process_two_pixels
   1.817 +    bilinear_interpolate_two_pixels 8888, 8, 0565, src
   1.818 +.endm
   1.819 +
   1.820 +.macro bilinear_src_8888_8_0565_process_four_pixels
   1.821 +    bilinear_interpolate_four_pixels 8888, 8, 0565, src
   1.822 +.endm
   1.823 +
   1.824 +.macro bilinear_src_8888_8_0565_process_pixblock_head
   1.825 +    bilinear_src_8888_8_0565_process_four_pixels
   1.826 +.endm
   1.827 +
   1.828 +.macro bilinear_src_8888_8_0565_process_pixblock_tail
   1.829 +.endm
   1.830 +
   1.831 +.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
   1.832 +    bilinear_src_8888_8_0565_process_pixblock_tail
   1.833 +    bilinear_src_8888_8_0565_process_pixblock_head
   1.834 +.endm
   1.835 +
   1.836 +/* src_0565_8_x888 */
   1.837 +.macro bilinear_src_0565_8_x888_process_last_pixel
   1.838 +    bilinear_interpolate_last_pixel 0565, 8, 8888, src
   1.839 +.endm
   1.840 +
   1.841 +.macro bilinear_src_0565_8_x888_process_two_pixels
   1.842 +    bilinear_interpolate_two_pixels 0565, 8, 8888, src
   1.843 +.endm
   1.844 +
   1.845 +.macro bilinear_src_0565_8_x888_process_four_pixels
   1.846 +    bilinear_interpolate_four_pixels 0565, 8, 8888, src
   1.847 +.endm
   1.848 +
   1.849 +.macro bilinear_src_0565_8_x888_process_pixblock_head
   1.850 +    bilinear_src_0565_8_x888_process_four_pixels
   1.851 +.endm
   1.852 +
   1.853 +.macro bilinear_src_0565_8_x888_process_pixblock_tail
   1.854 +.endm
   1.855 +
   1.856 +.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
   1.857 +    bilinear_src_0565_8_x888_process_pixblock_tail
   1.858 +    bilinear_src_0565_8_x888_process_pixblock_head
   1.859 +.endm
   1.860 +
   1.861 +/* src_0565_8_0565 */
   1.862 +.macro bilinear_src_0565_8_0565_process_last_pixel
   1.863 +    bilinear_interpolate_last_pixel 0565, 8, 0565, src
   1.864 +.endm
   1.865 +
   1.866 +.macro bilinear_src_0565_8_0565_process_two_pixels
   1.867 +    bilinear_interpolate_two_pixels 0565, 8, 0565, src
   1.868 +.endm
   1.869 +
   1.870 +.macro bilinear_src_0565_8_0565_process_four_pixels
   1.871 +    bilinear_interpolate_four_pixels 0565, 8, 0565, src
   1.872 +.endm
   1.873 +
   1.874 +.macro bilinear_src_0565_8_0565_process_pixblock_head
   1.875 +    bilinear_src_0565_8_0565_process_four_pixels
   1.876 +.endm
   1.877 +
   1.878 +.macro bilinear_src_0565_8_0565_process_pixblock_tail
   1.879 +.endm
   1.880 +
   1.881 +.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
   1.882 +    bilinear_src_0565_8_0565_process_pixblock_tail
   1.883 +    bilinear_src_0565_8_0565_process_pixblock_head
   1.884 +.endm
   1.885 +
   1.886 +/* over_8888_8888 */
   1.887 +.macro bilinear_over_8888_8888_process_last_pixel
   1.888 +    bilinear_interpolate_last_pixel 8888, x, 8888, over
   1.889 +.endm
   1.890 +
   1.891 +.macro bilinear_over_8888_8888_process_two_pixels
   1.892 +    bilinear_interpolate_two_pixels 8888, x, 8888, over
   1.893 +.endm
   1.894 +
   1.895 +.macro bilinear_over_8888_8888_process_four_pixels
   1.896 +    bilinear_interpolate_four_pixels 8888, x, 8888, over
   1.897 +.endm
   1.898 +
   1.899 +.macro bilinear_over_8888_8888_process_pixblock_head
   1.900 +    mov         TMP1, X, asr #16
   1.901 +    add         X, X, UX
   1.902 +    add         TMP1, TOP, TMP1, asl #2
   1.903 +    mov         TMP2, X, asr #16
   1.904 +    add         X, X, UX
   1.905 +    add         TMP2, TOP, TMP2, asl #2
   1.906 +
   1.907 +    vld1.32     {d22}, [TMP1], STRIDE
   1.908 +    vld1.32     {d23}, [TMP1]
   1.909 +    mov         TMP3, X, asr #16
   1.910 +    add         X, X, UX
   1.911 +    add         TMP3, TOP, TMP3, asl #2
   1.912 +    vmull.u8    q8, d22, d28
   1.913 +    vmlal.u8    q8, d23, d29
   1.914 +
   1.915 +    vld1.32     {d22}, [TMP2], STRIDE
   1.916 +    vld1.32     {d23}, [TMP2]
   1.917 +    mov         TMP4, X, asr #16
   1.918 +    add         X, X, UX
   1.919 +    add         TMP4, TOP, TMP4, asl #2
   1.920 +    vmull.u8    q9, d22, d28
   1.921 +    vmlal.u8    q9, d23, d29
   1.922 +
   1.923 +    vld1.32     {d22}, [TMP3], STRIDE
   1.924 +    vld1.32     {d23}, [TMP3]
   1.925 +    vmull.u8    q10, d22, d28
   1.926 +    vmlal.u8    q10, d23, d29
   1.927 +
   1.928 +    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
   1.929 +    vmlsl.u16   q0, d16, d30
   1.930 +    vmlal.u16   q0, d17, d30
   1.931 +
   1.932 +    pld         [TMP4, PF_OFFS]
   1.933 +    vld1.32     {d16}, [TMP4], STRIDE
   1.934 +    vld1.32     {d17}, [TMP4]
   1.935 +    pld         [TMP4, PF_OFFS]
   1.936 +    vmull.u8    q11, d16, d28
   1.937 +    vmlal.u8    q11, d17, d29
   1.938 +
   1.939 +    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
   1.940 +    vmlsl.u16   q1, d18, d31
   1.941 +    vmlal.u16   q1, d19, d31
   1.942 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1.943 +    vadd.u16    q12, q12, q13
   1.944 +.endm
   1.945 +
   1.946 +.macro bilinear_over_8888_8888_process_pixblock_tail
   1.947 +    vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
   1.948 +    vmlsl.u16   q2, d20, d30
   1.949 +    vmlal.u16   q2, d21, d30
   1.950 +    vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
   1.951 +    vmlsl.u16   q3, d22, d31
   1.952 +    vmlal.u16   q3, d23, d31
   1.953 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.954 +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.955 +    vld1.32     {d2, d3}, [OUT, :128]
   1.956 +    pld         [OUT, #(prefetch_offset * 4)]
   1.957 +    vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.958 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   1.959 +    vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.960 +    vmovn.u16   d6, q0
   1.961 +    vmovn.u16   d7, q2
   1.962 +    vuzp.8      d6, d7
   1.963 +    vuzp.8      d2, d3
   1.964 +    vuzp.8      d6, d7
   1.965 +    vuzp.8      d2, d3
   1.966 +    vdup.32     d4, d7[1]
   1.967 +    vmvn.8      d4, d4
   1.968 +    vmull.u8    q11, d2, d4
   1.969 +    vmull.u8    q2, d3, d4
   1.970 +    vrshr.u16   q1, q11, #8
   1.971 +    vrshr.u16   q10, q2, #8
   1.972 +    vraddhn.u16 d2, q1, q11
   1.973 +    vraddhn.u16 d3, q10, q2
   1.974 +    vqadd.u8    q3, q1, q3
   1.975 +    vuzp.8      d6, d7
   1.976 +    vuzp.8      d6, d7
   1.977 +    vadd.u16    q12, q12, q13
   1.978 +    vst1.32     {d6, d7}, [OUT, :128]!
   1.979 +.endm
   1.980 +
   1.981 +.macro bilinear_over_8888_8888_process_pixblock_tail_head
   1.982 +                                            vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
   1.983 +    mov         TMP1, X, asr #16
   1.984 +    add         X, X, UX
   1.985 +    add         TMP1, TOP, TMP1, asl #2
   1.986 +                                            vmlsl.u16   q2, d20, d30
   1.987 +    mov         TMP2, X, asr #16
   1.988 +    add         X, X, UX
   1.989 +    add         TMP2, TOP, TMP2, asl #2
   1.990 +                                            vmlal.u16   q2, d21, d30
   1.991 +                                            vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
   1.992 +    vld1.32     {d20}, [TMP1], STRIDE
   1.993 +                                            vmlsl.u16   q3, d22, d31
   1.994 +                                            vmlal.u16   q3, d23, d31
   1.995 +    vld1.32     {d21}, [TMP1]
   1.996 +    vmull.u8    q8, d20, d28
   1.997 +    vmlal.u8    q8, d21, d29
   1.998 +                                            vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   1.999 +                                            vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1000 +                                            vld1.32     {d2, d3}, [OUT, :128]
  1.1001 +                                            pld         [OUT, PF_OFFS]
  1.1002 +                                            vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1003 +                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.1004 +    vld1.32     {d22}, [TMP2], STRIDE
  1.1005 +                                            vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1006 +                                            vmovn.u16   d6, q0
  1.1007 +    vld1.32     {d23}, [TMP2]
  1.1008 +    vmull.u8    q9, d22, d28
  1.1009 +    mov         TMP3, X, asr #16
  1.1010 +    add         X, X, UX
  1.1011 +    add         TMP3, TOP, TMP3, asl #2
  1.1012 +    mov         TMP4, X, asr #16
  1.1013 +    add         X, X, UX
  1.1014 +    add         TMP4, TOP, TMP4, asl #2
  1.1015 +    vmlal.u8    q9, d23, d29
  1.1016 +                                            vmovn.u16   d7, q2
  1.1017 +    vld1.32     {d22}, [TMP3], STRIDE
  1.1018 +                                            vuzp.8      d6, d7
  1.1019 +                                            vuzp.8      d2, d3
  1.1020 +                                            vuzp.8      d6, d7
  1.1021 +                                            vuzp.8      d2, d3
  1.1022 +                                            vdup.32     d4, d7[1]
  1.1023 +    vld1.32     {d23}, [TMP3]
  1.1024 +                                            vmvn.8      d4, d4
  1.1025 +    vmull.u8    q10, d22, d28
  1.1026 +    vmlal.u8    q10, d23, d29
  1.1027 +                                            vmull.u8    q11, d2, d4
  1.1028 +                                            vmull.u8    q2, d3, d4
  1.1029 +    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
  1.1030 +    vmlsl.u16   q0, d16, d30
  1.1031 +                                            vrshr.u16   q1, q11, #8
  1.1032 +    vmlal.u16   q0, d17, d30
  1.1033 +                                            vrshr.u16   q8, q2, #8
  1.1034 +                                            vraddhn.u16 d2, q1, q11
  1.1035 +                                            vraddhn.u16 d3, q8, q2
  1.1036 +    pld         [TMP4, PF_OFFS]
  1.1037 +    vld1.32     {d16}, [TMP4], STRIDE
  1.1038 +                                            vqadd.u8    q3, q1, q3
  1.1039 +    vld1.32     {d17}, [TMP4]
  1.1040 +    pld         [TMP4, PF_OFFS]
  1.1041 +    vmull.u8    q11, d16, d28
  1.1042 +    vmlal.u8    q11, d17, d29
  1.1043 +                                            vuzp.8      d6, d7
  1.1044 +    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
  1.1045 +                                            vuzp.8      d6, d7
  1.1046 +    vmlsl.u16   q1, d18, d31
  1.1047 +                                            vadd.u16    q12, q12, q13
  1.1048 +    vmlal.u16   q1, d19, d31
  1.1049 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.1050 +    vadd.u16    q12, q12, q13
  1.1051 +                                            vst1.32     {d6, d7}, [OUT, :128]!
  1.1052 +.endm
  1.1053 +
  1.1054 +/* over_8888_8_8888 */
  1.1055 +.macro bilinear_over_8888_8_8888_process_last_pixel
  1.1056 +    bilinear_interpolate_last_pixel 8888, 8, 8888, over
  1.1057 +.endm
  1.1058 +
  1.1059 +.macro bilinear_over_8888_8_8888_process_two_pixels
  1.1060 +    bilinear_interpolate_two_pixels 8888, 8, 8888, over
  1.1061 +.endm
  1.1062 +
  1.1063 +.macro bilinear_over_8888_8_8888_process_four_pixels
  1.1064 +    bilinear_interpolate_four_pixels 8888, 8, 8888, over
  1.1065 +.endm
  1.1066 +
  1.1067 +.macro bilinear_over_8888_8_8888_process_pixblock_head
  1.1068 +    mov         TMP1, X, asr #16
  1.1069 +    add         X, X, UX
  1.1070 +    add         TMP1, TOP, TMP1, asl #2
  1.1071 +    vld1.32     {d0}, [TMP1], STRIDE
  1.1072 +    mov         TMP2, X, asr #16
  1.1073 +    add         X, X, UX
  1.1074 +    add         TMP2, TOP, TMP2, asl #2
  1.1075 +    vld1.32     {d1}, [TMP1]
  1.1076 +    mov         TMP3, X, asr #16
  1.1077 +    add         X, X, UX
  1.1078 +    add         TMP3, TOP, TMP3, asl #2
  1.1079 +    vld1.32     {d2}, [TMP2], STRIDE
  1.1080 +    mov         TMP4, X, asr #16
  1.1081 +    add         X, X, UX
  1.1082 +    add         TMP4, TOP, TMP4, asl #2
  1.1083 +    vld1.32     {d3}, [TMP2]
  1.1084 +    vmull.u8    q2, d0, d28
  1.1085 +    vmull.u8    q3, d2, d28
  1.1086 +    vmlal.u8    q2, d1, d29
  1.1087 +    vmlal.u8    q3, d3, d29
  1.1088 +    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
  1.1089 +    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
  1.1090 +    vmlsl.u16   q0, d4, d30
  1.1091 +    vmlsl.u16   q1, d6, d31
  1.1092 +    vmlal.u16   q0, d5, d30
  1.1093 +    vmlal.u16   q1, d7, d31
  1.1094 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1095 +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1096 +    vld1.32     {d2}, [TMP3], STRIDE
  1.1097 +    vld1.32     {d3}, [TMP3]
  1.1098 +    pld         [TMP4, PF_OFFS]
  1.1099 +    vld1.32     {d4}, [TMP4], STRIDE
  1.1100 +    vld1.32     {d5}, [TMP4]
  1.1101 +    pld         [TMP4, PF_OFFS]
  1.1102 +    vmull.u8    q3, d2, d28
  1.1103 +    vmlal.u8    q3, d3, d29
  1.1104 +    vmull.u8    q1, d4, d28
  1.1105 +    vmlal.u8    q1, d5, d29
  1.1106 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.1107 +    vld1.32     {d22[0]}, [MASK]!
  1.1108 +    pld         [MASK, #prefetch_offset]
  1.1109 +    vadd.u16    q12, q12, q13
  1.1110 +    vmovn.u16   d16, q0
  1.1111 +.endm
  1.1112 +
  1.1113 +.macro bilinear_over_8888_8_8888_process_pixblock_tail
  1.1114 +    vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
  1.1115 +    vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
  1.1116 +    vmlsl.u16   q9, d6, d30
  1.1117 +    vmlsl.u16   q10, d2, d31
  1.1118 +    vmlal.u16   q9, d7, d30
  1.1119 +    vmlal.u16   q10, d3, d31
  1.1120 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.1121 +    vadd.u16    q12, q12, q13
  1.1122 +    vdup.32     d22, d22[0]
  1.1123 +    vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1124 +    vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1125 +    vmovn.u16   d17, q9
  1.1126 +    vld1.32     {d18, d19}, [OUT, :128]
  1.1127 +    pld         [OUT, PF_OFFS]
  1.1128 +    vuzp.8      d16, d17
  1.1129 +    vuzp.8      d18, d19
  1.1130 +    vuzp.8      d16, d17
  1.1131 +    vuzp.8      d18, d19
  1.1132 +    vmull.u8    q10, d16, d22
  1.1133 +    vmull.u8    q11, d17, d22
  1.1134 +    vrsra.u16   q10, q10, #8
  1.1135 +    vrsra.u16   q11, q11, #8
  1.1136 +    vrshrn.u16  d16, q10, #8
  1.1137 +    vrshrn.u16  d17, q11, #8
  1.1138 +    vdup.32     d22, d17[1]
  1.1139 +    vmvn.8      d22, d22
  1.1140 +    vmull.u8    q10, d18, d22
  1.1141 +    vmull.u8    q11, d19, d22
  1.1142 +    vrshr.u16   q9, q10, #8
  1.1143 +    vrshr.u16   q0, q11, #8
  1.1144 +    vraddhn.u16 d18, q9, q10
  1.1145 +    vraddhn.u16 d19, q0, q11
  1.1146 +    vqadd.u8    q9, q8, q9
  1.1147 +    vuzp.8      d18, d19
  1.1148 +    vuzp.8      d18, d19
  1.1149 +    vst1.32     {d18, d19}, [OUT, :128]!
  1.1150 +.endm
  1.1151 +
  1.1152 +.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
  1.1153 +                                            vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
  1.1154 +    mov         TMP1, X, asr #16
  1.1155 +    add         X, X, UX
  1.1156 +    add         TMP1, TOP, TMP1, asl #2
  1.1157 +                                            vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
  1.1158 +    vld1.32     {d0}, [TMP1], STRIDE
  1.1159 +    mov         TMP2, X, asr #16
  1.1160 +    add         X, X, UX
  1.1161 +    add         TMP2, TOP, TMP2, asl #2
  1.1162 +                                            vmlsl.u16   q9, d6, d30
  1.1163 +                                            vmlsl.u16   q10, d2, d31
  1.1164 +    vld1.32     {d1}, [TMP1]
  1.1165 +    mov         TMP3, X, asr #16
  1.1166 +    add         X, X, UX
  1.1167 +    add         TMP3, TOP, TMP3, asl #2
  1.1168 +                                            vmlal.u16   q9, d7, d30
  1.1169 +                                            vmlal.u16   q10, d3, d31
  1.1170 +    vld1.32     {d2}, [TMP2], STRIDE
  1.1171 +    mov         TMP4, X, asr #16
  1.1172 +    add         X, X, UX
  1.1173 +    add         TMP4, TOP, TMP4, asl #2
  1.1174 +                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.1175 +                                            vadd.u16    q12, q12, q13
  1.1176 +    vld1.32     {d3}, [TMP2]
  1.1177 +                                            vdup.32     d22, d22[0]
  1.1178 +                                            vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1179 +                                            vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1180 +    vmull.u8    q2, d0, d28
  1.1181 +    vmull.u8    q3, d2, d28
  1.1182 +                                            vmovn.u16   d17, q9
  1.1183 +                                            vld1.32     {d18, d19}, [OUT, :128]
  1.1184 +                                            pld         [OUT, #(prefetch_offset * 4)]
  1.1185 +    vmlal.u8    q2, d1, d29
  1.1186 +    vmlal.u8    q3, d3, d29
  1.1187 +                                            vuzp.8      d16, d17
  1.1188 +                                            vuzp.8      d18, d19
  1.1189 +    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
  1.1190 +    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
  1.1191 +                                            vuzp.8      d16, d17
  1.1192 +                                            vuzp.8      d18, d19
  1.1193 +    vmlsl.u16   q0, d4, d30
  1.1194 +    vmlsl.u16   q1, d6, d31
  1.1195 +                                            vmull.u8    q10, d16, d22
  1.1196 +                                            vmull.u8    q11, d17, d22
  1.1197 +    vmlal.u16   q0, d5, d30
  1.1198 +    vmlal.u16   q1, d7, d31
  1.1199 +                                            vrsra.u16   q10, q10, #8
  1.1200 +                                            vrsra.u16   q11, q11, #8
  1.1201 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1202 +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1.1203 +                                            vrshrn.u16  d16, q10, #8
  1.1204 +                                            vrshrn.u16  d17, q11, #8
  1.1205 +    vld1.32     {d2}, [TMP3], STRIDE
  1.1206 +                                            vdup.32     d22, d17[1]
  1.1207 +    vld1.32     {d3}, [TMP3]
  1.1208 +                                            vmvn.8      d22, d22
  1.1209 +    pld         [TMP4, PF_OFFS]
  1.1210 +    vld1.32     {d4}, [TMP4], STRIDE
  1.1211 +                                            vmull.u8    q10, d18, d22
  1.1212 +                                            vmull.u8    q11, d19, d22
  1.1213 +    vld1.32     {d5}, [TMP4]
  1.1214 +    pld         [TMP4, PF_OFFS]
  1.1215 +    vmull.u8    q3, d2, d28
  1.1216 +                                            vrshr.u16   q9, q10, #8
  1.1217 +                                            vrshr.u16   q15, q11, #8
  1.1218 +    vmlal.u8    q3, d3, d29
  1.1219 +    vmull.u8    q1, d4, d28
  1.1220 +                                            vraddhn.u16 d18, q9, q10
  1.1221 +                                            vraddhn.u16 d19, q15, q11
  1.1222 +    vmlal.u8    q1, d5, d29
  1.1223 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1.1224 +                                            vqadd.u8    q9, q8, q9
  1.1225 +    vld1.32     {d22[0]}, [MASK]!
  1.1226 +                                            vuzp.8      d18, d19
  1.1227 +    vadd.u16    q12, q12, q13
  1.1228 +                                            vuzp.8      d18, d19
  1.1229 +    vmovn.u16   d16, q0
  1.1230 +                                            vst1.32     {d18, d19}, [OUT, :128]!
  1.1231 +.endm
  1.1232 +
  1.1233 +/* add_8888_8888 */
  1.1234 +.macro bilinear_add_8888_8888_process_last_pixel
  1.1235 +    bilinear_interpolate_last_pixel 8888, x, 8888, add
  1.1236 +.endm
  1.1237 +
  1.1238 +.macro bilinear_add_8888_8888_process_two_pixels
  1.1239 +    bilinear_interpolate_two_pixels 8888, x, 8888, add
  1.1240 +.endm
  1.1241 +
  1.1242 +.macro bilinear_add_8888_8888_process_four_pixels
  1.1243 +    bilinear_interpolate_four_pixels 8888, x, 8888, add
  1.1244 +.endm
  1.1245 +
  1.1246 +.macro bilinear_add_8888_8888_process_pixblock_head
  1.1247 +    bilinear_add_8888_8888_process_four_pixels
  1.1248 +.endm
  1.1249 +
  1.1250 +.macro bilinear_add_8888_8888_process_pixblock_tail
  1.1251 +.endm
  1.1252 +
  1.1253 +.macro bilinear_add_8888_8888_process_pixblock_tail_head
  1.1254 +    bilinear_add_8888_8888_process_pixblock_tail
  1.1255 +    bilinear_add_8888_8888_process_pixblock_head
  1.1256 +.endm
  1.1257 +
  1.1258 +/* add_8888_8_8888 */
  1.1259 +.macro bilinear_add_8888_8_8888_process_last_pixel
  1.1260 +    bilinear_interpolate_last_pixel 8888, 8, 8888, add
  1.1261 +.endm
  1.1262 +
  1.1263 +.macro bilinear_add_8888_8_8888_process_two_pixels
  1.1264 +    bilinear_interpolate_two_pixels 8888, 8, 8888, add
  1.1265 +.endm
  1.1266 +
  1.1267 +.macro bilinear_add_8888_8_8888_process_four_pixels
  1.1268 +    bilinear_interpolate_four_pixels 8888, 8, 8888, add
  1.1269 +.endm
  1.1270 +
  1.1271 +.macro bilinear_add_8888_8_8888_process_pixblock_head
  1.1272 +    bilinear_add_8888_8_8888_process_four_pixels
  1.1273 +.endm
  1.1274 +
  1.1275 +.macro bilinear_add_8888_8_8888_process_pixblock_tail
  1.1276 +.endm
  1.1277 +
  1.1278 +.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
  1.1279 +    bilinear_add_8888_8_8888_process_pixblock_tail
  1.1280 +    bilinear_add_8888_8_8888_process_pixblock_head
  1.1281 +.endm
  1.1282 +
  1.1283 +
  1.1284 +/* Bilinear scanline functions */
  1.1285 +generate_bilinear_scanline_func \
  1.1286 +    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
  1.1287 +    8888, 8888, 2, 2, \
  1.1288 +    bilinear_src_8888_8_8888_process_last_pixel, \
  1.1289 +    bilinear_src_8888_8_8888_process_two_pixels, \
  1.1290 +    bilinear_src_8888_8_8888_process_four_pixels, \
  1.1291 +    bilinear_src_8888_8_8888_process_pixblock_head, \
  1.1292 +    bilinear_src_8888_8_8888_process_pixblock_tail, \
  1.1293 +    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
  1.1294 +    4, 28, BILINEAR_FLAG_USE_MASK
  1.1295 +
  1.1296 +generate_bilinear_scanline_func \
  1.1297 +    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
  1.1298 +    8888, 0565, 2, 1, \
  1.1299 +    bilinear_src_8888_8_0565_process_last_pixel, \
  1.1300 +    bilinear_src_8888_8_0565_process_two_pixels, \
  1.1301 +    bilinear_src_8888_8_0565_process_four_pixels, \
  1.1302 +    bilinear_src_8888_8_0565_process_pixblock_head, \
  1.1303 +    bilinear_src_8888_8_0565_process_pixblock_tail, \
  1.1304 +    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
  1.1305 +    4, 28, BILINEAR_FLAG_USE_MASK
  1.1306 +
  1.1307 +generate_bilinear_scanline_func \
  1.1308 +    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
  1.1309 +    0565, 8888, 1, 2, \
  1.1310 +    bilinear_src_0565_8_x888_process_last_pixel, \
  1.1311 +    bilinear_src_0565_8_x888_process_two_pixels, \
  1.1312 +    bilinear_src_0565_8_x888_process_four_pixels, \
  1.1313 +    bilinear_src_0565_8_x888_process_pixblock_head, \
  1.1314 +    bilinear_src_0565_8_x888_process_pixblock_tail, \
  1.1315 +    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
  1.1316 +    4, 28, BILINEAR_FLAG_USE_MASK
  1.1317 +
  1.1318 +generate_bilinear_scanline_func \
  1.1319 +    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
  1.1320 +    0565, 0565, 1, 1, \
  1.1321 +    bilinear_src_0565_8_0565_process_last_pixel, \
  1.1322 +    bilinear_src_0565_8_0565_process_two_pixels, \
  1.1323 +    bilinear_src_0565_8_0565_process_four_pixels, \
  1.1324 +    bilinear_src_0565_8_0565_process_pixblock_head, \
  1.1325 +    bilinear_src_0565_8_0565_process_pixblock_tail, \
  1.1326 +    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
  1.1327 +    4, 28, BILINEAR_FLAG_USE_MASK
  1.1328 +
  1.1329 +generate_bilinear_scanline_func \
  1.1330 +    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
  1.1331 +    8888, 8888, 2, 2, \
  1.1332 +    bilinear_over_8888_8888_process_last_pixel, \
  1.1333 +    bilinear_over_8888_8888_process_two_pixels, \
  1.1334 +    bilinear_over_8888_8888_process_four_pixels, \
  1.1335 +    bilinear_over_8888_8888_process_pixblock_head, \
  1.1336 +    bilinear_over_8888_8888_process_pixblock_tail, \
  1.1337 +    bilinear_over_8888_8888_process_pixblock_tail_head, \
  1.1338 +    4, 28, 0
  1.1339 +
  1.1340 +generate_bilinear_scanline_func \
  1.1341 +    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
  1.1342 +    8888, 8888, 2, 2, \
  1.1343 +    bilinear_over_8888_8_8888_process_last_pixel, \
  1.1344 +    bilinear_over_8888_8_8888_process_two_pixels, \
  1.1345 +    bilinear_over_8888_8_8888_process_four_pixels, \
  1.1346 +    bilinear_over_8888_8_8888_process_pixblock_head, \
  1.1347 +    bilinear_over_8888_8_8888_process_pixblock_tail, \
  1.1348 +    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
  1.1349 +    4, 28, BILINEAR_FLAG_USE_MASK
  1.1350 +
  1.1351 +generate_bilinear_scanline_func \
  1.1352 +    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
  1.1353 +    8888, 8888, 2, 2, \
  1.1354 +    bilinear_add_8888_8888_process_last_pixel, \
  1.1355 +    bilinear_add_8888_8888_process_two_pixels, \
  1.1356 +    bilinear_add_8888_8888_process_four_pixels, \
  1.1357 +    bilinear_add_8888_8888_process_pixblock_head, \
  1.1358 +    bilinear_add_8888_8888_process_pixblock_tail, \
  1.1359 +    bilinear_add_8888_8888_process_pixblock_tail_head, \
  1.1360 +    4, 28, 0
  1.1361 +
  1.1362 +generate_bilinear_scanline_func \
  1.1363 +    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
  1.1364 +    8888, 8888, 2, 2, \
  1.1365 +    bilinear_add_8888_8_8888_process_last_pixel, \
  1.1366 +    bilinear_add_8888_8_8888_process_two_pixels, \
  1.1367 +    bilinear_add_8888_8_8888_process_four_pixels, \
  1.1368 +    bilinear_add_8888_8_8888_process_pixblock_head, \
  1.1369 +    bilinear_add_8888_8_8888_process_pixblock_tail, \
  1.1370 +    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
  1.1371 +    4, 28, BILINEAR_FLAG_USE_MASK

mercurial