gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  * Copyright © 2011 SCore Corporation
     3  *
     4  * Permission is hereby granted, free of charge, to any person obtaining a
     5  * copy of this software and associated documentation files (the "Software"),
     6  * to deal in the Software without restriction, including without limitation
     7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     8  * and/or sell copies of the Software, and to permit persons to whom the
     9  * Software is furnished to do so, subject to the following conditions:
    10  *
    11  * The above copyright notice and this permission notice (including the next
    12  * paragraph) shall be included in all copies or substantial portions of the
    13  * Software.
    14  *
    15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
    18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    21  * DEALINGS IN THE SOFTWARE.
    22  *
    23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
    24  * Author:  Taekyun Kim (tkq.kim@samsung.com)
    25  */
    27 /*
    28  * This file contains scaled bilinear scanline functions implemented
    29  * using older siarhei's bilinear macro template.
    30  *
    31  * << General scanline function procedures >>
    32  *  1. bilinear interpolate source pixels
    33  *  2. load mask pixels
    34  *  3. load destination pixels
    35  *  4. duplicate mask to fill whole register
    36  *  5. interleave source & destination pixels
    37  *  6. apply mask to source pixels
    38  *  7. combine source & destination pixels
    39  *  8, Deinterleave final result
    40  *  9. store destination pixels
    41  *
    42  * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
    43  * Registers with double numbers(src01, dst01) are 128-bits registers.
    44  * All temp registers can be used freely outside the code block.
    45  * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
    46  *
    47  * Remarks
    48  *  There can be lots of pipeline stalls inside code block and between code blocks.
    49  *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
    50  */
    52 /* Prevent the stack from becoming executable for no reason... */
    53 #if defined(__linux__) && defined (__ELF__)
    54 .section .note.GNU-stack,"",%progbits
    55 #endif
    57 .text
    58 .fpu neon
    59 .arch armv7a
    60 .object_arch armv4
    61 .eabi_attribute 10, 0
    62 .eabi_attribute 12, 0
    63 .arm
    64 .altmacro
    65 .p2align 2
    67 #include "pixman-private.h"
    68 #include "pixman-arm-neon-asm.h"
    70 /*
    71  * Bilinear macros from pixman-arm-neon-asm.S
    72  */
    74 /* Supplementary macro for setting function attributes */
    75 .macro pixman_asm_function fname
    76     .func fname
    77     .global fname
    78 #ifdef __ELF__
    79     .hidden fname
    80     .type fname, %function
    81 #endif
    82 fname:
    83 .endm
    85 /*
    86  * Bilinear scaling support code which tries to provide pixel fetching, color
    87  * format conversion, and interpolation as separate macros which can be used
    88  * as the basic building blocks for constructing bilinear scanline functions.
    89  */
    91 .macro bilinear_load_8888 reg1, reg2, tmp
    92     mov       TMP1, X, asr #16
    93     add       X, X, UX
    94     add       TMP1, TOP, TMP1, asl #2
    95     vld1.32   {reg1}, [TMP1], STRIDE
    96     vld1.32   {reg2}, [TMP1]
    97 .endm
    99 .macro bilinear_load_0565 reg1, reg2, tmp
   100     mov       TMP1, X, asr #16
   101     add       X, X, UX
   102     add       TMP1, TOP, TMP1, asl #1
   103     vld1.32   {reg2[0]}, [TMP1], STRIDE
   104     vld1.32   {reg2[1]}, [TMP1]
   105     convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
   106 .endm
   108 .macro bilinear_load_and_vertical_interpolate_two_8888 \
   109                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
   111     bilinear_load_8888 reg1, reg2, tmp1
   112     vmull.u8  acc1, reg1, d28
   113     vmlal.u8  acc1, reg2, d29
   114     bilinear_load_8888 reg3, reg4, tmp2
   115     vmull.u8  acc2, reg3, d28
   116     vmlal.u8  acc2, reg4, d29
   117 .endm
   119 .macro bilinear_load_and_vertical_interpolate_four_8888 \
   120                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
   121                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   123     bilinear_load_and_vertical_interpolate_two_8888 \
   124                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
   125     bilinear_load_and_vertical_interpolate_two_8888 \
   126                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   127 .endm
   129 .macro bilinear_load_and_vertical_interpolate_two_0565 \
   130                 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
   132     mov       TMP1, X, asr #16
   133     add       X, X, UX
   134     add       TMP1, TOP, TMP1, asl #1
   135     mov       TMP2, X, asr #16
   136     add       X, X, UX
   137     add       TMP2, TOP, TMP2, asl #1
   138     vld1.32   {acc2lo[0]}, [TMP1], STRIDE
   139     vld1.32   {acc2hi[0]}, [TMP2], STRIDE
   140     vld1.32   {acc2lo[1]}, [TMP1]
   141     vld1.32   {acc2hi[1]}, [TMP2]
   142     convert_0565_to_x888 acc2, reg3, reg2, reg1
   143     vzip.u8   reg1, reg3
   144     vzip.u8   reg2, reg4
   145     vzip.u8   reg3, reg4
   146     vzip.u8   reg1, reg2
   147     vmull.u8  acc1, reg1, d28
   148     vmlal.u8  acc1, reg2, d29
   149     vmull.u8  acc2, reg3, d28
   150     vmlal.u8  acc2, reg4, d29
   151 .endm
   153 .macro bilinear_load_and_vertical_interpolate_four_0565 \
   154                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
   155                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   157     mov       TMP1, X, asr #16
   158     add       X, X, UX
   159     add       TMP1, TOP, TMP1, asl #1
   160     mov       TMP2, X, asr #16
   161     add       X, X, UX
   162     add       TMP2, TOP, TMP2, asl #1
   163     vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
   164     vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
   165     vld1.32   {xacc2lo[1]}, [TMP1]
   166     vld1.32   {xacc2hi[1]}, [TMP2]
   167     convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
   168     mov       TMP1, X, asr #16
   169     add       X, X, UX
   170     add       TMP1, TOP, TMP1, asl #1
   171     mov       TMP2, X, asr #16
   172     add       X, X, UX
   173     add       TMP2, TOP, TMP2, asl #1
   174     vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
   175     vzip.u8   xreg1, xreg3
   176     vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
   177     vzip.u8   xreg2, xreg4
   178     vld1.32   {yacc2lo[1]}, [TMP1]
   179     vzip.u8   xreg3, xreg4
   180     vld1.32   {yacc2hi[1]}, [TMP2]
   181     vzip.u8   xreg1, xreg2
   182     convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
   183     vmull.u8  xacc1, xreg1, d28
   184     vzip.u8   yreg1, yreg3
   185     vmlal.u8  xacc1, xreg2, d29
   186     vzip.u8   yreg2, yreg4
   187     vmull.u8  xacc2, xreg3, d28
   188     vzip.u8   yreg3, yreg4
   189     vmlal.u8  xacc2, xreg4, d29
   190     vzip.u8   yreg1, yreg2
   191     vmull.u8  yacc1, yreg1, d28
   192     vmlal.u8  yacc1, yreg2, d29
   193     vmull.u8  yacc2, yreg3, d28
   194     vmlal.u8  yacc2, yreg4, d29
   195 .endm
   197 .macro bilinear_store_8888 numpix, tmp1, tmp2
   198 .if numpix == 4
   199     vst1.32   {d0, d1}, [OUT]!
   200 .elseif numpix == 2
   201     vst1.32   {d0}, [OUT]!
   202 .elseif numpix == 1
   203     vst1.32   {d0[0]}, [OUT, :32]!
   204 .else
   205     .error bilinear_store_8888 numpix is unsupported
   206 .endif
   207 .endm
   209 .macro bilinear_store_0565 numpix, tmp1, tmp2
   210     vuzp.u8 d0, d1
   211     vuzp.u8 d2, d3
   212     vuzp.u8 d1, d3
   213     vuzp.u8 d0, d2
   214     convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
   215 .if numpix == 4
   216     vst1.16   {d2}, [OUT]!
   217 .elseif numpix == 2
   218     vst1.32   {d2[0]}, [OUT]!
   219 .elseif numpix == 1
   220     vst1.16   {d2[0]}, [OUT]!
   221 .else
   222     .error bilinear_store_0565 numpix is unsupported
   223 .endif
   224 .endm
   227 /*
   228  * Macros for loading mask pixels into register 'mask'.
   229  * vdup must be done in somewhere else.
   230  */
   231 .macro bilinear_load_mask_x numpix, mask
   232 .endm
   234 .macro bilinear_load_mask_8 numpix, mask
   235 .if numpix == 4
   236     vld1.32     {mask[0]}, [MASK]!
   237 .elseif numpix == 2
   238     vld1.16     {mask[0]}, [MASK]!
   239 .elseif numpix == 1
   240     vld1.8      {mask[0]}, [MASK]!
   241 .else
   242     .error bilinear_load_mask_8 numpix is unsupported
   243 .endif
   244     pld         [MASK, #prefetch_offset]
   245 .endm
   247 .macro bilinear_load_mask mask_fmt, numpix, mask
   248     bilinear_load_mask_&mask_fmt numpix, mask
   249 .endm
   252 /*
   253  * Macros for loading destination pixels into register 'dst0' and 'dst1'.
   254  * Interleave should be done somewhere else.
   255  */
   256 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
   257 .endm
   259 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
   260 .endm
   262 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
   263 .if numpix == 4
   264     vld1.32     {dst0, dst1}, [OUT]
   265 .elseif numpix == 2
   266     vld1.32     {dst0}, [OUT]
   267 .elseif numpix == 1
   268     vld1.32     {dst0[0]}, [OUT]
   269 .else
   270     .error bilinear_load_dst_8888 numpix is unsupported
   271 .endif
   272     pld         [OUT, #(prefetch_offset * 4)]
   273 .endm
   275 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
   276     bilinear_load_dst_8888 numpix, dst0, dst1, dst01
   277 .endm
   279 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
   280     bilinear_load_dst_8888 numpix, dst0, dst1, dst01
   281 .endm
   283 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
   284     bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
   285 .endm
   287 /*
   288  * Macros for duplicating partially loaded mask to fill entire register.
   289  * We will apply mask to interleaved source pixels, that is
   290  *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
   291  *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
   292  * So, we need to duplicate loaded mask into whole register.
   293  *
   294  * For two pixel case
   295  *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
   296  *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
   297  * We can do some optimizations for this including last pixel cases.
   298  */
   299 .macro bilinear_duplicate_mask_x numpix, mask
   300 .endm
   302 .macro bilinear_duplicate_mask_8 numpix, mask
   303 .if numpix == 4
   304     vdup.32     mask, mask[0]
   305 .elseif numpix == 2
   306     vdup.16     mask, mask[0]
   307 .elseif numpix == 1
   308     vdup.8      mask, mask[0]
   309 .else
   310     .error bilinear_duplicate_mask_8 is unsupported
   311 .endif
   312 .endm
   314 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
   315     bilinear_duplicate_mask_&mask_fmt numpix, mask
   316 .endm
   318 /*
   319  * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
   320  * Interleave should be done when maks is enabled or operator is 'over'.
   321  */
   322 .macro bilinear_interleave src0, src1, dst0, dst1
   323     vuzp.8      src0, src1
   324     vuzp.8      dst0, dst1
   325     vuzp.8      src0, src1
   326     vuzp.8      dst0, dst1
   327 .endm
   329 .macro bilinear_interleave_src_dst_x_src \
   330                 numpix, src0, src1, src01, dst0, dst1, dst01
   331 .endm
   333 .macro bilinear_interleave_src_dst_x_over \
   334                 numpix, src0, src1, src01, dst0, dst1, dst01
   336     bilinear_interleave src0, src1, dst0, dst1
   337 .endm
   339 .macro bilinear_interleave_src_dst_x_add \
   340                 numpix, src0, src1, src01, dst0, dst1, dst01
   341 .endm
   343 .macro bilinear_interleave_src_dst_8_src \
   344                 numpix, src0, src1, src01, dst0, dst1, dst01
   346     bilinear_interleave src0, src1, dst0, dst1
   347 .endm
   349 .macro bilinear_interleave_src_dst_8_over \
   350                 numpix, src0, src1, src01, dst0, dst1, dst01
   352     bilinear_interleave src0, src1, dst0, dst1
   353 .endm
   355 .macro bilinear_interleave_src_dst_8_add \
   356                 numpix, src0, src1, src01, dst0, dst1, dst01
   358     bilinear_interleave src0, src1, dst0, dst1
   359 .endm
   361 .macro bilinear_interleave_src_dst \
   362                 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
   364     bilinear_interleave_src_dst_&mask_fmt&_&op \
   365                 numpix, src0, src1, src01, dst0, dst1, dst01
   366 .endm
   369 /*
   370  * Macros for applying masks to src pixels. (see combine_mask_u() function)
   371  * src, dst should be in interleaved form.
   372  * mask register should be in form (m0, m1, m2, m3).
   373  */
   374 .macro bilinear_apply_mask_to_src_x \
   375                 numpix, src0, src1, src01, mask, \
   376                 tmp01, tmp23, tmp45, tmp67
   377 .endm
   379 .macro bilinear_apply_mask_to_src_8 \
   380                 numpix, src0, src1, src01, mask, \
   381                 tmp01, tmp23, tmp45, tmp67
   383     vmull.u8        tmp01, src0, mask
   384     vmull.u8        tmp23, src1, mask
   385     /* bubbles */
   386     vrshr.u16       tmp45, tmp01, #8
   387     vrshr.u16       tmp67, tmp23, #8
   388     /* bubbles */
   389     vraddhn.u16     src0, tmp45, tmp01
   390     vraddhn.u16     src1, tmp67, tmp23
   391 .endm
   393 .macro bilinear_apply_mask_to_src \
   394                 mask_fmt, numpix, src0, src1, src01, mask, \
   395                 tmp01, tmp23, tmp45, tmp67
   397     bilinear_apply_mask_to_src_&mask_fmt \
   398                 numpix, src0, src1, src01, mask, \
   399                 tmp01, tmp23, tmp45, tmp67
   400 .endm
   403 /*
   404  * Macros for combining src and destination pixels.
   405  * Interleave or not is depending on operator 'op'.
   406  */
   407 .macro bilinear_combine_src \
   408                 numpix, src0, src1, src01, dst0, dst1, dst01, \
   409                 tmp01, tmp23, tmp45, tmp67, tmp8
   410 .endm
   412 .macro bilinear_combine_over \
   413                 numpix, src0, src1, src01, dst0, dst1, dst01, \
   414                 tmp01, tmp23, tmp45, tmp67, tmp8
   416     vdup.32     tmp8, src1[1]
   417     /* bubbles */
   418     vmvn.8      tmp8, tmp8
   419     /* bubbles */
   420     vmull.u8    tmp01, dst0, tmp8
   421     /* bubbles */
   422     vmull.u8    tmp23, dst1, tmp8
   423     /* bubbles */
   424     vrshr.u16   tmp45, tmp01, #8
   425     vrshr.u16   tmp67, tmp23, #8
   426     /* bubbles */
   427     vraddhn.u16 dst0, tmp45, tmp01
   428     vraddhn.u16 dst1, tmp67, tmp23
   429     /* bubbles */
   430     vqadd.u8    src01, dst01, src01
   431 .endm
   433 .macro bilinear_combine_add \
   434                 numpix, src0, src1, src01, dst0, dst1, dst01, \
   435                 tmp01, tmp23, tmp45, tmp67, tmp8
   437     vqadd.u8    src01, dst01, src01
   438 .endm
   440 .macro bilinear_combine \
   441                 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
   442                 tmp01, tmp23, tmp45, tmp67, tmp8
   444     bilinear_combine_&op \
   445                 numpix, src0, src1, src01, dst0, dst1, dst01, \
   446                 tmp01, tmp23, tmp45, tmp67, tmp8
   447 .endm
   449 /*
   450  * Macros for final deinterleaving of destination pixels if needed.
   451  */
   452 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
   453     vuzp.8      dst0, dst1
   454     /* bubbles */
   455     vuzp.8      dst0, dst1
   456 .endm
   458 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
   459 .endm
   461 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
   462     bilinear_deinterleave numpix, dst0, dst1, dst01
   463 .endm
   465 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
   466 .endm
   468 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
   469     bilinear_deinterleave numpix, dst0, dst1, dst01
   470 .endm
   472 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
   473     bilinear_deinterleave numpix, dst0, dst1, dst01
   474 .endm
   476 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
   477     bilinear_deinterleave numpix, dst0, dst1, dst01
   478 .endm
   480 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
   481     bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
   482 .endm
   485 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
   486     bilinear_load_&src_fmt d0, d1, d2
   487     bilinear_load_mask mask_fmt, 1, d4
   488     bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
   489     vmull.u8  q1, d0, d28
   490     vmlal.u8  q1, d1, d29
   491     /* 5 cycles bubble */
   492     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   493     vmlsl.u16 q0, d2, d30
   494     vmlal.u16 q0, d3, d30
   495     /* 5 cycles bubble */
   496     bilinear_duplicate_mask mask_fmt, 1, d4
   497     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   498     /* 3 cycles bubble */
   499     vmovn.u16 d0, q0
   500     /* 1 cycle bubble */
   501     bilinear_interleave_src_dst \
   502                 mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
   503     bilinear_apply_mask_to_src \
   504                 mask_fmt, 1, d0, d1, q0, d4, \
   505                 q3, q8, q10, q11
   506     bilinear_combine \
   507                 op, 1, d0, d1, q0, d18, d19, q9, \
   508                 q3, q8, q10, q11, d5
   509     bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
   510     bilinear_store_&dst_fmt 1, q2, q3
   511 .endm
   513 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
   514     bilinear_load_and_vertical_interpolate_two_&src_fmt \
   515                 q1, q11, d0, d1, d20, d21, d22, d23
   516     bilinear_load_mask mask_fmt, 2, d4
   517     bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
   518     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   519     vmlsl.u16 q0, d2, d30
   520     vmlal.u16 q0, d3, d30
   521     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
   522     vmlsl.u16 q10, d22, d31
   523     vmlal.u16 q10, d23, d31
   524     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   525     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   526     bilinear_duplicate_mask mask_fmt, 2, d4
   527     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   528     vadd.u16  q12, q12, q13
   529     vmovn.u16 d0, q0
   530     bilinear_interleave_src_dst \
   531                 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
   532     bilinear_apply_mask_to_src \
   533                 mask_fmt, 2, d0, d1, q0, d4, \
   534                 q3, q8, q10, q11
   535     bilinear_combine \
   536                 op, 2, d0, d1, q0, d18, d19, q9, \
   537                 q3, q8, q10, q11, d5
   538     bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
   539     bilinear_store_&dst_fmt 2, q2, q3
   540 .endm
   542 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
   543     bilinear_load_and_vertical_interpolate_four_&src_fmt \
   544                 q1, q11, d0, d1, d20, d21, d22, d23 \
   545                 q3, q9,  d4, d5, d16, d17, d18, d19
   546     pld       [TMP1, PF_OFFS]
   547     sub       TMP1, TMP1, STRIDE
   548     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   549     vmlsl.u16 q0, d2, d30
   550     vmlal.u16 q0, d3, d30
   551     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
   552     vmlsl.u16 q10, d22, d31
   553     vmlal.u16 q10, d23, d31
   554     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   555     vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
   556     vmlsl.u16 q2, d6, d30
   557     vmlal.u16 q2, d7, d30
   558     vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
   559     bilinear_load_mask mask_fmt, 4, d22
   560     bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
   561     pld       [TMP1, PF_OFFS]
   562     vmlsl.u16 q8, d18, d31
   563     vmlal.u16 q8, d19, d31
   564     vadd.u16  q12, q12, q13
   565     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   566     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   567     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   568     vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
   569     bilinear_duplicate_mask mask_fmt, 4, d22
   570     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   571     vmovn.u16 d0, q0
   572     vmovn.u16 d1, q2
   573     vadd.u16  q12, q12, q13
   574     bilinear_interleave_src_dst \
   575                 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
   576     bilinear_apply_mask_to_src \
   577                 mask_fmt, 4, d0, d1, q0, d22, \
   578                 q3, q8, q9, q10
   579     bilinear_combine \
   580                 op, 4, d0, d1, q0, d2, d3, q1, \
   581                 q3, q8, q9, q10, d23
   582     bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
   583     bilinear_store_&dst_fmt 4, q2, q3
   584 .endm
   586 .set BILINEAR_FLAG_USE_MASK,		1
   587 .set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
   589 /*
   590  * Main template macro for generating NEON optimized bilinear scanline functions.
   591  *
   592  * Bilinear scanline generator macro take folling arguments:
   593  *  fname			- name of the function to generate
   594  *  src_fmt			- source color format (8888 or 0565)
   595  *  dst_fmt			- destination color format (8888 or 0565)
   596  *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
   597  *  process_last_pixel		- code block that interpolate one pixel and does not
   598  *				  update horizontal weight
   599  *  process_two_pixels		- code block that interpolate two pixels and update
   600  *				  horizontal weight
   601  *  process_four_pixels		- code block that interpolate four pixels and update
   602  *				  horizontal weight
   603  *  process_pixblock_head	- head part of middle loop
   604  *  process_pixblock_tail	- tail part of middle loop
   605  *  process_pixblock_tail_head	- tail_head of middle loop
   606  *  pixblock_size		- number of pixels processed in a single middle loop
   607  *  prefetch_distance		- prefetch in the source image by that many pixels ahead
   608  */
   610 .macro generate_bilinear_scanline_func \
   611 	fname, \
   612 	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
   613 	bilinear_process_last_pixel, \
   614 	bilinear_process_two_pixels, \
   615 	bilinear_process_four_pixels, \
   616 	bilinear_process_pixblock_head, \
   617 	bilinear_process_pixblock_tail, \
   618 	bilinear_process_pixblock_tail_head, \
   619 	pixblock_size, \
   620 	prefetch_distance, \
   621 	flags
   623 pixman_asm_function fname
   624 .if pixblock_size == 8
   625 .elseif pixblock_size == 4
   626 .else
   627     .error unsupported pixblock size
   628 .endif
   630 .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
   631     OUT       .req    r0
   632     TOP       .req    r1
   633     BOTTOM    .req    r2
   634     WT        .req    r3
   635     WB        .req    r4
   636     X         .req    r5
   637     UX        .req    r6
   638     WIDTH     .req    ip
   639     TMP1      .req    r3
   640     TMP2      .req    r4
   641     PF_OFFS   .req    r7
   642     TMP3      .req    r8
   643     TMP4      .req    r9
   644     STRIDE    .req    r2
   646     mov		ip, sp
   647     push	{r4, r5, r6, r7, r8, r9}
   648     mov		PF_OFFS, #prefetch_distance
   649     ldmia	ip, {WB, X, UX, WIDTH}
   650 .else
   651     OUT       .req      r0
   652     MASK      .req      r1
   653     TOP       .req      r2
   654     BOTTOM    .req      r3
   655     WT        .req      r4
   656     WB        .req      r5
   657     X         .req      r6
   658     UX        .req      r7
   659     WIDTH     .req      ip
   660     TMP1      .req      r4
   661     TMP2      .req      r5
   662     PF_OFFS   .req      r8
   663     TMP3      .req      r9
   664     TMP4      .req      r10
   665     STRIDE    .req      r3
   667     .set prefetch_offset, prefetch_distance
   669     mov       ip, sp
   670     push      {r4, r5, r6, r7, r8, r9, r10, ip}
   671     mov       PF_OFFS, #prefetch_distance
   672     ldmia     ip, {WT, WB, X, UX, WIDTH}
   673 .endif
   675     mul       PF_OFFS, PF_OFFS, UX
   677 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
   678     vpush     {d8-d15}
   679 .endif
   681     sub	      STRIDE, BOTTOM, TOP
   682     .unreq    BOTTOM
   684     cmp       WIDTH, #0
   685     ble       3f
   687     vdup.u16  q12, X
   688     vdup.u16  q13, UX
   689     vdup.u8   d28, WT
   690     vdup.u8   d29, WB
   691     vadd.u16  d25, d25, d26
   693     /* ensure good destination alignment  */
   694     cmp       WIDTH, #1
   695     blt       0f
   696     tst       OUT, #(1 << dst_bpp_shift)
   697     beq       0f
   698     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   699     vadd.u16  q12, q12, q13
   700     bilinear_process_last_pixel
   701     sub       WIDTH, WIDTH, #1
   702 0:
   703     vadd.u16  q13, q13, q13
   704     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   705     vadd.u16  q12, q12, q13
   707     cmp       WIDTH, #2
   708     blt       0f
   709     tst       OUT, #(1 << (dst_bpp_shift + 1))
   710     beq       0f
   711     bilinear_process_two_pixels
   712     sub       WIDTH, WIDTH, #2
   713 0:
   714 .if pixblock_size == 8
   715     cmp       WIDTH, #4
   716     blt       0f
   717     tst       OUT, #(1 << (dst_bpp_shift + 2))
   718     beq       0f
   719     bilinear_process_four_pixels
   720     sub       WIDTH, WIDTH, #4
   721 0:
   722 .endif
   723     subs      WIDTH, WIDTH, #pixblock_size
   724     blt       1f
   725     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
   726     bilinear_process_pixblock_head
   727     subs      WIDTH, WIDTH, #pixblock_size
   728     blt       5f
   729 0:
   730     bilinear_process_pixblock_tail_head
   731     subs      WIDTH, WIDTH, #pixblock_size
   732     bge       0b
   733 5:
   734     bilinear_process_pixblock_tail
   735 1:
   736 .if pixblock_size == 8
   737     tst       WIDTH, #4
   738     beq       2f
   739     bilinear_process_four_pixels
   740 2:
   741 .endif
   742     /* handle the remaining trailing pixels */
   743     tst       WIDTH, #2
   744     beq       2f
   745     bilinear_process_two_pixels
   746 2:
   747     tst       WIDTH, #1
   748     beq       3f
   749     bilinear_process_last_pixel
   750 3:
   751 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
   752     vpop      {d8-d15}
   753 .endif
   755 .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
   756     pop       {r4, r5, r6, r7, r8, r9}
   757 .else
   758     pop       {r4, r5, r6, r7, r8, r9, r10, ip}
   759 .endif
   760     bx        lr
   762     .unreq    OUT
   763     .unreq    TOP
   764     .unreq    WT
   765     .unreq    WB
   766     .unreq    X
   767     .unreq    UX
   768     .unreq    WIDTH
   769     .unreq    TMP1
   770     .unreq    TMP2
   771     .unreq    PF_OFFS
   772     .unreq    TMP3
   773     .unreq    TMP4
   774     .unreq    STRIDE
   775 .if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
   776     .unreq    MASK
   777 .endif
   779 .endfunc
   781 .endm
   783 /* src_8888_8_8888 */
   784 .macro bilinear_src_8888_8_8888_process_last_pixel
   785     bilinear_interpolate_last_pixel 8888, 8, 8888, src
   786 .endm
   788 .macro bilinear_src_8888_8_8888_process_two_pixels
   789     bilinear_interpolate_two_pixels 8888, 8, 8888, src
   790 .endm
   792 .macro bilinear_src_8888_8_8888_process_four_pixels
   793     bilinear_interpolate_four_pixels 8888, 8, 8888, src
   794 .endm
   796 .macro bilinear_src_8888_8_8888_process_pixblock_head
   797     bilinear_src_8888_8_8888_process_four_pixels
   798 .endm
   800 .macro bilinear_src_8888_8_8888_process_pixblock_tail
   801 .endm
   803 .macro bilinear_src_8888_8_8888_process_pixblock_tail_head
   804     bilinear_src_8888_8_8888_process_pixblock_tail
   805     bilinear_src_8888_8_8888_process_pixblock_head
   806 .endm
   808 /* src_8888_8_0565 */
   809 .macro bilinear_src_8888_8_0565_process_last_pixel
   810     bilinear_interpolate_last_pixel 8888, 8, 0565, src
   811 .endm
   813 .macro bilinear_src_8888_8_0565_process_two_pixels
   814     bilinear_interpolate_two_pixels 8888, 8, 0565, src
   815 .endm
   817 .macro bilinear_src_8888_8_0565_process_four_pixels
   818     bilinear_interpolate_four_pixels 8888, 8, 0565, src
   819 .endm
   821 .macro bilinear_src_8888_8_0565_process_pixblock_head
   822     bilinear_src_8888_8_0565_process_four_pixels
   823 .endm
   825 .macro bilinear_src_8888_8_0565_process_pixblock_tail
   826 .endm
   828 .macro bilinear_src_8888_8_0565_process_pixblock_tail_head
   829     bilinear_src_8888_8_0565_process_pixblock_tail
   830     bilinear_src_8888_8_0565_process_pixblock_head
   831 .endm
   833 /* src_0565_8_x888 */
   834 .macro bilinear_src_0565_8_x888_process_last_pixel
   835     bilinear_interpolate_last_pixel 0565, 8, 8888, src
   836 .endm
   838 .macro bilinear_src_0565_8_x888_process_two_pixels
   839     bilinear_interpolate_two_pixels 0565, 8, 8888, src
   840 .endm
   842 .macro bilinear_src_0565_8_x888_process_four_pixels
   843     bilinear_interpolate_four_pixels 0565, 8, 8888, src
   844 .endm
   846 .macro bilinear_src_0565_8_x888_process_pixblock_head
   847     bilinear_src_0565_8_x888_process_four_pixels
   848 .endm
   850 .macro bilinear_src_0565_8_x888_process_pixblock_tail
   851 .endm
   853 .macro bilinear_src_0565_8_x888_process_pixblock_tail_head
   854     bilinear_src_0565_8_x888_process_pixblock_tail
   855     bilinear_src_0565_8_x888_process_pixblock_head
   856 .endm
   858 /* src_0565_8_0565 */
   859 .macro bilinear_src_0565_8_0565_process_last_pixel
   860     bilinear_interpolate_last_pixel 0565, 8, 0565, src
   861 .endm
   863 .macro bilinear_src_0565_8_0565_process_two_pixels
   864     bilinear_interpolate_two_pixels 0565, 8, 0565, src
   865 .endm
   867 .macro bilinear_src_0565_8_0565_process_four_pixels
   868     bilinear_interpolate_four_pixels 0565, 8, 0565, src
   869 .endm
   871 .macro bilinear_src_0565_8_0565_process_pixblock_head
   872     bilinear_src_0565_8_0565_process_four_pixels
   873 .endm
   875 .macro bilinear_src_0565_8_0565_process_pixblock_tail
   876 .endm
   878 .macro bilinear_src_0565_8_0565_process_pixblock_tail_head
   879     bilinear_src_0565_8_0565_process_pixblock_tail
   880     bilinear_src_0565_8_0565_process_pixblock_head
   881 .endm
   883 /* over_8888_8888 */
   884 .macro bilinear_over_8888_8888_process_last_pixel
   885     bilinear_interpolate_last_pixel 8888, x, 8888, over
   886 .endm
   888 .macro bilinear_over_8888_8888_process_two_pixels
   889     bilinear_interpolate_two_pixels 8888, x, 8888, over
   890 .endm
   892 .macro bilinear_over_8888_8888_process_four_pixels
   893     bilinear_interpolate_four_pixels 8888, x, 8888, over
   894 .endm
   896 .macro bilinear_over_8888_8888_process_pixblock_head
   897     mov         TMP1, X, asr #16
   898     add         X, X, UX
   899     add         TMP1, TOP, TMP1, asl #2
   900     mov         TMP2, X, asr #16
   901     add         X, X, UX
   902     add         TMP2, TOP, TMP2, asl #2
   904     vld1.32     {d22}, [TMP1], STRIDE
   905     vld1.32     {d23}, [TMP1]
   906     mov         TMP3, X, asr #16
   907     add         X, X, UX
   908     add         TMP3, TOP, TMP3, asl #2
   909     vmull.u8    q8, d22, d28
   910     vmlal.u8    q8, d23, d29
   912     vld1.32     {d22}, [TMP2], STRIDE
   913     vld1.32     {d23}, [TMP2]
   914     mov         TMP4, X, asr #16
   915     add         X, X, UX
   916     add         TMP4, TOP, TMP4, asl #2
   917     vmull.u8    q9, d22, d28
   918     vmlal.u8    q9, d23, d29
   920     vld1.32     {d22}, [TMP3], STRIDE
   921     vld1.32     {d23}, [TMP3]
   922     vmull.u8    q10, d22, d28
   923     vmlal.u8    q10, d23, d29
   925     vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
   926     vmlsl.u16   q0, d16, d30
   927     vmlal.u16   q0, d17, d30
   929     pld         [TMP4, PF_OFFS]
   930     vld1.32     {d16}, [TMP4], STRIDE
   931     vld1.32     {d17}, [TMP4]
   932     pld         [TMP4, PF_OFFS]
   933     vmull.u8    q11, d16, d28
   934     vmlal.u8    q11, d17, d29
   936     vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
   937     vmlsl.u16   q1, d18, d31
   938     vmlal.u16   q1, d19, d31
   939     vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   940     vadd.u16    q12, q12, q13
   941 .endm
   943 .macro bilinear_over_8888_8888_process_pixblock_tail
   944     vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
   945     vmlsl.u16   q2, d20, d30
   946     vmlal.u16   q2, d21, d30
   947     vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
   948     vmlsl.u16   q3, d22, d31
   949     vmlal.u16   q3, d23, d31
   950     vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   951     vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   952     vld1.32     {d2, d3}, [OUT, :128]
   953     pld         [OUT, #(prefetch_offset * 4)]
   954     vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   955     vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   956     vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   957     vmovn.u16   d6, q0
   958     vmovn.u16   d7, q2
   959     vuzp.8      d6, d7
   960     vuzp.8      d2, d3
   961     vuzp.8      d6, d7
   962     vuzp.8      d2, d3
   963     vdup.32     d4, d7[1]
   964     vmvn.8      d4, d4
   965     vmull.u8    q11, d2, d4
   966     vmull.u8    q2, d3, d4
   967     vrshr.u16   q1, q11, #8
   968     vrshr.u16   q10, q2, #8
   969     vraddhn.u16 d2, q1, q11
   970     vraddhn.u16 d3, q10, q2
   971     vqadd.u8    q3, q1, q3
   972     vuzp.8      d6, d7
   973     vuzp.8      d6, d7
   974     vadd.u16    q12, q12, q13
   975     vst1.32     {d6, d7}, [OUT, :128]!
   976 .endm
   978 .macro bilinear_over_8888_8888_process_pixblock_tail_head
   979                                             vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
   980     mov         TMP1, X, asr #16
   981     add         X, X, UX
   982     add         TMP1, TOP, TMP1, asl #2
   983                                             vmlsl.u16   q2, d20, d30
   984     mov         TMP2, X, asr #16
   985     add         X, X, UX
   986     add         TMP2, TOP, TMP2, asl #2
   987                                             vmlal.u16   q2, d21, d30
   988                                             vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
   989     vld1.32     {d20}, [TMP1], STRIDE
   990                                             vmlsl.u16   q3, d22, d31
   991                                             vmlal.u16   q3, d23, d31
   992     vld1.32     {d21}, [TMP1]
   993     vmull.u8    q8, d20, d28
   994     vmlal.u8    q8, d21, d29
   995                                             vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   996                                             vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   997                                             vld1.32     {d2, d3}, [OUT, :128]
   998                                             pld         [OUT, PF_OFFS]
   999                                             vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
  1000                                             vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1001     vld1.32     {d22}, [TMP2], STRIDE
  1002                                             vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
  1003                                             vmovn.u16   d6, q0
  1004     vld1.32     {d23}, [TMP2]
  1005     vmull.u8    q9, d22, d28
  1006     mov         TMP3, X, asr #16
  1007     add         X, X, UX
  1008     add         TMP3, TOP, TMP3, asl #2
  1009     mov         TMP4, X, asr #16
  1010     add         X, X, UX
  1011     add         TMP4, TOP, TMP4, asl #2
  1012     vmlal.u8    q9, d23, d29
  1013                                             vmovn.u16   d7, q2
  1014     vld1.32     {d22}, [TMP3], STRIDE
  1015                                             vuzp.8      d6, d7
  1016                                             vuzp.8      d2, d3
  1017                                             vuzp.8      d6, d7
  1018                                             vuzp.8      d2, d3
  1019                                             vdup.32     d4, d7[1]
  1020     vld1.32     {d23}, [TMP3]
  1021                                             vmvn.8      d4, d4
  1022     vmull.u8    q10, d22, d28
  1023     vmlal.u8    q10, d23, d29
  1024                                             vmull.u8    q11, d2, d4
  1025                                             vmull.u8    q2, d3, d4
  1026     vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
  1027     vmlsl.u16   q0, d16, d30
  1028                                             vrshr.u16   q1, q11, #8
  1029     vmlal.u16   q0, d17, d30
  1030                                             vrshr.u16   q8, q2, #8
  1031                                             vraddhn.u16 d2, q1, q11
  1032                                             vraddhn.u16 d3, q8, q2
  1033     pld         [TMP4, PF_OFFS]
  1034     vld1.32     {d16}, [TMP4], STRIDE
  1035                                             vqadd.u8    q3, q1, q3
  1036     vld1.32     {d17}, [TMP4]
  1037     pld         [TMP4, PF_OFFS]
  1038     vmull.u8    q11, d16, d28
  1039     vmlal.u8    q11, d17, d29
  1040                                             vuzp.8      d6, d7
  1041     vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
  1042                                             vuzp.8      d6, d7
  1043     vmlsl.u16   q1, d18, d31
  1044                                             vadd.u16    q12, q12, q13
  1045     vmlal.u16   q1, d19, d31
  1046     vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1047     vadd.u16    q12, q12, q13
  1048                                             vst1.32     {d6, d7}, [OUT, :128]!
  1049 .endm
  1051 /* over_8888_8_8888 */
  1052 .macro bilinear_over_8888_8_8888_process_last_pixel
  1053     bilinear_interpolate_last_pixel 8888, 8, 8888, over
  1054 .endm
  1056 .macro bilinear_over_8888_8_8888_process_two_pixels
  1057     bilinear_interpolate_two_pixels 8888, 8, 8888, over
  1058 .endm
  1060 .macro bilinear_over_8888_8_8888_process_four_pixels
  1061     bilinear_interpolate_four_pixels 8888, 8, 8888, over
  1062 .endm
  1064 .macro bilinear_over_8888_8_8888_process_pixblock_head
  1065     mov         TMP1, X, asr #16
  1066     add         X, X, UX
  1067     add         TMP1, TOP, TMP1, asl #2
  1068     vld1.32     {d0}, [TMP1], STRIDE
  1069     mov         TMP2, X, asr #16
  1070     add         X, X, UX
  1071     add         TMP2, TOP, TMP2, asl #2
  1072     vld1.32     {d1}, [TMP1]
  1073     mov         TMP3, X, asr #16
  1074     add         X, X, UX
  1075     add         TMP3, TOP, TMP3, asl #2
  1076     vld1.32     {d2}, [TMP2], STRIDE
  1077     mov         TMP4, X, asr #16
  1078     add         X, X, UX
  1079     add         TMP4, TOP, TMP4, asl #2
  1080     vld1.32     {d3}, [TMP2]
  1081     vmull.u8    q2, d0, d28
  1082     vmull.u8    q3, d2, d28
  1083     vmlal.u8    q2, d1, d29
  1084     vmlal.u8    q3, d3, d29
  1085     vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
  1086     vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
  1087     vmlsl.u16   q0, d4, d30
  1088     vmlsl.u16   q1, d6, d31
  1089     vmlal.u16   q0, d5, d30
  1090     vmlal.u16   q1, d7, d31
  1091     vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1092     vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1093     vld1.32     {d2}, [TMP3], STRIDE
  1094     vld1.32     {d3}, [TMP3]
  1095     pld         [TMP4, PF_OFFS]
  1096     vld1.32     {d4}, [TMP4], STRIDE
  1097     vld1.32     {d5}, [TMP4]
  1098     pld         [TMP4, PF_OFFS]
  1099     vmull.u8    q3, d2, d28
  1100     vmlal.u8    q3, d3, d29
  1101     vmull.u8    q1, d4, d28
  1102     vmlal.u8    q1, d5, d29
  1103     vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1104     vld1.32     {d22[0]}, [MASK]!
  1105     pld         [MASK, #prefetch_offset]
  1106     vadd.u16    q12, q12, q13
  1107     vmovn.u16   d16, q0
  1108 .endm
  1110 .macro bilinear_over_8888_8_8888_process_pixblock_tail
  1111     vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
  1112     vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
  1113     vmlsl.u16   q9, d6, d30
  1114     vmlsl.u16   q10, d2, d31
  1115     vmlal.u16   q9, d7, d30
  1116     vmlal.u16   q10, d3, d31
  1117     vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1118     vadd.u16    q12, q12, q13
  1119     vdup.32     d22, d22[0]
  1120     vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
  1121     vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
  1122     vmovn.u16   d17, q9
  1123     vld1.32     {d18, d19}, [OUT, :128]
  1124     pld         [OUT, PF_OFFS]
  1125     vuzp.8      d16, d17
  1126     vuzp.8      d18, d19
  1127     vuzp.8      d16, d17
  1128     vuzp.8      d18, d19
  1129     vmull.u8    q10, d16, d22
  1130     vmull.u8    q11, d17, d22
  1131     vrsra.u16   q10, q10, #8
  1132     vrsra.u16   q11, q11, #8
  1133     vrshrn.u16  d16, q10, #8
  1134     vrshrn.u16  d17, q11, #8
  1135     vdup.32     d22, d17[1]
  1136     vmvn.8      d22, d22
  1137     vmull.u8    q10, d18, d22
  1138     vmull.u8    q11, d19, d22
  1139     vrshr.u16   q9, q10, #8
  1140     vrshr.u16   q0, q11, #8
  1141     vraddhn.u16 d18, q9, q10
  1142     vraddhn.u16 d19, q0, q11
  1143     vqadd.u8    q9, q8, q9
  1144     vuzp.8      d18, d19
  1145     vuzp.8      d18, d19
  1146     vst1.32     {d18, d19}, [OUT, :128]!
  1147 .endm
  1149 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
  1150                                             vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
  1151     mov         TMP1, X, asr #16
  1152     add         X, X, UX
  1153     add         TMP1, TOP, TMP1, asl #2
  1154                                             vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
  1155     vld1.32     {d0}, [TMP1], STRIDE
  1156     mov         TMP2, X, asr #16
  1157     add         X, X, UX
  1158     add         TMP2, TOP, TMP2, asl #2
  1159                                             vmlsl.u16   q9, d6, d30
  1160                                             vmlsl.u16   q10, d2, d31
  1161     vld1.32     {d1}, [TMP1]
  1162     mov         TMP3, X, asr #16
  1163     add         X, X, UX
  1164     add         TMP3, TOP, TMP3, asl #2
  1165                                             vmlal.u16   q9, d7, d30
  1166                                             vmlal.u16   q10, d3, d31
  1167     vld1.32     {d2}, [TMP2], STRIDE
  1168     mov         TMP4, X, asr #16
  1169     add         X, X, UX
  1170     add         TMP4, TOP, TMP4, asl #2
  1171                                             vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1172                                             vadd.u16    q12, q12, q13
  1173     vld1.32     {d3}, [TMP2]
  1174                                             vdup.32     d22, d22[0]
  1175                                             vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
  1176                                             vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
  1177     vmull.u8    q2, d0, d28
  1178     vmull.u8    q3, d2, d28
  1179                                             vmovn.u16   d17, q9
  1180                                             vld1.32     {d18, d19}, [OUT, :128]
  1181                                             pld         [OUT, #(prefetch_offset * 4)]
  1182     vmlal.u8    q2, d1, d29
  1183     vmlal.u8    q3, d3, d29
  1184                                             vuzp.8      d16, d17
  1185                                             vuzp.8      d18, d19
  1186     vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
  1187     vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
  1188                                             vuzp.8      d16, d17
  1189                                             vuzp.8      d18, d19
  1190     vmlsl.u16   q0, d4, d30
  1191     vmlsl.u16   q1, d6, d31
  1192                                             vmull.u8    q10, d16, d22
  1193                                             vmull.u8    q11, d17, d22
  1194     vmlal.u16   q0, d5, d30
  1195     vmlal.u16   q1, d7, d31
  1196                                             vrsra.u16   q10, q10, #8
  1197                                             vrsra.u16   q11, q11, #8
  1198     vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
  1199     vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
  1200                                             vrshrn.u16  d16, q10, #8
  1201                                             vrshrn.u16  d17, q11, #8
  1202     vld1.32     {d2}, [TMP3], STRIDE
  1203                                             vdup.32     d22, d17[1]
  1204     vld1.32     {d3}, [TMP3]
  1205                                             vmvn.8      d22, d22
  1206     pld         [TMP4, PF_OFFS]
  1207     vld1.32     {d4}, [TMP4], STRIDE
  1208                                             vmull.u8    q10, d18, d22
  1209                                             vmull.u8    q11, d19, d22
  1210     vld1.32     {d5}, [TMP4]
  1211     pld         [TMP4, PF_OFFS]
  1212     vmull.u8    q3, d2, d28
  1213                                             vrshr.u16   q9, q10, #8
  1214                                             vrshr.u16   q15, q11, #8
  1215     vmlal.u8    q3, d3, d29
  1216     vmull.u8    q1, d4, d28
  1217                                             vraddhn.u16 d18, q9, q10
  1218                                             vraddhn.u16 d19, q15, q11
  1219     vmlal.u8    q1, d5, d29
  1220     vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
  1221                                             vqadd.u8    q9, q8, q9
  1222     vld1.32     {d22[0]}, [MASK]!
  1223                                             vuzp.8      d18, d19
  1224     vadd.u16    q12, q12, q13
  1225                                             vuzp.8      d18, d19
  1226     vmovn.u16   d16, q0
  1227                                             vst1.32     {d18, d19}, [OUT, :128]!
  1228 .endm
  1230 /* add_8888_8888 */
  1231 .macro bilinear_add_8888_8888_process_last_pixel
  1232     bilinear_interpolate_last_pixel 8888, x, 8888, add
  1233 .endm
  1235 .macro bilinear_add_8888_8888_process_two_pixels
  1236     bilinear_interpolate_two_pixels 8888, x, 8888, add
  1237 .endm
  1239 .macro bilinear_add_8888_8888_process_four_pixels
  1240     bilinear_interpolate_four_pixels 8888, x, 8888, add
  1241 .endm
  1243 .macro bilinear_add_8888_8888_process_pixblock_head
  1244     bilinear_add_8888_8888_process_four_pixels
  1245 .endm
  1247 .macro bilinear_add_8888_8888_process_pixblock_tail
  1248 .endm
  1250 .macro bilinear_add_8888_8888_process_pixblock_tail_head
  1251     bilinear_add_8888_8888_process_pixblock_tail
  1252     bilinear_add_8888_8888_process_pixblock_head
  1253 .endm
  1255 /* add_8888_8_8888 */
  1256 .macro bilinear_add_8888_8_8888_process_last_pixel
  1257     bilinear_interpolate_last_pixel 8888, 8, 8888, add
  1258 .endm
  1260 .macro bilinear_add_8888_8_8888_process_two_pixels
  1261     bilinear_interpolate_two_pixels 8888, 8, 8888, add
  1262 .endm
  1264 .macro bilinear_add_8888_8_8888_process_four_pixels
  1265     bilinear_interpolate_four_pixels 8888, 8, 8888, add
  1266 .endm
  1268 .macro bilinear_add_8888_8_8888_process_pixblock_head
  1269     bilinear_add_8888_8_8888_process_four_pixels
  1270 .endm
  1272 .macro bilinear_add_8888_8_8888_process_pixblock_tail
  1273 .endm
  1275 .macro bilinear_add_8888_8_8888_process_pixblock_tail_head
  1276     bilinear_add_8888_8_8888_process_pixblock_tail
  1277     bilinear_add_8888_8_8888_process_pixblock_head
  1278 .endm
  1281 /* Bilinear scanline functions */
  1282 generate_bilinear_scanline_func \
  1283     pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
  1284     8888, 8888, 2, 2, \
  1285     bilinear_src_8888_8_8888_process_last_pixel, \
  1286     bilinear_src_8888_8_8888_process_two_pixels, \
  1287     bilinear_src_8888_8_8888_process_four_pixels, \
  1288     bilinear_src_8888_8_8888_process_pixblock_head, \
  1289     bilinear_src_8888_8_8888_process_pixblock_tail, \
  1290     bilinear_src_8888_8_8888_process_pixblock_tail_head, \
  1291     4, 28, BILINEAR_FLAG_USE_MASK
  1293 generate_bilinear_scanline_func \
  1294     pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
  1295     8888, 0565, 2, 1, \
  1296     bilinear_src_8888_8_0565_process_last_pixel, \
  1297     bilinear_src_8888_8_0565_process_two_pixels, \
  1298     bilinear_src_8888_8_0565_process_four_pixels, \
  1299     bilinear_src_8888_8_0565_process_pixblock_head, \
  1300     bilinear_src_8888_8_0565_process_pixblock_tail, \
  1301     bilinear_src_8888_8_0565_process_pixblock_tail_head, \
  1302     4, 28, BILINEAR_FLAG_USE_MASK
  1304 generate_bilinear_scanline_func \
  1305     pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
  1306     0565, 8888, 1, 2, \
  1307     bilinear_src_0565_8_x888_process_last_pixel, \
  1308     bilinear_src_0565_8_x888_process_two_pixels, \
  1309     bilinear_src_0565_8_x888_process_four_pixels, \
  1310     bilinear_src_0565_8_x888_process_pixblock_head, \
  1311     bilinear_src_0565_8_x888_process_pixblock_tail, \
  1312     bilinear_src_0565_8_x888_process_pixblock_tail_head, \
  1313     4, 28, BILINEAR_FLAG_USE_MASK
  1315 generate_bilinear_scanline_func \
  1316     pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
  1317     0565, 0565, 1, 1, \
  1318     bilinear_src_0565_8_0565_process_last_pixel, \
  1319     bilinear_src_0565_8_0565_process_two_pixels, \
  1320     bilinear_src_0565_8_0565_process_four_pixels, \
  1321     bilinear_src_0565_8_0565_process_pixblock_head, \
  1322     bilinear_src_0565_8_0565_process_pixblock_tail, \
  1323     bilinear_src_0565_8_0565_process_pixblock_tail_head, \
  1324     4, 28, BILINEAR_FLAG_USE_MASK
  1326 generate_bilinear_scanline_func \
  1327     pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
  1328     8888, 8888, 2, 2, \
  1329     bilinear_over_8888_8888_process_last_pixel, \
  1330     bilinear_over_8888_8888_process_two_pixels, \
  1331     bilinear_over_8888_8888_process_four_pixels, \
  1332     bilinear_over_8888_8888_process_pixblock_head, \
  1333     bilinear_over_8888_8888_process_pixblock_tail, \
  1334     bilinear_over_8888_8888_process_pixblock_tail_head, \
  1335     4, 28, 0
  1337 generate_bilinear_scanline_func \
  1338     pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
  1339     8888, 8888, 2, 2, \
  1340     bilinear_over_8888_8_8888_process_last_pixel, \
  1341     bilinear_over_8888_8_8888_process_two_pixels, \
  1342     bilinear_over_8888_8_8888_process_four_pixels, \
  1343     bilinear_over_8888_8_8888_process_pixblock_head, \
  1344     bilinear_over_8888_8_8888_process_pixblock_tail, \
  1345     bilinear_over_8888_8_8888_process_pixblock_tail_head, \
  1346     4, 28, BILINEAR_FLAG_USE_MASK
  1348 generate_bilinear_scanline_func \
  1349     pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
  1350     8888, 8888, 2, 2, \
  1351     bilinear_add_8888_8888_process_last_pixel, \
  1352     bilinear_add_8888_8888_process_two_pixels, \
  1353     bilinear_add_8888_8888_process_four_pixels, \
  1354     bilinear_add_8888_8888_process_pixblock_head, \
  1355     bilinear_add_8888_8888_process_pixblock_tail, \
  1356     bilinear_add_8888_8888_process_pixblock_tail_head, \
  1357     4, 28, 0
  1359 generate_bilinear_scanline_func \
  1360     pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
  1361     8888, 8888, 2, 2, \
  1362     bilinear_add_8888_8_8888_process_last_pixel, \
  1363     bilinear_add_8888_8_8888_process_two_pixels, \
  1364     bilinear_add_8888_8_8888_process_four_pixels, \
  1365     bilinear_add_8888_8_8888_process_pixblock_head, \
  1366     bilinear_add_8888_8_8888_process_pixblock_tail, \
  1367     bilinear_add_8888_8_8888_process_pixblock_tail_head, \
  1368     4, 28, BILINEAR_FLAG_USE_MASK

mercurial