media/libjpeg/simd/jsimd_arm_neon.S

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libjpeg/simd/jsimd_arm_neon.S	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,2397 @@
     1.4 +/*
     1.5 + * ARM NEON optimizations for libjpeg-turbo
     1.6 + *
     1.7 + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
     1.8 + * All rights reserved.
     1.9 + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
    1.10 + *
    1.11 + * This software is provided 'as-is', without any express or implied
    1.12 + * warranty.  In no event will the authors be held liable for any damages
    1.13 + * arising from the use of this software.
    1.14 + *
    1.15 + * Permission is granted to anyone to use this software for any purpose,
    1.16 + * including commercial applications, and to alter it and redistribute it
    1.17 + * freely, subject to the following restrictions:
    1.18 + *
    1.19 + * 1. The origin of this software must not be misrepresented; you must not
    1.20 + *    claim that you wrote the original software. If you use this software
    1.21 + *    in a product, an acknowledgment in the product documentation would be
    1.22 + *    appreciated but is not required.
    1.23 + * 2. Altered source versions must be plainly marked as such, and must not be
    1.24 + *    misrepresented as being the original software.
    1.25 + * 3. This notice may not be removed or altered from any source distribution.
    1.26 + */
    1.27 +
    1.28 +#if defined(__linux__) && defined(__ELF__)
    1.29 +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
    1.30 +#endif
    1.31 +
    1.32 +.text
    1.33 +.fpu neon
    1.34 +.arch armv7a
    1.35 +.object_arch armv4
    1.36 +.arm
    1.37 +
    1.38 +
    1.39 +#define RESPECT_STRICT_ALIGNMENT 1
    1.40 +
    1.41 +/*****************************************************************************/
    1.42 +
    1.43 +/* Supplementary macro for setting function attributes */
    1.44 +.macro asm_function fname
    1.45 +#ifdef __APPLE__
    1.46 +    .func _\fname
    1.47 +    .globl _\fname
    1.48 +_\fname:
    1.49 +#else
    1.50 +    .func \fname
    1.51 +    .global \fname
    1.52 +#ifdef __ELF__
    1.53 +    .hidden \fname
    1.54 +    .type \fname, %function
    1.55 +#endif
    1.56 +\fname:
    1.57 +#endif
    1.58 +.endm
    1.59 +
    1.60 +/* Transpose a block of 4x4 coefficients in four 64-bit registers */
    1.61 +.macro transpose_4x4 x0, x1, x2, x3
    1.62 +    vtrn.16 \x0, \x1
    1.63 +    vtrn.16 \x2, \x3
    1.64 +    vtrn.32 \x0, \x2
    1.65 +    vtrn.32 \x1, \x3
    1.66 +.endm
    1.67 +
    1.68 +#define CENTERJSAMPLE 128
    1.69 +
    1.70 +/*****************************************************************************/
    1.71 +
    1.72 +/*
    1.73 + * Perform dequantization and inverse DCT on one block of coefficients.
    1.74 + *
    1.75 + * GLOBAL(void)
    1.76 + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
    1.77 + *                        JSAMPARRAY output_buf, JDIMENSION output_col)
    1.78 + */
    1.79 +
    1.80 +#define FIX_0_298631336  (2446)
    1.81 +#define FIX_0_390180644  (3196)
    1.82 +#define FIX_0_541196100  (4433)
    1.83 +#define FIX_0_765366865  (6270)
    1.84 +#define FIX_0_899976223  (7373)
    1.85 +#define FIX_1_175875602  (9633)
    1.86 +#define FIX_1_501321110  (12299)
    1.87 +#define FIX_1_847759065  (15137)
    1.88 +#define FIX_1_961570560  (16069)
    1.89 +#define FIX_2_053119869  (16819)
    1.90 +#define FIX_2_562915447  (20995)
    1.91 +#define FIX_3_072711026  (25172)
    1.92 +
    1.93 +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
    1.94 +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
    1.95 +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
    1.96 +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
    1.97 +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
    1.98 +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
    1.99 +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
   1.100 +#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
   1.101 +
   1.102 +/*
   1.103 + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
   1.104 + * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
   1.105 + */
   1.106 +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
   1.107 +{                                                                             \
   1.108 +    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
   1.109 +    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
   1.110 +    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
   1.111 +                                                                              \
   1.112 +    /* 1-D iDCT input data */                                                 \
   1.113 +    row0 = xrow0;                                                             \
   1.114 +    row1 = xrow1;                                                             \
   1.115 +    row2 = xrow2;                                                             \
   1.116 +    row3 = xrow3;                                                             \
   1.117 +    row4 = xrow4;                                                             \
   1.118 +    row5 = xrow5;                                                             \
   1.119 +    row6 = xrow6;                                                             \
   1.120 +    row7 = xrow7;                                                             \
   1.121 +                                                                              \
   1.122 +    q5 = row7 + row3;                                                         \
   1.123 +    q4 = row5 + row1;                                                         \
   1.124 +    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
   1.125 +         MULTIPLY(q4, FIX_1_175875602);                                       \
   1.126 +    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
   1.127 +         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
   1.128 +    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
   1.129 +         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
   1.130 +    q4 = q6;                                                                  \
   1.131 +    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
   1.132 +    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
   1.133 +          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
   1.134 +    /* now we can use q1 (reloadable constants have been used up) */          \
   1.135 +    q1 = q3 + q2;                                                             \
   1.136 +    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
   1.137 +          MULTIPLY(row1, -FIX_0_899976223);                                   \
   1.138 +    q5 = q7;                                                                  \
   1.139 +    q1 = q1 + q6;                                                             \
   1.140 +    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
   1.141 +          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
   1.142 +                                                                              \
   1.143 +    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
   1.144 +    tmp11_plus_tmp2 = q1;                                                     \
   1.145 +    row1 = 0;                                                                 \
   1.146 +                                                                              \
   1.147 +    q1 = q1 - q6;                                                             \
   1.148 +    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
   1.149 +          MULTIPLY(row3, -FIX_2_562915447);                                   \
   1.150 +    q1 = q1 - q6;                                                             \
   1.151 +    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
   1.152 +         MULTIPLY(row6, FIX_0_541196100);                                     \
   1.153 +    q3 = q3 - q2;                                                             \
   1.154 +                                                                              \
   1.155 +    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
   1.156 +    tmp11_minus_tmp2 = q1;                                                    \
   1.157 +                                                                              \
   1.158 +    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
   1.159 +    q2 = q1 + q6;                                                             \
   1.160 +    q1 = q1 - q6;                                                             \
   1.161 +                                                                              \
   1.162 +    /* pick up the results */                                                 \
   1.163 +    tmp0  = q4;                                                               \
   1.164 +    tmp1  = q5;                                                               \
   1.165 +    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
   1.166 +    tmp3  = q7;                                                               \
   1.167 +    tmp10 = q2;                                                               \
   1.168 +    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
   1.169 +    tmp12 = q3;                                                               \
   1.170 +    tmp13 = q1;                                                               \
   1.171 +}
   1.172 +
   1.173 +#define XFIX_0_899976223                    d0[0]
   1.174 +#define XFIX_0_541196100                    d0[1]
   1.175 +#define XFIX_2_562915447                    d0[2]
   1.176 +#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
   1.177 +#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
   1.178 +#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
   1.179 +#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
   1.180 +#define XFIX_1_175875602                    d1[3]
   1.181 +#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
   1.182 +#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
   1.183 +#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
   1.184 +#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
   1.185 +
   1.186 +.balign 16
   1.187 +jsimd_idct_islow_neon_consts:
   1.188 +    .short FIX_0_899976223                    /* d0[0] */
   1.189 +    .short FIX_0_541196100                    /* d0[1] */
   1.190 +    .short FIX_2_562915447                    /* d0[2] */
   1.191 +    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
   1.192 +    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
   1.193 +    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
   1.194 +    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
   1.195 +    .short FIX_1_175875602                    /* d1[3] */
   1.196 +    /* reloadable constants */
   1.197 +    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
   1.198 +    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
   1.199 +    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
   1.200 +    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
   1.201 +
   1.202 +asm_function jsimd_idct_islow_neon
   1.203 +
   1.204 +    DCT_TABLE       .req r0
   1.205 +    COEF_BLOCK      .req r1
   1.206 +    OUTPUT_BUF      .req r2
   1.207 +    OUTPUT_COL      .req r3
   1.208 +    TMP1            .req r0
   1.209 +    TMP2            .req r1
   1.210 +    TMP3            .req r2
   1.211 +    TMP4            .req ip
   1.212 +
   1.213 +    ROW0L           .req d16
   1.214 +    ROW0R           .req d17
   1.215 +    ROW1L           .req d18
   1.216 +    ROW1R           .req d19
   1.217 +    ROW2L           .req d20
   1.218 +    ROW2R           .req d21
   1.219 +    ROW3L           .req d22
   1.220 +    ROW3R           .req d23
   1.221 +    ROW4L           .req d24
   1.222 +    ROW4R           .req d25
   1.223 +    ROW5L           .req d26
   1.224 +    ROW5R           .req d27
   1.225 +    ROW6L           .req d28
   1.226 +    ROW6R           .req d29
   1.227 +    ROW7L           .req d30
   1.228 +    ROW7R           .req d31
   1.229 +
   1.230 +    /* Load and dequantize coefficients into NEON registers
   1.231 +     * with the following allocation:
   1.232 +     *       0 1 2 3 | 4 5 6 7
   1.233 +     *      ---------+--------
   1.234 +     *   0 | d16     | d17     ( q8  )
   1.235 +     *   1 | d18     | d19     ( q9  )
   1.236 +     *   2 | d20     | d21     ( q10 )
   1.237 +     *   3 | d22     | d23     ( q11 )
   1.238 +     *   4 | d24     | d25     ( q12 )
   1.239 +     *   5 | d26     | d27     ( q13 )
   1.240 +     *   6 | d28     | d29     ( q14 )
   1.241 +     *   7 | d30     | d31     ( q15 )
   1.242 +     */
   1.243 +    adr             ip, jsimd_idct_islow_neon_consts
   1.244 +    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
   1.245 +    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
   1.246 +    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
   1.247 +    vmul.s16        q8, q8, q0
   1.248 +    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
   1.249 +    vmul.s16        q9, q9, q1
   1.250 +    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
   1.251 +    vmul.s16        q10, q10, q2
   1.252 +    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
   1.253 +    vmul.s16        q11, q11, q3
   1.254 +    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
   1.255 +    vmul.s16        q12, q12, q0
   1.256 +    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
   1.257 +    vmul.s16        q14, q14, q2
   1.258 +    vmul.s16        q13, q13, q1
   1.259 +    vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
   1.260 +    add             ip, ip, #16
   1.261 +    vmul.s16        q15, q15, q3
   1.262 +    vpush           {d8-d15} /* save NEON registers */
   1.263 +    /* 1-D IDCT, pass 1, left 4x8 half */
   1.264 +    vadd.s16        d4,    ROW7L, ROW3L
   1.265 +    vadd.s16        d5,    ROW5L, ROW1L
   1.266 +    vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
   1.267 +    vmlal.s16       q6,    d5,    XFIX_1_175875602
   1.268 +    vmull.s16       q7,    d4,    XFIX_1_175875602
   1.269 +      /* Check for the zero coefficients in the right 4x8 half */
   1.270 +      push            {r4, r5}
   1.271 +    vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
   1.272 +    vsubl.s16       q3,    ROW0L, ROW4L
   1.273 +      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
   1.274 +    vmull.s16       q2,    ROW2L, XFIX_0_541196100
   1.275 +    vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
   1.276 +      orr             r0,    r4,    r5
   1.277 +    vmov            q4,    q6
   1.278 +    vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
   1.279 +      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
   1.280 +    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
   1.281 +    vshl.s32        q3,    q3,    #13
   1.282 +      orr             r0,    r0,    r4
   1.283 +    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
   1.284 +      orr             r0,    r0,    r5
   1.285 +    vadd.s32        q1,    q3,    q2
   1.286 +      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
   1.287 +    vmov            q5,    q7
   1.288 +    vadd.s32        q1,    q1,    q6
   1.289 +      orr             r0,    r0,    r4
   1.290 +    vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
   1.291 +      orr             r0,    r0,    r5
   1.292 +    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
   1.293 +    vrshrn.s32      ROW1L, q1,    #11
   1.294 +      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
   1.295 +    vsub.s32        q1,    q1,    q6
   1.296 +    vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
   1.297 +      orr             r0,    r0,    r4
   1.298 +    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
   1.299 +      orr             r0,    r0,    r5
   1.300 +    vsub.s32        q1,    q1,    q6
   1.301 +    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
   1.302 +      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
   1.303 +    vmlal.s16       q6,    ROW6L, XFIX_0_541196100
   1.304 +    vsub.s32        q3,    q3,    q2
   1.305 +      orr             r0,    r0,    r4
   1.306 +    vrshrn.s32      ROW6L, q1,    #11
   1.307 +      orr             r0,    r0,    r5
   1.308 +    vadd.s32        q1,    q3,    q5
   1.309 +      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
   1.310 +    vsub.s32        q3,    q3,    q5
   1.311 +    vaddl.s16       q5,    ROW0L, ROW4L
   1.312 +      orr             r0,    r0,    r4
   1.313 +    vrshrn.s32      ROW2L, q1,    #11
   1.314 +      orr             r0,    r0,    r5
   1.315 +    vrshrn.s32      ROW5L, q3,    #11
   1.316 +      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
   1.317 +    vshl.s32        q5,    q5,    #13
   1.318 +    vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
   1.319 +      orr             r0,    r0,    r4
   1.320 +    vadd.s32        q2,    q5,    q6
   1.321 +      orrs            r0,    r0,    r5
   1.322 +    vsub.s32        q1,    q5,    q6
   1.323 +    vadd.s32        q6,    q2,    q7
   1.324 +      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
   1.325 +    vsub.s32        q2,    q2,    q7
   1.326 +    vadd.s32        q5,    q1,    q4
   1.327 +      orr             r0,    r4,    r5
   1.328 +    vsub.s32        q3,    q1,    q4
   1.329 +      pop             {r4, r5}
   1.330 +    vrshrn.s32      ROW7L, q2,    #11
   1.331 +    vrshrn.s32      ROW3L, q5,    #11
   1.332 +    vrshrn.s32      ROW0L, q6,    #11
   1.333 +    vrshrn.s32      ROW4L, q3,    #11
   1.334 +
   1.335 +      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
   1.336 +
   1.337 +    /* 1-D IDCT, pass 1, right 4x8 half */
   1.338 +    vld1.s16        {d2},  [ip, :64]    /* reload constants */
   1.339 +    vadd.s16        d10,   ROW7R, ROW3R
   1.340 +    vadd.s16        d8,    ROW5R, ROW1R
   1.341 +      /* Transpose left 4x8 half */
   1.342 +      vtrn.16         ROW6L, ROW7L
   1.343 +    vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
   1.344 +    vmlal.s16       q6,    d8,    XFIX_1_175875602
   1.345 +      vtrn.16         ROW2L, ROW3L
   1.346 +    vmull.s16       q7,    d10,   XFIX_1_175875602
   1.347 +    vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
   1.348 +      vtrn.16         ROW0L, ROW1L
   1.349 +    vsubl.s16       q3,    ROW0R, ROW4R
   1.350 +    vmull.s16       q2,    ROW2R, XFIX_0_541196100
   1.351 +    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
   1.352 +      vtrn.16         ROW4L, ROW5L
   1.353 +    vmov            q4,    q6
   1.354 +    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
   1.355 +    vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
   1.356 +      vtrn.32         ROW1L, ROW3L
   1.357 +    vshl.s32        q3,    q3,    #13
   1.358 +    vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
   1.359 +      vtrn.32         ROW4L, ROW6L
   1.360 +    vadd.s32        q1,    q3,    q2
   1.361 +    vmov            q5,    q7
   1.362 +    vadd.s32        q1,    q1,    q6
   1.363 +      vtrn.32         ROW0L, ROW2L
   1.364 +    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
   1.365 +    vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
   1.366 +    vrshrn.s32      ROW1R, q1,    #11
   1.367 +      vtrn.32         ROW5L, ROW7L
   1.368 +    vsub.s32        q1,    q1,    q6
   1.369 +    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
   1.370 +    vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
   1.371 +    vsub.s32        q1,    q1,    q6
   1.372 +    vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
   1.373 +    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
   1.374 +    vsub.s32        q3,    q3,    q2
   1.375 +    vrshrn.s32      ROW6R, q1,    #11
   1.376 +    vadd.s32        q1,    q3,    q5
   1.377 +    vsub.s32        q3,    q3,    q5
   1.378 +    vaddl.s16       q5,    ROW0R, ROW4R
   1.379 +    vrshrn.s32      ROW2R, q1,    #11
   1.380 +    vrshrn.s32      ROW5R, q3,    #11
   1.381 +    vshl.s32        q5,    q5,    #13
   1.382 +    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
   1.383 +    vadd.s32        q2,    q5,    q6
   1.384 +    vsub.s32        q1,    q5,    q6
   1.385 +    vadd.s32        q6,    q2,    q7
   1.386 +    vsub.s32        q2,    q2,    q7
   1.387 +    vadd.s32        q5,    q1,    q4
   1.388 +    vsub.s32        q3,    q1,    q4
   1.389 +    vrshrn.s32      ROW7R, q2,    #11
   1.390 +    vrshrn.s32      ROW3R, q5,    #11
   1.391 +    vrshrn.s32      ROW0R, q6,    #11
   1.392 +    vrshrn.s32      ROW4R, q3,    #11
   1.393 +    /* Transpose right 4x8 half */
   1.394 +    vtrn.16         ROW6R, ROW7R
   1.395 +    vtrn.16         ROW2R, ROW3R
   1.396 +    vtrn.16         ROW0R, ROW1R
   1.397 +    vtrn.16         ROW4R, ROW5R
   1.398 +    vtrn.32         ROW1R, ROW3R
   1.399 +    vtrn.32         ROW4R, ROW6R
   1.400 +    vtrn.32         ROW0R, ROW2R
   1.401 +    vtrn.32         ROW5R, ROW7R
   1.402 +
   1.403 +1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
   1.404 +    vld1.s16        {d2},  [ip, :64]    /* reload constants */
   1.405 +    vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
   1.406 +    vmlal.s16       q6,    ROW1L, XFIX_1_175875602
   1.407 +    vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
   1.408 +    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
   1.409 +    vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
   1.410 +    vmlal.s16       q7,    ROW3L, XFIX_1_175875602
   1.411 +    vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
   1.412 +    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
   1.413 +    vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
   1.414 +    vmull.s16       q2,    ROW2L, XFIX_0_541196100
   1.415 +    vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
   1.416 +    vmov            q4,    q6
   1.417 +    vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
   1.418 +    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
   1.419 +    vshl.s32        q3,    q3,    #13
   1.420 +    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
   1.421 +    vadd.s32        q1,    q3,    q2
   1.422 +    vmov            q5,    q7
   1.423 +    vadd.s32        q1,    q1,    q6
   1.424 +    vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
   1.425 +    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
   1.426 +    vshrn.s32       ROW1L, q1,    #16
   1.427 +    vsub.s32        q1,    q1,    q6
   1.428 +    vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
   1.429 +    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
   1.430 +    vsub.s32        q1,    q1,    q6
   1.431 +    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
   1.432 +    vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
   1.433 +    vsub.s32        q3,    q3,    q2
   1.434 +    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
   1.435 +    vadd.s32        q1,    q3,    q5
   1.436 +    vsub.s32        q3,    q3,    q5
   1.437 +    vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
   1.438 +    vshrn.s32       ROW2L, q1,    #16
   1.439 +    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
   1.440 +    vshl.s32        q5,    q5,    #13
   1.441 +    vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
   1.442 +    vadd.s32        q2,    q5,    q6
   1.443 +    vsub.s32        q1,    q5,    q6
   1.444 +    vadd.s32        q6,    q2,    q7
   1.445 +    vsub.s32        q2,    q2,    q7
   1.446 +    vadd.s32        q5,    q1,    q4
   1.447 +    vsub.s32        q3,    q1,    q4
   1.448 +    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
   1.449 +    vshrn.s32       ROW3L, q5,    #16
   1.450 +    vshrn.s32       ROW0L, q6,    #16
   1.451 +    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
   1.452 +    /* 1-D IDCT, pass 2, right 4x8 half */
   1.453 +    vld1.s16        {d2},  [ip, :64]    /* reload constants */
   1.454 +    vmull.s16       q6,    ROW5R, XFIX_1_175875602
   1.455 +    vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
   1.456 +    vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
   1.457 +    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
   1.458 +    vmull.s16       q7,    ROW7R, XFIX_1_175875602
   1.459 +    vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
   1.460 +    vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
   1.461 +    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
   1.462 +    vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
   1.463 +    vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
   1.464 +    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
   1.465 +    vmov            q4,    q6
   1.466 +    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
   1.467 +    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
   1.468 +    vshl.s32        q3,    q3,    #13
   1.469 +    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
   1.470 +    vadd.s32        q1,    q3,    q2
   1.471 +    vmov            q5,    q7
   1.472 +    vadd.s32        q1,    q1,    q6
   1.473 +    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
   1.474 +    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
   1.475 +    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
   1.476 +    vsub.s32        q1,    q1,    q6
   1.477 +    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
   1.478 +    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
   1.479 +    vsub.s32        q1,    q1,    q6
   1.480 +    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
   1.481 +    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
   1.482 +    vsub.s32        q3,    q3,    q2
   1.483 +    vshrn.s32       ROW6R, q1,    #16
   1.484 +    vadd.s32        q1,    q3,    q5
   1.485 +    vsub.s32        q3,    q3,    q5
   1.486 +    vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
   1.487 +    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
   1.488 +    vshrn.s32       ROW5R, q3,    #16
   1.489 +    vshl.s32        q5,    q5,    #13
   1.490 +    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
   1.491 +    vadd.s32        q2,    q5,    q6
   1.492 +    vsub.s32        q1,    q5,    q6
   1.493 +    vadd.s32        q6,    q2,    q7
   1.494 +    vsub.s32        q2,    q2,    q7
   1.495 +    vadd.s32        q5,    q1,    q4
   1.496 +    vsub.s32        q3,    q1,    q4
   1.497 +    vshrn.s32       ROW7R, q2,    #16
   1.498 +    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
   1.499 +    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
   1.500 +    vshrn.s32       ROW4R, q3,    #16
   1.501 +
   1.502 +2:  /* Descale to 8-bit and range limit */
   1.503 +    vqrshrn.s16     d16,   q8,    #2
   1.504 +    vqrshrn.s16     d17,   q9,    #2
   1.505 +    vqrshrn.s16     d18,   q10,   #2
   1.506 +    vqrshrn.s16     d19,   q11,   #2
   1.507 +    vpop            {d8-d15} /* restore NEON registers */
   1.508 +    vqrshrn.s16     d20,   q12,   #2
   1.509 +      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
   1.510 +      vtrn.16         q8,    q9
   1.511 +    vqrshrn.s16     d21,   q13,   #2
   1.512 +    vqrshrn.s16     d22,   q14,   #2
   1.513 +      vmov.u8         q0,    #(CENTERJSAMPLE)
   1.514 +    vqrshrn.s16     d23,   q15,   #2
   1.515 +      vtrn.8          d16,   d17
   1.516 +      vtrn.8          d18,   d19
   1.517 +      vadd.u8         q8,    q8,    q0
   1.518 +      vadd.u8         q9,    q9,    q0
   1.519 +      vtrn.16         q10,   q11
   1.520 +        /* Store results to the output buffer */
   1.521 +        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
   1.522 +        add             TMP1, TMP1, OUTPUT_COL
   1.523 +        add             TMP2, TMP2, OUTPUT_COL
   1.524 +        vst1.8          {d16}, [TMP1]
   1.525 +      vtrn.8          d20, d21
   1.526 +        vst1.8          {d17}, [TMP2]
   1.527 +        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
   1.528 +        add             TMP1, TMP1, OUTPUT_COL
   1.529 +        add             TMP2, TMP2, OUTPUT_COL
   1.530 +        vst1.8          {d18}, [TMP1]
   1.531 +      vadd.u8         q10,   q10,   q0
   1.532 +        vst1.8          {d19}, [TMP2]
   1.533 +        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
   1.534 +        add             TMP1, TMP1, OUTPUT_COL
   1.535 +        add             TMP2, TMP2, OUTPUT_COL
   1.536 +        add             TMP3, TMP3, OUTPUT_COL
   1.537 +        add             TMP4, TMP4, OUTPUT_COL
   1.538 +      vtrn.8          d22, d23
   1.539 +        vst1.8          {d20}, [TMP1]
   1.540 +      vadd.u8         q11,   q11,   q0
   1.541 +        vst1.8          {d21}, [TMP2]
   1.542 +        vst1.8          {d22}, [TMP3]
   1.543 +        vst1.8          {d23}, [TMP4]
   1.544 +    bx              lr
   1.545 +
   1.546 +3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
   1.547 +
   1.548 +    /* Transpose left 4x8 half */
   1.549 +    vtrn.16         ROW6L, ROW7L
   1.550 +    vtrn.16         ROW2L, ROW3L
   1.551 +    vtrn.16         ROW0L, ROW1L
   1.552 +    vtrn.16         ROW4L, ROW5L
   1.553 +    vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
   1.554 +    vtrn.32         ROW1L, ROW3L
   1.555 +    vtrn.32         ROW4L, ROW6L
   1.556 +    vtrn.32         ROW0L, ROW2L
   1.557 +    vtrn.32         ROW5L, ROW7L
   1.558 +
   1.559 +    cmp             r0, #0
   1.560 +    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
   1.561 +
   1.562 +    /* Only row 0 is non-zero for the right 4x8 half  */
   1.563 +    vdup.s16        ROW1R, ROW0R[1]
   1.564 +    vdup.s16        ROW2R, ROW0R[2]
   1.565 +    vdup.s16        ROW3R, ROW0R[3]
   1.566 +    vdup.s16        ROW4R, ROW0R[0]
   1.567 +    vdup.s16        ROW5R, ROW0R[1]
   1.568 +    vdup.s16        ROW6R, ROW0R[2]
   1.569 +    vdup.s16        ROW7R, ROW0R[3]
   1.570 +    vdup.s16        ROW0R, ROW0R[0]
   1.571 +    b               1b /* Go to 'normal' second pass */
   1.572 +
   1.573 +4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
   1.574 +    vld1.s16        {d2},  [ip, :64]    /* reload constants */
   1.575 +    vmull.s16       q6,    ROW1L, XFIX_1_175875602
   1.576 +    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
   1.577 +    vmull.s16       q7,    ROW3L, XFIX_1_175875602
   1.578 +    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
   1.579 +    vmull.s16       q2,    ROW2L, XFIX_0_541196100
   1.580 +    vshll.s16       q3,    ROW0L, #13
   1.581 +    vmov            q4,    q6
   1.582 +    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
   1.583 +    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
   1.584 +    vadd.s32        q1,    q3,    q2
   1.585 +    vmov            q5,    q7
   1.586 +    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
   1.587 +    vadd.s32        q1,    q1,    q6
   1.588 +    vadd.s32        q6,    q6,    q6
   1.589 +    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
   1.590 +    vshrn.s32       ROW1L, q1,    #16
   1.591 +    vsub.s32        q1,    q1,    q6
   1.592 +    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
   1.593 +    vsub.s32        q3,    q3,    q2
   1.594 +    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
   1.595 +    vadd.s32        q1,    q3,    q5
   1.596 +    vsub.s32        q3,    q3,    q5
   1.597 +    vshll.s16       q5,    ROW0L, #13
   1.598 +    vshrn.s32       ROW2L, q1,    #16
   1.599 +    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
   1.600 +    vadd.s32        q2,    q5,    q6
   1.601 +    vsub.s32        q1,    q5,    q6
   1.602 +    vadd.s32        q6,    q2,    q7
   1.603 +    vsub.s32        q2,    q2,    q7
   1.604 +    vadd.s32        q5,    q1,    q4
   1.605 +    vsub.s32        q3,    q1,    q4
   1.606 +    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
   1.607 +    vshrn.s32       ROW3L, q5,    #16
   1.608 +    vshrn.s32       ROW0L, q6,    #16
   1.609 +    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
   1.610 +    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
   1.611 +    vld1.s16        {d2},  [ip, :64]    /* reload constants */
   1.612 +    vmull.s16       q6,    ROW5L, XFIX_1_175875602
   1.613 +    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
   1.614 +    vmull.s16       q7,    ROW7L, XFIX_1_175875602
   1.615 +    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
   1.616 +    vmull.s16       q2,    ROW6L, XFIX_0_541196100
   1.617 +    vshll.s16       q3,    ROW4L, #13
   1.618 +    vmov            q4,    q6
   1.619 +    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
   1.620 +    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
   1.621 +    vadd.s32        q1,    q3,    q2
   1.622 +    vmov            q5,    q7
   1.623 +    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
   1.624 +    vadd.s32        q1,    q1,    q6
   1.625 +    vadd.s32        q6,    q6,    q6
   1.626 +    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
   1.627 +    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
   1.628 +    vsub.s32        q1,    q1,    q6
   1.629 +    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
   1.630 +    vsub.s32        q3,    q3,    q2
   1.631 +    vshrn.s32       ROW6R, q1,    #16
   1.632 +    vadd.s32        q1,    q3,    q5
   1.633 +    vsub.s32        q3,    q3,    q5
   1.634 +    vshll.s16       q5,    ROW4L, #13
   1.635 +    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
   1.636 +    vshrn.s32       ROW5R, q3,    #16
   1.637 +    vadd.s32        q2,    q5,    q6
   1.638 +    vsub.s32        q1,    q5,    q6
   1.639 +    vadd.s32        q6,    q2,    q7
   1.640 +    vsub.s32        q2,    q2,    q7
   1.641 +    vadd.s32        q5,    q1,    q4
   1.642 +    vsub.s32        q3,    q1,    q4
   1.643 +    vshrn.s32       ROW7R, q2,    #16
   1.644 +    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
   1.645 +    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
   1.646 +    vshrn.s32       ROW4R, q3,    #16
   1.647 +    b               2b /* Go to epilogue */
   1.648 +
   1.649 +    .unreq          DCT_TABLE
   1.650 +    .unreq          COEF_BLOCK
   1.651 +    .unreq          OUTPUT_BUF
   1.652 +    .unreq          OUTPUT_COL
   1.653 +    .unreq          TMP1
   1.654 +    .unreq          TMP2
   1.655 +    .unreq          TMP3
   1.656 +    .unreq          TMP4
   1.657 +
   1.658 +    .unreq          ROW0L
   1.659 +    .unreq          ROW0R
   1.660 +    .unreq          ROW1L
   1.661 +    .unreq          ROW1R
   1.662 +    .unreq          ROW2L
   1.663 +    .unreq          ROW2R
   1.664 +    .unreq          ROW3L
   1.665 +    .unreq          ROW3R
   1.666 +    .unreq          ROW4L
   1.667 +    .unreq          ROW4R
   1.668 +    .unreq          ROW5L
   1.669 +    .unreq          ROW5R
   1.670 +    .unreq          ROW6L
   1.671 +    .unreq          ROW6R
   1.672 +    .unreq          ROW7L
   1.673 +    .unreq          ROW7R
   1.674 +.endfunc
   1.675 +
   1.676 +/*****************************************************************************/
   1.677 +
   1.678 +/*
   1.679 + * jsimd_idct_ifast_neon
   1.680 + *
   1.681 + * This function contains a fast, not so accurate integer implementation of
   1.682 + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
   1.683 + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
   1.684 + * function from jidctfst.c
   1.685 + *
   1.686 + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
   1.687 + * But in ARM NEON case some extra additions are required because VQDMULH
   1.688 + * instruction can't handle the constants larger than 1. So the expressions
   1.689 + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
   1.690 + * which introduces an extra addition. Overall, there are 6 extra additions
   1.691 + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
   1.692 + */
   1.693 +
   1.694 +#define XFIX_1_082392200 d0[0]
   1.695 +#define XFIX_1_414213562 d0[1]
   1.696 +#define XFIX_1_847759065 d0[2]
   1.697 +#define XFIX_2_613125930 d0[3]
   1.698 +
   1.699 +.balign 16
   1.700 +jsimd_idct_ifast_neon_consts:
   1.701 +    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
   1.702 +    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
   1.703 +    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
   1.704 +    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
   1.705 +
   1.706 +asm_function jsimd_idct_ifast_neon
   1.707 +
   1.708 +    DCT_TABLE       .req r0
   1.709 +    COEF_BLOCK      .req r1
   1.710 +    OUTPUT_BUF      .req r2
   1.711 +    OUTPUT_COL      .req r3
   1.712 +    TMP1            .req r0
   1.713 +    TMP2            .req r1
   1.714 +    TMP3            .req r2
   1.715 +    TMP4            .req ip
   1.716 +
   1.717 +    /* Load and dequantize coefficients into NEON registers
   1.718 +     * with the following allocation:
   1.719 +     *       0 1 2 3 | 4 5 6 7
   1.720 +     *      ---------+--------
   1.721 +     *   0 | d16     | d17     ( q8  )
   1.722 +     *   1 | d18     | d19     ( q9  )
   1.723 +     *   2 | d20     | d21     ( q10 )
   1.724 +     *   3 | d22     | d23     ( q11 )
   1.725 +     *   4 | d24     | d25     ( q12 )
   1.726 +     *   5 | d26     | d27     ( q13 )
   1.727 +     *   6 | d28     | d29     ( q14 )
   1.728 +     *   7 | d30     | d31     ( q15 )
   1.729 +     */
   1.730 +    adr             ip, jsimd_idct_ifast_neon_consts
   1.731 +    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
   1.732 +    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
   1.733 +    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
   1.734 +    vmul.s16        q8,  q8,  q0
   1.735 +    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
   1.736 +    vmul.s16        q9,  q9,  q1
   1.737 +    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
   1.738 +    vmul.s16        q10, q10, q2
   1.739 +    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
   1.740 +    vmul.s16        q11, q11, q3
   1.741 +    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
   1.742 +    vmul.s16        q12, q12, q0
   1.743 +    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
   1.744 +    vmul.s16        q14, q14, q2
   1.745 +    vmul.s16        q13, q13, q1
   1.746 +    vld1.16         {d0}, [ip, :64] /* load constants */
   1.747 +    vmul.s16        q15, q15, q3
   1.748 +    vpush           {d8-d13}        /* save NEON registers */
   1.749 +    /* 1-D IDCT, pass 1 */
   1.750 +    vsub.s16        q2,  q10, q14
   1.751 +    vadd.s16        q14, q10, q14
   1.752 +    vsub.s16        q1,  q11, q13
   1.753 +    vadd.s16        q13, q11, q13
   1.754 +    vsub.s16        q5,  q9,  q15
   1.755 +    vadd.s16        q15, q9,  q15
   1.756 +    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
   1.757 +    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
   1.758 +    vadd.s16        q3,  q1,  q1
   1.759 +    vsub.s16        q1,  q5,  q1
   1.760 +    vadd.s16        q10, q2,  q4
   1.761 +    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
   1.762 +    vsub.s16        q2,  q15, q13
   1.763 +    vadd.s16        q3,  q3,  q6
   1.764 +    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
   1.765 +    vadd.s16        q1,  q1,  q4
   1.766 +    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
   1.767 +    vsub.s16        q10, q10, q14
   1.768 +    vadd.s16        q2,  q2,  q6
   1.769 +    vsub.s16        q6,  q8,  q12
   1.770 +    vadd.s16        q12, q8,  q12
   1.771 +    vadd.s16        q9,  q5,  q4
   1.772 +    vadd.s16        q5,  q6,  q10
   1.773 +    vsub.s16        q10, q6,  q10
   1.774 +    vadd.s16        q6,  q15, q13
   1.775 +    vadd.s16        q8,  q12, q14
   1.776 +    vsub.s16        q3,  q6,  q3
   1.777 +    vsub.s16        q12, q12, q14
   1.778 +    vsub.s16        q3,  q3,  q1
   1.779 +    vsub.s16        q1,  q9,  q1
   1.780 +    vadd.s16        q2,  q3,  q2
   1.781 +    vsub.s16        q15, q8,  q6
   1.782 +    vadd.s16        q1,  q1,  q2
   1.783 +    vadd.s16        q8,  q8,  q6
   1.784 +    vadd.s16        q14, q5,  q3
   1.785 +    vsub.s16        q9,  q5,  q3
   1.786 +    vsub.s16        q13, q10, q2
   1.787 +    vadd.s16        q10, q10, q2
   1.788 +      /* Transpose */
   1.789 +      vtrn.16         q8,  q9
   1.790 +    vsub.s16        q11, q12, q1
   1.791 +      vtrn.16         q14, q15
   1.792 +    vadd.s16        q12, q12, q1
   1.793 +      vtrn.16         q10, q11
   1.794 +      vtrn.16         q12, q13
   1.795 +      vtrn.32         q9,  q11
   1.796 +      vtrn.32         q12, q14
   1.797 +      vtrn.32         q8,  q10
   1.798 +      vtrn.32         q13, q15
   1.799 +      vswp            d28, d21
   1.800 +      vswp            d26, d19
   1.801 +    /* 1-D IDCT, pass 2 */
   1.802 +    vsub.s16        q2,  q10, q14
   1.803 +      vswp            d30, d23
   1.804 +    vadd.s16        q14, q10, q14
   1.805 +      vswp            d24, d17
   1.806 +    vsub.s16        q1,  q11, q13
   1.807 +    vadd.s16        q13, q11, q13
   1.808 +    vsub.s16        q5,  q9,  q15
   1.809 +    vadd.s16        q15, q9,  q15
   1.810 +    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
   1.811 +    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
   1.812 +    vadd.s16        q3,  q1,  q1
   1.813 +    vsub.s16        q1,  q5,  q1
   1.814 +    vadd.s16        q10, q2,  q4
   1.815 +    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
   1.816 +    vsub.s16        q2,  q15, q13
   1.817 +    vadd.s16        q3,  q3,  q6
   1.818 +    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
   1.819 +    vadd.s16        q1,  q1,  q4
   1.820 +    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
   1.821 +    vsub.s16        q10, q10, q14
   1.822 +    vadd.s16        q2,  q2,  q6
   1.823 +    vsub.s16        q6,  q8,  q12
   1.824 +    vadd.s16        q12, q8,  q12
   1.825 +    vadd.s16        q9,  q5,  q4
   1.826 +    vadd.s16        q5,  q6,  q10
   1.827 +    vsub.s16        q10, q6,  q10
   1.828 +    vadd.s16        q6,  q15, q13
   1.829 +    vadd.s16        q8,  q12, q14
   1.830 +    vsub.s16        q3,  q6,  q3
   1.831 +    vsub.s16        q12, q12, q14
   1.832 +    vsub.s16        q3,  q3,  q1
   1.833 +    vsub.s16        q1,  q9,  q1
   1.834 +    vadd.s16        q2,  q3,  q2
   1.835 +    vsub.s16        q15, q8,  q6
   1.836 +    vadd.s16        q1,  q1,  q2
   1.837 +    vadd.s16        q8,  q8,  q6
   1.838 +    vadd.s16        q14, q5,  q3
   1.839 +    vsub.s16        q9,  q5,  q3
   1.840 +    vsub.s16        q13, q10, q2
   1.841 +    vpop            {d8-d13}        /* restore NEON registers */
   1.842 +    vadd.s16        q10, q10, q2
   1.843 +    vsub.s16        q11, q12, q1
   1.844 +    vadd.s16        q12, q12, q1
   1.845 +    /* Descale to 8-bit and range limit */
   1.846 +    vmov.u8         q0,  #0x80
   1.847 +    vqshrn.s16      d16, q8,  #5
   1.848 +    vqshrn.s16      d17, q9,  #5
   1.849 +    vqshrn.s16      d18, q10, #5
   1.850 +    vqshrn.s16      d19, q11, #5
   1.851 +    vqshrn.s16      d20, q12, #5
   1.852 +    vqshrn.s16      d21, q13, #5
   1.853 +    vqshrn.s16      d22, q14, #5
   1.854 +    vqshrn.s16      d23, q15, #5
   1.855 +    vadd.u8         q8,  q8,  q0
   1.856 +    vadd.u8         q9,  q9,  q0
   1.857 +    vadd.u8         q10, q10, q0
   1.858 +    vadd.u8         q11, q11, q0
   1.859 +    /* Transpose the final 8-bit samples */
   1.860 +    vtrn.16         q8,  q9
   1.861 +    vtrn.16         q10, q11
   1.862 +    vtrn.32         q8,  q10
   1.863 +    vtrn.32         q9,  q11
   1.864 +    vtrn.8          d16, d17
   1.865 +    vtrn.8          d18, d19
   1.866 +      /* Store results to the output buffer */
   1.867 +      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
   1.868 +      add             TMP1, TMP1, OUTPUT_COL
   1.869 +      add             TMP2, TMP2, OUTPUT_COL
   1.870 +      vst1.8          {d16}, [TMP1]
   1.871 +      vst1.8          {d17}, [TMP2]
   1.872 +      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
   1.873 +      add             TMP1, TMP1, OUTPUT_COL
   1.874 +      add             TMP2, TMP2, OUTPUT_COL
   1.875 +      vst1.8          {d18}, [TMP1]
   1.876 +    vtrn.8          d20, d21
   1.877 +      vst1.8          {d19}, [TMP2]
   1.878 +      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
   1.879 +      add             TMP1, TMP1, OUTPUT_COL
   1.880 +      add             TMP2, TMP2, OUTPUT_COL
   1.881 +      add             TMP3, TMP3, OUTPUT_COL
   1.882 +      add             TMP4, TMP4, OUTPUT_COL
   1.883 +      vst1.8          {d20}, [TMP1]
   1.884 +    vtrn.8          d22, d23
   1.885 +      vst1.8          {d21}, [TMP2]
   1.886 +      vst1.8          {d22}, [TMP3]
   1.887 +      vst1.8          {d23}, [TMP4]
   1.888 +    bx              lr
   1.889 +
   1.890 +    .unreq          DCT_TABLE
   1.891 +    .unreq          COEF_BLOCK
   1.892 +    .unreq          OUTPUT_BUF
   1.893 +    .unreq          OUTPUT_COL
   1.894 +    .unreq          TMP1
   1.895 +    .unreq          TMP2
   1.896 +    .unreq          TMP3
   1.897 +    .unreq          TMP4
   1.898 +.endfunc
   1.899 +
   1.900 +/*****************************************************************************/
   1.901 +
   1.902 +/*
   1.903 + * jsimd_idct_4x4_neon
   1.904 + *
   1.905 + * This function contains inverse-DCT code for getting reduced-size
   1.906 + * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
   1.907 + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
   1.908 + * function from jpeg-6b (jidctred.c).
   1.909 + *
   1.910 + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
   1.911 + *       requires much less arithmetic operations and hence should be faster.
   1.912 + *       The primary purpose of this particular NEON optimized function is
   1.913 + *       bit exact compatibility with jpeg-6b.
   1.914 + *
   1.915 + * TODO: a bit better instructions scheduling can be achieved by expanding
   1.916 + *       idct_helper/transpose_4x4 macros and reordering instructions,
   1.917 + *       but readability will suffer somewhat.
   1.918 + */
   1.919 +
   1.920 +#define CONST_BITS  13
   1.921 +
   1.922 +#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
   1.923 +#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
   1.924 +#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
   1.925 +#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
   1.926 +#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
   1.927 +#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
   1.928 +#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
   1.929 +#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
   1.930 +#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
   1.931 +#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
   1.932 +#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
   1.933 +#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
   1.934 +#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
   1.935 +#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
   1.936 +
   1.937 +.balign 16
   1.938 +jsimd_idct_4x4_neon_consts:
   1.939 +    .short     FIX_1_847759065     /* d0[0] */
   1.940 +    .short     -FIX_0_765366865    /* d0[1] */
   1.941 +    .short     -FIX_0_211164243    /* d0[2] */
   1.942 +    .short     FIX_1_451774981     /* d0[3] */
   1.943 +    .short     -FIX_2_172734803    /* d1[0] */
   1.944 +    .short     FIX_1_061594337     /* d1[1] */
   1.945 +    .short     -FIX_0_509795579    /* d1[2] */
   1.946 +    .short     -FIX_0_601344887    /* d1[3] */
   1.947 +    .short     FIX_0_899976223     /* d2[0] */
   1.948 +    .short     FIX_2_562915447     /* d2[1] */
   1.949 +    .short     1 << (CONST_BITS+1) /* d2[2] */
   1.950 +    .short     0                   /* d2[3] */
   1.951 +
   1.952 +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
   1.953 +    vmull.s16       q14, \x4,  d2[2]
   1.954 +    vmlal.s16       q14, \x8,  d0[0]
   1.955 +    vmlal.s16       q14, \x14, d0[1]
   1.956 +
   1.957 +    vmull.s16       q13, \x16, d1[2]
   1.958 +    vmlal.s16       q13, \x12, d1[3]
   1.959 +    vmlal.s16       q13, \x10, d2[0]
   1.960 +    vmlal.s16       q13, \x6,  d2[1]
   1.961 +
   1.962 +    vmull.s16       q15, \x4,  d2[2]
   1.963 +    vmlsl.s16       q15, \x8,  d0[0]
   1.964 +    vmlsl.s16       q15, \x14, d0[1]
   1.965 +
   1.966 +    vmull.s16       q12, \x16, d0[2]
   1.967 +    vmlal.s16       q12, \x12, d0[3]
   1.968 +    vmlal.s16       q12, \x10, d1[0]
   1.969 +    vmlal.s16       q12, \x6,  d1[1]
   1.970 +
   1.971 +    vadd.s32        q10, q14, q13
   1.972 +    vsub.s32        q14, q14, q13
   1.973 +
   1.974 +.if \shift > 16
   1.975 +    vrshr.s32       q10,  q10, #\shift
   1.976 +    vrshr.s32       q14,  q14, #\shift
   1.977 +    vmovn.s32       \y26, q10
   1.978 +    vmovn.s32       \y29, q14
   1.979 +.else
   1.980 +    vrshrn.s32      \y26, q10, #\shift
   1.981 +    vrshrn.s32      \y29, q14, #\shift
   1.982 +.endif
   1.983 +
   1.984 +    vadd.s32        q10, q15, q12
   1.985 +    vsub.s32        q15, q15, q12
   1.986 +
   1.987 +.if \shift > 16
   1.988 +    vrshr.s32       q10,  q10, #\shift
   1.989 +    vrshr.s32       q15,  q15, #\shift
   1.990 +    vmovn.s32       \y27, q10
   1.991 +    vmovn.s32       \y28, q15
   1.992 +.else
   1.993 +    vrshrn.s32      \y27, q10, #\shift
   1.994 +    vrshrn.s32      \y28, q15, #\shift
   1.995 +.endif
   1.996 +
   1.997 +.endm
   1.998 +
   1.999 +asm_function jsimd_idct_4x4_neon
  1.1000 +
  1.1001 +    DCT_TABLE       .req r0
  1.1002 +    COEF_BLOCK      .req r1
  1.1003 +    OUTPUT_BUF      .req r2
  1.1004 +    OUTPUT_COL      .req r3
  1.1005 +    TMP1            .req r0
  1.1006 +    TMP2            .req r1
  1.1007 +    TMP3            .req r2
  1.1008 +    TMP4            .req ip
  1.1009 +
  1.1010 +    vpush           {d8-d15}
  1.1011 +
  1.1012 +    /* Load constants (d3 is just used for padding) */
  1.1013 +    adr             TMP4, jsimd_idct_4x4_neon_consts
  1.1014 +    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
  1.1015 +
  1.1016 +    /* Load all COEF_BLOCK into NEON registers with the following allocation:
  1.1017 +     *       0 1 2 3 | 4 5 6 7
  1.1018 +     *      ---------+--------
  1.1019 +     *   0 | d4      | d5
  1.1020 +     *   1 | d6      | d7
  1.1021 +     *   2 | d8      | d9
  1.1022 +     *   3 | d10     | d11
  1.1023 +     *   4 | -       | -
  1.1024 +     *   5 | d12     | d13
  1.1025 +     *   6 | d14     | d15
  1.1026 +     *   7 | d16     | d17
  1.1027 +     */
  1.1028 +    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
  1.1029 +    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
  1.1030 +    add COEF_BLOCK, COEF_BLOCK, #16
  1.1031 +    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
  1.1032 +    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
  1.1033 +    /* dequantize */
  1.1034 +    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
  1.1035 +    vmul.s16        q2, q2, q9
  1.1036 +    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
  1.1037 +    vmul.s16        q3, q3, q10
  1.1038 +    vmul.s16        q4, q4, q11
  1.1039 +    add             DCT_TABLE, DCT_TABLE, #16
  1.1040 +    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
  1.1041 +    vmul.s16        q5, q5, q12
  1.1042 +    vmul.s16        q6, q6, q13
  1.1043 +    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
  1.1044 +    vmul.s16        q7, q7, q14
  1.1045 +    vmul.s16        q8, q8, q15
  1.1046 +
  1.1047 +    /* Pass 1 */
  1.1048 +    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
  1.1049 +    transpose_4x4   d4, d6, d8, d10
  1.1050 +    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
  1.1051 +    transpose_4x4   d5, d7, d9, d11
  1.1052 +
  1.1053 +    /* Pass 2 */
  1.1054 +    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
  1.1055 +    transpose_4x4   d26, d27, d28, d29
  1.1056 +
  1.1057 +    /* Range limit */
  1.1058 +    vmov.u16        q15, #0x80
  1.1059 +    vadd.s16        q13, q13, q15
  1.1060 +    vadd.s16        q14, q14, q15
  1.1061 +    vqmovun.s16     d26, q13
  1.1062 +    vqmovun.s16     d27, q14
  1.1063 +
  1.1064 +    /* Store results to the output buffer */
  1.1065 +    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
  1.1066 +    add             TMP1, TMP1, OUTPUT_COL
  1.1067 +    add             TMP2, TMP2, OUTPUT_COL
  1.1068 +    add             TMP3, TMP3, OUTPUT_COL
  1.1069 +    add             TMP4, TMP4, OUTPUT_COL
  1.1070 +
  1.1071 +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
  1.1072 +    /* We can use much less instructions on little endian systems if the
  1.1073 +     * OS kernel is not configured to trap unaligned memory accesses
  1.1074 +     */
  1.1075 +    vst1.32         {d26[0]}, [TMP1]!
  1.1076 +    vst1.32         {d27[0]}, [TMP3]!
  1.1077 +    vst1.32         {d26[1]}, [TMP2]!
  1.1078 +    vst1.32         {d27[1]}, [TMP4]!
  1.1079 +#else
  1.1080 +    vst1.8          {d26[0]}, [TMP1]!
  1.1081 +    vst1.8          {d27[0]}, [TMP3]!
  1.1082 +    vst1.8          {d26[1]}, [TMP1]!
  1.1083 +    vst1.8          {d27[1]}, [TMP3]!
  1.1084 +    vst1.8          {d26[2]}, [TMP1]!
  1.1085 +    vst1.8          {d27[2]}, [TMP3]!
  1.1086 +    vst1.8          {d26[3]}, [TMP1]!
  1.1087 +    vst1.8          {d27[3]}, [TMP3]!
  1.1088 +
  1.1089 +    vst1.8          {d26[4]}, [TMP2]!
  1.1090 +    vst1.8          {d27[4]}, [TMP4]!
  1.1091 +    vst1.8          {d26[5]}, [TMP2]!
  1.1092 +    vst1.8          {d27[5]}, [TMP4]!
  1.1093 +    vst1.8          {d26[6]}, [TMP2]!
  1.1094 +    vst1.8          {d27[6]}, [TMP4]!
  1.1095 +    vst1.8          {d26[7]}, [TMP2]!
  1.1096 +    vst1.8          {d27[7]}, [TMP4]!
  1.1097 +#endif
  1.1098 +
  1.1099 +    vpop            {d8-d15}
  1.1100 +    bx              lr
  1.1101 +
  1.1102 +    .unreq          DCT_TABLE
  1.1103 +    .unreq          COEF_BLOCK
  1.1104 +    .unreq          OUTPUT_BUF
  1.1105 +    .unreq          OUTPUT_COL
  1.1106 +    .unreq          TMP1
  1.1107 +    .unreq          TMP2
  1.1108 +    .unreq          TMP3
  1.1109 +    .unreq          TMP4
  1.1110 +.endfunc
  1.1111 +
  1.1112 +.purgem idct_helper
  1.1113 +
  1.1114 +/*****************************************************************************/
  1.1115 +
  1.1116 +/*
  1.1117 + * jsimd_idct_2x2_neon
  1.1118 + *
  1.1119 + * This function contains inverse-DCT code for getting reduced-size
  1.1120 + * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
  1.1121 + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
  1.1122 + * function from jpeg-6b (jidctred.c).
  1.1123 + *
  1.1124 + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
  1.1125 + *       requires much less arithmetic operations and hence should be faster.
  1.1126 + *       The primary purpose of this particular NEON optimized function is
  1.1127 + *       bit exact compatibility with jpeg-6b.
  1.1128 + */
  1.1129 +
  1.1130 +.balign 8
  1.1131 +jsimd_idct_2x2_neon_consts:
  1.1132 +    .short     -FIX_0_720959822    /* d0[0] */
  1.1133 +    .short     FIX_0_850430095     /* d0[1] */
  1.1134 +    .short     -FIX_1_272758580    /* d0[2] */
  1.1135 +    .short     FIX_3_624509785     /* d0[3] */
  1.1136 +
  1.1137 +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
  1.1138 +    vshll.s16  q14,  \x4,  #15
  1.1139 +    vmull.s16  q13,  \x6,  d0[3]
  1.1140 +    vmlal.s16  q13,  \x10, d0[2]
  1.1141 +    vmlal.s16  q13,  \x12, d0[1]
  1.1142 +    vmlal.s16  q13,  \x16, d0[0]
  1.1143 +
  1.1144 +    vadd.s32   q10,  q14,  q13
  1.1145 +    vsub.s32   q14,  q14,  q13
  1.1146 +
  1.1147 +.if \shift > 16
  1.1148 +    vrshr.s32  q10,  q10,  #\shift
  1.1149 +    vrshr.s32  q14,  q14,  #\shift
  1.1150 +    vmovn.s32  \y26, q10
  1.1151 +    vmovn.s32  \y27, q14
  1.1152 +.else
  1.1153 +    vrshrn.s32 \y26, q10,  #\shift
  1.1154 +    vrshrn.s32 \y27, q14,  #\shift
  1.1155 +.endif
  1.1156 +
  1.1157 +.endm
  1.1158 +
  1.1159 +asm_function jsimd_idct_2x2_neon
  1.1160 +
  1.1161 +    DCT_TABLE       .req r0
  1.1162 +    COEF_BLOCK      .req r1
  1.1163 +    OUTPUT_BUF      .req r2
  1.1164 +    OUTPUT_COL      .req r3
  1.1165 +    TMP1            .req r0
  1.1166 +    TMP2            .req ip
  1.1167 +
  1.1168 +    vpush           {d8-d15}
  1.1169 +
  1.1170 +    /* Load constants */
  1.1171 +    adr             TMP2, jsimd_idct_2x2_neon_consts
  1.1172 +    vld1.16         {d0}, [TMP2, :64]
  1.1173 +
  1.1174 +    /* Load all COEF_BLOCK into NEON registers with the following allocation:
  1.1175 +     *       0 1 2 3 | 4 5 6 7
  1.1176 +     *      ---------+--------
  1.1177 +     *   0 | d4      | d5
  1.1178 +     *   1 | d6      | d7
  1.1179 +     *   2 | -       | -
  1.1180 +     *   3 | d10     | d11
  1.1181 +     *   4 | -       | -
  1.1182 +     *   5 | d12     | d13
  1.1183 +     *   6 | -       | -
  1.1184 +     *   7 | d16     | d17
  1.1185 +     */
  1.1186 +    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
  1.1187 +    add             COEF_BLOCK, COEF_BLOCK, #16
  1.1188 +    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
  1.1189 +    add             COEF_BLOCK, COEF_BLOCK, #16
  1.1190 +    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
  1.1191 +    add             COEF_BLOCK, COEF_BLOCK, #16
  1.1192 +    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
  1.1193 +    /* Dequantize */
  1.1194 +    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
  1.1195 +    vmul.s16        q2, q2, q9
  1.1196 +    vmul.s16        q3, q3, q10
  1.1197 +    add             DCT_TABLE, DCT_TABLE, #16
  1.1198 +    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
  1.1199 +    vmul.s16        q5, q5, q12
  1.1200 +    add             DCT_TABLE, DCT_TABLE, #16
  1.1201 +    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
  1.1202 +    vmul.s16        q6, q6, q13
  1.1203 +    add             DCT_TABLE, DCT_TABLE, #16
  1.1204 +    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
  1.1205 +    vmul.s16        q8, q8, q15
  1.1206 +
  1.1207 +    /* Pass 1 */
  1.1208 +#if 0
  1.1209 +    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
  1.1210 +    transpose_4x4   d4, d6, d8,  d10
  1.1211 +    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
  1.1212 +    transpose_4x4   d5, d7, d9,  d11
  1.1213 +#else
  1.1214 +    vmull.s16       q13, d6,  d0[3]
  1.1215 +    vmlal.s16       q13, d10, d0[2]
  1.1216 +    vmlal.s16       q13, d12, d0[1]
  1.1217 +    vmlal.s16       q13, d16, d0[0]
  1.1218 +    vmull.s16       q12, d7,  d0[3]
  1.1219 +    vmlal.s16       q12, d11, d0[2]
  1.1220 +    vmlal.s16       q12, d13, d0[1]
  1.1221 +    vmlal.s16       q12, d17, d0[0]
  1.1222 +    vshll.s16       q14, d4,  #15
  1.1223 +    vshll.s16       q15, d5,  #15
  1.1224 +    vadd.s32        q10, q14, q13
  1.1225 +    vsub.s32        q14, q14, q13
  1.1226 +    vrshrn.s32      d4,  q10, #13
  1.1227 +    vrshrn.s32      d6,  q14, #13
  1.1228 +    vadd.s32        q10, q15, q12
  1.1229 +    vsub.s32        q14, q15, q12
  1.1230 +    vrshrn.s32      d5,  q10, #13
  1.1231 +    vrshrn.s32      d7,  q14, #13
  1.1232 +    vtrn.16         q2,  q3
  1.1233 +    vtrn.32         q3,  q5
  1.1234 +#endif
  1.1235 +
  1.1236 +    /* Pass 2 */
  1.1237 +    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
  1.1238 +
  1.1239 +    /* Range limit */
  1.1240 +    vmov.u16        q15, #0x80
  1.1241 +    vadd.s16        q13, q13, q15
  1.1242 +    vqmovun.s16     d26, q13
  1.1243 +    vqmovun.s16     d27, q13
  1.1244 +
  1.1245 +    /* Store results to the output buffer */
  1.1246 +    ldmia           OUTPUT_BUF, {TMP1, TMP2}
  1.1247 +    add             TMP1, TMP1, OUTPUT_COL
  1.1248 +    add             TMP2, TMP2, OUTPUT_COL
  1.1249 +
  1.1250 +    vst1.8          {d26[0]}, [TMP1]!
  1.1251 +    vst1.8          {d27[4]}, [TMP1]!
  1.1252 +    vst1.8          {d26[1]}, [TMP2]!
  1.1253 +    vst1.8          {d27[5]}, [TMP2]!
  1.1254 +
  1.1255 +    vpop            {d8-d15}
  1.1256 +    bx              lr
  1.1257 +
  1.1258 +    .unreq          DCT_TABLE
  1.1259 +    .unreq          COEF_BLOCK
  1.1260 +    .unreq          OUTPUT_BUF
  1.1261 +    .unreq          OUTPUT_COL
  1.1262 +    .unreq          TMP1
  1.1263 +    .unreq          TMP2
  1.1264 +.endfunc
  1.1265 +
  1.1266 +.purgem idct_helper
  1.1267 +
  1.1268 +/*****************************************************************************/
  1.1269 +
  1.1270 +/*
  1.1271 + * jsimd_ycc_extrgb_convert_neon
  1.1272 + * jsimd_ycc_extbgr_convert_neon
  1.1273 + * jsimd_ycc_extrgbx_convert_neon
  1.1274 + * jsimd_ycc_extbgrx_convert_neon
  1.1275 + * jsimd_ycc_extxbgr_convert_neon
  1.1276 + * jsimd_ycc_extxrgb_convert_neon
  1.1277 + *
  1.1278 + * Colorspace conversion YCbCr -> RGB
  1.1279 + */
  1.1280 +
  1.1281 +
  1.1282 +.macro do_load size
  1.1283 +    .if \size == 8
  1.1284 +        vld1.8  {d4}, [U, :64]!
  1.1285 +        vld1.8  {d5}, [V, :64]!
  1.1286 +        vld1.8  {d0}, [Y, :64]!
  1.1287 +        pld     [U, #64]
  1.1288 +        pld     [V, #64]
  1.1289 +        pld     [Y, #64]
  1.1290 +    .elseif \size == 4
  1.1291 +        vld1.8  {d4[0]}, [U]!
  1.1292 +        vld1.8  {d4[1]}, [U]!
  1.1293 +        vld1.8  {d4[2]}, [U]!
  1.1294 +        vld1.8  {d4[3]}, [U]!
  1.1295 +        vld1.8  {d5[0]}, [V]!
  1.1296 +        vld1.8  {d5[1]}, [V]!
  1.1297 +        vld1.8  {d5[2]}, [V]!
  1.1298 +        vld1.8  {d5[3]}, [V]!
  1.1299 +        vld1.8  {d0[0]}, [Y]!
  1.1300 +        vld1.8  {d0[1]}, [Y]!
  1.1301 +        vld1.8  {d0[2]}, [Y]!
  1.1302 +        vld1.8  {d0[3]}, [Y]!
  1.1303 +    .elseif \size == 2
  1.1304 +        vld1.8  {d4[4]}, [U]!
  1.1305 +        vld1.8  {d4[5]}, [U]!
  1.1306 +        vld1.8  {d5[4]}, [V]!
  1.1307 +        vld1.8  {d5[5]}, [V]!
  1.1308 +        vld1.8  {d0[4]}, [Y]!
  1.1309 +        vld1.8  {d0[5]}, [Y]!
  1.1310 +    .elseif \size == 1
  1.1311 +        vld1.8  {d4[6]}, [U]!
  1.1312 +        vld1.8  {d5[6]}, [V]!
  1.1313 +        vld1.8  {d0[6]}, [Y]!
  1.1314 +    .else
  1.1315 +        .error unsupported macroblock size
  1.1316 +    .endif
  1.1317 +.endm
  1.1318 +
  1.1319 +.macro do_store bpp, size
  1.1320 +    .if \bpp == 24
  1.1321 +        .if \size == 8
  1.1322 +            vst3.8  {d10, d11, d12}, [RGB]!
  1.1323 +        .elseif \size == 4
  1.1324 +            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
  1.1325 +            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
  1.1326 +            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
  1.1327 +            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
  1.1328 +        .elseif \size == 2
  1.1329 +            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
  1.1330 +            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
  1.1331 +        .elseif \size == 1
  1.1332 +            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
  1.1333 +        .else
  1.1334 +            .error unsupported macroblock size
  1.1335 +        .endif
  1.1336 +    .elseif \bpp == 32
  1.1337 +        .if \size == 8
  1.1338 +            vst4.8  {d10, d11, d12, d13}, [RGB]!
  1.1339 +        .elseif \size == 4
  1.1340 +            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
  1.1341 +            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
  1.1342 +            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
  1.1343 +            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
  1.1344 +        .elseif \size == 2
  1.1345 +            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
  1.1346 +            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
  1.1347 +        .elseif \size == 1
  1.1348 +            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
  1.1349 +        .else
  1.1350 +            .error unsupported macroblock size
  1.1351 +        .endif
  1.1352 +    .else
  1.1353 +        .error unsupported bpp
  1.1354 +    .endif
  1.1355 +.endm
  1.1356 +
  1.1357 +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
  1.1358 +
  1.1359 +/*
  1.1360 + * 2 stage pipelined YCbCr->RGB conversion
  1.1361 + */
  1.1362 +
  1.1363 +.macro do_yuv_to_rgb_stage1
  1.1364 +    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
  1.1365 +    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
  1.1366 +    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
  1.1367 +    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
  1.1368 +    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
  1.1369 +    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
  1.1370 +    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
  1.1371 +    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
  1.1372 +    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
  1.1373 +    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
  1.1374 +.endm
  1.1375 +
  1.1376 +.macro do_yuv_to_rgb_stage2
  1.1377 +    vrshrn.s32      d20, q10, #15
  1.1378 +    vrshrn.s32      d21, q11, #15
  1.1379 +    vrshrn.s32      d24, q12, #14
  1.1380 +    vrshrn.s32      d25, q13, #14
  1.1381 +    vrshrn.s32      d28, q14, #14
  1.1382 +    vrshrn.s32      d29, q15, #14
  1.1383 +    vaddw.u8        q10, q10, d0
  1.1384 +    vaddw.u8        q12, q12, d0
  1.1385 +    vaddw.u8        q14, q14, d0
  1.1386 +    vqmovun.s16     d1\g_offs, q10
  1.1387 +    vqmovun.s16     d1\r_offs, q12
  1.1388 +    vqmovun.s16     d1\b_offs, q14
  1.1389 +.endm
  1.1390 +
  1.1391 +.macro do_yuv_to_rgb_stage2_store_load_stage1
  1.1392 +    vld1.8          {d4}, [U, :64]!
  1.1393 +      vrshrn.s32      d20, q10, #15
  1.1394 +      vrshrn.s32      d21, q11, #15
  1.1395 +      vrshrn.s32      d24, q12, #14
  1.1396 +      vrshrn.s32      d25, q13, #14
  1.1397 +      vrshrn.s32      d28, q14, #14
  1.1398 +    vld1.8          {d5}, [V, :64]!
  1.1399 +      vrshrn.s32      d29, q15, #14
  1.1400 +      vaddw.u8        q10, q10, d0
  1.1401 +      vaddw.u8        q12, q12, d0
  1.1402 +      vaddw.u8        q14, q14, d0
  1.1403 +      vqmovun.s16     d1\g_offs, q10
  1.1404 +    vld1.8          {d0}, [Y, :64]!
  1.1405 +      vqmovun.s16     d1\r_offs, q12
  1.1406 +    pld             [U, #64]
  1.1407 +    pld             [V, #64]
  1.1408 +    pld             [Y, #64]
  1.1409 +      vqmovun.s16     d1\b_offs, q14
  1.1410 +    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
  1.1411 +    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
  1.1412 +      do_store        \bpp, 8
  1.1413 +    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
  1.1414 +    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
  1.1415 +    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
  1.1416 +    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
  1.1417 +    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
  1.1418 +    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
  1.1419 +    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
  1.1420 +    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
  1.1421 +.endm
  1.1422 +
  1.1423 +.macro do_yuv_to_rgb
  1.1424 +    do_yuv_to_rgb_stage1
  1.1425 +    do_yuv_to_rgb_stage2
  1.1426 +.endm
  1.1427 +
  1.1428 +/* Apple gas crashes on adrl, work around that by using adr.
  1.1429 + * But this requires a copy of these constants for each function.
  1.1430 + */
  1.1431 +
  1.1432 +.balign 16
  1.1433 +jsimd_ycc_\colorid\()_neon_consts:
  1.1434 +    .short          0,      0,     0,      0
  1.1435 +    .short          22971, -11277, -23401, 29033
  1.1436 +    .short          -128,  -128,   -128,   -128
  1.1437 +    .short          -128,  -128,   -128,   -128
  1.1438 +
  1.1439 +asm_function jsimd_ycc_\colorid\()_convert_neon
  1.1440 +    OUTPUT_WIDTH    .req r0
  1.1441 +    INPUT_BUF       .req r1
  1.1442 +    INPUT_ROW       .req r2
  1.1443 +    OUTPUT_BUF      .req r3
  1.1444 +    NUM_ROWS        .req r4
  1.1445 +
  1.1446 +    INPUT_BUF0      .req r5
  1.1447 +    INPUT_BUF1      .req r6
  1.1448 +    INPUT_BUF2      .req INPUT_BUF
  1.1449 +
  1.1450 +    RGB             .req r7
  1.1451 +    Y               .req r8
  1.1452 +    U               .req r9
  1.1453 +    V               .req r10
  1.1454 +    N               .req ip
  1.1455 +
  1.1456 +    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
  1.1457 +    adr             ip, jsimd_ycc_\colorid\()_neon_consts
  1.1458 +    vld1.16         {d0, d1, d2, d3}, [ip, :128]
  1.1459 +
  1.1460 +    /* Save ARM registers and handle input arguments */
  1.1461 +    push            {r4, r5, r6, r7, r8, r9, r10, lr}
  1.1462 +    ldr             NUM_ROWS, [sp, #(4 * 8)]
  1.1463 +    ldr             INPUT_BUF0, [INPUT_BUF]
  1.1464 +    ldr             INPUT_BUF1, [INPUT_BUF, #4]
  1.1465 +    ldr             INPUT_BUF2, [INPUT_BUF, #8]
  1.1466 +    .unreq          INPUT_BUF
  1.1467 +
  1.1468 +    /* Save NEON registers */
  1.1469 +    vpush           {d8-d15}
  1.1470 +
  1.1471 +    /* Initially set d10, d11, d12, d13 to 0xFF */
  1.1472 +    vmov.u8         q5, #255
  1.1473 +    vmov.u8         q6, #255
  1.1474 +
  1.1475 +    /* Outer loop over scanlines */
  1.1476 +    cmp             NUM_ROWS, #1
  1.1477 +    blt             9f
  1.1478 +0:
  1.1479 +    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
  1.1480 +    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
  1.1481 +    mov             N, OUTPUT_WIDTH
  1.1482 +    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
  1.1483 +    add             INPUT_ROW, INPUT_ROW, #1
  1.1484 +    ldr             RGB, [OUTPUT_BUF], #4
  1.1485 +
  1.1486 +    /* Inner loop over pixels */
  1.1487 +    subs            N, N, #8
  1.1488 +    blt             3f
  1.1489 +    do_load         8
  1.1490 +    do_yuv_to_rgb_stage1
  1.1491 +    subs            N, N, #8
  1.1492 +    blt             2f
  1.1493 +1:
  1.1494 +    do_yuv_to_rgb_stage2_store_load_stage1
  1.1495 +    subs            N, N, #8
  1.1496 +    bge             1b
  1.1497 +2:
  1.1498 +    do_yuv_to_rgb_stage2
  1.1499 +    do_store        \bpp, 8
  1.1500 +    tst             N, #7
  1.1501 +    beq             8f
  1.1502 +3:
  1.1503 +    tst             N, #4
  1.1504 +    beq             3f
  1.1505 +    do_load         4
  1.1506 +3:
  1.1507 +    tst             N, #2
  1.1508 +    beq             4f
  1.1509 +    do_load         2
  1.1510 +4:
  1.1511 +    tst             N, #1
  1.1512 +    beq             5f
  1.1513 +    do_load         1
  1.1514 +5:
  1.1515 +    do_yuv_to_rgb
  1.1516 +    tst             N, #4
  1.1517 +    beq             6f
  1.1518 +    do_store        \bpp, 4
  1.1519 +6:
  1.1520 +    tst             N, #2
  1.1521 +    beq             7f
  1.1522 +    do_store        \bpp, 2
  1.1523 +7:
  1.1524 +    tst             N, #1
  1.1525 +    beq             8f
  1.1526 +    do_store        \bpp, 1
  1.1527 +8:
  1.1528 +    subs            NUM_ROWS, NUM_ROWS, #1
  1.1529 +    bgt             0b
  1.1530 +9:
  1.1531 +    /* Restore all registers and return */
  1.1532 +    vpop            {d8-d15}
  1.1533 +    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
  1.1534 +
  1.1535 +    .unreq          OUTPUT_WIDTH
  1.1536 +    .unreq          INPUT_ROW
  1.1537 +    .unreq          OUTPUT_BUF
  1.1538 +    .unreq          NUM_ROWS
  1.1539 +    .unreq          INPUT_BUF0
  1.1540 +    .unreq          INPUT_BUF1
  1.1541 +    .unreq          INPUT_BUF2
  1.1542 +    .unreq          RGB
  1.1543 +    .unreq          Y
  1.1544 +    .unreq          U
  1.1545 +    .unreq          V
  1.1546 +    .unreq          N
  1.1547 +.endfunc
  1.1548 +
  1.1549 +.purgem do_yuv_to_rgb
  1.1550 +.purgem do_yuv_to_rgb_stage1
  1.1551 +.purgem do_yuv_to_rgb_stage2
  1.1552 +.purgem do_yuv_to_rgb_stage2_store_load_stage1
  1.1553 +
  1.1554 +.endm
  1.1555 +
  1.1556 +/*--------------------------------- id ----- bpp R  G  B */
  1.1557 +generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
  1.1558 +generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
  1.1559 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
  1.1560 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
  1.1561 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
  1.1562 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
  1.1563 +
  1.1564 +.purgem do_load
  1.1565 +.purgem do_store
  1.1566 +
  1.1567 +/*****************************************************************************/
  1.1568 +
  1.1569 +/*
  1.1570 + * jsimd_extrgb_ycc_convert_neon
  1.1571 + * jsimd_extbgr_ycc_convert_neon
  1.1572 + * jsimd_extrgbx_ycc_convert_neon
  1.1573 + * jsimd_extbgrx_ycc_convert_neon
  1.1574 + * jsimd_extxbgr_ycc_convert_neon
  1.1575 + * jsimd_extxrgb_ycc_convert_neon
  1.1576 + *
  1.1577 + * Colorspace conversion RGB -> YCbCr
  1.1578 + */
  1.1579 +
  1.1580 +.macro do_store size
  1.1581 +    .if \size == 8
  1.1582 +        vst1.8  {d20}, [Y]!
  1.1583 +        vst1.8  {d21}, [U]!
  1.1584 +        vst1.8  {d22}, [V]!
  1.1585 +    .elseif \size == 4
  1.1586 +        vst1.8  {d20[0]}, [Y]!
  1.1587 +        vst1.8  {d20[1]}, [Y]!
  1.1588 +        vst1.8  {d20[2]}, [Y]!
  1.1589 +        vst1.8  {d20[3]}, [Y]!
  1.1590 +        vst1.8  {d21[0]}, [U]!
  1.1591 +        vst1.8  {d21[1]}, [U]!
  1.1592 +        vst1.8  {d21[2]}, [U]!
  1.1593 +        vst1.8  {d21[3]}, [U]!
  1.1594 +        vst1.8  {d22[0]}, [V]!
  1.1595 +        vst1.8  {d22[1]}, [V]!
  1.1596 +        vst1.8  {d22[2]}, [V]!
  1.1597 +        vst1.8  {d22[3]}, [V]!
  1.1598 +    .elseif \size == 2
  1.1599 +        vst1.8  {d20[4]}, [Y]!
  1.1600 +        vst1.8  {d20[5]}, [Y]!
  1.1601 +        vst1.8  {d21[4]}, [U]!
  1.1602 +        vst1.8  {d21[5]}, [U]!
  1.1603 +        vst1.8  {d22[4]}, [V]!
  1.1604 +        vst1.8  {d22[5]}, [V]!
  1.1605 +    .elseif \size == 1
  1.1606 +        vst1.8  {d20[6]}, [Y]!
  1.1607 +        vst1.8  {d21[6]}, [U]!
  1.1608 +        vst1.8  {d22[6]}, [V]!
  1.1609 +    .else
  1.1610 +        .error unsupported macroblock size
  1.1611 +    .endif
  1.1612 +.endm
  1.1613 +
  1.1614 +.macro do_load bpp, size
  1.1615 +    .if \bpp == 24
  1.1616 +        .if \size == 8
  1.1617 +            vld3.8  {d10, d11, d12}, [RGB]!
  1.1618 +            pld     [RGB, #128]
  1.1619 +        .elseif \size == 4
  1.1620 +            vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
  1.1621 +            vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
  1.1622 +            vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
  1.1623 +            vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
  1.1624 +        .elseif \size == 2
  1.1625 +            vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
  1.1626 +            vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
  1.1627 +        .elseif \size == 1
  1.1628 +            vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
  1.1629 +        .else
  1.1630 +            .error unsupported macroblock size
  1.1631 +        .endif
  1.1632 +    .elseif \bpp == 32
  1.1633 +        .if \size == 8
  1.1634 +            vld4.8  {d10, d11, d12, d13}, [RGB]!
  1.1635 +            pld     [RGB, #128]
  1.1636 +        .elseif \size == 4
  1.1637 +            vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
  1.1638 +            vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
  1.1639 +            vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
  1.1640 +            vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
  1.1641 +        .elseif \size == 2
  1.1642 +            vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
  1.1643 +            vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
  1.1644 +        .elseif \size == 1
  1.1645 +            vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
  1.1646 +        .else
  1.1647 +            .error unsupported macroblock size
  1.1648 +        .endif
  1.1649 +    .else
  1.1650 +        .error unsupported bpp
  1.1651 +    .endif
  1.1652 +.endm
  1.1653 +
  1.1654 +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
  1.1655 +
  1.1656 +/*
  1.1657 + * 2 stage pipelined RGB->YCbCr conversion
  1.1658 + */
  1.1659 +
  1.1660 +.macro do_rgb_to_yuv_stage1
  1.1661 +    vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
  1.1662 +    vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
  1.1663 +    vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
  1.1664 +    vmull.u16   q7, d4, d0[0]
  1.1665 +    vmlal.u16   q7, d6, d0[1]
  1.1666 +    vmlal.u16   q7, d8, d0[2]
  1.1667 +    vmull.u16   q8, d5, d0[0]
  1.1668 +    vmlal.u16   q8, d7, d0[1]
  1.1669 +    vmlal.u16   q8, d9, d0[2]
  1.1670 +    vrev64.32   q9,  q1
  1.1671 +    vrev64.32   q13, q1
  1.1672 +    vmlsl.u16   q9,  d4, d0[3]
  1.1673 +    vmlsl.u16   q9,  d6, d1[0]
  1.1674 +    vmlal.u16   q9,  d8, d1[1]
  1.1675 +    vmlsl.u16   q13, d5, d0[3]
  1.1676 +    vmlsl.u16   q13, d7, d1[0]
  1.1677 +    vmlal.u16   q13, d9, d1[1]
  1.1678 +    vrev64.32   q14, q1
  1.1679 +    vrev64.32   q15, q1
  1.1680 +    vmlal.u16   q14, d4, d1[1]
  1.1681 +    vmlsl.u16   q14, d6, d1[2]
  1.1682 +    vmlsl.u16   q14, d8, d1[3]
  1.1683 +    vmlal.u16   q15, d5, d1[1]
  1.1684 +    vmlsl.u16   q15, d7, d1[2]
  1.1685 +    vmlsl.u16   q15, d9, d1[3]
  1.1686 +.endm
  1.1687 +
  1.1688 +.macro do_rgb_to_yuv_stage2
  1.1689 +    vrshrn.u32  d20, q7,  #16
  1.1690 +    vrshrn.u32  d21, q8,  #16
  1.1691 +    vshrn.u32   d22, q9,  #16
  1.1692 +    vshrn.u32   d23, q13, #16
  1.1693 +    vshrn.u32   d24, q14, #16
  1.1694 +    vshrn.u32   d25, q15, #16
  1.1695 +    vmovn.u16   d20, q10      /* d20 = y */
  1.1696 +    vmovn.u16   d21, q11      /* d21 = u */
  1.1697 +    vmovn.u16   d22, q12      /* d22 = v */
  1.1698 +.endm
  1.1699 +
  1.1700 +.macro do_rgb_to_yuv
  1.1701 +    do_rgb_to_yuv_stage1
  1.1702 +    do_rgb_to_yuv_stage2
  1.1703 +.endm
  1.1704 +
  1.1705 +.macro do_rgb_to_yuv_stage2_store_load_stage1
  1.1706 +      vrshrn.u32  d20, q7,  #16
  1.1707 +      vrshrn.u32  d21, q8,  #16
  1.1708 +      vshrn.u32   d22, q9,  #16
  1.1709 +    vrev64.32   q9,  q1
  1.1710 +      vshrn.u32   d23, q13, #16
  1.1711 +    vrev64.32   q13, q1
  1.1712 +      vshrn.u32   d24, q14, #16
  1.1713 +      vshrn.u32   d25, q15, #16
  1.1714 +    do_load     \bpp, 8
  1.1715 +      vmovn.u16   d20, q10      /* d20 = y */
  1.1716 +    vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
  1.1717 +      vmovn.u16   d21, q11      /* d21 = u */
  1.1718 +    vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
  1.1719 +      vmovn.u16   d22, q12      /* d22 = v */
  1.1720 +    vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
  1.1721 +    vmull.u16   q7, d4, d0[0]
  1.1722 +    vmlal.u16   q7, d6, d0[1]
  1.1723 +    vmlal.u16   q7, d8, d0[2]
  1.1724 +      vst1.8      {d20}, [Y]!
  1.1725 +    vmull.u16   q8, d5, d0[0]
  1.1726 +    vmlal.u16   q8, d7, d0[1]
  1.1727 +    vmlal.u16   q8, d9, d0[2]
  1.1728 +    vmlsl.u16   q9,  d4, d0[3]
  1.1729 +    vmlsl.u16   q9,  d6, d1[0]
  1.1730 +    vmlal.u16   q9,  d8, d1[1]
  1.1731 +      vst1.8      {d21}, [U]!
  1.1732 +    vmlsl.u16   q13, d5, d0[3]
  1.1733 +    vmlsl.u16   q13, d7, d1[0]
  1.1734 +    vmlal.u16   q13, d9, d1[1]
  1.1735 +    vrev64.32   q14, q1
  1.1736 +    vrev64.32   q15, q1
  1.1737 +    vmlal.u16   q14, d4, d1[1]
  1.1738 +    vmlsl.u16   q14, d6, d1[2]
  1.1739 +    vmlsl.u16   q14, d8, d1[3]
  1.1740 +      vst1.8      {d22}, [V]!
  1.1741 +    vmlal.u16   q15, d5, d1[1]
  1.1742 +    vmlsl.u16   q15, d7, d1[2]
  1.1743 +    vmlsl.u16   q15, d9, d1[3]
  1.1744 +.endm
  1.1745 +
  1.1746 +.balign 16
  1.1747 +jsimd_\colorid\()_ycc_neon_consts:
  1.1748 +    .short          19595, 38470, 7471,  11059
  1.1749 +    .short          21709, 32768, 27439, 5329
  1.1750 +    .short          32767, 128,   32767, 128
  1.1751 +    .short          32767, 128,   32767, 128
  1.1752 +
  1.1753 +asm_function jsimd_\colorid\()_ycc_convert_neon
  1.1754 +    OUTPUT_WIDTH    .req r0
  1.1755 +    INPUT_BUF       .req r1
  1.1756 +    OUTPUT_BUF      .req r2
  1.1757 +    OUTPUT_ROW      .req r3
  1.1758 +    NUM_ROWS        .req r4
  1.1759 +
  1.1760 +    OUTPUT_BUF0     .req r5
  1.1761 +    OUTPUT_BUF1     .req r6
  1.1762 +    OUTPUT_BUF2     .req OUTPUT_BUF
  1.1763 +
  1.1764 +    RGB             .req r7
  1.1765 +    Y               .req r8
  1.1766 +    U               .req r9
  1.1767 +    V               .req r10
  1.1768 +    N               .req ip
  1.1769 +
  1.1770 +    /* Load constants to d0, d1, d2, d3 */
  1.1771 +    adr             ip, jsimd_\colorid\()_ycc_neon_consts
  1.1772 +    vld1.16         {d0, d1, d2, d3}, [ip, :128]
  1.1773 +
  1.1774 +    /* Save ARM registers and handle input arguments */
  1.1775 +    push            {r4, r5, r6, r7, r8, r9, r10, lr}
  1.1776 +    ldr             NUM_ROWS, [sp, #(4 * 8)]
  1.1777 +    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
  1.1778 +    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
  1.1779 +    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
  1.1780 +    .unreq          OUTPUT_BUF
  1.1781 +
  1.1782 +    /* Save NEON registers */
  1.1783 +    vpush           {d8-d15}
  1.1784 +
  1.1785 +    /* Outer loop over scanlines */
  1.1786 +    cmp             NUM_ROWS, #1
  1.1787 +    blt             9f
  1.1788 +0:
  1.1789 +    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
  1.1790 +    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
  1.1791 +    mov             N, OUTPUT_WIDTH
  1.1792 +    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
  1.1793 +    add             OUTPUT_ROW, OUTPUT_ROW, #1
  1.1794 +    ldr             RGB, [INPUT_BUF], #4
  1.1795 +
  1.1796 +    /* Inner loop over pixels */
  1.1797 +    subs            N, N, #8
  1.1798 +    blt             3f
  1.1799 +    do_load         \bpp, 8
  1.1800 +    do_rgb_to_yuv_stage1
  1.1801 +    subs            N, N, #8
  1.1802 +    blt             2f
  1.1803 +1:
  1.1804 +    do_rgb_to_yuv_stage2_store_load_stage1
  1.1805 +    subs            N, N, #8
  1.1806 +    bge             1b
  1.1807 +2:
  1.1808 +    do_rgb_to_yuv_stage2
  1.1809 +    do_store        8
  1.1810 +    tst             N, #7
  1.1811 +    beq             8f
  1.1812 +3:
  1.1813 +    tst             N, #4
  1.1814 +    beq             3f
  1.1815 +    do_load         \bpp, 4
  1.1816 +3:
  1.1817 +    tst             N, #2
  1.1818 +    beq             4f
  1.1819 +    do_load         \bpp, 2
  1.1820 +4:
  1.1821 +    tst             N, #1
  1.1822 +    beq             5f
  1.1823 +    do_load         \bpp, 1
  1.1824 +5:
  1.1825 +    do_rgb_to_yuv
  1.1826 +    tst             N, #4
  1.1827 +    beq             6f
  1.1828 +    do_store        4
  1.1829 +6:
  1.1830 +    tst             N, #2
  1.1831 +    beq             7f
  1.1832 +    do_store        2
  1.1833 +7:
  1.1834 +    tst             N, #1
  1.1835 +    beq             8f
  1.1836 +    do_store        1
  1.1837 +8:
  1.1838 +    subs            NUM_ROWS, NUM_ROWS, #1
  1.1839 +    bgt             0b
  1.1840 +9:
  1.1841 +    /* Restore all registers and return */
  1.1842 +    vpop            {d8-d15}
  1.1843 +    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
  1.1844 +
  1.1845 +    .unreq          OUTPUT_WIDTH
  1.1846 +    .unreq          OUTPUT_ROW
  1.1847 +    .unreq          INPUT_BUF
  1.1848 +    .unreq          NUM_ROWS
  1.1849 +    .unreq          OUTPUT_BUF0
  1.1850 +    .unreq          OUTPUT_BUF1
  1.1851 +    .unreq          OUTPUT_BUF2
  1.1852 +    .unreq          RGB
  1.1853 +    .unreq          Y
  1.1854 +    .unreq          U
  1.1855 +    .unreq          V
  1.1856 +    .unreq          N
  1.1857 +.endfunc
  1.1858 +
  1.1859 +.purgem do_rgb_to_yuv
  1.1860 +.purgem do_rgb_to_yuv_stage1
  1.1861 +.purgem do_rgb_to_yuv_stage2
  1.1862 +.purgem do_rgb_to_yuv_stage2_store_load_stage1
  1.1863 +
  1.1864 +.endm
  1.1865 +
  1.1866 +/*--------------------------------- id ----- bpp R  G  B */
  1.1867 +generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
  1.1868 +generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
  1.1869 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
  1.1870 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
  1.1871 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
  1.1872 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
  1.1873 +
  1.1874 +.purgem do_load
  1.1875 +.purgem do_store
  1.1876 +
  1.1877 +/*****************************************************************************/
  1.1878 +
  1.1879 +/*
  1.1880 + * Load data into workspace, applying unsigned->signed conversion
  1.1881 + *
  1.1882 + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
  1.1883 + *       rid of VST1.16 instructions
  1.1884 + */
  1.1885 +
  1.1886 +asm_function jsimd_convsamp_neon
  1.1887 +    SAMPLE_DATA     .req r0
  1.1888 +    START_COL       .req r1
  1.1889 +    WORKSPACE       .req r2
  1.1890 +    TMP1            .req r3
  1.1891 +    TMP2            .req r4
  1.1892 +    TMP3            .req r5
  1.1893 +    TMP4            .req ip
  1.1894 +
  1.1895 +    push            {r4, r5}
  1.1896 +    vmov.u8         d0, #128
  1.1897 +
  1.1898 +    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
  1.1899 +    add             TMP1, TMP1, START_COL
  1.1900 +    add             TMP2, TMP2, START_COL
  1.1901 +    add             TMP3, TMP3, START_COL
  1.1902 +    add             TMP4, TMP4, START_COL
  1.1903 +    vld1.8          {d16}, [TMP1]
  1.1904 +    vsubl.u8        q8, d16, d0
  1.1905 +    vld1.8          {d18}, [TMP2]
  1.1906 +    vsubl.u8        q9, d18, d0
  1.1907 +    vld1.8          {d20}, [TMP3]
  1.1908 +    vsubl.u8        q10, d20, d0
  1.1909 +    vld1.8          {d22}, [TMP4]
  1.1910 +    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
  1.1911 +    vsubl.u8        q11, d22, d0
  1.1912 +    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
  1.1913 +    add             TMP1, TMP1, START_COL
  1.1914 +    add             TMP2, TMP2, START_COL
  1.1915 +    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
  1.1916 +    add             TMP3, TMP3, START_COL
  1.1917 +    add             TMP4, TMP4, START_COL
  1.1918 +    vld1.8          {d24}, [TMP1]
  1.1919 +    vsubl.u8        q12, d24, d0
  1.1920 +    vld1.8          {d26}, [TMP2]
  1.1921 +    vsubl.u8        q13, d26, d0
  1.1922 +    vld1.8          {d28}, [TMP3]
  1.1923 +    vsubl.u8        q14, d28, d0
  1.1924 +    vld1.8          {d30}, [TMP4]
  1.1925 +    vsubl.u8        q15, d30, d0
  1.1926 +    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
  1.1927 +    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
  1.1928 +    pop             {r4, r5}
  1.1929 +    bx              lr
  1.1930 +
  1.1931 +    .unreq          SAMPLE_DATA
  1.1932 +    .unreq          START_COL
  1.1933 +    .unreq          WORKSPACE
  1.1934 +    .unreq          TMP1
  1.1935 +    .unreq          TMP2
  1.1936 +    .unreq          TMP3
  1.1937 +    .unreq          TMP4
  1.1938 +.endfunc
  1.1939 +
  1.1940 +/*****************************************************************************/
  1.1941 +
  1.1942 +/*
  1.1943 + * jsimd_fdct_ifast_neon
  1.1944 + *
  1.1945 + * This function contains a fast, not so accurate integer implementation of
  1.1946 + * the forward DCT (Discrete Cosine Transform). It uses the same calculations
  1.1947 + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
  1.1948 + * function from jfdctfst.c
  1.1949 + *
  1.1950 + * TODO: can be combined with 'jsimd_convsamp_neon' to get
  1.1951 + *       rid of a bunch of VLD1.16 instructions
  1.1952 + */
  1.1953 +
  1.1954 +#define XFIX_0_382683433 d0[0]
  1.1955 +#define XFIX_0_541196100 d0[1]
  1.1956 +#define XFIX_0_707106781 d0[2]
  1.1957 +#define XFIX_1_306562965 d0[3]
  1.1958 +
  1.1959 +.balign 16
  1.1960 +jsimd_fdct_ifast_neon_consts:
  1.1961 +    .short (98 * 128)              /* XFIX_0_382683433 */
  1.1962 +    .short (139 * 128)             /* XFIX_0_541196100 */
  1.1963 +    .short (181 * 128)             /* XFIX_0_707106781 */
  1.1964 +    .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
  1.1965 +
  1.1966 +asm_function jsimd_fdct_ifast_neon
  1.1967 +
  1.1968 +    DATA            .req r0
  1.1969 +    TMP             .req ip
  1.1970 +
  1.1971 +    vpush           {d8-d15}
  1.1972 +
  1.1973 +    /* Load constants */
  1.1974 +    adr             TMP, jsimd_fdct_ifast_neon_consts
  1.1975 +    vld1.16         {d0}, [TMP, :64]
  1.1976 +
  1.1977 +    /* Load all DATA into NEON registers with the following allocation:
  1.1978 +     *       0 1 2 3 | 4 5 6 7
  1.1979 +     *      ---------+--------
  1.1980 +     *   0 | d16     | d17    | q8
  1.1981 +     *   1 | d18     | d19    | q9
  1.1982 +     *   2 | d20     | d21    | q10
  1.1983 +     *   3 | d22     | d23    | q11
  1.1984 +     *   4 | d24     | d25    | q12
  1.1985 +     *   5 | d26     | d27    | q13
  1.1986 +     *   6 | d28     | d29    | q14
  1.1987 +     *   7 | d30     | d31    | q15
  1.1988 +     */
  1.1989 +
  1.1990 +    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
  1.1991 +    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
  1.1992 +    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
  1.1993 +    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
  1.1994 +    sub             DATA, DATA, #(128 - 32)
  1.1995 +
  1.1996 +    mov             TMP, #2
  1.1997 +1:
  1.1998 +    /* Transpose */
  1.1999 +    vtrn.16         q12, q13
  1.2000 +    vtrn.16         q10, q11
  1.2001 +    vtrn.16         q8,  q9
  1.2002 +    vtrn.16         q14, q15
  1.2003 +    vtrn.32         q9,  q11
  1.2004 +    vtrn.32         q13, q15
  1.2005 +    vtrn.32         q8,  q10
  1.2006 +    vtrn.32         q12, q14
  1.2007 +    vswp            d30, d23
  1.2008 +    vswp            d24, d17
  1.2009 +    vswp            d26, d19
  1.2010 +      /* 1-D FDCT */
  1.2011 +      vadd.s16        q2,  q11, q12
  1.2012 +    vswp            d28, d21
  1.2013 +      vsub.s16        q12, q11, q12
  1.2014 +      vsub.s16        q6,  q10, q13
  1.2015 +      vadd.s16        q10, q10, q13
  1.2016 +      vsub.s16        q7,  q9,  q14
  1.2017 +      vadd.s16        q9,  q9,  q14
  1.2018 +      vsub.s16        q1,  q8,  q15
  1.2019 +      vadd.s16        q8,  q8,  q15
  1.2020 +      vsub.s16        q4,  q9,  q10
  1.2021 +      vsub.s16        q5,  q8,  q2
  1.2022 +      vadd.s16        q3,  q9,  q10
  1.2023 +      vadd.s16        q4,  q4,  q5
  1.2024 +      vadd.s16        q2,  q8,  q2
  1.2025 +      vqdmulh.s16     q4,  q4,  XFIX_0_707106781
  1.2026 +      vadd.s16        q11, q12, q6
  1.2027 +      vadd.s16        q8,  q2,  q3
  1.2028 +      vsub.s16        q12, q2,  q3
  1.2029 +      vadd.s16        q3,  q6,  q7
  1.2030 +      vadd.s16        q7,  q7,  q1
  1.2031 +      vqdmulh.s16     q3,  q3,  XFIX_0_707106781
  1.2032 +      vsub.s16        q6,  q11, q7
  1.2033 +      vadd.s16        q10, q5,  q4
  1.2034 +      vqdmulh.s16     q6,  q6,  XFIX_0_382683433
  1.2035 +      vsub.s16        q14, q5,  q4
  1.2036 +      vqdmulh.s16     q11, q11, XFIX_0_541196100
  1.2037 +      vqdmulh.s16     q5,  q7,  XFIX_1_306562965
  1.2038 +      vadd.s16        q4,  q1,  q3
  1.2039 +      vsub.s16        q3,  q1,  q3
  1.2040 +      vadd.s16        q7,  q7,  q6
  1.2041 +      vadd.s16        q11, q11, q6
  1.2042 +      vadd.s16        q7,  q7,  q5
  1.2043 +      vadd.s16        q13, q3,  q11
  1.2044 +      vsub.s16        q11, q3,  q11
  1.2045 +      vadd.s16        q9,  q4,  q7
  1.2046 +      vsub.s16        q15, q4,  q7
  1.2047 +    subs            TMP, TMP, #1
  1.2048 +    bne             1b
  1.2049 +
  1.2050 +    /* store results */
  1.2051 +    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
  1.2052 +    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
  1.2053 +    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
  1.2054 +    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
  1.2055 +
  1.2056 +    vpop            {d8-d15}
  1.2057 +    bx              lr
  1.2058 +
  1.2059 +    .unreq          DATA
  1.2060 +    .unreq          TMP
  1.2061 +.endfunc
  1.2062 +
  1.2063 +/*****************************************************************************/
  1.2064 +
  1.2065 +/*
  1.2066 + * GLOBAL(void)
  1.2067 + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
  1.2068 + *                      DCTELEM * workspace);
  1.2069 + *
  1.2070 + * Note: the code uses 2 stage pipelining in order to improve instructions
  1.2071 + *       scheduling and eliminate stalls (this provides ~15% better
  1.2072 + *       performance for this function on both ARM Cortex-A8 and
  1.2073 + *       ARM Cortex-A9 when compared to the non-pipelined variant).
  1.2074 + *       The instructions which belong to the second stage use different
  1.2075 + *       indentation for better readiability.
  1.2076 + */
  1.2077 +asm_function jsimd_quantize_neon
  1.2078 +
  1.2079 +    COEF_BLOCK      .req r0
  1.2080 +    DIVISORS        .req r1
  1.2081 +    WORKSPACE       .req r2
  1.2082 +
  1.2083 +    RECIPROCAL      .req DIVISORS
  1.2084 +    CORRECTION      .req r3
  1.2085 +    SHIFT           .req ip
  1.2086 +    LOOP_COUNT      .req r4
  1.2087 +
  1.2088 +    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
  1.2089 +    vabs.s16        q12, q0
  1.2090 +    add             CORRECTION, DIVISORS, #(64 * 2)
  1.2091 +    add             SHIFT, DIVISORS, #(64 * 6)
  1.2092 +    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
  1.2093 +    vabs.s16        q13, q1
  1.2094 +    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
  1.2095 +    vadd.u16        q12, q12, q10 /* add correction */
  1.2096 +    vadd.u16        q13, q13, q11
  1.2097 +    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
  1.2098 +    vmull.u16       q11, d25, d17
  1.2099 +    vmull.u16       q8,  d26, d18
  1.2100 +    vmull.u16       q9,  d27, d19
  1.2101 +    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
  1.2102 +    vshrn.u32       d20, q10, #16
  1.2103 +    vshrn.u32       d21, q11, #16
  1.2104 +    vshrn.u32       d22, q8,  #16
  1.2105 +    vshrn.u32       d23, q9,  #16
  1.2106 +    vneg.s16        q12, q12
  1.2107 +    vneg.s16        q13, q13
  1.2108 +    vshr.s16        q2,  q0,  #15 /* extract sign */
  1.2109 +    vshr.s16        q3,  q1,  #15
  1.2110 +    vshl.u16        q14, q10, q12 /* shift */
  1.2111 +    vshl.u16        q15, q11, q13
  1.2112 +
  1.2113 +    push            {r4, r5}
  1.2114 +    mov             LOOP_COUNT, #3
  1.2115 +1:
  1.2116 +    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
  1.2117 +      veor.u16        q14, q14, q2  /* restore sign */
  1.2118 +    vabs.s16        q12, q0
  1.2119 +    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
  1.2120 +    vabs.s16        q13, q1
  1.2121 +      veor.u16        q15, q15, q3
  1.2122 +    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
  1.2123 +    vadd.u16        q12, q12, q10 /* add correction */
  1.2124 +    vadd.u16        q13, q13, q11
  1.2125 +    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
  1.2126 +    vmull.u16       q11, d25, d17
  1.2127 +    vmull.u16       q8,  d26, d18
  1.2128 +    vmull.u16       q9,  d27, d19
  1.2129 +      vsub.u16        q14, q14, q2
  1.2130 +    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
  1.2131 +      vsub.u16        q15, q15, q3
  1.2132 +    vshrn.u32       d20, q10, #16
  1.2133 +    vshrn.u32       d21, q11, #16
  1.2134 +      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
  1.2135 +    vshrn.u32       d22, q8,  #16
  1.2136 +    vshrn.u32       d23, q9,  #16
  1.2137 +    vneg.s16        q12, q12
  1.2138 +    vneg.s16        q13, q13
  1.2139 +    vshr.s16        q2,  q0,  #15 /* extract sign */
  1.2140 +    vshr.s16        q3,  q1,  #15
  1.2141 +    vshl.u16        q14, q10, q12 /* shift */
  1.2142 +    vshl.u16        q15, q11, q13
  1.2143 +    subs            LOOP_COUNT, LOOP_COUNT, #1
  1.2144 +    bne             1b
  1.2145 +    pop             {r4, r5}
  1.2146 +
  1.2147 +      veor.u16        q14, q14, q2  /* restore sign */
  1.2148 +      veor.u16        q15, q15, q3
  1.2149 +      vsub.u16        q14, q14, q2
  1.2150 +      vsub.u16        q15, q15, q3
  1.2151 +      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
  1.2152 +
  1.2153 +    bx              lr /* return */
  1.2154 +
  1.2155 +    .unreq          COEF_BLOCK
  1.2156 +    .unreq          DIVISORS
  1.2157 +    .unreq          WORKSPACE
  1.2158 +    .unreq          RECIPROCAL
  1.2159 +    .unreq          CORRECTION
  1.2160 +    .unreq          SHIFT
  1.2161 +    .unreq          LOOP_COUNT
  1.2162 +.endfunc
  1.2163 +
  1.2164 +/*****************************************************************************/
  1.2165 +
  1.2166 +/*
  1.2167 + * GLOBAL(void)
  1.2168 + * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
  1.2169 + *                                 JDIMENSION   downsampled_width,
  1.2170 + *                                 JSAMPARRAY   input_data,
  1.2171 + *                                 JSAMPARRAY * output_data_ptr);
  1.2172 + *
  1.2173 + * Note: the use of unaligned writes is the main remaining bottleneck in
  1.2174 + *       this code, which can be potentially solved to get up to tens
  1.2175 + *       of percents performance improvement on Cortex-A8/Cortex-A9.
  1.2176 + */
  1.2177 +
  1.2178 +/*
  1.2179 + * Upsample 16 source pixels to 32 destination pixels. The new 16 source
  1.2180 + * pixels are loaded to q0. The previous 16 source pixels are in q1. The
  1.2181 + * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
  1.2182 + * Register d28 is used for multiplication by 3. Register q15 is used
  1.2183 + * for adding +1 bias.
  1.2184 + */
  1.2185 +.macro upsample16   OUTPTR, INPTR
  1.2186 +    vld1.8          {q0}, [\INPTR]!
  1.2187 +    vmovl.u8        q8,  d0
  1.2188 +    vext.8          q2,  q1,  q0, #15
  1.2189 +    vmovl.u8        q9,  d1
  1.2190 +    vaddw.u8        q10, q15, d4
  1.2191 +    vaddw.u8        q11, q15, d5
  1.2192 +    vmlal.u8        q8,  d4,  d28
  1.2193 +    vmlal.u8        q9,  d5,  d28
  1.2194 +    vmlal.u8        q10, d0,  d28
  1.2195 +    vmlal.u8        q11, d1,  d28
  1.2196 +    vmov            q1,  q0       /* backup source pixels to q1 */
  1.2197 +    vrshrn.u16      d6,  q8,  #2
  1.2198 +    vrshrn.u16      d7,  q9,  #2
  1.2199 +    vshrn.u16       d8,  q10, #2
  1.2200 +    vshrn.u16       d9,  q11, #2
  1.2201 +    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
  1.2202 +.endm
  1.2203 +
  1.2204 +/*
  1.2205 + * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
  1.2206 + * macro, the roles of q0 and q1 registers are reversed for even and odd
  1.2207 + * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
  1.2208 + * Also this unrolling allows to reorder loads and stores to compensate
  1.2209 + * multiplication latency and reduce stalls.
  1.2210 + */
  1.2211 +.macro upsample32   OUTPTR, INPTR
  1.2212 +    /* even 16 pixels group */
  1.2213 +    vld1.8          {q0}, [\INPTR]!
  1.2214 +    vmovl.u8        q8,  d0
  1.2215 +    vext.8          q2,  q1,  q0, #15
  1.2216 +    vmovl.u8        q9,  d1
  1.2217 +    vaddw.u8        q10, q15, d4
  1.2218 +    vaddw.u8        q11, q15, d5
  1.2219 +    vmlal.u8        q8,  d4,  d28
  1.2220 +    vmlal.u8        q9,  d5,  d28
  1.2221 +    vmlal.u8        q10, d0,  d28
  1.2222 +    vmlal.u8        q11, d1,  d28
  1.2223 +        /* odd 16 pixels group */
  1.2224 +        vld1.8          {q1}, [\INPTR]!
  1.2225 +    vrshrn.u16      d6,  q8,  #2
  1.2226 +    vrshrn.u16      d7,  q9,  #2
  1.2227 +    vshrn.u16       d8,  q10, #2
  1.2228 +    vshrn.u16       d9,  q11, #2
  1.2229 +        vmovl.u8        q8,  d2
  1.2230 +        vext.8          q2,  q0,  q1, #15
  1.2231 +        vmovl.u8        q9,  d3
  1.2232 +        vaddw.u8        q10, q15, d4
  1.2233 +        vaddw.u8        q11, q15, d5
  1.2234 +        vmlal.u8        q8,  d4,  d28
  1.2235 +        vmlal.u8        q9,  d5,  d28
  1.2236 +        vmlal.u8        q10, d2,  d28
  1.2237 +        vmlal.u8        q11, d3,  d28
  1.2238 +    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
  1.2239 +        vrshrn.u16      d6,  q8,  #2
  1.2240 +        vrshrn.u16      d7,  q9,  #2
  1.2241 +        vshrn.u16       d8,  q10, #2
  1.2242 +        vshrn.u16       d9,  q11, #2
  1.2243 +        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
  1.2244 +.endm
  1.2245 +
  1.2246 +/*
  1.2247 + * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
  1.2248 + */
  1.2249 +.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
  1.2250 +    /* special case for the first and last pixels */
  1.2251 +    sub             \WIDTH, \WIDTH, #1
  1.2252 +    add             \OUTPTR, \OUTPTR, #1
  1.2253 +    ldrb            \TMP1, [\INPTR, \WIDTH]
  1.2254 +    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
  1.2255 +    ldrb            \TMP1, [\INPTR], #1
  1.2256 +    strb            \TMP1, [\OUTPTR, #-1]
  1.2257 +    vmov.8          d3[7], \TMP1
  1.2258 +
  1.2259 +    subs            \WIDTH, \WIDTH, #32
  1.2260 +    blt             5f
  1.2261 +0:  /* process 32 pixels per iteration */
  1.2262 +    upsample32      \OUTPTR, \INPTR
  1.2263 +    subs            \WIDTH, \WIDTH, #32
  1.2264 +    bge             0b
  1.2265 +5:
  1.2266 +    adds            \WIDTH, \WIDTH, #16
  1.2267 +    blt             1f
  1.2268 +0:  /* process 16 pixels if needed */
  1.2269 +    upsample16      \OUTPTR, \INPTR
  1.2270 +    subs            \WIDTH, \WIDTH, #16
  1.2271 +1:
  1.2272 +    adds            \WIDTH, \WIDTH, #16
  1.2273 +    beq             9f
  1.2274 +
  1.2275 +    /* load the remaining 1-15 pixels */
  1.2276 +    add             \INPTR, \INPTR, \WIDTH
  1.2277 +    tst             \WIDTH, #1
  1.2278 +    beq             2f
  1.2279 +    sub             \INPTR, \INPTR, #1
  1.2280 +    vld1.8          {d0[0]}, [\INPTR]
  1.2281 +2:
  1.2282 +    tst             \WIDTH, #2
  1.2283 +    beq             2f
  1.2284 +    vext.8          d0, d0, d0, #6
  1.2285 +    sub             \INPTR, \INPTR, #1
  1.2286 +    vld1.8          {d0[1]}, [\INPTR]
  1.2287 +    sub             \INPTR, \INPTR, #1
  1.2288 +    vld1.8          {d0[0]}, [\INPTR]
  1.2289 +2:
  1.2290 +    tst             \WIDTH, #4
  1.2291 +    beq             2f
  1.2292 +    vrev64.32       d0, d0
  1.2293 +    sub             \INPTR, \INPTR, #1
  1.2294 +    vld1.8          {d0[3]}, [\INPTR]
  1.2295 +    sub             \INPTR, \INPTR, #1
  1.2296 +    vld1.8          {d0[2]}, [\INPTR]
  1.2297 +    sub             \INPTR, \INPTR, #1
  1.2298 +    vld1.8          {d0[1]}, [\INPTR]
  1.2299 +    sub             \INPTR, \INPTR, #1
  1.2300 +    vld1.8          {d0[0]}, [\INPTR]
  1.2301 +2:
  1.2302 +    tst             \WIDTH, #8
  1.2303 +    beq             2f
  1.2304 +    vmov            d1,  d0
  1.2305 +    sub             \INPTR, \INPTR, #8
  1.2306 +    vld1.8          {d0}, [\INPTR]
  1.2307 +2:  /* upsample the remaining pixels */
  1.2308 +    vmovl.u8        q8,  d0
  1.2309 +    vext.8          q2,  q1,  q0, #15
  1.2310 +    vmovl.u8        q9,  d1
  1.2311 +    vaddw.u8        q10, q15, d4
  1.2312 +    vaddw.u8        q11, q15, d5
  1.2313 +    vmlal.u8        q8,  d4,  d28
  1.2314 +    vmlal.u8        q9,  d5,  d28
  1.2315 +    vmlal.u8        q10, d0,  d28
  1.2316 +    vmlal.u8        q11, d1,  d28
  1.2317 +    vrshrn.u16      d10, q8,  #2
  1.2318 +    vrshrn.u16      d12, q9,  #2
  1.2319 +    vshrn.u16       d11, q10, #2
  1.2320 +    vshrn.u16       d13, q11, #2
  1.2321 +    vzip.8          d10, d11
  1.2322 +    vzip.8          d12, d13
  1.2323 +    /* store the remaining pixels */
  1.2324 +    tst             \WIDTH, #8
  1.2325 +    beq             2f
  1.2326 +    vst1.8          {d10, d11}, [\OUTPTR]!
  1.2327 +    vmov            q5,  q6
  1.2328 +2:
  1.2329 +    tst             \WIDTH, #4
  1.2330 +    beq             2f
  1.2331 +    vst1.8          {d10}, [\OUTPTR]!
  1.2332 +    vmov            d10,  d11
  1.2333 +2:
  1.2334 +    tst             \WIDTH, #2
  1.2335 +    beq             2f
  1.2336 +    vst1.8          {d10[0]}, [\OUTPTR]!
  1.2337 +    vst1.8          {d10[1]}, [\OUTPTR]!
  1.2338 +    vst1.8          {d10[2]}, [\OUTPTR]!
  1.2339 +    vst1.8          {d10[3]}, [\OUTPTR]!
  1.2340 +    vext.8          d10, d10, d10, #4
  1.2341 +2:
  1.2342 +    tst             \WIDTH, #1
  1.2343 +    beq             2f
  1.2344 +    vst1.8          {d10[0]}, [\OUTPTR]!
  1.2345 +    vst1.8          {d10[1]}, [\OUTPTR]!
  1.2346 +2:
  1.2347 +9:
  1.2348 +.endm
  1.2349 +
  1.2350 +asm_function jsimd_h2v1_fancy_upsample_neon
  1.2351 +
  1.2352 +    MAX_V_SAMP_FACTOR .req r0
  1.2353 +    DOWNSAMPLED_WIDTH .req r1
  1.2354 +    INPUT_DATA        .req r2
  1.2355 +    OUTPUT_DATA_PTR   .req r3
  1.2356 +    OUTPUT_DATA       .req OUTPUT_DATA_PTR
  1.2357 +
  1.2358 +    OUTPTR            .req r4
  1.2359 +    INPTR             .req r5
  1.2360 +    WIDTH             .req ip
  1.2361 +    TMP               .req lr
  1.2362 +
  1.2363 +    push            {r4, r5, r6, lr}
  1.2364 +    vpush           {d8-d15}
  1.2365 +
  1.2366 +    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
  1.2367 +    cmp             MAX_V_SAMP_FACTOR, #0
  1.2368 +    ble             99f
  1.2369 +
  1.2370 +    /* initialize constants */
  1.2371 +    vmov.u8         d28, #3
  1.2372 +    vmov.u16        q15, #1
  1.2373 +11:
  1.2374 +    ldr             INPTR, [INPUT_DATA], #4
  1.2375 +    ldr             OUTPTR, [OUTPUT_DATA], #4
  1.2376 +    mov             WIDTH, DOWNSAMPLED_WIDTH
  1.2377 +    upsample_row    OUTPTR, INPTR, WIDTH, TMP
  1.2378 +    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
  1.2379 +    bgt             11b
  1.2380 +
  1.2381 +99:
  1.2382 +    vpop            {d8-d15}
  1.2383 +    pop             {r4, r5, r6, pc}
  1.2384 +
  1.2385 +    .unreq          MAX_V_SAMP_FACTOR
  1.2386 +    .unreq          DOWNSAMPLED_WIDTH
  1.2387 +    .unreq          INPUT_DATA
  1.2388 +    .unreq          OUTPUT_DATA_PTR
  1.2389 +    .unreq          OUTPUT_DATA
  1.2390 +
  1.2391 +    .unreq          OUTPTR
  1.2392 +    .unreq          INPTR
  1.2393 +    .unreq          WIDTH
  1.2394 +    .unreq          TMP
  1.2395 +
  1.2396 +.endfunc
  1.2397 +
  1.2398 +.purgem upsample16
  1.2399 +.purgem upsample32
  1.2400 +.purgem upsample_row

mercurial