1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jsimd_arm_neon.S Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2397 @@ 1.4 +/* 1.5 + * ARM NEON optimizations for libjpeg-turbo 1.6 + * 1.7 + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 1.8 + * All rights reserved. 1.9 + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 1.10 + * 1.11 + * This software is provided 'as-is', without any express or implied 1.12 + * warranty. In no event will the authors be held liable for any damages 1.13 + * arising from the use of this software. 1.14 + * 1.15 + * Permission is granted to anyone to use this software for any purpose, 1.16 + * including commercial applications, and to alter it and redistribute it 1.17 + * freely, subject to the following restrictions: 1.18 + * 1.19 + * 1. The origin of this software must not be misrepresented; you must not 1.20 + * claim that you wrote the original software. If you use this software 1.21 + * in a product, an acknowledgment in the product documentation would be 1.22 + * appreciated but is not required. 1.23 + * 2. Altered source versions must be plainly marked as such, and must not be 1.24 + * misrepresented as being the original software. 1.25 + * 3. This notice may not be removed or altered from any source distribution. 1.26 + */ 1.27 + 1.28 +#if defined(__linux__) && defined(__ELF__) 1.29 +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 1.30 +#endif 1.31 + 1.32 +.text 1.33 +.fpu neon 1.34 +.arch armv7a 1.35 +.object_arch armv4 1.36 +.arm 1.37 + 1.38 + 1.39 +#define RESPECT_STRICT_ALIGNMENT 1 1.40 + 1.41 +/*****************************************************************************/ 1.42 + 1.43 +/* Supplementary macro for setting function attributes */ 1.44 +.macro asm_function fname 1.45 +#ifdef __APPLE__ 1.46 + .func _\fname 1.47 + .globl _\fname 1.48 +_\fname: 1.49 +#else 1.50 + .func \fname 1.51 + .global \fname 1.52 +#ifdef __ELF__ 1.53 + .hidden \fname 1.54 + .type \fname, %function 1.55 +#endif 1.56 +\fname: 1.57 +#endif 1.58 +.endm 1.59 + 1.60 +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 1.61 +.macro transpose_4x4 x0, x1, x2, x3 1.62 + vtrn.16 \x0, \x1 1.63 + vtrn.16 \x2, \x3 1.64 + vtrn.32 \x0, \x2 1.65 + vtrn.32 \x1, \x3 1.66 +.endm 1.67 + 1.68 +#define CENTERJSAMPLE 128 1.69 + 1.70 +/*****************************************************************************/ 1.71 + 1.72 +/* 1.73 + * Perform dequantization and inverse DCT on one block of coefficients. 1.74 + * 1.75 + * GLOBAL(void) 1.76 + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, 1.77 + * JSAMPARRAY output_buf, JDIMENSION output_col) 1.78 + */ 1.79 + 1.80 +#define FIX_0_298631336 (2446) 1.81 +#define FIX_0_390180644 (3196) 1.82 +#define FIX_0_541196100 (4433) 1.83 +#define FIX_0_765366865 (6270) 1.84 +#define FIX_0_899976223 (7373) 1.85 +#define FIX_1_175875602 (9633) 1.86 +#define FIX_1_501321110 (12299) 1.87 +#define FIX_1_847759065 (15137) 1.88 +#define FIX_1_961570560 (16069) 1.89 +#define FIX_2_053119869 (16819) 1.90 +#define FIX_2_562915447 (20995) 1.91 +#define FIX_3_072711026 (25172) 1.92 + 1.93 +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) 1.94 +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) 1.95 +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) 1.96 +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) 1.97 +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) 1.98 +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) 1.99 +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) 1.100 +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) 1.101 + 1.102 +/* 1.103 + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. 1.104 + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' 1.105 + */ 1.106 +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ 1.107 +{ \ 1.108 + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ 1.109 + INT32 q1, q2, q3, q4, q5, q6, q7; \ 1.110 + INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ 1.111 + \ 1.112 + /* 1-D iDCT input data */ \ 1.113 + row0 = xrow0; \ 1.114 + row1 = xrow1; \ 1.115 + row2 = xrow2; \ 1.116 + row3 = xrow3; \ 1.117 + row4 = xrow4; \ 1.118 + row5 = xrow5; \ 1.119 + row6 = xrow6; \ 1.120 + row7 = xrow7; \ 1.121 + \ 1.122 + q5 = row7 + row3; \ 1.123 + q4 = row5 + row1; \ 1.124 + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ 1.125 + MULTIPLY(q4, FIX_1_175875602); \ 1.126 + q7 = MULTIPLY(q5, FIX_1_175875602) + \ 1.127 + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ 1.128 + q2 = MULTIPLY(row2, FIX_0_541196100) + \ 1.129 + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ 1.130 + q4 = q6; \ 1.131 + q3 = ((INT32) row0 - (INT32) row4) << 13; \ 1.132 + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ 1.133 + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ 1.134 + /* now we can use q1 (reloadable constants have been used up) */ \ 1.135 + q1 = q3 + q2; \ 1.136 + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ 1.137 + MULTIPLY(row1, -FIX_0_899976223); \ 1.138 + q5 = q7; \ 1.139 + q1 = q1 + q6; \ 1.140 + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ 1.141 + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ 1.142 + \ 1.143 + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ 1.144 + tmp11_plus_tmp2 = q1; \ 1.145 + row1 = 0; \ 1.146 + \ 1.147 + q1 = q1 - q6; \ 1.148 + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ 1.149 + MULTIPLY(row3, -FIX_2_562915447); \ 1.150 + q1 = q1 - q6; \ 1.151 + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ 1.152 + MULTIPLY(row6, FIX_0_541196100); \ 1.153 + q3 = q3 - q2; \ 1.154 + \ 1.155 + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ 1.156 + tmp11_minus_tmp2 = q1; \ 1.157 + \ 1.158 + q1 = ((INT32) row0 + (INT32) row4) << 13; \ 1.159 + q2 = q1 + q6; \ 1.160 + q1 = q1 - q6; \ 1.161 + \ 1.162 + /* pick up the results */ \ 1.163 + tmp0 = q4; \ 1.164 + tmp1 = q5; \ 1.165 + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ 1.166 + tmp3 = q7; \ 1.167 + tmp10 = q2; \ 1.168 + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ 1.169 + tmp12 = q3; \ 1.170 + tmp13 = q1; \ 1.171 +} 1.172 + 1.173 +#define XFIX_0_899976223 d0[0] 1.174 +#define XFIX_0_541196100 d0[1] 1.175 +#define XFIX_2_562915447 d0[2] 1.176 +#define XFIX_0_298631336_MINUS_0_899976223 d0[3] 1.177 +#define XFIX_1_501321110_MINUS_0_899976223 d1[0] 1.178 +#define XFIX_2_053119869_MINUS_2_562915447 d1[1] 1.179 +#define XFIX_0_541196100_PLUS_0_765366865 d1[2] 1.180 +#define XFIX_1_175875602 d1[3] 1.181 +#define XFIX_1_175875602_MINUS_0_390180644 d2[0] 1.182 +#define XFIX_0_541196100_MINUS_1_847759065 d2[1] 1.183 +#define XFIX_3_072711026_MINUS_2_562915447 d2[2] 1.184 +#define XFIX_1_175875602_MINUS_1_961570560 d2[3] 1.185 + 1.186 +.balign 16 1.187 +jsimd_idct_islow_neon_consts: 1.188 + .short FIX_0_899976223 /* d0[0] */ 1.189 + .short FIX_0_541196100 /* d0[1] */ 1.190 + .short FIX_2_562915447 /* d0[2] */ 1.191 + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 1.192 + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 1.193 + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 1.194 + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 1.195 + .short FIX_1_175875602 /* d1[3] */ 1.196 + /* reloadable constants */ 1.197 + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ 1.198 + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ 1.199 + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ 1.200 + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ 1.201 + 1.202 +asm_function jsimd_idct_islow_neon 1.203 + 1.204 + DCT_TABLE .req r0 1.205 + COEF_BLOCK .req r1 1.206 + OUTPUT_BUF .req r2 1.207 + OUTPUT_COL .req r3 1.208 + TMP1 .req r0 1.209 + TMP2 .req r1 1.210 + TMP3 .req r2 1.211 + TMP4 .req ip 1.212 + 1.213 + ROW0L .req d16 1.214 + ROW0R .req d17 1.215 + ROW1L .req d18 1.216 + ROW1R .req d19 1.217 + ROW2L .req d20 1.218 + ROW2R .req d21 1.219 + ROW3L .req d22 1.220 + ROW3R .req d23 1.221 + ROW4L .req d24 1.222 + ROW4R .req d25 1.223 + ROW5L .req d26 1.224 + ROW5R .req d27 1.225 + ROW6L .req d28 1.226 + ROW6R .req d29 1.227 + ROW7L .req d30 1.228 + ROW7R .req d31 1.229 + 1.230 + /* Load and dequantize coefficients into NEON registers 1.231 + * with the following allocation: 1.232 + * 0 1 2 3 | 4 5 6 7 1.233 + * ---------+-------- 1.234 + * 0 | d16 | d17 ( q8 ) 1.235 + * 1 | d18 | d19 ( q9 ) 1.236 + * 2 | d20 | d21 ( q10 ) 1.237 + * 3 | d22 | d23 ( q11 ) 1.238 + * 4 | d24 | d25 ( q12 ) 1.239 + * 5 | d26 | d27 ( q13 ) 1.240 + * 6 | d28 | d29 ( q14 ) 1.241 + * 7 | d30 | d31 ( q15 ) 1.242 + */ 1.243 + adr ip, jsimd_idct_islow_neon_consts 1.244 + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 1.245 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 1.246 + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 1.247 + vmul.s16 q8, q8, q0 1.248 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 1.249 + vmul.s16 q9, q9, q1 1.250 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 1.251 + vmul.s16 q10, q10, q2 1.252 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 1.253 + vmul.s16 q11, q11, q3 1.254 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 1.255 + vmul.s16 q12, q12, q0 1.256 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 1.257 + vmul.s16 q14, q14, q2 1.258 + vmul.s16 q13, q13, q1 1.259 + vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ 1.260 + add ip, ip, #16 1.261 + vmul.s16 q15, q15, q3 1.262 + vpush {d8-d15} /* save NEON registers */ 1.263 + /* 1-D IDCT, pass 1, left 4x8 half */ 1.264 + vadd.s16 d4, ROW7L, ROW3L 1.265 + vadd.s16 d5, ROW5L, ROW1L 1.266 + vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 1.267 + vmlal.s16 q6, d5, XFIX_1_175875602 1.268 + vmull.s16 q7, d4, XFIX_1_175875602 1.269 + /* Check for the zero coefficients in the right 4x8 half */ 1.270 + push {r4, r5} 1.271 + vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 1.272 + vsubl.s16 q3, ROW0L, ROW4L 1.273 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] 1.274 + vmull.s16 q2, ROW2L, XFIX_0_541196100 1.275 + vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 1.276 + orr r0, r4, r5 1.277 + vmov q4, q6 1.278 + vmlsl.s16 q6, ROW5L, XFIX_2_562915447 1.279 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] 1.280 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 1.281 + vshl.s32 q3, q3, #13 1.282 + orr r0, r0, r4 1.283 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 1.284 + orr r0, r0, r5 1.285 + vadd.s32 q1, q3, q2 1.286 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] 1.287 + vmov q5, q7 1.288 + vadd.s32 q1, q1, q6 1.289 + orr r0, r0, r4 1.290 + vmlsl.s16 q7, ROW7L, XFIX_0_899976223 1.291 + orr r0, r0, r5 1.292 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 1.293 + vrshrn.s32 ROW1L, q1, #11 1.294 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] 1.295 + vsub.s32 q1, q1, q6 1.296 + vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 1.297 + orr r0, r0, r4 1.298 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 1.299 + orr r0, r0, r5 1.300 + vsub.s32 q1, q1, q6 1.301 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 1.302 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] 1.303 + vmlal.s16 q6, ROW6L, XFIX_0_541196100 1.304 + vsub.s32 q3, q3, q2 1.305 + orr r0, r0, r4 1.306 + vrshrn.s32 ROW6L, q1, #11 1.307 + orr r0, r0, r5 1.308 + vadd.s32 q1, q3, q5 1.309 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] 1.310 + vsub.s32 q3, q3, q5 1.311 + vaddl.s16 q5, ROW0L, ROW4L 1.312 + orr r0, r0, r4 1.313 + vrshrn.s32 ROW2L, q1, #11 1.314 + orr r0, r0, r5 1.315 + vrshrn.s32 ROW5L, q3, #11 1.316 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] 1.317 + vshl.s32 q5, q5, #13 1.318 + vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 1.319 + orr r0, r0, r4 1.320 + vadd.s32 q2, q5, q6 1.321 + orrs r0, r0, r5 1.322 + vsub.s32 q1, q5, q6 1.323 + vadd.s32 q6, q2, q7 1.324 + ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] 1.325 + vsub.s32 q2, q2, q7 1.326 + vadd.s32 q5, q1, q4 1.327 + orr r0, r4, r5 1.328 + vsub.s32 q3, q1, q4 1.329 + pop {r4, r5} 1.330 + vrshrn.s32 ROW7L, q2, #11 1.331 + vrshrn.s32 ROW3L, q5, #11 1.332 + vrshrn.s32 ROW0L, q6, #11 1.333 + vrshrn.s32 ROW4L, q3, #11 1.334 + 1.335 + beq 3f /* Go to do some special handling for the sparse right 4x8 half */ 1.336 + 1.337 + /* 1-D IDCT, pass 1, right 4x8 half */ 1.338 + vld1.s16 {d2}, [ip, :64] /* reload constants */ 1.339 + vadd.s16 d10, ROW7R, ROW3R 1.340 + vadd.s16 d8, ROW5R, ROW1R 1.341 + /* Transpose left 4x8 half */ 1.342 + vtrn.16 ROW6L, ROW7L 1.343 + vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 1.344 + vmlal.s16 q6, d8, XFIX_1_175875602 1.345 + vtrn.16 ROW2L, ROW3L 1.346 + vmull.s16 q7, d10, XFIX_1_175875602 1.347 + vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 1.348 + vtrn.16 ROW0L, ROW1L 1.349 + vsubl.s16 q3, ROW0R, ROW4R 1.350 + vmull.s16 q2, ROW2R, XFIX_0_541196100 1.351 + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 1.352 + vtrn.16 ROW4L, ROW5L 1.353 + vmov q4, q6 1.354 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 1.355 + vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 1.356 + vtrn.32 ROW1L, ROW3L 1.357 + vshl.s32 q3, q3, #13 1.358 + vmlsl.s16 q4, ROW1R, XFIX_0_899976223 1.359 + vtrn.32 ROW4L, ROW6L 1.360 + vadd.s32 q1, q3, q2 1.361 + vmov q5, q7 1.362 + vadd.s32 q1, q1, q6 1.363 + vtrn.32 ROW0L, ROW2L 1.364 + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 1.365 + vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 1.366 + vrshrn.s32 ROW1R, q1, #11 1.367 + vtrn.32 ROW5L, ROW7L 1.368 + vsub.s32 q1, q1, q6 1.369 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 1.370 + vmlsl.s16 q5, ROW3R, XFIX_2_562915447 1.371 + vsub.s32 q1, q1, q6 1.372 + vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 1.373 + vmlal.s16 q6, ROW6R, XFIX_0_541196100 1.374 + vsub.s32 q3, q3, q2 1.375 + vrshrn.s32 ROW6R, q1, #11 1.376 + vadd.s32 q1, q3, q5 1.377 + vsub.s32 q3, q3, q5 1.378 + vaddl.s16 q5, ROW0R, ROW4R 1.379 + vrshrn.s32 ROW2R, q1, #11 1.380 + vrshrn.s32 ROW5R, q3, #11 1.381 + vshl.s32 q5, q5, #13 1.382 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 1.383 + vadd.s32 q2, q5, q6 1.384 + vsub.s32 q1, q5, q6 1.385 + vadd.s32 q6, q2, q7 1.386 + vsub.s32 q2, q2, q7 1.387 + vadd.s32 q5, q1, q4 1.388 + vsub.s32 q3, q1, q4 1.389 + vrshrn.s32 ROW7R, q2, #11 1.390 + vrshrn.s32 ROW3R, q5, #11 1.391 + vrshrn.s32 ROW0R, q6, #11 1.392 + vrshrn.s32 ROW4R, q3, #11 1.393 + /* Transpose right 4x8 half */ 1.394 + vtrn.16 ROW6R, ROW7R 1.395 + vtrn.16 ROW2R, ROW3R 1.396 + vtrn.16 ROW0R, ROW1R 1.397 + vtrn.16 ROW4R, ROW5R 1.398 + vtrn.32 ROW1R, ROW3R 1.399 + vtrn.32 ROW4R, ROW6R 1.400 + vtrn.32 ROW0R, ROW2R 1.401 + vtrn.32 ROW5R, ROW7R 1.402 + 1.403 +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ 1.404 + vld1.s16 {d2}, [ip, :64] /* reload constants */ 1.405 + vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 1.406 + vmlal.s16 q6, ROW1L, XFIX_1_175875602 1.407 + vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 1.408 + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 1.409 + vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 1.410 + vmlal.s16 q7, ROW3L, XFIX_1_175875602 1.411 + vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 1.412 + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 1.413 + vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 1.414 + vmull.s16 q2, ROW2L, XFIX_0_541196100 1.415 + vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ 1.416 + vmov q4, q6 1.417 + vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ 1.418 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 1.419 + vshl.s32 q3, q3, #13 1.420 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 1.421 + vadd.s32 q1, q3, q2 1.422 + vmov q5, q7 1.423 + vadd.s32 q1, q1, q6 1.424 + vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ 1.425 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 1.426 + vshrn.s32 ROW1L, q1, #16 1.427 + vsub.s32 q1, q1, q6 1.428 + vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ 1.429 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 1.430 + vsub.s32 q1, q1, q6 1.431 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 1.432 + vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 1.433 + vsub.s32 q3, q3, q2 1.434 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 1.435 + vadd.s32 q1, q3, q5 1.436 + vsub.s32 q3, q3, q5 1.437 + vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 1.438 + vshrn.s32 ROW2L, q1, #16 1.439 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 1.440 + vshl.s32 q5, q5, #13 1.441 + vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ 1.442 + vadd.s32 q2, q5, q6 1.443 + vsub.s32 q1, q5, q6 1.444 + vadd.s32 q6, q2, q7 1.445 + vsub.s32 q2, q2, q7 1.446 + vadd.s32 q5, q1, q4 1.447 + vsub.s32 q3, q1, q4 1.448 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 1.449 + vshrn.s32 ROW3L, q5, #16 1.450 + vshrn.s32 ROW0L, q6, #16 1.451 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 1.452 + /* 1-D IDCT, pass 2, right 4x8 half */ 1.453 + vld1.s16 {d2}, [ip, :64] /* reload constants */ 1.454 + vmull.s16 q6, ROW5R, XFIX_1_175875602 1.455 + vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 1.456 + vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 1.457 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 1.458 + vmull.s16 q7, ROW7R, XFIX_1_175875602 1.459 + vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 1.460 + vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 1.461 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 1.462 + vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 1.463 + vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 1.464 + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 1.465 + vmov q4, q6 1.466 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 1.467 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ 1.468 + vshl.s32 q3, q3, #13 1.469 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ 1.470 + vadd.s32 q1, q3, q2 1.471 + vmov q5, q7 1.472 + vadd.s32 q1, q1, q6 1.473 + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 1.474 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ 1.475 + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 1.476 + vsub.s32 q1, q1, q6 1.477 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 1.478 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ 1.479 + vsub.s32 q1, q1, q6 1.480 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ 1.481 + vmlal.s16 q6, ROW6R, XFIX_0_541196100 1.482 + vsub.s32 q3, q3, q2 1.483 + vshrn.s32 ROW6R, q1, #16 1.484 + vadd.s32 q1, q3, q5 1.485 + vsub.s32 q3, q3, q5 1.486 + vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 1.487 + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 1.488 + vshrn.s32 ROW5R, q3, #16 1.489 + vshl.s32 q5, q5, #13 1.490 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 1.491 + vadd.s32 q2, q5, q6 1.492 + vsub.s32 q1, q5, q6 1.493 + vadd.s32 q6, q2, q7 1.494 + vsub.s32 q2, q2, q7 1.495 + vadd.s32 q5, q1, q4 1.496 + vsub.s32 q3, q1, q4 1.497 + vshrn.s32 ROW7R, q2, #16 1.498 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 1.499 + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 1.500 + vshrn.s32 ROW4R, q3, #16 1.501 + 1.502 +2: /* Descale to 8-bit and range limit */ 1.503 + vqrshrn.s16 d16, q8, #2 1.504 + vqrshrn.s16 d17, q9, #2 1.505 + vqrshrn.s16 d18, q10, #2 1.506 + vqrshrn.s16 d19, q11, #2 1.507 + vpop {d8-d15} /* restore NEON registers */ 1.508 + vqrshrn.s16 d20, q12, #2 1.509 + /* Transpose the final 8-bit samples and do signed->unsigned conversion */ 1.510 + vtrn.16 q8, q9 1.511 + vqrshrn.s16 d21, q13, #2 1.512 + vqrshrn.s16 d22, q14, #2 1.513 + vmov.u8 q0, #(CENTERJSAMPLE) 1.514 + vqrshrn.s16 d23, q15, #2 1.515 + vtrn.8 d16, d17 1.516 + vtrn.8 d18, d19 1.517 + vadd.u8 q8, q8, q0 1.518 + vadd.u8 q9, q9, q0 1.519 + vtrn.16 q10, q11 1.520 + /* Store results to the output buffer */ 1.521 + ldmia OUTPUT_BUF!, {TMP1, TMP2} 1.522 + add TMP1, TMP1, OUTPUT_COL 1.523 + add TMP2, TMP2, OUTPUT_COL 1.524 + vst1.8 {d16}, [TMP1] 1.525 + vtrn.8 d20, d21 1.526 + vst1.8 {d17}, [TMP2] 1.527 + ldmia OUTPUT_BUF!, {TMP1, TMP2} 1.528 + add TMP1, TMP1, OUTPUT_COL 1.529 + add TMP2, TMP2, OUTPUT_COL 1.530 + vst1.8 {d18}, [TMP1] 1.531 + vadd.u8 q10, q10, q0 1.532 + vst1.8 {d19}, [TMP2] 1.533 + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 1.534 + add TMP1, TMP1, OUTPUT_COL 1.535 + add TMP2, TMP2, OUTPUT_COL 1.536 + add TMP3, TMP3, OUTPUT_COL 1.537 + add TMP4, TMP4, OUTPUT_COL 1.538 + vtrn.8 d22, d23 1.539 + vst1.8 {d20}, [TMP1] 1.540 + vadd.u8 q11, q11, q0 1.541 + vst1.8 {d21}, [TMP2] 1.542 + vst1.8 {d22}, [TMP3] 1.543 + vst1.8 {d23}, [TMP4] 1.544 + bx lr 1.545 + 1.546 +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ 1.547 + 1.548 + /* Transpose left 4x8 half */ 1.549 + vtrn.16 ROW6L, ROW7L 1.550 + vtrn.16 ROW2L, ROW3L 1.551 + vtrn.16 ROW0L, ROW1L 1.552 + vtrn.16 ROW4L, ROW5L 1.553 + vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ 1.554 + vtrn.32 ROW1L, ROW3L 1.555 + vtrn.32 ROW4L, ROW6L 1.556 + vtrn.32 ROW0L, ROW2L 1.557 + vtrn.32 ROW5L, ROW7L 1.558 + 1.559 + cmp r0, #0 1.560 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ 1.561 + 1.562 + /* Only row 0 is non-zero for the right 4x8 half */ 1.563 + vdup.s16 ROW1R, ROW0R[1] 1.564 + vdup.s16 ROW2R, ROW0R[2] 1.565 + vdup.s16 ROW3R, ROW0R[3] 1.566 + vdup.s16 ROW4R, ROW0R[0] 1.567 + vdup.s16 ROW5R, ROW0R[1] 1.568 + vdup.s16 ROW6R, ROW0R[2] 1.569 + vdup.s16 ROW7R, ROW0R[3] 1.570 + vdup.s16 ROW0R, ROW0R[0] 1.571 + b 1b /* Go to 'normal' second pass */ 1.572 + 1.573 +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 1.574 + vld1.s16 {d2}, [ip, :64] /* reload constants */ 1.575 + vmull.s16 q6, ROW1L, XFIX_1_175875602 1.576 + vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 1.577 + vmull.s16 q7, ROW3L, XFIX_1_175875602 1.578 + vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 1.579 + vmull.s16 q2, ROW2L, XFIX_0_541196100 1.580 + vshll.s16 q3, ROW0L, #13 1.581 + vmov q4, q6 1.582 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 1.583 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 1.584 + vadd.s32 q1, q3, q2 1.585 + vmov q5, q7 1.586 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 1.587 + vadd.s32 q1, q1, q6 1.588 + vadd.s32 q6, q6, q6 1.589 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 1.590 + vshrn.s32 ROW1L, q1, #16 1.591 + vsub.s32 q1, q1, q6 1.592 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 1.593 + vsub.s32 q3, q3, q2 1.594 + vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 1.595 + vadd.s32 q1, q3, q5 1.596 + vsub.s32 q3, q3, q5 1.597 + vshll.s16 q5, ROW0L, #13 1.598 + vshrn.s32 ROW2L, q1, #16 1.599 + vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 1.600 + vadd.s32 q2, q5, q6 1.601 + vsub.s32 q1, q5, q6 1.602 + vadd.s32 q6, q2, q7 1.603 + vsub.s32 q2, q2, q7 1.604 + vadd.s32 q5, q1, q4 1.605 + vsub.s32 q3, q1, q4 1.606 + vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 1.607 + vshrn.s32 ROW3L, q5, #16 1.608 + vshrn.s32 ROW0L, q6, #16 1.609 + vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 1.610 + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ 1.611 + vld1.s16 {d2}, [ip, :64] /* reload constants */ 1.612 + vmull.s16 q6, ROW5L, XFIX_1_175875602 1.613 + vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 1.614 + vmull.s16 q7, ROW7L, XFIX_1_175875602 1.615 + vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 1.616 + vmull.s16 q2, ROW6L, XFIX_0_541196100 1.617 + vshll.s16 q3, ROW4L, #13 1.618 + vmov q4, q6 1.619 + vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 1.620 + vmlsl.s16 q4, ROW5L, XFIX_0_899976223 1.621 + vadd.s32 q1, q3, q2 1.622 + vmov q5, q7 1.623 + vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 1.624 + vadd.s32 q1, q1, q6 1.625 + vadd.s32 q6, q6, q6 1.626 + vmlsl.s16 q5, ROW7L, XFIX_2_562915447 1.627 + vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 1.628 + vsub.s32 q1, q1, q6 1.629 + vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 1.630 + vsub.s32 q3, q3, q2 1.631 + vshrn.s32 ROW6R, q1, #16 1.632 + vadd.s32 q1, q3, q5 1.633 + vsub.s32 q3, q3, q5 1.634 + vshll.s16 q5, ROW4L, #13 1.635 + vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 1.636 + vshrn.s32 ROW5R, q3, #16 1.637 + vadd.s32 q2, q5, q6 1.638 + vsub.s32 q1, q5, q6 1.639 + vadd.s32 q6, q2, q7 1.640 + vsub.s32 q2, q2, q7 1.641 + vadd.s32 q5, q1, q4 1.642 + vsub.s32 q3, q1, q4 1.643 + vshrn.s32 ROW7R, q2, #16 1.644 + vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 1.645 + vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 1.646 + vshrn.s32 ROW4R, q3, #16 1.647 + b 2b /* Go to epilogue */ 1.648 + 1.649 + .unreq DCT_TABLE 1.650 + .unreq COEF_BLOCK 1.651 + .unreq OUTPUT_BUF 1.652 + .unreq OUTPUT_COL 1.653 + .unreq TMP1 1.654 + .unreq TMP2 1.655 + .unreq TMP3 1.656 + .unreq TMP4 1.657 + 1.658 + .unreq ROW0L 1.659 + .unreq ROW0R 1.660 + .unreq ROW1L 1.661 + .unreq ROW1R 1.662 + .unreq ROW2L 1.663 + .unreq ROW2R 1.664 + .unreq ROW3L 1.665 + .unreq ROW3R 1.666 + .unreq ROW4L 1.667 + .unreq ROW4R 1.668 + .unreq ROW5L 1.669 + .unreq ROW5R 1.670 + .unreq ROW6L 1.671 + .unreq ROW6R 1.672 + .unreq ROW7L 1.673 + .unreq ROW7R 1.674 +.endfunc 1.675 + 1.676 +/*****************************************************************************/ 1.677 + 1.678 +/* 1.679 + * jsimd_idct_ifast_neon 1.680 + * 1.681 + * This function contains a fast, not so accurate integer implementation of 1.682 + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 1.683 + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 1.684 + * function from jidctfst.c 1.685 + * 1.686 + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 1.687 + * But in ARM NEON case some extra additions are required because VQDMULH 1.688 + * instruction can't handle the constants larger than 1. So the expressions 1.689 + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 1.690 + * which introduces an extra addition. Overall, there are 6 extra additions 1.691 + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 1.692 + */ 1.693 + 1.694 +#define XFIX_1_082392200 d0[0] 1.695 +#define XFIX_1_414213562 d0[1] 1.696 +#define XFIX_1_847759065 d0[2] 1.697 +#define XFIX_2_613125930 d0[3] 1.698 + 1.699 +.balign 16 1.700 +jsimd_idct_ifast_neon_consts: 1.701 + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 1.702 + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 1.703 + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 1.704 + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 1.705 + 1.706 +asm_function jsimd_idct_ifast_neon 1.707 + 1.708 + DCT_TABLE .req r0 1.709 + COEF_BLOCK .req r1 1.710 + OUTPUT_BUF .req r2 1.711 + OUTPUT_COL .req r3 1.712 + TMP1 .req r0 1.713 + TMP2 .req r1 1.714 + TMP3 .req r2 1.715 + TMP4 .req ip 1.716 + 1.717 + /* Load and dequantize coefficients into NEON registers 1.718 + * with the following allocation: 1.719 + * 0 1 2 3 | 4 5 6 7 1.720 + * ---------+-------- 1.721 + * 0 | d16 | d17 ( q8 ) 1.722 + * 1 | d18 | d19 ( q9 ) 1.723 + * 2 | d20 | d21 ( q10 ) 1.724 + * 3 | d22 | d23 ( q11 ) 1.725 + * 4 | d24 | d25 ( q12 ) 1.726 + * 5 | d26 | d27 ( q13 ) 1.727 + * 6 | d28 | d29 ( q14 ) 1.728 + * 7 | d30 | d31 ( q15 ) 1.729 + */ 1.730 + adr ip, jsimd_idct_ifast_neon_consts 1.731 + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 1.732 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 1.733 + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 1.734 + vmul.s16 q8, q8, q0 1.735 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 1.736 + vmul.s16 q9, q9, q1 1.737 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 1.738 + vmul.s16 q10, q10, q2 1.739 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 1.740 + vmul.s16 q11, q11, q3 1.741 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 1.742 + vmul.s16 q12, q12, q0 1.743 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 1.744 + vmul.s16 q14, q14, q2 1.745 + vmul.s16 q13, q13, q1 1.746 + vld1.16 {d0}, [ip, :64] /* load constants */ 1.747 + vmul.s16 q15, q15, q3 1.748 + vpush {d8-d13} /* save NEON registers */ 1.749 + /* 1-D IDCT, pass 1 */ 1.750 + vsub.s16 q2, q10, q14 1.751 + vadd.s16 q14, q10, q14 1.752 + vsub.s16 q1, q11, q13 1.753 + vadd.s16 q13, q11, q13 1.754 + vsub.s16 q5, q9, q15 1.755 + vadd.s16 q15, q9, q15 1.756 + vqdmulh.s16 q4, q2, XFIX_1_414213562 1.757 + vqdmulh.s16 q6, q1, XFIX_2_613125930 1.758 + vadd.s16 q3, q1, q1 1.759 + vsub.s16 q1, q5, q1 1.760 + vadd.s16 q10, q2, q4 1.761 + vqdmulh.s16 q4, q1, XFIX_1_847759065 1.762 + vsub.s16 q2, q15, q13 1.763 + vadd.s16 q3, q3, q6 1.764 + vqdmulh.s16 q6, q2, XFIX_1_414213562 1.765 + vadd.s16 q1, q1, q4 1.766 + vqdmulh.s16 q4, q5, XFIX_1_082392200 1.767 + vsub.s16 q10, q10, q14 1.768 + vadd.s16 q2, q2, q6 1.769 + vsub.s16 q6, q8, q12 1.770 + vadd.s16 q12, q8, q12 1.771 + vadd.s16 q9, q5, q4 1.772 + vadd.s16 q5, q6, q10 1.773 + vsub.s16 q10, q6, q10 1.774 + vadd.s16 q6, q15, q13 1.775 + vadd.s16 q8, q12, q14 1.776 + vsub.s16 q3, q6, q3 1.777 + vsub.s16 q12, q12, q14 1.778 + vsub.s16 q3, q3, q1 1.779 + vsub.s16 q1, q9, q1 1.780 + vadd.s16 q2, q3, q2 1.781 + vsub.s16 q15, q8, q6 1.782 + vadd.s16 q1, q1, q2 1.783 + vadd.s16 q8, q8, q6 1.784 + vadd.s16 q14, q5, q3 1.785 + vsub.s16 q9, q5, q3 1.786 + vsub.s16 q13, q10, q2 1.787 + vadd.s16 q10, q10, q2 1.788 + /* Transpose */ 1.789 + vtrn.16 q8, q9 1.790 + vsub.s16 q11, q12, q1 1.791 + vtrn.16 q14, q15 1.792 + vadd.s16 q12, q12, q1 1.793 + vtrn.16 q10, q11 1.794 + vtrn.16 q12, q13 1.795 + vtrn.32 q9, q11 1.796 + vtrn.32 q12, q14 1.797 + vtrn.32 q8, q10 1.798 + vtrn.32 q13, q15 1.799 + vswp d28, d21 1.800 + vswp d26, d19 1.801 + /* 1-D IDCT, pass 2 */ 1.802 + vsub.s16 q2, q10, q14 1.803 + vswp d30, d23 1.804 + vadd.s16 q14, q10, q14 1.805 + vswp d24, d17 1.806 + vsub.s16 q1, q11, q13 1.807 + vadd.s16 q13, q11, q13 1.808 + vsub.s16 q5, q9, q15 1.809 + vadd.s16 q15, q9, q15 1.810 + vqdmulh.s16 q4, q2, XFIX_1_414213562 1.811 + vqdmulh.s16 q6, q1, XFIX_2_613125930 1.812 + vadd.s16 q3, q1, q1 1.813 + vsub.s16 q1, q5, q1 1.814 + vadd.s16 q10, q2, q4 1.815 + vqdmulh.s16 q4, q1, XFIX_1_847759065 1.816 + vsub.s16 q2, q15, q13 1.817 + vadd.s16 q3, q3, q6 1.818 + vqdmulh.s16 q6, q2, XFIX_1_414213562 1.819 + vadd.s16 q1, q1, q4 1.820 + vqdmulh.s16 q4, q5, XFIX_1_082392200 1.821 + vsub.s16 q10, q10, q14 1.822 + vadd.s16 q2, q2, q6 1.823 + vsub.s16 q6, q8, q12 1.824 + vadd.s16 q12, q8, q12 1.825 + vadd.s16 q9, q5, q4 1.826 + vadd.s16 q5, q6, q10 1.827 + vsub.s16 q10, q6, q10 1.828 + vadd.s16 q6, q15, q13 1.829 + vadd.s16 q8, q12, q14 1.830 + vsub.s16 q3, q6, q3 1.831 + vsub.s16 q12, q12, q14 1.832 + vsub.s16 q3, q3, q1 1.833 + vsub.s16 q1, q9, q1 1.834 + vadd.s16 q2, q3, q2 1.835 + vsub.s16 q15, q8, q6 1.836 + vadd.s16 q1, q1, q2 1.837 + vadd.s16 q8, q8, q6 1.838 + vadd.s16 q14, q5, q3 1.839 + vsub.s16 q9, q5, q3 1.840 + vsub.s16 q13, q10, q2 1.841 + vpop {d8-d13} /* restore NEON registers */ 1.842 + vadd.s16 q10, q10, q2 1.843 + vsub.s16 q11, q12, q1 1.844 + vadd.s16 q12, q12, q1 1.845 + /* Descale to 8-bit and range limit */ 1.846 + vmov.u8 q0, #0x80 1.847 + vqshrn.s16 d16, q8, #5 1.848 + vqshrn.s16 d17, q9, #5 1.849 + vqshrn.s16 d18, q10, #5 1.850 + vqshrn.s16 d19, q11, #5 1.851 + vqshrn.s16 d20, q12, #5 1.852 + vqshrn.s16 d21, q13, #5 1.853 + vqshrn.s16 d22, q14, #5 1.854 + vqshrn.s16 d23, q15, #5 1.855 + vadd.u8 q8, q8, q0 1.856 + vadd.u8 q9, q9, q0 1.857 + vadd.u8 q10, q10, q0 1.858 + vadd.u8 q11, q11, q0 1.859 + /* Transpose the final 8-bit samples */ 1.860 + vtrn.16 q8, q9 1.861 + vtrn.16 q10, q11 1.862 + vtrn.32 q8, q10 1.863 + vtrn.32 q9, q11 1.864 + vtrn.8 d16, d17 1.865 + vtrn.8 d18, d19 1.866 + /* Store results to the output buffer */ 1.867 + ldmia OUTPUT_BUF!, {TMP1, TMP2} 1.868 + add TMP1, TMP1, OUTPUT_COL 1.869 + add TMP2, TMP2, OUTPUT_COL 1.870 + vst1.8 {d16}, [TMP1] 1.871 + vst1.8 {d17}, [TMP2] 1.872 + ldmia OUTPUT_BUF!, {TMP1, TMP2} 1.873 + add TMP1, TMP1, OUTPUT_COL 1.874 + add TMP2, TMP2, OUTPUT_COL 1.875 + vst1.8 {d18}, [TMP1] 1.876 + vtrn.8 d20, d21 1.877 + vst1.8 {d19}, [TMP2] 1.878 + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 1.879 + add TMP1, TMP1, OUTPUT_COL 1.880 + add TMP2, TMP2, OUTPUT_COL 1.881 + add TMP3, TMP3, OUTPUT_COL 1.882 + add TMP4, TMP4, OUTPUT_COL 1.883 + vst1.8 {d20}, [TMP1] 1.884 + vtrn.8 d22, d23 1.885 + vst1.8 {d21}, [TMP2] 1.886 + vst1.8 {d22}, [TMP3] 1.887 + vst1.8 {d23}, [TMP4] 1.888 + bx lr 1.889 + 1.890 + .unreq DCT_TABLE 1.891 + .unreq COEF_BLOCK 1.892 + .unreq OUTPUT_BUF 1.893 + .unreq OUTPUT_COL 1.894 + .unreq TMP1 1.895 + .unreq TMP2 1.896 + .unreq TMP3 1.897 + .unreq TMP4 1.898 +.endfunc 1.899 + 1.900 +/*****************************************************************************/ 1.901 + 1.902 +/* 1.903 + * jsimd_idct_4x4_neon 1.904 + * 1.905 + * This function contains inverse-DCT code for getting reduced-size 1.906 + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 1.907 + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 1.908 + * function from jpeg-6b (jidctred.c). 1.909 + * 1.910 + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 1.911 + * requires much less arithmetic operations and hence should be faster. 1.912 + * The primary purpose of this particular NEON optimized function is 1.913 + * bit exact compatibility with jpeg-6b. 1.914 + * 1.915 + * TODO: a bit better instructions scheduling can be achieved by expanding 1.916 + * idct_helper/transpose_4x4 macros and reordering instructions, 1.917 + * but readability will suffer somewhat. 1.918 + */ 1.919 + 1.920 +#define CONST_BITS 13 1.921 + 1.922 +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 1.923 +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 1.924 +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 1.925 +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 1.926 +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 1.927 +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 1.928 +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 1.929 +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 1.930 +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 1.931 +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 1.932 +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 1.933 +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 1.934 +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 1.935 +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 1.936 + 1.937 +.balign 16 1.938 +jsimd_idct_4x4_neon_consts: 1.939 + .short FIX_1_847759065 /* d0[0] */ 1.940 + .short -FIX_0_765366865 /* d0[1] */ 1.941 + .short -FIX_0_211164243 /* d0[2] */ 1.942 + .short FIX_1_451774981 /* d0[3] */ 1.943 + .short -FIX_2_172734803 /* d1[0] */ 1.944 + .short FIX_1_061594337 /* d1[1] */ 1.945 + .short -FIX_0_509795579 /* d1[2] */ 1.946 + .short -FIX_0_601344887 /* d1[3] */ 1.947 + .short FIX_0_899976223 /* d2[0] */ 1.948 + .short FIX_2_562915447 /* d2[1] */ 1.949 + .short 1 << (CONST_BITS+1) /* d2[2] */ 1.950 + .short 0 /* d2[3] */ 1.951 + 1.952 +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 1.953 + vmull.s16 q14, \x4, d2[2] 1.954 + vmlal.s16 q14, \x8, d0[0] 1.955 + vmlal.s16 q14, \x14, d0[1] 1.956 + 1.957 + vmull.s16 q13, \x16, d1[2] 1.958 + vmlal.s16 q13, \x12, d1[3] 1.959 + vmlal.s16 q13, \x10, d2[0] 1.960 + vmlal.s16 q13, \x6, d2[1] 1.961 + 1.962 + vmull.s16 q15, \x4, d2[2] 1.963 + vmlsl.s16 q15, \x8, d0[0] 1.964 + vmlsl.s16 q15, \x14, d0[1] 1.965 + 1.966 + vmull.s16 q12, \x16, d0[2] 1.967 + vmlal.s16 q12, \x12, d0[3] 1.968 + vmlal.s16 q12, \x10, d1[0] 1.969 + vmlal.s16 q12, \x6, d1[1] 1.970 + 1.971 + vadd.s32 q10, q14, q13 1.972 + vsub.s32 q14, q14, q13 1.973 + 1.974 +.if \shift > 16 1.975 + vrshr.s32 q10, q10, #\shift 1.976 + vrshr.s32 q14, q14, #\shift 1.977 + vmovn.s32 \y26, q10 1.978 + vmovn.s32 \y29, q14 1.979 +.else 1.980 + vrshrn.s32 \y26, q10, #\shift 1.981 + vrshrn.s32 \y29, q14, #\shift 1.982 +.endif 1.983 + 1.984 + vadd.s32 q10, q15, q12 1.985 + vsub.s32 q15, q15, q12 1.986 + 1.987 +.if \shift > 16 1.988 + vrshr.s32 q10, q10, #\shift 1.989 + vrshr.s32 q15, q15, #\shift 1.990 + vmovn.s32 \y27, q10 1.991 + vmovn.s32 \y28, q15 1.992 +.else 1.993 + vrshrn.s32 \y27, q10, #\shift 1.994 + vrshrn.s32 \y28, q15, #\shift 1.995 +.endif 1.996 + 1.997 +.endm 1.998 + 1.999 +asm_function jsimd_idct_4x4_neon 1.1000 + 1.1001 + DCT_TABLE .req r0 1.1002 + COEF_BLOCK .req r1 1.1003 + OUTPUT_BUF .req r2 1.1004 + OUTPUT_COL .req r3 1.1005 + TMP1 .req r0 1.1006 + TMP2 .req r1 1.1007 + TMP3 .req r2 1.1008 + TMP4 .req ip 1.1009 + 1.1010 + vpush {d8-d15} 1.1011 + 1.1012 + /* Load constants (d3 is just used for padding) */ 1.1013 + adr TMP4, jsimd_idct_4x4_neon_consts 1.1014 + vld1.16 {d0, d1, d2, d3}, [TMP4, :128] 1.1015 + 1.1016 + /* Load all COEF_BLOCK into NEON registers with the following allocation: 1.1017 + * 0 1 2 3 | 4 5 6 7 1.1018 + * ---------+-------- 1.1019 + * 0 | d4 | d5 1.1020 + * 1 | d6 | d7 1.1021 + * 2 | d8 | d9 1.1022 + * 3 | d10 | d11 1.1023 + * 4 | - | - 1.1024 + * 5 | d12 | d13 1.1025 + * 6 | d14 | d15 1.1026 + * 7 | d16 | d17 1.1027 + */ 1.1028 + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1.1029 + vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! 1.1030 + add COEF_BLOCK, COEF_BLOCK, #16 1.1031 + vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! 1.1032 + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1.1033 + /* dequantize */ 1.1034 + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1.1035 + vmul.s16 q2, q2, q9 1.1036 + vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! 1.1037 + vmul.s16 q3, q3, q10 1.1038 + vmul.s16 q4, q4, q11 1.1039 + add DCT_TABLE, DCT_TABLE, #16 1.1040 + vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! 1.1041 + vmul.s16 q5, q5, q12 1.1042 + vmul.s16 q6, q6, q13 1.1043 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1.1044 + vmul.s16 q7, q7, q14 1.1045 + vmul.s16 q8, q8, q15 1.1046 + 1.1047 + /* Pass 1 */ 1.1048 + idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 1.1049 + transpose_4x4 d4, d6, d8, d10 1.1050 + idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 1.1051 + transpose_4x4 d5, d7, d9, d11 1.1052 + 1.1053 + /* Pass 2 */ 1.1054 + idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 1.1055 + transpose_4x4 d26, d27, d28, d29 1.1056 + 1.1057 + /* Range limit */ 1.1058 + vmov.u16 q15, #0x80 1.1059 + vadd.s16 q13, q13, q15 1.1060 + vadd.s16 q14, q14, q15 1.1061 + vqmovun.s16 d26, q13 1.1062 + vqmovun.s16 d27, q14 1.1063 + 1.1064 + /* Store results to the output buffer */ 1.1065 + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 1.1066 + add TMP1, TMP1, OUTPUT_COL 1.1067 + add TMP2, TMP2, OUTPUT_COL 1.1068 + add TMP3, TMP3, OUTPUT_COL 1.1069 + add TMP4, TMP4, OUTPUT_COL 1.1070 + 1.1071 +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1.1072 + /* We can use much less instructions on little endian systems if the 1.1073 + * OS kernel is not configured to trap unaligned memory accesses 1.1074 + */ 1.1075 + vst1.32 {d26[0]}, [TMP1]! 1.1076 + vst1.32 {d27[0]}, [TMP3]! 1.1077 + vst1.32 {d26[1]}, [TMP2]! 1.1078 + vst1.32 {d27[1]}, [TMP4]! 1.1079 +#else 1.1080 + vst1.8 {d26[0]}, [TMP1]! 1.1081 + vst1.8 {d27[0]}, [TMP3]! 1.1082 + vst1.8 {d26[1]}, [TMP1]! 1.1083 + vst1.8 {d27[1]}, [TMP3]! 1.1084 + vst1.8 {d26[2]}, [TMP1]! 1.1085 + vst1.8 {d27[2]}, [TMP3]! 1.1086 + vst1.8 {d26[3]}, [TMP1]! 1.1087 + vst1.8 {d27[3]}, [TMP3]! 1.1088 + 1.1089 + vst1.8 {d26[4]}, [TMP2]! 1.1090 + vst1.8 {d27[4]}, [TMP4]! 1.1091 + vst1.8 {d26[5]}, [TMP2]! 1.1092 + vst1.8 {d27[5]}, [TMP4]! 1.1093 + vst1.8 {d26[6]}, [TMP2]! 1.1094 + vst1.8 {d27[6]}, [TMP4]! 1.1095 + vst1.8 {d26[7]}, [TMP2]! 1.1096 + vst1.8 {d27[7]}, [TMP4]! 1.1097 +#endif 1.1098 + 1.1099 + vpop {d8-d15} 1.1100 + bx lr 1.1101 + 1.1102 + .unreq DCT_TABLE 1.1103 + .unreq COEF_BLOCK 1.1104 + .unreq OUTPUT_BUF 1.1105 + .unreq OUTPUT_COL 1.1106 + .unreq TMP1 1.1107 + .unreq TMP2 1.1108 + .unreq TMP3 1.1109 + .unreq TMP4 1.1110 +.endfunc 1.1111 + 1.1112 +.purgem idct_helper 1.1113 + 1.1114 +/*****************************************************************************/ 1.1115 + 1.1116 +/* 1.1117 + * jsimd_idct_2x2_neon 1.1118 + * 1.1119 + * This function contains inverse-DCT code for getting reduced-size 1.1120 + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1.1121 + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1.1122 + * function from jpeg-6b (jidctred.c). 1.1123 + * 1.1124 + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1.1125 + * requires much less arithmetic operations and hence should be faster. 1.1126 + * The primary purpose of this particular NEON optimized function is 1.1127 + * bit exact compatibility with jpeg-6b. 1.1128 + */ 1.1129 + 1.1130 +.balign 8 1.1131 +jsimd_idct_2x2_neon_consts: 1.1132 + .short -FIX_0_720959822 /* d0[0] */ 1.1133 + .short FIX_0_850430095 /* d0[1] */ 1.1134 + .short -FIX_1_272758580 /* d0[2] */ 1.1135 + .short FIX_3_624509785 /* d0[3] */ 1.1136 + 1.1137 +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1.1138 + vshll.s16 q14, \x4, #15 1.1139 + vmull.s16 q13, \x6, d0[3] 1.1140 + vmlal.s16 q13, \x10, d0[2] 1.1141 + vmlal.s16 q13, \x12, d0[1] 1.1142 + vmlal.s16 q13, \x16, d0[0] 1.1143 + 1.1144 + vadd.s32 q10, q14, q13 1.1145 + vsub.s32 q14, q14, q13 1.1146 + 1.1147 +.if \shift > 16 1.1148 + vrshr.s32 q10, q10, #\shift 1.1149 + vrshr.s32 q14, q14, #\shift 1.1150 + vmovn.s32 \y26, q10 1.1151 + vmovn.s32 \y27, q14 1.1152 +.else 1.1153 + vrshrn.s32 \y26, q10, #\shift 1.1154 + vrshrn.s32 \y27, q14, #\shift 1.1155 +.endif 1.1156 + 1.1157 +.endm 1.1158 + 1.1159 +asm_function jsimd_idct_2x2_neon 1.1160 + 1.1161 + DCT_TABLE .req r0 1.1162 + COEF_BLOCK .req r1 1.1163 + OUTPUT_BUF .req r2 1.1164 + OUTPUT_COL .req r3 1.1165 + TMP1 .req r0 1.1166 + TMP2 .req ip 1.1167 + 1.1168 + vpush {d8-d15} 1.1169 + 1.1170 + /* Load constants */ 1.1171 + adr TMP2, jsimd_idct_2x2_neon_consts 1.1172 + vld1.16 {d0}, [TMP2, :64] 1.1173 + 1.1174 + /* Load all COEF_BLOCK into NEON registers with the following allocation: 1.1175 + * 0 1 2 3 | 4 5 6 7 1.1176 + * ---------+-------- 1.1177 + * 0 | d4 | d5 1.1178 + * 1 | d6 | d7 1.1179 + * 2 | - | - 1.1180 + * 3 | d10 | d11 1.1181 + * 4 | - | - 1.1182 + * 5 | d12 | d13 1.1183 + * 6 | - | - 1.1184 + * 7 | d16 | d17 1.1185 + */ 1.1186 + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1.1187 + add COEF_BLOCK, COEF_BLOCK, #16 1.1188 + vld1.16 {d10, d11}, [COEF_BLOCK, :128]! 1.1189 + add COEF_BLOCK, COEF_BLOCK, #16 1.1190 + vld1.16 {d12, d13}, [COEF_BLOCK, :128]! 1.1191 + add COEF_BLOCK, COEF_BLOCK, #16 1.1192 + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1.1193 + /* Dequantize */ 1.1194 + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1.1195 + vmul.s16 q2, q2, q9 1.1196 + vmul.s16 q3, q3, q10 1.1197 + add DCT_TABLE, DCT_TABLE, #16 1.1198 + vld1.16 {d24, d25}, [DCT_TABLE, :128]! 1.1199 + vmul.s16 q5, q5, q12 1.1200 + add DCT_TABLE, DCT_TABLE, #16 1.1201 + vld1.16 {d26, d27}, [DCT_TABLE, :128]! 1.1202 + vmul.s16 q6, q6, q13 1.1203 + add DCT_TABLE, DCT_TABLE, #16 1.1204 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1.1205 + vmul.s16 q8, q8, q15 1.1206 + 1.1207 + /* Pass 1 */ 1.1208 +#if 0 1.1209 + idct_helper d4, d6, d10, d12, d16, 13, d4, d6 1.1210 + transpose_4x4 d4, d6, d8, d10 1.1211 + idct_helper d5, d7, d11, d13, d17, 13, d5, d7 1.1212 + transpose_4x4 d5, d7, d9, d11 1.1213 +#else 1.1214 + vmull.s16 q13, d6, d0[3] 1.1215 + vmlal.s16 q13, d10, d0[2] 1.1216 + vmlal.s16 q13, d12, d0[1] 1.1217 + vmlal.s16 q13, d16, d0[0] 1.1218 + vmull.s16 q12, d7, d0[3] 1.1219 + vmlal.s16 q12, d11, d0[2] 1.1220 + vmlal.s16 q12, d13, d0[1] 1.1221 + vmlal.s16 q12, d17, d0[0] 1.1222 + vshll.s16 q14, d4, #15 1.1223 + vshll.s16 q15, d5, #15 1.1224 + vadd.s32 q10, q14, q13 1.1225 + vsub.s32 q14, q14, q13 1.1226 + vrshrn.s32 d4, q10, #13 1.1227 + vrshrn.s32 d6, q14, #13 1.1228 + vadd.s32 q10, q15, q12 1.1229 + vsub.s32 q14, q15, q12 1.1230 + vrshrn.s32 d5, q10, #13 1.1231 + vrshrn.s32 d7, q14, #13 1.1232 + vtrn.16 q2, q3 1.1233 + vtrn.32 q3, q5 1.1234 +#endif 1.1235 + 1.1236 + /* Pass 2 */ 1.1237 + idct_helper d4, d6, d10, d7, d11, 20, d26, d27 1.1238 + 1.1239 + /* Range limit */ 1.1240 + vmov.u16 q15, #0x80 1.1241 + vadd.s16 q13, q13, q15 1.1242 + vqmovun.s16 d26, q13 1.1243 + vqmovun.s16 d27, q13 1.1244 + 1.1245 + /* Store results to the output buffer */ 1.1246 + ldmia OUTPUT_BUF, {TMP1, TMP2} 1.1247 + add TMP1, TMP1, OUTPUT_COL 1.1248 + add TMP2, TMP2, OUTPUT_COL 1.1249 + 1.1250 + vst1.8 {d26[0]}, [TMP1]! 1.1251 + vst1.8 {d27[4]}, [TMP1]! 1.1252 + vst1.8 {d26[1]}, [TMP2]! 1.1253 + vst1.8 {d27[5]}, [TMP2]! 1.1254 + 1.1255 + vpop {d8-d15} 1.1256 + bx lr 1.1257 + 1.1258 + .unreq DCT_TABLE 1.1259 + .unreq COEF_BLOCK 1.1260 + .unreq OUTPUT_BUF 1.1261 + .unreq OUTPUT_COL 1.1262 + .unreq TMP1 1.1263 + .unreq TMP2 1.1264 +.endfunc 1.1265 + 1.1266 +.purgem idct_helper 1.1267 + 1.1268 +/*****************************************************************************/ 1.1269 + 1.1270 +/* 1.1271 + * jsimd_ycc_extrgb_convert_neon 1.1272 + * jsimd_ycc_extbgr_convert_neon 1.1273 + * jsimd_ycc_extrgbx_convert_neon 1.1274 + * jsimd_ycc_extbgrx_convert_neon 1.1275 + * jsimd_ycc_extxbgr_convert_neon 1.1276 + * jsimd_ycc_extxrgb_convert_neon 1.1277 + * 1.1278 + * Colorspace conversion YCbCr -> RGB 1.1279 + */ 1.1280 + 1.1281 + 1.1282 +.macro do_load size 1.1283 + .if \size == 8 1.1284 + vld1.8 {d4}, [U, :64]! 1.1285 + vld1.8 {d5}, [V, :64]! 1.1286 + vld1.8 {d0}, [Y, :64]! 1.1287 + pld [U, #64] 1.1288 + pld [V, #64] 1.1289 + pld [Y, #64] 1.1290 + .elseif \size == 4 1.1291 + vld1.8 {d4[0]}, [U]! 1.1292 + vld1.8 {d4[1]}, [U]! 1.1293 + vld1.8 {d4[2]}, [U]! 1.1294 + vld1.8 {d4[3]}, [U]! 1.1295 + vld1.8 {d5[0]}, [V]! 1.1296 + vld1.8 {d5[1]}, [V]! 1.1297 + vld1.8 {d5[2]}, [V]! 1.1298 + vld1.8 {d5[3]}, [V]! 1.1299 + vld1.8 {d0[0]}, [Y]! 1.1300 + vld1.8 {d0[1]}, [Y]! 1.1301 + vld1.8 {d0[2]}, [Y]! 1.1302 + vld1.8 {d0[3]}, [Y]! 1.1303 + .elseif \size == 2 1.1304 + vld1.8 {d4[4]}, [U]! 1.1305 + vld1.8 {d4[5]}, [U]! 1.1306 + vld1.8 {d5[4]}, [V]! 1.1307 + vld1.8 {d5[5]}, [V]! 1.1308 + vld1.8 {d0[4]}, [Y]! 1.1309 + vld1.8 {d0[5]}, [Y]! 1.1310 + .elseif \size == 1 1.1311 + vld1.8 {d4[6]}, [U]! 1.1312 + vld1.8 {d5[6]}, [V]! 1.1313 + vld1.8 {d0[6]}, [Y]! 1.1314 + .else 1.1315 + .error unsupported macroblock size 1.1316 + .endif 1.1317 +.endm 1.1318 + 1.1319 +.macro do_store bpp, size 1.1320 + .if \bpp == 24 1.1321 + .if \size == 8 1.1322 + vst3.8 {d10, d11, d12}, [RGB]! 1.1323 + .elseif \size == 4 1.1324 + vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1.1325 + vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1.1326 + vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1.1327 + vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1.1328 + .elseif \size == 2 1.1329 + vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1.1330 + vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1.1331 + .elseif \size == 1 1.1332 + vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1.1333 + .else 1.1334 + .error unsupported macroblock size 1.1335 + .endif 1.1336 + .elseif \bpp == 32 1.1337 + .if \size == 8 1.1338 + vst4.8 {d10, d11, d12, d13}, [RGB]! 1.1339 + .elseif \size == 4 1.1340 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1.1341 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1.1342 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1.1343 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1.1344 + .elseif \size == 2 1.1345 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1.1346 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1.1347 + .elseif \size == 1 1.1348 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1.1349 + .else 1.1350 + .error unsupported macroblock size 1.1351 + .endif 1.1352 + .else 1.1353 + .error unsupported bpp 1.1354 + .endif 1.1355 +.endm 1.1356 + 1.1357 +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1.1358 + 1.1359 +/* 1.1360 + * 2 stage pipelined YCbCr->RGB conversion 1.1361 + */ 1.1362 + 1.1363 +.macro do_yuv_to_rgb_stage1 1.1364 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1.1365 + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1.1366 + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1.1367 + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1.1368 + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1.1369 + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1.1370 + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1.1371 + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1.1372 + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1.1373 + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1.1374 +.endm 1.1375 + 1.1376 +.macro do_yuv_to_rgb_stage2 1.1377 + vrshrn.s32 d20, q10, #15 1.1378 + vrshrn.s32 d21, q11, #15 1.1379 + vrshrn.s32 d24, q12, #14 1.1380 + vrshrn.s32 d25, q13, #14 1.1381 + vrshrn.s32 d28, q14, #14 1.1382 + vrshrn.s32 d29, q15, #14 1.1383 + vaddw.u8 q10, q10, d0 1.1384 + vaddw.u8 q12, q12, d0 1.1385 + vaddw.u8 q14, q14, d0 1.1386 + vqmovun.s16 d1\g_offs, q10 1.1387 + vqmovun.s16 d1\r_offs, q12 1.1388 + vqmovun.s16 d1\b_offs, q14 1.1389 +.endm 1.1390 + 1.1391 +.macro do_yuv_to_rgb_stage2_store_load_stage1 1.1392 + vld1.8 {d4}, [U, :64]! 1.1393 + vrshrn.s32 d20, q10, #15 1.1394 + vrshrn.s32 d21, q11, #15 1.1395 + vrshrn.s32 d24, q12, #14 1.1396 + vrshrn.s32 d25, q13, #14 1.1397 + vrshrn.s32 d28, q14, #14 1.1398 + vld1.8 {d5}, [V, :64]! 1.1399 + vrshrn.s32 d29, q15, #14 1.1400 + vaddw.u8 q10, q10, d0 1.1401 + vaddw.u8 q12, q12, d0 1.1402 + vaddw.u8 q14, q14, d0 1.1403 + vqmovun.s16 d1\g_offs, q10 1.1404 + vld1.8 {d0}, [Y, :64]! 1.1405 + vqmovun.s16 d1\r_offs, q12 1.1406 + pld [U, #64] 1.1407 + pld [V, #64] 1.1408 + pld [Y, #64] 1.1409 + vqmovun.s16 d1\b_offs, q14 1.1410 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1.1411 + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1.1412 + do_store \bpp, 8 1.1413 + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1.1414 + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1.1415 + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1.1416 + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1.1417 + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1.1418 + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1.1419 + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1.1420 + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1.1421 +.endm 1.1422 + 1.1423 +.macro do_yuv_to_rgb 1.1424 + do_yuv_to_rgb_stage1 1.1425 + do_yuv_to_rgb_stage2 1.1426 +.endm 1.1427 + 1.1428 +/* Apple gas crashes on adrl, work around that by using adr. 1.1429 + * But this requires a copy of these constants for each function. 1.1430 + */ 1.1431 + 1.1432 +.balign 16 1.1433 +jsimd_ycc_\colorid\()_neon_consts: 1.1434 + .short 0, 0, 0, 0 1.1435 + .short 22971, -11277, -23401, 29033 1.1436 + .short -128, -128, -128, -128 1.1437 + .short -128, -128, -128, -128 1.1438 + 1.1439 +asm_function jsimd_ycc_\colorid\()_convert_neon 1.1440 + OUTPUT_WIDTH .req r0 1.1441 + INPUT_BUF .req r1 1.1442 + INPUT_ROW .req r2 1.1443 + OUTPUT_BUF .req r3 1.1444 + NUM_ROWS .req r4 1.1445 + 1.1446 + INPUT_BUF0 .req r5 1.1447 + INPUT_BUF1 .req r6 1.1448 + INPUT_BUF2 .req INPUT_BUF 1.1449 + 1.1450 + RGB .req r7 1.1451 + Y .req r8 1.1452 + U .req r9 1.1453 + V .req r10 1.1454 + N .req ip 1.1455 + 1.1456 + /* Load constants to d1, d2, d3 (d0 is just used for padding) */ 1.1457 + adr ip, jsimd_ycc_\colorid\()_neon_consts 1.1458 + vld1.16 {d0, d1, d2, d3}, [ip, :128] 1.1459 + 1.1460 + /* Save ARM registers and handle input arguments */ 1.1461 + push {r4, r5, r6, r7, r8, r9, r10, lr} 1.1462 + ldr NUM_ROWS, [sp, #(4 * 8)] 1.1463 + ldr INPUT_BUF0, [INPUT_BUF] 1.1464 + ldr INPUT_BUF1, [INPUT_BUF, #4] 1.1465 + ldr INPUT_BUF2, [INPUT_BUF, #8] 1.1466 + .unreq INPUT_BUF 1.1467 + 1.1468 + /* Save NEON registers */ 1.1469 + vpush {d8-d15} 1.1470 + 1.1471 + /* Initially set d10, d11, d12, d13 to 0xFF */ 1.1472 + vmov.u8 q5, #255 1.1473 + vmov.u8 q6, #255 1.1474 + 1.1475 + /* Outer loop over scanlines */ 1.1476 + cmp NUM_ROWS, #1 1.1477 + blt 9f 1.1478 +0: 1.1479 + ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] 1.1480 + ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] 1.1481 + mov N, OUTPUT_WIDTH 1.1482 + ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] 1.1483 + add INPUT_ROW, INPUT_ROW, #1 1.1484 + ldr RGB, [OUTPUT_BUF], #4 1.1485 + 1.1486 + /* Inner loop over pixels */ 1.1487 + subs N, N, #8 1.1488 + blt 3f 1.1489 + do_load 8 1.1490 + do_yuv_to_rgb_stage1 1.1491 + subs N, N, #8 1.1492 + blt 2f 1.1493 +1: 1.1494 + do_yuv_to_rgb_stage2_store_load_stage1 1.1495 + subs N, N, #8 1.1496 + bge 1b 1.1497 +2: 1.1498 + do_yuv_to_rgb_stage2 1.1499 + do_store \bpp, 8 1.1500 + tst N, #7 1.1501 + beq 8f 1.1502 +3: 1.1503 + tst N, #4 1.1504 + beq 3f 1.1505 + do_load 4 1.1506 +3: 1.1507 + tst N, #2 1.1508 + beq 4f 1.1509 + do_load 2 1.1510 +4: 1.1511 + tst N, #1 1.1512 + beq 5f 1.1513 + do_load 1 1.1514 +5: 1.1515 + do_yuv_to_rgb 1.1516 + tst N, #4 1.1517 + beq 6f 1.1518 + do_store \bpp, 4 1.1519 +6: 1.1520 + tst N, #2 1.1521 + beq 7f 1.1522 + do_store \bpp, 2 1.1523 +7: 1.1524 + tst N, #1 1.1525 + beq 8f 1.1526 + do_store \bpp, 1 1.1527 +8: 1.1528 + subs NUM_ROWS, NUM_ROWS, #1 1.1529 + bgt 0b 1.1530 +9: 1.1531 + /* Restore all registers and return */ 1.1532 + vpop {d8-d15} 1.1533 + pop {r4, r5, r6, r7, r8, r9, r10, pc} 1.1534 + 1.1535 + .unreq OUTPUT_WIDTH 1.1536 + .unreq INPUT_ROW 1.1537 + .unreq OUTPUT_BUF 1.1538 + .unreq NUM_ROWS 1.1539 + .unreq INPUT_BUF0 1.1540 + .unreq INPUT_BUF1 1.1541 + .unreq INPUT_BUF2 1.1542 + .unreq RGB 1.1543 + .unreq Y 1.1544 + .unreq U 1.1545 + .unreq V 1.1546 + .unreq N 1.1547 +.endfunc 1.1548 + 1.1549 +.purgem do_yuv_to_rgb 1.1550 +.purgem do_yuv_to_rgb_stage1 1.1551 +.purgem do_yuv_to_rgb_stage2 1.1552 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 1.1553 + 1.1554 +.endm 1.1555 + 1.1556 +/*--------------------------------- id ----- bpp R G B */ 1.1557 +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 1.1558 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 1.1559 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 1.1560 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 1.1561 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 1.1562 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 1.1563 + 1.1564 +.purgem do_load 1.1565 +.purgem do_store 1.1566 + 1.1567 +/*****************************************************************************/ 1.1568 + 1.1569 +/* 1.1570 + * jsimd_extrgb_ycc_convert_neon 1.1571 + * jsimd_extbgr_ycc_convert_neon 1.1572 + * jsimd_extrgbx_ycc_convert_neon 1.1573 + * jsimd_extbgrx_ycc_convert_neon 1.1574 + * jsimd_extxbgr_ycc_convert_neon 1.1575 + * jsimd_extxrgb_ycc_convert_neon 1.1576 + * 1.1577 + * Colorspace conversion RGB -> YCbCr 1.1578 + */ 1.1579 + 1.1580 +.macro do_store size 1.1581 + .if \size == 8 1.1582 + vst1.8 {d20}, [Y]! 1.1583 + vst1.8 {d21}, [U]! 1.1584 + vst1.8 {d22}, [V]! 1.1585 + .elseif \size == 4 1.1586 + vst1.8 {d20[0]}, [Y]! 1.1587 + vst1.8 {d20[1]}, [Y]! 1.1588 + vst1.8 {d20[2]}, [Y]! 1.1589 + vst1.8 {d20[3]}, [Y]! 1.1590 + vst1.8 {d21[0]}, [U]! 1.1591 + vst1.8 {d21[1]}, [U]! 1.1592 + vst1.8 {d21[2]}, [U]! 1.1593 + vst1.8 {d21[3]}, [U]! 1.1594 + vst1.8 {d22[0]}, [V]! 1.1595 + vst1.8 {d22[1]}, [V]! 1.1596 + vst1.8 {d22[2]}, [V]! 1.1597 + vst1.8 {d22[3]}, [V]! 1.1598 + .elseif \size == 2 1.1599 + vst1.8 {d20[4]}, [Y]! 1.1600 + vst1.8 {d20[5]}, [Y]! 1.1601 + vst1.8 {d21[4]}, [U]! 1.1602 + vst1.8 {d21[5]}, [U]! 1.1603 + vst1.8 {d22[4]}, [V]! 1.1604 + vst1.8 {d22[5]}, [V]! 1.1605 + .elseif \size == 1 1.1606 + vst1.8 {d20[6]}, [Y]! 1.1607 + vst1.8 {d21[6]}, [U]! 1.1608 + vst1.8 {d22[6]}, [V]! 1.1609 + .else 1.1610 + .error unsupported macroblock size 1.1611 + .endif 1.1612 +.endm 1.1613 + 1.1614 +.macro do_load bpp, size 1.1615 + .if \bpp == 24 1.1616 + .if \size == 8 1.1617 + vld3.8 {d10, d11, d12}, [RGB]! 1.1618 + pld [RGB, #128] 1.1619 + .elseif \size == 4 1.1620 + vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1.1621 + vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1.1622 + vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1.1623 + vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1.1624 + .elseif \size == 2 1.1625 + vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1.1626 + vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1.1627 + .elseif \size == 1 1.1628 + vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1.1629 + .else 1.1630 + .error unsupported macroblock size 1.1631 + .endif 1.1632 + .elseif \bpp == 32 1.1633 + .if \size == 8 1.1634 + vld4.8 {d10, d11, d12, d13}, [RGB]! 1.1635 + pld [RGB, #128] 1.1636 + .elseif \size == 4 1.1637 + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1.1638 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1.1639 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1.1640 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1.1641 + .elseif \size == 2 1.1642 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1.1643 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1.1644 + .elseif \size == 1 1.1645 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1.1646 + .else 1.1647 + .error unsupported macroblock size 1.1648 + .endif 1.1649 + .else 1.1650 + .error unsupported bpp 1.1651 + .endif 1.1652 +.endm 1.1653 + 1.1654 +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1.1655 + 1.1656 +/* 1.1657 + * 2 stage pipelined RGB->YCbCr conversion 1.1658 + */ 1.1659 + 1.1660 +.macro do_rgb_to_yuv_stage1 1.1661 + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1.1662 + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1.1663 + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1.1664 + vmull.u16 q7, d4, d0[0] 1.1665 + vmlal.u16 q7, d6, d0[1] 1.1666 + vmlal.u16 q7, d8, d0[2] 1.1667 + vmull.u16 q8, d5, d0[0] 1.1668 + vmlal.u16 q8, d7, d0[1] 1.1669 + vmlal.u16 q8, d9, d0[2] 1.1670 + vrev64.32 q9, q1 1.1671 + vrev64.32 q13, q1 1.1672 + vmlsl.u16 q9, d4, d0[3] 1.1673 + vmlsl.u16 q9, d6, d1[0] 1.1674 + vmlal.u16 q9, d8, d1[1] 1.1675 + vmlsl.u16 q13, d5, d0[3] 1.1676 + vmlsl.u16 q13, d7, d1[0] 1.1677 + vmlal.u16 q13, d9, d1[1] 1.1678 + vrev64.32 q14, q1 1.1679 + vrev64.32 q15, q1 1.1680 + vmlal.u16 q14, d4, d1[1] 1.1681 + vmlsl.u16 q14, d6, d1[2] 1.1682 + vmlsl.u16 q14, d8, d1[3] 1.1683 + vmlal.u16 q15, d5, d1[1] 1.1684 + vmlsl.u16 q15, d7, d1[2] 1.1685 + vmlsl.u16 q15, d9, d1[3] 1.1686 +.endm 1.1687 + 1.1688 +.macro do_rgb_to_yuv_stage2 1.1689 + vrshrn.u32 d20, q7, #16 1.1690 + vrshrn.u32 d21, q8, #16 1.1691 + vshrn.u32 d22, q9, #16 1.1692 + vshrn.u32 d23, q13, #16 1.1693 + vshrn.u32 d24, q14, #16 1.1694 + vshrn.u32 d25, q15, #16 1.1695 + vmovn.u16 d20, q10 /* d20 = y */ 1.1696 + vmovn.u16 d21, q11 /* d21 = u */ 1.1697 + vmovn.u16 d22, q12 /* d22 = v */ 1.1698 +.endm 1.1699 + 1.1700 +.macro do_rgb_to_yuv 1.1701 + do_rgb_to_yuv_stage1 1.1702 + do_rgb_to_yuv_stage2 1.1703 +.endm 1.1704 + 1.1705 +.macro do_rgb_to_yuv_stage2_store_load_stage1 1.1706 + vrshrn.u32 d20, q7, #16 1.1707 + vrshrn.u32 d21, q8, #16 1.1708 + vshrn.u32 d22, q9, #16 1.1709 + vrev64.32 q9, q1 1.1710 + vshrn.u32 d23, q13, #16 1.1711 + vrev64.32 q13, q1 1.1712 + vshrn.u32 d24, q14, #16 1.1713 + vshrn.u32 d25, q15, #16 1.1714 + do_load \bpp, 8 1.1715 + vmovn.u16 d20, q10 /* d20 = y */ 1.1716 + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1.1717 + vmovn.u16 d21, q11 /* d21 = u */ 1.1718 + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1.1719 + vmovn.u16 d22, q12 /* d22 = v */ 1.1720 + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1.1721 + vmull.u16 q7, d4, d0[0] 1.1722 + vmlal.u16 q7, d6, d0[1] 1.1723 + vmlal.u16 q7, d8, d0[2] 1.1724 + vst1.8 {d20}, [Y]! 1.1725 + vmull.u16 q8, d5, d0[0] 1.1726 + vmlal.u16 q8, d7, d0[1] 1.1727 + vmlal.u16 q8, d9, d0[2] 1.1728 + vmlsl.u16 q9, d4, d0[3] 1.1729 + vmlsl.u16 q9, d6, d1[0] 1.1730 + vmlal.u16 q9, d8, d1[1] 1.1731 + vst1.8 {d21}, [U]! 1.1732 + vmlsl.u16 q13, d5, d0[3] 1.1733 + vmlsl.u16 q13, d7, d1[0] 1.1734 + vmlal.u16 q13, d9, d1[1] 1.1735 + vrev64.32 q14, q1 1.1736 + vrev64.32 q15, q1 1.1737 + vmlal.u16 q14, d4, d1[1] 1.1738 + vmlsl.u16 q14, d6, d1[2] 1.1739 + vmlsl.u16 q14, d8, d1[3] 1.1740 + vst1.8 {d22}, [V]! 1.1741 + vmlal.u16 q15, d5, d1[1] 1.1742 + vmlsl.u16 q15, d7, d1[2] 1.1743 + vmlsl.u16 q15, d9, d1[3] 1.1744 +.endm 1.1745 + 1.1746 +.balign 16 1.1747 +jsimd_\colorid\()_ycc_neon_consts: 1.1748 + .short 19595, 38470, 7471, 11059 1.1749 + .short 21709, 32768, 27439, 5329 1.1750 + .short 32767, 128, 32767, 128 1.1751 + .short 32767, 128, 32767, 128 1.1752 + 1.1753 +asm_function jsimd_\colorid\()_ycc_convert_neon 1.1754 + OUTPUT_WIDTH .req r0 1.1755 + INPUT_BUF .req r1 1.1756 + OUTPUT_BUF .req r2 1.1757 + OUTPUT_ROW .req r3 1.1758 + NUM_ROWS .req r4 1.1759 + 1.1760 + OUTPUT_BUF0 .req r5 1.1761 + OUTPUT_BUF1 .req r6 1.1762 + OUTPUT_BUF2 .req OUTPUT_BUF 1.1763 + 1.1764 + RGB .req r7 1.1765 + Y .req r8 1.1766 + U .req r9 1.1767 + V .req r10 1.1768 + N .req ip 1.1769 + 1.1770 + /* Load constants to d0, d1, d2, d3 */ 1.1771 + adr ip, jsimd_\colorid\()_ycc_neon_consts 1.1772 + vld1.16 {d0, d1, d2, d3}, [ip, :128] 1.1773 + 1.1774 + /* Save ARM registers and handle input arguments */ 1.1775 + push {r4, r5, r6, r7, r8, r9, r10, lr} 1.1776 + ldr NUM_ROWS, [sp, #(4 * 8)] 1.1777 + ldr OUTPUT_BUF0, [OUTPUT_BUF] 1.1778 + ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] 1.1779 + ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] 1.1780 + .unreq OUTPUT_BUF 1.1781 + 1.1782 + /* Save NEON registers */ 1.1783 + vpush {d8-d15} 1.1784 + 1.1785 + /* Outer loop over scanlines */ 1.1786 + cmp NUM_ROWS, #1 1.1787 + blt 9f 1.1788 +0: 1.1789 + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] 1.1790 + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] 1.1791 + mov N, OUTPUT_WIDTH 1.1792 + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] 1.1793 + add OUTPUT_ROW, OUTPUT_ROW, #1 1.1794 + ldr RGB, [INPUT_BUF], #4 1.1795 + 1.1796 + /* Inner loop over pixels */ 1.1797 + subs N, N, #8 1.1798 + blt 3f 1.1799 + do_load \bpp, 8 1.1800 + do_rgb_to_yuv_stage1 1.1801 + subs N, N, #8 1.1802 + blt 2f 1.1803 +1: 1.1804 + do_rgb_to_yuv_stage2_store_load_stage1 1.1805 + subs N, N, #8 1.1806 + bge 1b 1.1807 +2: 1.1808 + do_rgb_to_yuv_stage2 1.1809 + do_store 8 1.1810 + tst N, #7 1.1811 + beq 8f 1.1812 +3: 1.1813 + tst N, #4 1.1814 + beq 3f 1.1815 + do_load \bpp, 4 1.1816 +3: 1.1817 + tst N, #2 1.1818 + beq 4f 1.1819 + do_load \bpp, 2 1.1820 +4: 1.1821 + tst N, #1 1.1822 + beq 5f 1.1823 + do_load \bpp, 1 1.1824 +5: 1.1825 + do_rgb_to_yuv 1.1826 + tst N, #4 1.1827 + beq 6f 1.1828 + do_store 4 1.1829 +6: 1.1830 + tst N, #2 1.1831 + beq 7f 1.1832 + do_store 2 1.1833 +7: 1.1834 + tst N, #1 1.1835 + beq 8f 1.1836 + do_store 1 1.1837 +8: 1.1838 + subs NUM_ROWS, NUM_ROWS, #1 1.1839 + bgt 0b 1.1840 +9: 1.1841 + /* Restore all registers and return */ 1.1842 + vpop {d8-d15} 1.1843 + pop {r4, r5, r6, r7, r8, r9, r10, pc} 1.1844 + 1.1845 + .unreq OUTPUT_WIDTH 1.1846 + .unreq OUTPUT_ROW 1.1847 + .unreq INPUT_BUF 1.1848 + .unreq NUM_ROWS 1.1849 + .unreq OUTPUT_BUF0 1.1850 + .unreq OUTPUT_BUF1 1.1851 + .unreq OUTPUT_BUF2 1.1852 + .unreq RGB 1.1853 + .unreq Y 1.1854 + .unreq U 1.1855 + .unreq V 1.1856 + .unreq N 1.1857 +.endfunc 1.1858 + 1.1859 +.purgem do_rgb_to_yuv 1.1860 +.purgem do_rgb_to_yuv_stage1 1.1861 +.purgem do_rgb_to_yuv_stage2 1.1862 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 1.1863 + 1.1864 +.endm 1.1865 + 1.1866 +/*--------------------------------- id ----- bpp R G B */ 1.1867 +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 1.1868 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 1.1869 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 1.1870 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 1.1871 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 1.1872 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 1.1873 + 1.1874 +.purgem do_load 1.1875 +.purgem do_store 1.1876 + 1.1877 +/*****************************************************************************/ 1.1878 + 1.1879 +/* 1.1880 + * Load data into workspace, applying unsigned->signed conversion 1.1881 + * 1.1882 + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 1.1883 + * rid of VST1.16 instructions 1.1884 + */ 1.1885 + 1.1886 +asm_function jsimd_convsamp_neon 1.1887 + SAMPLE_DATA .req r0 1.1888 + START_COL .req r1 1.1889 + WORKSPACE .req r2 1.1890 + TMP1 .req r3 1.1891 + TMP2 .req r4 1.1892 + TMP3 .req r5 1.1893 + TMP4 .req ip 1.1894 + 1.1895 + push {r4, r5} 1.1896 + vmov.u8 d0, #128 1.1897 + 1.1898 + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1.1899 + add TMP1, TMP1, START_COL 1.1900 + add TMP2, TMP2, START_COL 1.1901 + add TMP3, TMP3, START_COL 1.1902 + add TMP4, TMP4, START_COL 1.1903 + vld1.8 {d16}, [TMP1] 1.1904 + vsubl.u8 q8, d16, d0 1.1905 + vld1.8 {d18}, [TMP2] 1.1906 + vsubl.u8 q9, d18, d0 1.1907 + vld1.8 {d20}, [TMP3] 1.1908 + vsubl.u8 q10, d20, d0 1.1909 + vld1.8 {d22}, [TMP4] 1.1910 + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1.1911 + vsubl.u8 q11, d22, d0 1.1912 + vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! 1.1913 + add TMP1, TMP1, START_COL 1.1914 + add TMP2, TMP2, START_COL 1.1915 + vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! 1.1916 + add TMP3, TMP3, START_COL 1.1917 + add TMP4, TMP4, START_COL 1.1918 + vld1.8 {d24}, [TMP1] 1.1919 + vsubl.u8 q12, d24, d0 1.1920 + vld1.8 {d26}, [TMP2] 1.1921 + vsubl.u8 q13, d26, d0 1.1922 + vld1.8 {d28}, [TMP3] 1.1923 + vsubl.u8 q14, d28, d0 1.1924 + vld1.8 {d30}, [TMP4] 1.1925 + vsubl.u8 q15, d30, d0 1.1926 + vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! 1.1927 + vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! 1.1928 + pop {r4, r5} 1.1929 + bx lr 1.1930 + 1.1931 + .unreq SAMPLE_DATA 1.1932 + .unreq START_COL 1.1933 + .unreq WORKSPACE 1.1934 + .unreq TMP1 1.1935 + .unreq TMP2 1.1936 + .unreq TMP3 1.1937 + .unreq TMP4 1.1938 +.endfunc 1.1939 + 1.1940 +/*****************************************************************************/ 1.1941 + 1.1942 +/* 1.1943 + * jsimd_fdct_ifast_neon 1.1944 + * 1.1945 + * This function contains a fast, not so accurate integer implementation of 1.1946 + * the forward DCT (Discrete Cosine Transform). It uses the same calculations 1.1947 + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 1.1948 + * function from jfdctfst.c 1.1949 + * 1.1950 + * TODO: can be combined with 'jsimd_convsamp_neon' to get 1.1951 + * rid of a bunch of VLD1.16 instructions 1.1952 + */ 1.1953 + 1.1954 +#define XFIX_0_382683433 d0[0] 1.1955 +#define XFIX_0_541196100 d0[1] 1.1956 +#define XFIX_0_707106781 d0[2] 1.1957 +#define XFIX_1_306562965 d0[3] 1.1958 + 1.1959 +.balign 16 1.1960 +jsimd_fdct_ifast_neon_consts: 1.1961 + .short (98 * 128) /* XFIX_0_382683433 */ 1.1962 + .short (139 * 128) /* XFIX_0_541196100 */ 1.1963 + .short (181 * 128) /* XFIX_0_707106781 */ 1.1964 + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 1.1965 + 1.1966 +asm_function jsimd_fdct_ifast_neon 1.1967 + 1.1968 + DATA .req r0 1.1969 + TMP .req ip 1.1970 + 1.1971 + vpush {d8-d15} 1.1972 + 1.1973 + /* Load constants */ 1.1974 + adr TMP, jsimd_fdct_ifast_neon_consts 1.1975 + vld1.16 {d0}, [TMP, :64] 1.1976 + 1.1977 + /* Load all DATA into NEON registers with the following allocation: 1.1978 + * 0 1 2 3 | 4 5 6 7 1.1979 + * ---------+-------- 1.1980 + * 0 | d16 | d17 | q8 1.1981 + * 1 | d18 | d19 | q9 1.1982 + * 2 | d20 | d21 | q10 1.1983 + * 3 | d22 | d23 | q11 1.1984 + * 4 | d24 | d25 | q12 1.1985 + * 5 | d26 | d27 | q13 1.1986 + * 6 | d28 | d29 | q14 1.1987 + * 7 | d30 | d31 | q15 1.1988 + */ 1.1989 + 1.1990 + vld1.16 {d16, d17, d18, d19}, [DATA, :128]! 1.1991 + vld1.16 {d20, d21, d22, d23}, [DATA, :128]! 1.1992 + vld1.16 {d24, d25, d26, d27}, [DATA, :128]! 1.1993 + vld1.16 {d28, d29, d30, d31}, [DATA, :128] 1.1994 + sub DATA, DATA, #(128 - 32) 1.1995 + 1.1996 + mov TMP, #2 1.1997 +1: 1.1998 + /* Transpose */ 1.1999 + vtrn.16 q12, q13 1.2000 + vtrn.16 q10, q11 1.2001 + vtrn.16 q8, q9 1.2002 + vtrn.16 q14, q15 1.2003 + vtrn.32 q9, q11 1.2004 + vtrn.32 q13, q15 1.2005 + vtrn.32 q8, q10 1.2006 + vtrn.32 q12, q14 1.2007 + vswp d30, d23 1.2008 + vswp d24, d17 1.2009 + vswp d26, d19 1.2010 + /* 1-D FDCT */ 1.2011 + vadd.s16 q2, q11, q12 1.2012 + vswp d28, d21 1.2013 + vsub.s16 q12, q11, q12 1.2014 + vsub.s16 q6, q10, q13 1.2015 + vadd.s16 q10, q10, q13 1.2016 + vsub.s16 q7, q9, q14 1.2017 + vadd.s16 q9, q9, q14 1.2018 + vsub.s16 q1, q8, q15 1.2019 + vadd.s16 q8, q8, q15 1.2020 + vsub.s16 q4, q9, q10 1.2021 + vsub.s16 q5, q8, q2 1.2022 + vadd.s16 q3, q9, q10 1.2023 + vadd.s16 q4, q4, q5 1.2024 + vadd.s16 q2, q8, q2 1.2025 + vqdmulh.s16 q4, q4, XFIX_0_707106781 1.2026 + vadd.s16 q11, q12, q6 1.2027 + vadd.s16 q8, q2, q3 1.2028 + vsub.s16 q12, q2, q3 1.2029 + vadd.s16 q3, q6, q7 1.2030 + vadd.s16 q7, q7, q1 1.2031 + vqdmulh.s16 q3, q3, XFIX_0_707106781 1.2032 + vsub.s16 q6, q11, q7 1.2033 + vadd.s16 q10, q5, q4 1.2034 + vqdmulh.s16 q6, q6, XFIX_0_382683433 1.2035 + vsub.s16 q14, q5, q4 1.2036 + vqdmulh.s16 q11, q11, XFIX_0_541196100 1.2037 + vqdmulh.s16 q5, q7, XFIX_1_306562965 1.2038 + vadd.s16 q4, q1, q3 1.2039 + vsub.s16 q3, q1, q3 1.2040 + vadd.s16 q7, q7, q6 1.2041 + vadd.s16 q11, q11, q6 1.2042 + vadd.s16 q7, q7, q5 1.2043 + vadd.s16 q13, q3, q11 1.2044 + vsub.s16 q11, q3, q11 1.2045 + vadd.s16 q9, q4, q7 1.2046 + vsub.s16 q15, q4, q7 1.2047 + subs TMP, TMP, #1 1.2048 + bne 1b 1.2049 + 1.2050 + /* store results */ 1.2051 + vst1.16 {d16, d17, d18, d19}, [DATA, :128]! 1.2052 + vst1.16 {d20, d21, d22, d23}, [DATA, :128]! 1.2053 + vst1.16 {d24, d25, d26, d27}, [DATA, :128]! 1.2054 + vst1.16 {d28, d29, d30, d31}, [DATA, :128] 1.2055 + 1.2056 + vpop {d8-d15} 1.2057 + bx lr 1.2058 + 1.2059 + .unreq DATA 1.2060 + .unreq TMP 1.2061 +.endfunc 1.2062 + 1.2063 +/*****************************************************************************/ 1.2064 + 1.2065 +/* 1.2066 + * GLOBAL(void) 1.2067 + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, 1.2068 + * DCTELEM * workspace); 1.2069 + * 1.2070 + * Note: the code uses 2 stage pipelining in order to improve instructions 1.2071 + * scheduling and eliminate stalls (this provides ~15% better 1.2072 + * performance for this function on both ARM Cortex-A8 and 1.2073 + * ARM Cortex-A9 when compared to the non-pipelined variant). 1.2074 + * The instructions which belong to the second stage use different 1.2075 + * indentation for better readiability. 1.2076 + */ 1.2077 +asm_function jsimd_quantize_neon 1.2078 + 1.2079 + COEF_BLOCK .req r0 1.2080 + DIVISORS .req r1 1.2081 + WORKSPACE .req r2 1.2082 + 1.2083 + RECIPROCAL .req DIVISORS 1.2084 + CORRECTION .req r3 1.2085 + SHIFT .req ip 1.2086 + LOOP_COUNT .req r4 1.2087 + 1.2088 + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 1.2089 + vabs.s16 q12, q0 1.2090 + add CORRECTION, DIVISORS, #(64 * 2) 1.2091 + add SHIFT, DIVISORS, #(64 * 6) 1.2092 + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 1.2093 + vabs.s16 q13, q1 1.2094 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 1.2095 + vadd.u16 q12, q12, q10 /* add correction */ 1.2096 + vadd.u16 q13, q13, q11 1.2097 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 1.2098 + vmull.u16 q11, d25, d17 1.2099 + vmull.u16 q8, d26, d18 1.2100 + vmull.u16 q9, d27, d19 1.2101 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 1.2102 + vshrn.u32 d20, q10, #16 1.2103 + vshrn.u32 d21, q11, #16 1.2104 + vshrn.u32 d22, q8, #16 1.2105 + vshrn.u32 d23, q9, #16 1.2106 + vneg.s16 q12, q12 1.2107 + vneg.s16 q13, q13 1.2108 + vshr.s16 q2, q0, #15 /* extract sign */ 1.2109 + vshr.s16 q3, q1, #15 1.2110 + vshl.u16 q14, q10, q12 /* shift */ 1.2111 + vshl.u16 q15, q11, q13 1.2112 + 1.2113 + push {r4, r5} 1.2114 + mov LOOP_COUNT, #3 1.2115 +1: 1.2116 + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 1.2117 + veor.u16 q14, q14, q2 /* restore sign */ 1.2118 + vabs.s16 q12, q0 1.2119 + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 1.2120 + vabs.s16 q13, q1 1.2121 + veor.u16 q15, q15, q3 1.2122 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 1.2123 + vadd.u16 q12, q12, q10 /* add correction */ 1.2124 + vadd.u16 q13, q13, q11 1.2125 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 1.2126 + vmull.u16 q11, d25, d17 1.2127 + vmull.u16 q8, d26, d18 1.2128 + vmull.u16 q9, d27, d19 1.2129 + vsub.u16 q14, q14, q2 1.2130 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 1.2131 + vsub.u16 q15, q15, q3 1.2132 + vshrn.u32 d20, q10, #16 1.2133 + vshrn.u32 d21, q11, #16 1.2134 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 1.2135 + vshrn.u32 d22, q8, #16 1.2136 + vshrn.u32 d23, q9, #16 1.2137 + vneg.s16 q12, q12 1.2138 + vneg.s16 q13, q13 1.2139 + vshr.s16 q2, q0, #15 /* extract sign */ 1.2140 + vshr.s16 q3, q1, #15 1.2141 + vshl.u16 q14, q10, q12 /* shift */ 1.2142 + vshl.u16 q15, q11, q13 1.2143 + subs LOOP_COUNT, LOOP_COUNT, #1 1.2144 + bne 1b 1.2145 + pop {r4, r5} 1.2146 + 1.2147 + veor.u16 q14, q14, q2 /* restore sign */ 1.2148 + veor.u16 q15, q15, q3 1.2149 + vsub.u16 q14, q14, q2 1.2150 + vsub.u16 q15, q15, q3 1.2151 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 1.2152 + 1.2153 + bx lr /* return */ 1.2154 + 1.2155 + .unreq COEF_BLOCK 1.2156 + .unreq DIVISORS 1.2157 + .unreq WORKSPACE 1.2158 + .unreq RECIPROCAL 1.2159 + .unreq CORRECTION 1.2160 + .unreq SHIFT 1.2161 + .unreq LOOP_COUNT 1.2162 +.endfunc 1.2163 + 1.2164 +/*****************************************************************************/ 1.2165 + 1.2166 +/* 1.2167 + * GLOBAL(void) 1.2168 + * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, 1.2169 + * JDIMENSION downsampled_width, 1.2170 + * JSAMPARRAY input_data, 1.2171 + * JSAMPARRAY * output_data_ptr); 1.2172 + * 1.2173 + * Note: the use of unaligned writes is the main remaining bottleneck in 1.2174 + * this code, which can be potentially solved to get up to tens 1.2175 + * of percents performance improvement on Cortex-A8/Cortex-A9. 1.2176 + */ 1.2177 + 1.2178 +/* 1.2179 + * Upsample 16 source pixels to 32 destination pixels. The new 16 source 1.2180 + * pixels are loaded to q0. The previous 16 source pixels are in q1. The 1.2181 + * shifted-by-one source pixels are constructed in q2 by using q0 and q1. 1.2182 + * Register d28 is used for multiplication by 3. Register q15 is used 1.2183 + * for adding +1 bias. 1.2184 + */ 1.2185 +.macro upsample16 OUTPTR, INPTR 1.2186 + vld1.8 {q0}, [\INPTR]! 1.2187 + vmovl.u8 q8, d0 1.2188 + vext.8 q2, q1, q0, #15 1.2189 + vmovl.u8 q9, d1 1.2190 + vaddw.u8 q10, q15, d4 1.2191 + vaddw.u8 q11, q15, d5 1.2192 + vmlal.u8 q8, d4, d28 1.2193 + vmlal.u8 q9, d5, d28 1.2194 + vmlal.u8 q10, d0, d28 1.2195 + vmlal.u8 q11, d1, d28 1.2196 + vmov q1, q0 /* backup source pixels to q1 */ 1.2197 + vrshrn.u16 d6, q8, #2 1.2198 + vrshrn.u16 d7, q9, #2 1.2199 + vshrn.u16 d8, q10, #2 1.2200 + vshrn.u16 d9, q11, #2 1.2201 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 1.2202 +.endm 1.2203 + 1.2204 +/* 1.2205 + * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' 1.2206 + * macro, the roles of q0 and q1 registers are reversed for even and odd 1.2207 + * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. 1.2208 + * Also this unrolling allows to reorder loads and stores to compensate 1.2209 + * multiplication latency and reduce stalls. 1.2210 + */ 1.2211 +.macro upsample32 OUTPTR, INPTR 1.2212 + /* even 16 pixels group */ 1.2213 + vld1.8 {q0}, [\INPTR]! 1.2214 + vmovl.u8 q8, d0 1.2215 + vext.8 q2, q1, q0, #15 1.2216 + vmovl.u8 q9, d1 1.2217 + vaddw.u8 q10, q15, d4 1.2218 + vaddw.u8 q11, q15, d5 1.2219 + vmlal.u8 q8, d4, d28 1.2220 + vmlal.u8 q9, d5, d28 1.2221 + vmlal.u8 q10, d0, d28 1.2222 + vmlal.u8 q11, d1, d28 1.2223 + /* odd 16 pixels group */ 1.2224 + vld1.8 {q1}, [\INPTR]! 1.2225 + vrshrn.u16 d6, q8, #2 1.2226 + vrshrn.u16 d7, q9, #2 1.2227 + vshrn.u16 d8, q10, #2 1.2228 + vshrn.u16 d9, q11, #2 1.2229 + vmovl.u8 q8, d2 1.2230 + vext.8 q2, q0, q1, #15 1.2231 + vmovl.u8 q9, d3 1.2232 + vaddw.u8 q10, q15, d4 1.2233 + vaddw.u8 q11, q15, d5 1.2234 + vmlal.u8 q8, d4, d28 1.2235 + vmlal.u8 q9, d5, d28 1.2236 + vmlal.u8 q10, d2, d28 1.2237 + vmlal.u8 q11, d3, d28 1.2238 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 1.2239 + vrshrn.u16 d6, q8, #2 1.2240 + vrshrn.u16 d7, q9, #2 1.2241 + vshrn.u16 d8, q10, #2 1.2242 + vshrn.u16 d9, q11, #2 1.2243 + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 1.2244 +.endm 1.2245 + 1.2246 +/* 1.2247 + * Upsample a row of WIDTH pixels from INPTR to OUTPTR. 1.2248 + */ 1.2249 +.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 1.2250 + /* special case for the first and last pixels */ 1.2251 + sub \WIDTH, \WIDTH, #1 1.2252 + add \OUTPTR, \OUTPTR, #1 1.2253 + ldrb \TMP1, [\INPTR, \WIDTH] 1.2254 + strb \TMP1, [\OUTPTR, \WIDTH, asl #1] 1.2255 + ldrb \TMP1, [\INPTR], #1 1.2256 + strb \TMP1, [\OUTPTR, #-1] 1.2257 + vmov.8 d3[7], \TMP1 1.2258 + 1.2259 + subs \WIDTH, \WIDTH, #32 1.2260 + blt 5f 1.2261 +0: /* process 32 pixels per iteration */ 1.2262 + upsample32 \OUTPTR, \INPTR 1.2263 + subs \WIDTH, \WIDTH, #32 1.2264 + bge 0b 1.2265 +5: 1.2266 + adds \WIDTH, \WIDTH, #16 1.2267 + blt 1f 1.2268 +0: /* process 16 pixels if needed */ 1.2269 + upsample16 \OUTPTR, \INPTR 1.2270 + subs \WIDTH, \WIDTH, #16 1.2271 +1: 1.2272 + adds \WIDTH, \WIDTH, #16 1.2273 + beq 9f 1.2274 + 1.2275 + /* load the remaining 1-15 pixels */ 1.2276 + add \INPTR, \INPTR, \WIDTH 1.2277 + tst \WIDTH, #1 1.2278 + beq 2f 1.2279 + sub \INPTR, \INPTR, #1 1.2280 + vld1.8 {d0[0]}, [\INPTR] 1.2281 +2: 1.2282 + tst \WIDTH, #2 1.2283 + beq 2f 1.2284 + vext.8 d0, d0, d0, #6 1.2285 + sub \INPTR, \INPTR, #1 1.2286 + vld1.8 {d0[1]}, [\INPTR] 1.2287 + sub \INPTR, \INPTR, #1 1.2288 + vld1.8 {d0[0]}, [\INPTR] 1.2289 +2: 1.2290 + tst \WIDTH, #4 1.2291 + beq 2f 1.2292 + vrev64.32 d0, d0 1.2293 + sub \INPTR, \INPTR, #1 1.2294 + vld1.8 {d0[3]}, [\INPTR] 1.2295 + sub \INPTR, \INPTR, #1 1.2296 + vld1.8 {d0[2]}, [\INPTR] 1.2297 + sub \INPTR, \INPTR, #1 1.2298 + vld1.8 {d0[1]}, [\INPTR] 1.2299 + sub \INPTR, \INPTR, #1 1.2300 + vld1.8 {d0[0]}, [\INPTR] 1.2301 +2: 1.2302 + tst \WIDTH, #8 1.2303 + beq 2f 1.2304 + vmov d1, d0 1.2305 + sub \INPTR, \INPTR, #8 1.2306 + vld1.8 {d0}, [\INPTR] 1.2307 +2: /* upsample the remaining pixels */ 1.2308 + vmovl.u8 q8, d0 1.2309 + vext.8 q2, q1, q0, #15 1.2310 + vmovl.u8 q9, d1 1.2311 + vaddw.u8 q10, q15, d4 1.2312 + vaddw.u8 q11, q15, d5 1.2313 + vmlal.u8 q8, d4, d28 1.2314 + vmlal.u8 q9, d5, d28 1.2315 + vmlal.u8 q10, d0, d28 1.2316 + vmlal.u8 q11, d1, d28 1.2317 + vrshrn.u16 d10, q8, #2 1.2318 + vrshrn.u16 d12, q9, #2 1.2319 + vshrn.u16 d11, q10, #2 1.2320 + vshrn.u16 d13, q11, #2 1.2321 + vzip.8 d10, d11 1.2322 + vzip.8 d12, d13 1.2323 + /* store the remaining pixels */ 1.2324 + tst \WIDTH, #8 1.2325 + beq 2f 1.2326 + vst1.8 {d10, d11}, [\OUTPTR]! 1.2327 + vmov q5, q6 1.2328 +2: 1.2329 + tst \WIDTH, #4 1.2330 + beq 2f 1.2331 + vst1.8 {d10}, [\OUTPTR]! 1.2332 + vmov d10, d11 1.2333 +2: 1.2334 + tst \WIDTH, #2 1.2335 + beq 2f 1.2336 + vst1.8 {d10[0]}, [\OUTPTR]! 1.2337 + vst1.8 {d10[1]}, [\OUTPTR]! 1.2338 + vst1.8 {d10[2]}, [\OUTPTR]! 1.2339 + vst1.8 {d10[3]}, [\OUTPTR]! 1.2340 + vext.8 d10, d10, d10, #4 1.2341 +2: 1.2342 + tst \WIDTH, #1 1.2343 + beq 2f 1.2344 + vst1.8 {d10[0]}, [\OUTPTR]! 1.2345 + vst1.8 {d10[1]}, [\OUTPTR]! 1.2346 +2: 1.2347 +9: 1.2348 +.endm 1.2349 + 1.2350 +asm_function jsimd_h2v1_fancy_upsample_neon 1.2351 + 1.2352 + MAX_V_SAMP_FACTOR .req r0 1.2353 + DOWNSAMPLED_WIDTH .req r1 1.2354 + INPUT_DATA .req r2 1.2355 + OUTPUT_DATA_PTR .req r3 1.2356 + OUTPUT_DATA .req OUTPUT_DATA_PTR 1.2357 + 1.2358 + OUTPTR .req r4 1.2359 + INPTR .req r5 1.2360 + WIDTH .req ip 1.2361 + TMP .req lr 1.2362 + 1.2363 + push {r4, r5, r6, lr} 1.2364 + vpush {d8-d15} 1.2365 + 1.2366 + ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] 1.2367 + cmp MAX_V_SAMP_FACTOR, #0 1.2368 + ble 99f 1.2369 + 1.2370 + /* initialize constants */ 1.2371 + vmov.u8 d28, #3 1.2372 + vmov.u16 q15, #1 1.2373 +11: 1.2374 + ldr INPTR, [INPUT_DATA], #4 1.2375 + ldr OUTPTR, [OUTPUT_DATA], #4 1.2376 + mov WIDTH, DOWNSAMPLED_WIDTH 1.2377 + upsample_row OUTPTR, INPTR, WIDTH, TMP 1.2378 + subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 1.2379 + bgt 11b 1.2380 + 1.2381 +99: 1.2382 + vpop {d8-d15} 1.2383 + pop {r4, r5, r6, pc} 1.2384 + 1.2385 + .unreq MAX_V_SAMP_FACTOR 1.2386 + .unreq DOWNSAMPLED_WIDTH 1.2387 + .unreq INPUT_DATA 1.2388 + .unreq OUTPUT_DATA_PTR 1.2389 + .unreq OUTPUT_DATA 1.2390 + 1.2391 + .unreq OUTPTR 1.2392 + .unreq INPTR 1.2393 + .unreq WIDTH 1.2394 + .unreq TMP 1.2395 + 1.2396 +.endfunc 1.2397 + 1.2398 +.purgem upsample16 1.2399 +.purgem upsample32 1.2400 +.purgem upsample_row