media/libjpeg/simd/jsimd_arm_neon.S

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * ARM NEON optimizations for libjpeg-turbo
michael@0 3 *
michael@0 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
michael@0 5 * All rights reserved.
michael@0 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
michael@0 7 *
michael@0 8 * This software is provided 'as-is', without any express or implied
michael@0 9 * warranty. In no event will the authors be held liable for any damages
michael@0 10 * arising from the use of this software.
michael@0 11 *
michael@0 12 * Permission is granted to anyone to use this software for any purpose,
michael@0 13 * including commercial applications, and to alter it and redistribute it
michael@0 14 * freely, subject to the following restrictions:
michael@0 15 *
michael@0 16 * 1. The origin of this software must not be misrepresented; you must not
michael@0 17 * claim that you wrote the original software. If you use this software
michael@0 18 * in a product, an acknowledgment in the product documentation would be
michael@0 19 * appreciated but is not required.
michael@0 20 * 2. Altered source versions must be plainly marked as such, and must not be
michael@0 21 * misrepresented as being the original software.
michael@0 22 * 3. This notice may not be removed or altered from any source distribution.
michael@0 23 */
michael@0 24
michael@0 25 #if defined(__linux__) && defined(__ELF__)
michael@0 26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
michael@0 27 #endif
michael@0 28
michael@0 29 .text
michael@0 30 .fpu neon
michael@0 31 .arch armv7a
michael@0 32 .object_arch armv4
michael@0 33 .arm
michael@0 34
michael@0 35
michael@0 36 #define RESPECT_STRICT_ALIGNMENT 1
michael@0 37
michael@0 38 /*****************************************************************************/
michael@0 39
michael@0 40 /* Supplementary macro for setting function attributes */
michael@0 41 .macro asm_function fname
michael@0 42 #ifdef __APPLE__
michael@0 43 .func _\fname
michael@0 44 .globl _\fname
michael@0 45 _\fname:
michael@0 46 #else
michael@0 47 .func \fname
michael@0 48 .global \fname
michael@0 49 #ifdef __ELF__
michael@0 50 .hidden \fname
michael@0 51 .type \fname, %function
michael@0 52 #endif
michael@0 53 \fname:
michael@0 54 #endif
michael@0 55 .endm
michael@0 56
michael@0 57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
michael@0 58 .macro transpose_4x4 x0, x1, x2, x3
michael@0 59 vtrn.16 \x0, \x1
michael@0 60 vtrn.16 \x2, \x3
michael@0 61 vtrn.32 \x0, \x2
michael@0 62 vtrn.32 \x1, \x3
michael@0 63 .endm
michael@0 64
michael@0 65 #define CENTERJSAMPLE 128
michael@0 66
michael@0 67 /*****************************************************************************/
michael@0 68
michael@0 69 /*
michael@0 70 * Perform dequantization and inverse DCT on one block of coefficients.
michael@0 71 *
michael@0 72 * GLOBAL(void)
michael@0 73 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
michael@0 74 * JSAMPARRAY output_buf, JDIMENSION output_col)
michael@0 75 */
michael@0 76
michael@0 77 #define FIX_0_298631336 (2446)
michael@0 78 #define FIX_0_390180644 (3196)
michael@0 79 #define FIX_0_541196100 (4433)
michael@0 80 #define FIX_0_765366865 (6270)
michael@0 81 #define FIX_0_899976223 (7373)
michael@0 82 #define FIX_1_175875602 (9633)
michael@0 83 #define FIX_1_501321110 (12299)
michael@0 84 #define FIX_1_847759065 (15137)
michael@0 85 #define FIX_1_961570560 (16069)
michael@0 86 #define FIX_2_053119869 (16819)
michael@0 87 #define FIX_2_562915447 (20995)
michael@0 88 #define FIX_3_072711026 (25172)
michael@0 89
michael@0 90 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
michael@0 91 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
michael@0 92 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
michael@0 93 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
michael@0 94 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
michael@0 95 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
michael@0 96 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
michael@0 97 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
michael@0 98
michael@0 99 /*
michael@0 100 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
michael@0 101 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
michael@0 102 */
michael@0 103 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
michael@0 104 { \
michael@0 105 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
michael@0 106 INT32 q1, q2, q3, q4, q5, q6, q7; \
michael@0 107 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
michael@0 108 \
michael@0 109 /* 1-D iDCT input data */ \
michael@0 110 row0 = xrow0; \
michael@0 111 row1 = xrow1; \
michael@0 112 row2 = xrow2; \
michael@0 113 row3 = xrow3; \
michael@0 114 row4 = xrow4; \
michael@0 115 row5 = xrow5; \
michael@0 116 row6 = xrow6; \
michael@0 117 row7 = xrow7; \
michael@0 118 \
michael@0 119 q5 = row7 + row3; \
michael@0 120 q4 = row5 + row1; \
michael@0 121 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
michael@0 122 MULTIPLY(q4, FIX_1_175875602); \
michael@0 123 q7 = MULTIPLY(q5, FIX_1_175875602) + \
michael@0 124 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
michael@0 125 q2 = MULTIPLY(row2, FIX_0_541196100) + \
michael@0 126 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
michael@0 127 q4 = q6; \
michael@0 128 q3 = ((INT32) row0 - (INT32) row4) << 13; \
michael@0 129 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
michael@0 130 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
michael@0 131 /* now we can use q1 (reloadable constants have been used up) */ \
michael@0 132 q1 = q3 + q2; \
michael@0 133 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
michael@0 134 MULTIPLY(row1, -FIX_0_899976223); \
michael@0 135 q5 = q7; \
michael@0 136 q1 = q1 + q6; \
michael@0 137 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
michael@0 138 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
michael@0 139 \
michael@0 140 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
michael@0 141 tmp11_plus_tmp2 = q1; \
michael@0 142 row1 = 0; \
michael@0 143 \
michael@0 144 q1 = q1 - q6; \
michael@0 145 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
michael@0 146 MULTIPLY(row3, -FIX_2_562915447); \
michael@0 147 q1 = q1 - q6; \
michael@0 148 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
michael@0 149 MULTIPLY(row6, FIX_0_541196100); \
michael@0 150 q3 = q3 - q2; \
michael@0 151 \
michael@0 152 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
michael@0 153 tmp11_minus_tmp2 = q1; \
michael@0 154 \
michael@0 155 q1 = ((INT32) row0 + (INT32) row4) << 13; \
michael@0 156 q2 = q1 + q6; \
michael@0 157 q1 = q1 - q6; \
michael@0 158 \
michael@0 159 /* pick up the results */ \
michael@0 160 tmp0 = q4; \
michael@0 161 tmp1 = q5; \
michael@0 162 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
michael@0 163 tmp3 = q7; \
michael@0 164 tmp10 = q2; \
michael@0 165 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
michael@0 166 tmp12 = q3; \
michael@0 167 tmp13 = q1; \
michael@0 168 }
michael@0 169
michael@0 170 #define XFIX_0_899976223 d0[0]
michael@0 171 #define XFIX_0_541196100 d0[1]
michael@0 172 #define XFIX_2_562915447 d0[2]
michael@0 173 #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
michael@0 174 #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
michael@0 175 #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
michael@0 176 #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
michael@0 177 #define XFIX_1_175875602 d1[3]
michael@0 178 #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
michael@0 179 #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
michael@0 180 #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
michael@0 181 #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
michael@0 182
michael@0 183 .balign 16
michael@0 184 jsimd_idct_islow_neon_consts:
michael@0 185 .short FIX_0_899976223 /* d0[0] */
michael@0 186 .short FIX_0_541196100 /* d0[1] */
michael@0 187 .short FIX_2_562915447 /* d0[2] */
michael@0 188 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
michael@0 189 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
michael@0 190 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
michael@0 191 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
michael@0 192 .short FIX_1_175875602 /* d1[3] */
michael@0 193 /* reloadable constants */
michael@0 194 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
michael@0 195 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
michael@0 196 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
michael@0 197 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
michael@0 198
michael@0 199 asm_function jsimd_idct_islow_neon
michael@0 200
michael@0 201 DCT_TABLE .req r0
michael@0 202 COEF_BLOCK .req r1
michael@0 203 OUTPUT_BUF .req r2
michael@0 204 OUTPUT_COL .req r3
michael@0 205 TMP1 .req r0
michael@0 206 TMP2 .req r1
michael@0 207 TMP3 .req r2
michael@0 208 TMP4 .req ip
michael@0 209
michael@0 210 ROW0L .req d16
michael@0 211 ROW0R .req d17
michael@0 212 ROW1L .req d18
michael@0 213 ROW1R .req d19
michael@0 214 ROW2L .req d20
michael@0 215 ROW2R .req d21
michael@0 216 ROW3L .req d22
michael@0 217 ROW3R .req d23
michael@0 218 ROW4L .req d24
michael@0 219 ROW4R .req d25
michael@0 220 ROW5L .req d26
michael@0 221 ROW5R .req d27
michael@0 222 ROW6L .req d28
michael@0 223 ROW6R .req d29
michael@0 224 ROW7L .req d30
michael@0 225 ROW7R .req d31
michael@0 226
michael@0 227 /* Load and dequantize coefficients into NEON registers
michael@0 228 * with the following allocation:
michael@0 229 * 0 1 2 3 | 4 5 6 7
michael@0 230 * ---------+--------
michael@0 231 * 0 | d16 | d17 ( q8 )
michael@0 232 * 1 | d18 | d19 ( q9 )
michael@0 233 * 2 | d20 | d21 ( q10 )
michael@0 234 * 3 | d22 | d23 ( q11 )
michael@0 235 * 4 | d24 | d25 ( q12 )
michael@0 236 * 5 | d26 | d27 ( q13 )
michael@0 237 * 6 | d28 | d29 ( q14 )
michael@0 238 * 7 | d30 | d31 ( q15 )
michael@0 239 */
michael@0 240 adr ip, jsimd_idct_islow_neon_consts
michael@0 241 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
michael@0 242 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
michael@0 243 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
michael@0 244 vmul.s16 q8, q8, q0
michael@0 245 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
michael@0 246 vmul.s16 q9, q9, q1
michael@0 247 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
michael@0 248 vmul.s16 q10, q10, q2
michael@0 249 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
michael@0 250 vmul.s16 q11, q11, q3
michael@0 251 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
michael@0 252 vmul.s16 q12, q12, q0
michael@0 253 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
michael@0 254 vmul.s16 q14, q14, q2
michael@0 255 vmul.s16 q13, q13, q1
michael@0 256 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
michael@0 257 add ip, ip, #16
michael@0 258 vmul.s16 q15, q15, q3
michael@0 259 vpush {d8-d15} /* save NEON registers */
michael@0 260 /* 1-D IDCT, pass 1, left 4x8 half */
michael@0 261 vadd.s16 d4, ROW7L, ROW3L
michael@0 262 vadd.s16 d5, ROW5L, ROW1L
michael@0 263 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
michael@0 264 vmlal.s16 q6, d5, XFIX_1_175875602
michael@0 265 vmull.s16 q7, d4, XFIX_1_175875602
michael@0 266 /* Check for the zero coefficients in the right 4x8 half */
michael@0 267 push {r4, r5}
michael@0 268 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
michael@0 269 vsubl.s16 q3, ROW0L, ROW4L
michael@0 270 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
michael@0 271 vmull.s16 q2, ROW2L, XFIX_0_541196100
michael@0 272 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
michael@0 273 orr r0, r4, r5
michael@0 274 vmov q4, q6
michael@0 275 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
michael@0 276 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
michael@0 277 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
michael@0 278 vshl.s32 q3, q3, #13
michael@0 279 orr r0, r0, r4
michael@0 280 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
michael@0 281 orr r0, r0, r5
michael@0 282 vadd.s32 q1, q3, q2
michael@0 283 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
michael@0 284 vmov q5, q7
michael@0 285 vadd.s32 q1, q1, q6
michael@0 286 orr r0, r0, r4
michael@0 287 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
michael@0 288 orr r0, r0, r5
michael@0 289 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
michael@0 290 vrshrn.s32 ROW1L, q1, #11
michael@0 291 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
michael@0 292 vsub.s32 q1, q1, q6
michael@0 293 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
michael@0 294 orr r0, r0, r4
michael@0 295 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
michael@0 296 orr r0, r0, r5
michael@0 297 vsub.s32 q1, q1, q6
michael@0 298 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
michael@0 299 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
michael@0 300 vmlal.s16 q6, ROW6L, XFIX_0_541196100
michael@0 301 vsub.s32 q3, q3, q2
michael@0 302 orr r0, r0, r4
michael@0 303 vrshrn.s32 ROW6L, q1, #11
michael@0 304 orr r0, r0, r5
michael@0 305 vadd.s32 q1, q3, q5
michael@0 306 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
michael@0 307 vsub.s32 q3, q3, q5
michael@0 308 vaddl.s16 q5, ROW0L, ROW4L
michael@0 309 orr r0, r0, r4
michael@0 310 vrshrn.s32 ROW2L, q1, #11
michael@0 311 orr r0, r0, r5
michael@0 312 vrshrn.s32 ROW5L, q3, #11
michael@0 313 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
michael@0 314 vshl.s32 q5, q5, #13
michael@0 315 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
michael@0 316 orr r0, r0, r4
michael@0 317 vadd.s32 q2, q5, q6
michael@0 318 orrs r0, r0, r5
michael@0 319 vsub.s32 q1, q5, q6
michael@0 320 vadd.s32 q6, q2, q7
michael@0 321 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
michael@0 322 vsub.s32 q2, q2, q7
michael@0 323 vadd.s32 q5, q1, q4
michael@0 324 orr r0, r4, r5
michael@0 325 vsub.s32 q3, q1, q4
michael@0 326 pop {r4, r5}
michael@0 327 vrshrn.s32 ROW7L, q2, #11
michael@0 328 vrshrn.s32 ROW3L, q5, #11
michael@0 329 vrshrn.s32 ROW0L, q6, #11
michael@0 330 vrshrn.s32 ROW4L, q3, #11
michael@0 331
michael@0 332 beq 3f /* Go to do some special handling for the sparse right 4x8 half */
michael@0 333
michael@0 334 /* 1-D IDCT, pass 1, right 4x8 half */
michael@0 335 vld1.s16 {d2}, [ip, :64] /* reload constants */
michael@0 336 vadd.s16 d10, ROW7R, ROW3R
michael@0 337 vadd.s16 d8, ROW5R, ROW1R
michael@0 338 /* Transpose left 4x8 half */
michael@0 339 vtrn.16 ROW6L, ROW7L
michael@0 340 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
michael@0 341 vmlal.s16 q6, d8, XFIX_1_175875602
michael@0 342 vtrn.16 ROW2L, ROW3L
michael@0 343 vmull.s16 q7, d10, XFIX_1_175875602
michael@0 344 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
michael@0 345 vtrn.16 ROW0L, ROW1L
michael@0 346 vsubl.s16 q3, ROW0R, ROW4R
michael@0 347 vmull.s16 q2, ROW2R, XFIX_0_541196100
michael@0 348 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
michael@0 349 vtrn.16 ROW4L, ROW5L
michael@0 350 vmov q4, q6
michael@0 351 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
michael@0 352 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
michael@0 353 vtrn.32 ROW1L, ROW3L
michael@0 354 vshl.s32 q3, q3, #13
michael@0 355 vmlsl.s16 q4, ROW1R, XFIX_0_899976223
michael@0 356 vtrn.32 ROW4L, ROW6L
michael@0 357 vadd.s32 q1, q3, q2
michael@0 358 vmov q5, q7
michael@0 359 vadd.s32 q1, q1, q6
michael@0 360 vtrn.32 ROW0L, ROW2L
michael@0 361 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
michael@0 362 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
michael@0 363 vrshrn.s32 ROW1R, q1, #11
michael@0 364 vtrn.32 ROW5L, ROW7L
michael@0 365 vsub.s32 q1, q1, q6
michael@0 366 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
michael@0 367 vmlsl.s16 q5, ROW3R, XFIX_2_562915447
michael@0 368 vsub.s32 q1, q1, q6
michael@0 369 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
michael@0 370 vmlal.s16 q6, ROW6R, XFIX_0_541196100
michael@0 371 vsub.s32 q3, q3, q2
michael@0 372 vrshrn.s32 ROW6R, q1, #11
michael@0 373 vadd.s32 q1, q3, q5
michael@0 374 vsub.s32 q3, q3, q5
michael@0 375 vaddl.s16 q5, ROW0R, ROW4R
michael@0 376 vrshrn.s32 ROW2R, q1, #11
michael@0 377 vrshrn.s32 ROW5R, q3, #11
michael@0 378 vshl.s32 q5, q5, #13
michael@0 379 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
michael@0 380 vadd.s32 q2, q5, q6
michael@0 381 vsub.s32 q1, q5, q6
michael@0 382 vadd.s32 q6, q2, q7
michael@0 383 vsub.s32 q2, q2, q7
michael@0 384 vadd.s32 q5, q1, q4
michael@0 385 vsub.s32 q3, q1, q4
michael@0 386 vrshrn.s32 ROW7R, q2, #11
michael@0 387 vrshrn.s32 ROW3R, q5, #11
michael@0 388 vrshrn.s32 ROW0R, q6, #11
michael@0 389 vrshrn.s32 ROW4R, q3, #11
michael@0 390 /* Transpose right 4x8 half */
michael@0 391 vtrn.16 ROW6R, ROW7R
michael@0 392 vtrn.16 ROW2R, ROW3R
michael@0 393 vtrn.16 ROW0R, ROW1R
michael@0 394 vtrn.16 ROW4R, ROW5R
michael@0 395 vtrn.32 ROW1R, ROW3R
michael@0 396 vtrn.32 ROW4R, ROW6R
michael@0 397 vtrn.32 ROW0R, ROW2R
michael@0 398 vtrn.32 ROW5R, ROW7R
michael@0 399
michael@0 400 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
michael@0 401 vld1.s16 {d2}, [ip, :64] /* reload constants */
michael@0 402 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
michael@0 403 vmlal.s16 q6, ROW1L, XFIX_1_175875602
michael@0 404 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
michael@0 405 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
michael@0 406 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
michael@0 407 vmlal.s16 q7, ROW3L, XFIX_1_175875602
michael@0 408 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
michael@0 409 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
michael@0 410 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
michael@0 411 vmull.s16 q2, ROW2L, XFIX_0_541196100
michael@0 412 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
michael@0 413 vmov q4, q6
michael@0 414 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
michael@0 415 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
michael@0 416 vshl.s32 q3, q3, #13
michael@0 417 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
michael@0 418 vadd.s32 q1, q3, q2
michael@0 419 vmov q5, q7
michael@0 420 vadd.s32 q1, q1, q6
michael@0 421 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
michael@0 422 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
michael@0 423 vshrn.s32 ROW1L, q1, #16
michael@0 424 vsub.s32 q1, q1, q6
michael@0 425 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
michael@0 426 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
michael@0 427 vsub.s32 q1, q1, q6
michael@0 428 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
michael@0 429 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
michael@0 430 vsub.s32 q3, q3, q2
michael@0 431 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
michael@0 432 vadd.s32 q1, q3, q5
michael@0 433 vsub.s32 q3, q3, q5
michael@0 434 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
michael@0 435 vshrn.s32 ROW2L, q1, #16
michael@0 436 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
michael@0 437 vshl.s32 q5, q5, #13
michael@0 438 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
michael@0 439 vadd.s32 q2, q5, q6
michael@0 440 vsub.s32 q1, q5, q6
michael@0 441 vadd.s32 q6, q2, q7
michael@0 442 vsub.s32 q2, q2, q7
michael@0 443 vadd.s32 q5, q1, q4
michael@0 444 vsub.s32 q3, q1, q4
michael@0 445 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
michael@0 446 vshrn.s32 ROW3L, q5, #16
michael@0 447 vshrn.s32 ROW0L, q6, #16
michael@0 448 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
michael@0 449 /* 1-D IDCT, pass 2, right 4x8 half */
michael@0 450 vld1.s16 {d2}, [ip, :64] /* reload constants */
michael@0 451 vmull.s16 q6, ROW5R, XFIX_1_175875602
michael@0 452 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
michael@0 453 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
michael@0 454 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
michael@0 455 vmull.s16 q7, ROW7R, XFIX_1_175875602
michael@0 456 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
michael@0 457 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
michael@0 458 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
michael@0 459 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
michael@0 460 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
michael@0 461 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
michael@0 462 vmov q4, q6
michael@0 463 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
michael@0 464 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
michael@0 465 vshl.s32 q3, q3, #13
michael@0 466 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
michael@0 467 vadd.s32 q1, q3, q2
michael@0 468 vmov q5, q7
michael@0 469 vadd.s32 q1, q1, q6
michael@0 470 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
michael@0 471 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
michael@0 472 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
michael@0 473 vsub.s32 q1, q1, q6
michael@0 474 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
michael@0 475 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
michael@0 476 vsub.s32 q1, q1, q6
michael@0 477 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
michael@0 478 vmlal.s16 q6, ROW6R, XFIX_0_541196100
michael@0 479 vsub.s32 q3, q3, q2
michael@0 480 vshrn.s32 ROW6R, q1, #16
michael@0 481 vadd.s32 q1, q3, q5
michael@0 482 vsub.s32 q3, q3, q5
michael@0 483 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
michael@0 484 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
michael@0 485 vshrn.s32 ROW5R, q3, #16
michael@0 486 vshl.s32 q5, q5, #13
michael@0 487 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
michael@0 488 vadd.s32 q2, q5, q6
michael@0 489 vsub.s32 q1, q5, q6
michael@0 490 vadd.s32 q6, q2, q7
michael@0 491 vsub.s32 q2, q2, q7
michael@0 492 vadd.s32 q5, q1, q4
michael@0 493 vsub.s32 q3, q1, q4
michael@0 494 vshrn.s32 ROW7R, q2, #16
michael@0 495 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
michael@0 496 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
michael@0 497 vshrn.s32 ROW4R, q3, #16
michael@0 498
michael@0 499 2: /* Descale to 8-bit and range limit */
michael@0 500 vqrshrn.s16 d16, q8, #2
michael@0 501 vqrshrn.s16 d17, q9, #2
michael@0 502 vqrshrn.s16 d18, q10, #2
michael@0 503 vqrshrn.s16 d19, q11, #2
michael@0 504 vpop {d8-d15} /* restore NEON registers */
michael@0 505 vqrshrn.s16 d20, q12, #2
michael@0 506 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
michael@0 507 vtrn.16 q8, q9
michael@0 508 vqrshrn.s16 d21, q13, #2
michael@0 509 vqrshrn.s16 d22, q14, #2
michael@0 510 vmov.u8 q0, #(CENTERJSAMPLE)
michael@0 511 vqrshrn.s16 d23, q15, #2
michael@0 512 vtrn.8 d16, d17
michael@0 513 vtrn.8 d18, d19
michael@0 514 vadd.u8 q8, q8, q0
michael@0 515 vadd.u8 q9, q9, q0
michael@0 516 vtrn.16 q10, q11
michael@0 517 /* Store results to the output buffer */
michael@0 518 ldmia OUTPUT_BUF!, {TMP1, TMP2}
michael@0 519 add TMP1, TMP1, OUTPUT_COL
michael@0 520 add TMP2, TMP2, OUTPUT_COL
michael@0 521 vst1.8 {d16}, [TMP1]
michael@0 522 vtrn.8 d20, d21
michael@0 523 vst1.8 {d17}, [TMP2]
michael@0 524 ldmia OUTPUT_BUF!, {TMP1, TMP2}
michael@0 525 add TMP1, TMP1, OUTPUT_COL
michael@0 526 add TMP2, TMP2, OUTPUT_COL
michael@0 527 vst1.8 {d18}, [TMP1]
michael@0 528 vadd.u8 q10, q10, q0
michael@0 529 vst1.8 {d19}, [TMP2]
michael@0 530 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
michael@0 531 add TMP1, TMP1, OUTPUT_COL
michael@0 532 add TMP2, TMP2, OUTPUT_COL
michael@0 533 add TMP3, TMP3, OUTPUT_COL
michael@0 534 add TMP4, TMP4, OUTPUT_COL
michael@0 535 vtrn.8 d22, d23
michael@0 536 vst1.8 {d20}, [TMP1]
michael@0 537 vadd.u8 q11, q11, q0
michael@0 538 vst1.8 {d21}, [TMP2]
michael@0 539 vst1.8 {d22}, [TMP3]
michael@0 540 vst1.8 {d23}, [TMP4]
michael@0 541 bx lr
michael@0 542
michael@0 543 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
michael@0 544
michael@0 545 /* Transpose left 4x8 half */
michael@0 546 vtrn.16 ROW6L, ROW7L
michael@0 547 vtrn.16 ROW2L, ROW3L
michael@0 548 vtrn.16 ROW0L, ROW1L
michael@0 549 vtrn.16 ROW4L, ROW5L
michael@0 550 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
michael@0 551 vtrn.32 ROW1L, ROW3L
michael@0 552 vtrn.32 ROW4L, ROW6L
michael@0 553 vtrn.32 ROW0L, ROW2L
michael@0 554 vtrn.32 ROW5L, ROW7L
michael@0 555
michael@0 556 cmp r0, #0
michael@0 557 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
michael@0 558
michael@0 559 /* Only row 0 is non-zero for the right 4x8 half */
michael@0 560 vdup.s16 ROW1R, ROW0R[1]
michael@0 561 vdup.s16 ROW2R, ROW0R[2]
michael@0 562 vdup.s16 ROW3R, ROW0R[3]
michael@0 563 vdup.s16 ROW4R, ROW0R[0]
michael@0 564 vdup.s16 ROW5R, ROW0R[1]
michael@0 565 vdup.s16 ROW6R, ROW0R[2]
michael@0 566 vdup.s16 ROW7R, ROW0R[3]
michael@0 567 vdup.s16 ROW0R, ROW0R[0]
michael@0 568 b 1b /* Go to 'normal' second pass */
michael@0 569
michael@0 570 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
michael@0 571 vld1.s16 {d2}, [ip, :64] /* reload constants */
michael@0 572 vmull.s16 q6, ROW1L, XFIX_1_175875602
michael@0 573 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
michael@0 574 vmull.s16 q7, ROW3L, XFIX_1_175875602
michael@0 575 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
michael@0 576 vmull.s16 q2, ROW2L, XFIX_0_541196100
michael@0 577 vshll.s16 q3, ROW0L, #13
michael@0 578 vmov q4, q6
michael@0 579 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
michael@0 580 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
michael@0 581 vadd.s32 q1, q3, q2
michael@0 582 vmov q5, q7
michael@0 583 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
michael@0 584 vadd.s32 q1, q1, q6
michael@0 585 vadd.s32 q6, q6, q6
michael@0 586 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
michael@0 587 vshrn.s32 ROW1L, q1, #16
michael@0 588 vsub.s32 q1, q1, q6
michael@0 589 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
michael@0 590 vsub.s32 q3, q3, q2
michael@0 591 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
michael@0 592 vadd.s32 q1, q3, q5
michael@0 593 vsub.s32 q3, q3, q5
michael@0 594 vshll.s16 q5, ROW0L, #13
michael@0 595 vshrn.s32 ROW2L, q1, #16
michael@0 596 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
michael@0 597 vadd.s32 q2, q5, q6
michael@0 598 vsub.s32 q1, q5, q6
michael@0 599 vadd.s32 q6, q2, q7
michael@0 600 vsub.s32 q2, q2, q7
michael@0 601 vadd.s32 q5, q1, q4
michael@0 602 vsub.s32 q3, q1, q4
michael@0 603 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
michael@0 604 vshrn.s32 ROW3L, q5, #16
michael@0 605 vshrn.s32 ROW0L, q6, #16
michael@0 606 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
michael@0 607 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
michael@0 608 vld1.s16 {d2}, [ip, :64] /* reload constants */
michael@0 609 vmull.s16 q6, ROW5L, XFIX_1_175875602
michael@0 610 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
michael@0 611 vmull.s16 q7, ROW7L, XFIX_1_175875602
michael@0 612 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
michael@0 613 vmull.s16 q2, ROW6L, XFIX_0_541196100
michael@0 614 vshll.s16 q3, ROW4L, #13
michael@0 615 vmov q4, q6
michael@0 616 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
michael@0 617 vmlsl.s16 q4, ROW5L, XFIX_0_899976223
michael@0 618 vadd.s32 q1, q3, q2
michael@0 619 vmov q5, q7
michael@0 620 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
michael@0 621 vadd.s32 q1, q1, q6
michael@0 622 vadd.s32 q6, q6, q6
michael@0 623 vmlsl.s16 q5, ROW7L, XFIX_2_562915447
michael@0 624 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
michael@0 625 vsub.s32 q1, q1, q6
michael@0 626 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
michael@0 627 vsub.s32 q3, q3, q2
michael@0 628 vshrn.s32 ROW6R, q1, #16
michael@0 629 vadd.s32 q1, q3, q5
michael@0 630 vsub.s32 q3, q3, q5
michael@0 631 vshll.s16 q5, ROW4L, #13
michael@0 632 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
michael@0 633 vshrn.s32 ROW5R, q3, #16
michael@0 634 vadd.s32 q2, q5, q6
michael@0 635 vsub.s32 q1, q5, q6
michael@0 636 vadd.s32 q6, q2, q7
michael@0 637 vsub.s32 q2, q2, q7
michael@0 638 vadd.s32 q5, q1, q4
michael@0 639 vsub.s32 q3, q1, q4
michael@0 640 vshrn.s32 ROW7R, q2, #16
michael@0 641 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
michael@0 642 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
michael@0 643 vshrn.s32 ROW4R, q3, #16
michael@0 644 b 2b /* Go to epilogue */
michael@0 645
michael@0 646 .unreq DCT_TABLE
michael@0 647 .unreq COEF_BLOCK
michael@0 648 .unreq OUTPUT_BUF
michael@0 649 .unreq OUTPUT_COL
michael@0 650 .unreq TMP1
michael@0 651 .unreq TMP2
michael@0 652 .unreq TMP3
michael@0 653 .unreq TMP4
michael@0 654
michael@0 655 .unreq ROW0L
michael@0 656 .unreq ROW0R
michael@0 657 .unreq ROW1L
michael@0 658 .unreq ROW1R
michael@0 659 .unreq ROW2L
michael@0 660 .unreq ROW2R
michael@0 661 .unreq ROW3L
michael@0 662 .unreq ROW3R
michael@0 663 .unreq ROW4L
michael@0 664 .unreq ROW4R
michael@0 665 .unreq ROW5L
michael@0 666 .unreq ROW5R
michael@0 667 .unreq ROW6L
michael@0 668 .unreq ROW6R
michael@0 669 .unreq ROW7L
michael@0 670 .unreq ROW7R
michael@0 671 .endfunc
michael@0 672
michael@0 673 /*****************************************************************************/
michael@0 674
michael@0 675 /*
michael@0 676 * jsimd_idct_ifast_neon
michael@0 677 *
michael@0 678 * This function contains a fast, not so accurate integer implementation of
michael@0 679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
michael@0 680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
michael@0 681 * function from jidctfst.c
michael@0 682 *
michael@0 683 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
michael@0 684 * But in ARM NEON case some extra additions are required because VQDMULH
michael@0 685 * instruction can't handle the constants larger than 1. So the expressions
michael@0 686 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
michael@0 687 * which introduces an extra addition. Overall, there are 6 extra additions
michael@0 688 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
michael@0 689 */
michael@0 690
michael@0 691 #define XFIX_1_082392200 d0[0]
michael@0 692 #define XFIX_1_414213562 d0[1]
michael@0 693 #define XFIX_1_847759065 d0[2]
michael@0 694 #define XFIX_2_613125930 d0[3]
michael@0 695
michael@0 696 .balign 16
michael@0 697 jsimd_idct_ifast_neon_consts:
michael@0 698 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
michael@0 699 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
michael@0 700 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
michael@0 701 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
michael@0 702
michael@0 703 asm_function jsimd_idct_ifast_neon
michael@0 704
michael@0 705 DCT_TABLE .req r0
michael@0 706 COEF_BLOCK .req r1
michael@0 707 OUTPUT_BUF .req r2
michael@0 708 OUTPUT_COL .req r3
michael@0 709 TMP1 .req r0
michael@0 710 TMP2 .req r1
michael@0 711 TMP3 .req r2
michael@0 712 TMP4 .req ip
michael@0 713
michael@0 714 /* Load and dequantize coefficients into NEON registers
michael@0 715 * with the following allocation:
michael@0 716 * 0 1 2 3 | 4 5 6 7
michael@0 717 * ---------+--------
michael@0 718 * 0 | d16 | d17 ( q8 )
michael@0 719 * 1 | d18 | d19 ( q9 )
michael@0 720 * 2 | d20 | d21 ( q10 )
michael@0 721 * 3 | d22 | d23 ( q11 )
michael@0 722 * 4 | d24 | d25 ( q12 )
michael@0 723 * 5 | d26 | d27 ( q13 )
michael@0 724 * 6 | d28 | d29 ( q14 )
michael@0 725 * 7 | d30 | d31 ( q15 )
michael@0 726 */
michael@0 727 adr ip, jsimd_idct_ifast_neon_consts
michael@0 728 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
michael@0 729 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
michael@0 730 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
michael@0 731 vmul.s16 q8, q8, q0
michael@0 732 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
michael@0 733 vmul.s16 q9, q9, q1
michael@0 734 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
michael@0 735 vmul.s16 q10, q10, q2
michael@0 736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
michael@0 737 vmul.s16 q11, q11, q3
michael@0 738 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
michael@0 739 vmul.s16 q12, q12, q0
michael@0 740 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
michael@0 741 vmul.s16 q14, q14, q2
michael@0 742 vmul.s16 q13, q13, q1
michael@0 743 vld1.16 {d0}, [ip, :64] /* load constants */
michael@0 744 vmul.s16 q15, q15, q3
michael@0 745 vpush {d8-d13} /* save NEON registers */
michael@0 746 /* 1-D IDCT, pass 1 */
michael@0 747 vsub.s16 q2, q10, q14
michael@0 748 vadd.s16 q14, q10, q14
michael@0 749 vsub.s16 q1, q11, q13
michael@0 750 vadd.s16 q13, q11, q13
michael@0 751 vsub.s16 q5, q9, q15
michael@0 752 vadd.s16 q15, q9, q15
michael@0 753 vqdmulh.s16 q4, q2, XFIX_1_414213562
michael@0 754 vqdmulh.s16 q6, q1, XFIX_2_613125930
michael@0 755 vadd.s16 q3, q1, q1
michael@0 756 vsub.s16 q1, q5, q1
michael@0 757 vadd.s16 q10, q2, q4
michael@0 758 vqdmulh.s16 q4, q1, XFIX_1_847759065
michael@0 759 vsub.s16 q2, q15, q13
michael@0 760 vadd.s16 q3, q3, q6
michael@0 761 vqdmulh.s16 q6, q2, XFIX_1_414213562
michael@0 762 vadd.s16 q1, q1, q4
michael@0 763 vqdmulh.s16 q4, q5, XFIX_1_082392200
michael@0 764 vsub.s16 q10, q10, q14
michael@0 765 vadd.s16 q2, q2, q6
michael@0 766 vsub.s16 q6, q8, q12
michael@0 767 vadd.s16 q12, q8, q12
michael@0 768 vadd.s16 q9, q5, q4
michael@0 769 vadd.s16 q5, q6, q10
michael@0 770 vsub.s16 q10, q6, q10
michael@0 771 vadd.s16 q6, q15, q13
michael@0 772 vadd.s16 q8, q12, q14
michael@0 773 vsub.s16 q3, q6, q3
michael@0 774 vsub.s16 q12, q12, q14
michael@0 775 vsub.s16 q3, q3, q1
michael@0 776 vsub.s16 q1, q9, q1
michael@0 777 vadd.s16 q2, q3, q2
michael@0 778 vsub.s16 q15, q8, q6
michael@0 779 vadd.s16 q1, q1, q2
michael@0 780 vadd.s16 q8, q8, q6
michael@0 781 vadd.s16 q14, q5, q3
michael@0 782 vsub.s16 q9, q5, q3
michael@0 783 vsub.s16 q13, q10, q2
michael@0 784 vadd.s16 q10, q10, q2
michael@0 785 /* Transpose */
michael@0 786 vtrn.16 q8, q9
michael@0 787 vsub.s16 q11, q12, q1
michael@0 788 vtrn.16 q14, q15
michael@0 789 vadd.s16 q12, q12, q1
michael@0 790 vtrn.16 q10, q11
michael@0 791 vtrn.16 q12, q13
michael@0 792 vtrn.32 q9, q11
michael@0 793 vtrn.32 q12, q14
michael@0 794 vtrn.32 q8, q10
michael@0 795 vtrn.32 q13, q15
michael@0 796 vswp d28, d21
michael@0 797 vswp d26, d19
michael@0 798 /* 1-D IDCT, pass 2 */
michael@0 799 vsub.s16 q2, q10, q14
michael@0 800 vswp d30, d23
michael@0 801 vadd.s16 q14, q10, q14
michael@0 802 vswp d24, d17
michael@0 803 vsub.s16 q1, q11, q13
michael@0 804 vadd.s16 q13, q11, q13
michael@0 805 vsub.s16 q5, q9, q15
michael@0 806 vadd.s16 q15, q9, q15
michael@0 807 vqdmulh.s16 q4, q2, XFIX_1_414213562
michael@0 808 vqdmulh.s16 q6, q1, XFIX_2_613125930
michael@0 809 vadd.s16 q3, q1, q1
michael@0 810 vsub.s16 q1, q5, q1
michael@0 811 vadd.s16 q10, q2, q4
michael@0 812 vqdmulh.s16 q4, q1, XFIX_1_847759065
michael@0 813 vsub.s16 q2, q15, q13
michael@0 814 vadd.s16 q3, q3, q6
michael@0 815 vqdmulh.s16 q6, q2, XFIX_1_414213562
michael@0 816 vadd.s16 q1, q1, q4
michael@0 817 vqdmulh.s16 q4, q5, XFIX_1_082392200
michael@0 818 vsub.s16 q10, q10, q14
michael@0 819 vadd.s16 q2, q2, q6
michael@0 820 vsub.s16 q6, q8, q12
michael@0 821 vadd.s16 q12, q8, q12
michael@0 822 vadd.s16 q9, q5, q4
michael@0 823 vadd.s16 q5, q6, q10
michael@0 824 vsub.s16 q10, q6, q10
michael@0 825 vadd.s16 q6, q15, q13
michael@0 826 vadd.s16 q8, q12, q14
michael@0 827 vsub.s16 q3, q6, q3
michael@0 828 vsub.s16 q12, q12, q14
michael@0 829 vsub.s16 q3, q3, q1
michael@0 830 vsub.s16 q1, q9, q1
michael@0 831 vadd.s16 q2, q3, q2
michael@0 832 vsub.s16 q15, q8, q6
michael@0 833 vadd.s16 q1, q1, q2
michael@0 834 vadd.s16 q8, q8, q6
michael@0 835 vadd.s16 q14, q5, q3
michael@0 836 vsub.s16 q9, q5, q3
michael@0 837 vsub.s16 q13, q10, q2
michael@0 838 vpop {d8-d13} /* restore NEON registers */
michael@0 839 vadd.s16 q10, q10, q2
michael@0 840 vsub.s16 q11, q12, q1
michael@0 841 vadd.s16 q12, q12, q1
michael@0 842 /* Descale to 8-bit and range limit */
michael@0 843 vmov.u8 q0, #0x80
michael@0 844 vqshrn.s16 d16, q8, #5
michael@0 845 vqshrn.s16 d17, q9, #5
michael@0 846 vqshrn.s16 d18, q10, #5
michael@0 847 vqshrn.s16 d19, q11, #5
michael@0 848 vqshrn.s16 d20, q12, #5
michael@0 849 vqshrn.s16 d21, q13, #5
michael@0 850 vqshrn.s16 d22, q14, #5
michael@0 851 vqshrn.s16 d23, q15, #5
michael@0 852 vadd.u8 q8, q8, q0
michael@0 853 vadd.u8 q9, q9, q0
michael@0 854 vadd.u8 q10, q10, q0
michael@0 855 vadd.u8 q11, q11, q0
michael@0 856 /* Transpose the final 8-bit samples */
michael@0 857 vtrn.16 q8, q9
michael@0 858 vtrn.16 q10, q11
michael@0 859 vtrn.32 q8, q10
michael@0 860 vtrn.32 q9, q11
michael@0 861 vtrn.8 d16, d17
michael@0 862 vtrn.8 d18, d19
michael@0 863 /* Store results to the output buffer */
michael@0 864 ldmia OUTPUT_BUF!, {TMP1, TMP2}
michael@0 865 add TMP1, TMP1, OUTPUT_COL
michael@0 866 add TMP2, TMP2, OUTPUT_COL
michael@0 867 vst1.8 {d16}, [TMP1]
michael@0 868 vst1.8 {d17}, [TMP2]
michael@0 869 ldmia OUTPUT_BUF!, {TMP1, TMP2}
michael@0 870 add TMP1, TMP1, OUTPUT_COL
michael@0 871 add TMP2, TMP2, OUTPUT_COL
michael@0 872 vst1.8 {d18}, [TMP1]
michael@0 873 vtrn.8 d20, d21
michael@0 874 vst1.8 {d19}, [TMP2]
michael@0 875 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
michael@0 876 add TMP1, TMP1, OUTPUT_COL
michael@0 877 add TMP2, TMP2, OUTPUT_COL
michael@0 878 add TMP3, TMP3, OUTPUT_COL
michael@0 879 add TMP4, TMP4, OUTPUT_COL
michael@0 880 vst1.8 {d20}, [TMP1]
michael@0 881 vtrn.8 d22, d23
michael@0 882 vst1.8 {d21}, [TMP2]
michael@0 883 vst1.8 {d22}, [TMP3]
michael@0 884 vst1.8 {d23}, [TMP4]
michael@0 885 bx lr
michael@0 886
michael@0 887 .unreq DCT_TABLE
michael@0 888 .unreq COEF_BLOCK
michael@0 889 .unreq OUTPUT_BUF
michael@0 890 .unreq OUTPUT_COL
michael@0 891 .unreq TMP1
michael@0 892 .unreq TMP2
michael@0 893 .unreq TMP3
michael@0 894 .unreq TMP4
michael@0 895 .endfunc
michael@0 896
michael@0 897 /*****************************************************************************/
michael@0 898
michael@0 899 /*
michael@0 900 * jsimd_idct_4x4_neon
michael@0 901 *
michael@0 902 * This function contains inverse-DCT code for getting reduced-size
michael@0 903 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
michael@0 904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
michael@0 905 * function from jpeg-6b (jidctred.c).
michael@0 906 *
michael@0 907 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
michael@0 908 * requires much less arithmetic operations and hence should be faster.
michael@0 909 * The primary purpose of this particular NEON optimized function is
michael@0 910 * bit exact compatibility with jpeg-6b.
michael@0 911 *
michael@0 912 * TODO: a bit better instructions scheduling can be achieved by expanding
michael@0 913 * idct_helper/transpose_4x4 macros and reordering instructions,
michael@0 914 * but readability will suffer somewhat.
michael@0 915 */
michael@0 916
michael@0 917 #define CONST_BITS 13
michael@0 918
michael@0 919 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
michael@0 920 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
michael@0 921 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
michael@0 922 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
michael@0 923 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
michael@0 924 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
michael@0 925 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
michael@0 926 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
michael@0 927 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
michael@0 928 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
michael@0 929 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
michael@0 930 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
michael@0 931 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
michael@0 932 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
michael@0 933
michael@0 934 .balign 16
michael@0 935 jsimd_idct_4x4_neon_consts:
michael@0 936 .short FIX_1_847759065 /* d0[0] */
michael@0 937 .short -FIX_0_765366865 /* d0[1] */
michael@0 938 .short -FIX_0_211164243 /* d0[2] */
michael@0 939 .short FIX_1_451774981 /* d0[3] */
michael@0 940 .short -FIX_2_172734803 /* d1[0] */
michael@0 941 .short FIX_1_061594337 /* d1[1] */
michael@0 942 .short -FIX_0_509795579 /* d1[2] */
michael@0 943 .short -FIX_0_601344887 /* d1[3] */
michael@0 944 .short FIX_0_899976223 /* d2[0] */
michael@0 945 .short FIX_2_562915447 /* d2[1] */
michael@0 946 .short 1 << (CONST_BITS+1) /* d2[2] */
michael@0 947 .short 0 /* d2[3] */
michael@0 948
michael@0 949 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
michael@0 950 vmull.s16 q14, \x4, d2[2]
michael@0 951 vmlal.s16 q14, \x8, d0[0]
michael@0 952 vmlal.s16 q14, \x14, d0[1]
michael@0 953
michael@0 954 vmull.s16 q13, \x16, d1[2]
michael@0 955 vmlal.s16 q13, \x12, d1[3]
michael@0 956 vmlal.s16 q13, \x10, d2[0]
michael@0 957 vmlal.s16 q13, \x6, d2[1]
michael@0 958
michael@0 959 vmull.s16 q15, \x4, d2[2]
michael@0 960 vmlsl.s16 q15, \x8, d0[0]
michael@0 961 vmlsl.s16 q15, \x14, d0[1]
michael@0 962
michael@0 963 vmull.s16 q12, \x16, d0[2]
michael@0 964 vmlal.s16 q12, \x12, d0[3]
michael@0 965 vmlal.s16 q12, \x10, d1[0]
michael@0 966 vmlal.s16 q12, \x6, d1[1]
michael@0 967
michael@0 968 vadd.s32 q10, q14, q13
michael@0 969 vsub.s32 q14, q14, q13
michael@0 970
michael@0 971 .if \shift > 16
michael@0 972 vrshr.s32 q10, q10, #\shift
michael@0 973 vrshr.s32 q14, q14, #\shift
michael@0 974 vmovn.s32 \y26, q10
michael@0 975 vmovn.s32 \y29, q14
michael@0 976 .else
michael@0 977 vrshrn.s32 \y26, q10, #\shift
michael@0 978 vrshrn.s32 \y29, q14, #\shift
michael@0 979 .endif
michael@0 980
michael@0 981 vadd.s32 q10, q15, q12
michael@0 982 vsub.s32 q15, q15, q12
michael@0 983
michael@0 984 .if \shift > 16
michael@0 985 vrshr.s32 q10, q10, #\shift
michael@0 986 vrshr.s32 q15, q15, #\shift
michael@0 987 vmovn.s32 \y27, q10
michael@0 988 vmovn.s32 \y28, q15
michael@0 989 .else
michael@0 990 vrshrn.s32 \y27, q10, #\shift
michael@0 991 vrshrn.s32 \y28, q15, #\shift
michael@0 992 .endif
michael@0 993
michael@0 994 .endm
michael@0 995
michael@0 996 asm_function jsimd_idct_4x4_neon
michael@0 997
michael@0 998 DCT_TABLE .req r0
michael@0 999 COEF_BLOCK .req r1
michael@0 1000 OUTPUT_BUF .req r2
michael@0 1001 OUTPUT_COL .req r3
michael@0 1002 TMP1 .req r0
michael@0 1003 TMP2 .req r1
michael@0 1004 TMP3 .req r2
michael@0 1005 TMP4 .req ip
michael@0 1006
michael@0 1007 vpush {d8-d15}
michael@0 1008
michael@0 1009 /* Load constants (d3 is just used for padding) */
michael@0 1010 adr TMP4, jsimd_idct_4x4_neon_consts
michael@0 1011 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
michael@0 1012
michael@0 1013 /* Load all COEF_BLOCK into NEON registers with the following allocation:
michael@0 1014 * 0 1 2 3 | 4 5 6 7
michael@0 1015 * ---------+--------
michael@0 1016 * 0 | d4 | d5
michael@0 1017 * 1 | d6 | d7
michael@0 1018 * 2 | d8 | d9
michael@0 1019 * 3 | d10 | d11
michael@0 1020 * 4 | - | -
michael@0 1021 * 5 | d12 | d13
michael@0 1022 * 6 | d14 | d15
michael@0 1023 * 7 | d16 | d17
michael@0 1024 */
michael@0 1025 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
michael@0 1026 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
michael@0 1027 add COEF_BLOCK, COEF_BLOCK, #16
michael@0 1028 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
michael@0 1029 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
michael@0 1030 /* dequantize */
michael@0 1031 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
michael@0 1032 vmul.s16 q2, q2, q9
michael@0 1033 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
michael@0 1034 vmul.s16 q3, q3, q10
michael@0 1035 vmul.s16 q4, q4, q11
michael@0 1036 add DCT_TABLE, DCT_TABLE, #16
michael@0 1037 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
michael@0 1038 vmul.s16 q5, q5, q12
michael@0 1039 vmul.s16 q6, q6, q13
michael@0 1040 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
michael@0 1041 vmul.s16 q7, q7, q14
michael@0 1042 vmul.s16 q8, q8, q15
michael@0 1043
michael@0 1044 /* Pass 1 */
michael@0 1045 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
michael@0 1046 transpose_4x4 d4, d6, d8, d10
michael@0 1047 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
michael@0 1048 transpose_4x4 d5, d7, d9, d11
michael@0 1049
michael@0 1050 /* Pass 2 */
michael@0 1051 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
michael@0 1052 transpose_4x4 d26, d27, d28, d29
michael@0 1053
michael@0 1054 /* Range limit */
michael@0 1055 vmov.u16 q15, #0x80
michael@0 1056 vadd.s16 q13, q13, q15
michael@0 1057 vadd.s16 q14, q14, q15
michael@0 1058 vqmovun.s16 d26, q13
michael@0 1059 vqmovun.s16 d27, q14
michael@0 1060
michael@0 1061 /* Store results to the output buffer */
michael@0 1062 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
michael@0 1063 add TMP1, TMP1, OUTPUT_COL
michael@0 1064 add TMP2, TMP2, OUTPUT_COL
michael@0 1065 add TMP3, TMP3, OUTPUT_COL
michael@0 1066 add TMP4, TMP4, OUTPUT_COL
michael@0 1067
michael@0 1068 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
michael@0 1069 /* We can use much less instructions on little endian systems if the
michael@0 1070 * OS kernel is not configured to trap unaligned memory accesses
michael@0 1071 */
michael@0 1072 vst1.32 {d26[0]}, [TMP1]!
michael@0 1073 vst1.32 {d27[0]}, [TMP3]!
michael@0 1074 vst1.32 {d26[1]}, [TMP2]!
michael@0 1075 vst1.32 {d27[1]}, [TMP4]!
michael@0 1076 #else
michael@0 1077 vst1.8 {d26[0]}, [TMP1]!
michael@0 1078 vst1.8 {d27[0]}, [TMP3]!
michael@0 1079 vst1.8 {d26[1]}, [TMP1]!
michael@0 1080 vst1.8 {d27[1]}, [TMP3]!
michael@0 1081 vst1.8 {d26[2]}, [TMP1]!
michael@0 1082 vst1.8 {d27[2]}, [TMP3]!
michael@0 1083 vst1.8 {d26[3]}, [TMP1]!
michael@0 1084 vst1.8 {d27[3]}, [TMP3]!
michael@0 1085
michael@0 1086 vst1.8 {d26[4]}, [TMP2]!
michael@0 1087 vst1.8 {d27[4]}, [TMP4]!
michael@0 1088 vst1.8 {d26[5]}, [TMP2]!
michael@0 1089 vst1.8 {d27[5]}, [TMP4]!
michael@0 1090 vst1.8 {d26[6]}, [TMP2]!
michael@0 1091 vst1.8 {d27[6]}, [TMP4]!
michael@0 1092 vst1.8 {d26[7]}, [TMP2]!
michael@0 1093 vst1.8 {d27[7]}, [TMP4]!
michael@0 1094 #endif
michael@0 1095
michael@0 1096 vpop {d8-d15}
michael@0 1097 bx lr
michael@0 1098
michael@0 1099 .unreq DCT_TABLE
michael@0 1100 .unreq COEF_BLOCK
michael@0 1101 .unreq OUTPUT_BUF
michael@0 1102 .unreq OUTPUT_COL
michael@0 1103 .unreq TMP1
michael@0 1104 .unreq TMP2
michael@0 1105 .unreq TMP3
michael@0 1106 .unreq TMP4
michael@0 1107 .endfunc
michael@0 1108
michael@0 1109 .purgem idct_helper
michael@0 1110
michael@0 1111 /*****************************************************************************/
michael@0 1112
michael@0 1113 /*
michael@0 1114 * jsimd_idct_2x2_neon
michael@0 1115 *
michael@0 1116 * This function contains inverse-DCT code for getting reduced-size
michael@0 1117 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
michael@0 1118 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
michael@0 1119 * function from jpeg-6b (jidctred.c).
michael@0 1120 *
michael@0 1121 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
michael@0 1122 * requires much less arithmetic operations and hence should be faster.
michael@0 1123 * The primary purpose of this particular NEON optimized function is
michael@0 1124 * bit exact compatibility with jpeg-6b.
michael@0 1125 */
michael@0 1126
michael@0 1127 .balign 8
michael@0 1128 jsimd_idct_2x2_neon_consts:
michael@0 1129 .short -FIX_0_720959822 /* d0[0] */
michael@0 1130 .short FIX_0_850430095 /* d0[1] */
michael@0 1131 .short -FIX_1_272758580 /* d0[2] */
michael@0 1132 .short FIX_3_624509785 /* d0[3] */
michael@0 1133
michael@0 1134 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
michael@0 1135 vshll.s16 q14, \x4, #15
michael@0 1136 vmull.s16 q13, \x6, d0[3]
michael@0 1137 vmlal.s16 q13, \x10, d0[2]
michael@0 1138 vmlal.s16 q13, \x12, d0[1]
michael@0 1139 vmlal.s16 q13, \x16, d0[0]
michael@0 1140
michael@0 1141 vadd.s32 q10, q14, q13
michael@0 1142 vsub.s32 q14, q14, q13
michael@0 1143
michael@0 1144 .if \shift > 16
michael@0 1145 vrshr.s32 q10, q10, #\shift
michael@0 1146 vrshr.s32 q14, q14, #\shift
michael@0 1147 vmovn.s32 \y26, q10
michael@0 1148 vmovn.s32 \y27, q14
michael@0 1149 .else
michael@0 1150 vrshrn.s32 \y26, q10, #\shift
michael@0 1151 vrshrn.s32 \y27, q14, #\shift
michael@0 1152 .endif
michael@0 1153
michael@0 1154 .endm
michael@0 1155
michael@0 1156 asm_function jsimd_idct_2x2_neon
michael@0 1157
michael@0 1158 DCT_TABLE .req r0
michael@0 1159 COEF_BLOCK .req r1
michael@0 1160 OUTPUT_BUF .req r2
michael@0 1161 OUTPUT_COL .req r3
michael@0 1162 TMP1 .req r0
michael@0 1163 TMP2 .req ip
michael@0 1164
michael@0 1165 vpush {d8-d15}
michael@0 1166
michael@0 1167 /* Load constants */
michael@0 1168 adr TMP2, jsimd_idct_2x2_neon_consts
michael@0 1169 vld1.16 {d0}, [TMP2, :64]
michael@0 1170
michael@0 1171 /* Load all COEF_BLOCK into NEON registers with the following allocation:
michael@0 1172 * 0 1 2 3 | 4 5 6 7
michael@0 1173 * ---------+--------
michael@0 1174 * 0 | d4 | d5
michael@0 1175 * 1 | d6 | d7
michael@0 1176 * 2 | - | -
michael@0 1177 * 3 | d10 | d11
michael@0 1178 * 4 | - | -
michael@0 1179 * 5 | d12 | d13
michael@0 1180 * 6 | - | -
michael@0 1181 * 7 | d16 | d17
michael@0 1182 */
michael@0 1183 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
michael@0 1184 add COEF_BLOCK, COEF_BLOCK, #16
michael@0 1185 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
michael@0 1186 add COEF_BLOCK, COEF_BLOCK, #16
michael@0 1187 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
michael@0 1188 add COEF_BLOCK, COEF_BLOCK, #16
michael@0 1189 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
michael@0 1190 /* Dequantize */
michael@0 1191 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
michael@0 1192 vmul.s16 q2, q2, q9
michael@0 1193 vmul.s16 q3, q3, q10
michael@0 1194 add DCT_TABLE, DCT_TABLE, #16
michael@0 1195 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
michael@0 1196 vmul.s16 q5, q5, q12
michael@0 1197 add DCT_TABLE, DCT_TABLE, #16
michael@0 1198 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
michael@0 1199 vmul.s16 q6, q6, q13
michael@0 1200 add DCT_TABLE, DCT_TABLE, #16
michael@0 1201 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
michael@0 1202 vmul.s16 q8, q8, q15
michael@0 1203
michael@0 1204 /* Pass 1 */
michael@0 1205 #if 0
michael@0 1206 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
michael@0 1207 transpose_4x4 d4, d6, d8, d10
michael@0 1208 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
michael@0 1209 transpose_4x4 d5, d7, d9, d11
michael@0 1210 #else
michael@0 1211 vmull.s16 q13, d6, d0[3]
michael@0 1212 vmlal.s16 q13, d10, d0[2]
michael@0 1213 vmlal.s16 q13, d12, d0[1]
michael@0 1214 vmlal.s16 q13, d16, d0[0]
michael@0 1215 vmull.s16 q12, d7, d0[3]
michael@0 1216 vmlal.s16 q12, d11, d0[2]
michael@0 1217 vmlal.s16 q12, d13, d0[1]
michael@0 1218 vmlal.s16 q12, d17, d0[0]
michael@0 1219 vshll.s16 q14, d4, #15
michael@0 1220 vshll.s16 q15, d5, #15
michael@0 1221 vadd.s32 q10, q14, q13
michael@0 1222 vsub.s32 q14, q14, q13
michael@0 1223 vrshrn.s32 d4, q10, #13
michael@0 1224 vrshrn.s32 d6, q14, #13
michael@0 1225 vadd.s32 q10, q15, q12
michael@0 1226 vsub.s32 q14, q15, q12
michael@0 1227 vrshrn.s32 d5, q10, #13
michael@0 1228 vrshrn.s32 d7, q14, #13
michael@0 1229 vtrn.16 q2, q3
michael@0 1230 vtrn.32 q3, q5
michael@0 1231 #endif
michael@0 1232
michael@0 1233 /* Pass 2 */
michael@0 1234 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
michael@0 1235
michael@0 1236 /* Range limit */
michael@0 1237 vmov.u16 q15, #0x80
michael@0 1238 vadd.s16 q13, q13, q15
michael@0 1239 vqmovun.s16 d26, q13
michael@0 1240 vqmovun.s16 d27, q13
michael@0 1241
michael@0 1242 /* Store results to the output buffer */
michael@0 1243 ldmia OUTPUT_BUF, {TMP1, TMP2}
michael@0 1244 add TMP1, TMP1, OUTPUT_COL
michael@0 1245 add TMP2, TMP2, OUTPUT_COL
michael@0 1246
michael@0 1247 vst1.8 {d26[0]}, [TMP1]!
michael@0 1248 vst1.8 {d27[4]}, [TMP1]!
michael@0 1249 vst1.8 {d26[1]}, [TMP2]!
michael@0 1250 vst1.8 {d27[5]}, [TMP2]!
michael@0 1251
michael@0 1252 vpop {d8-d15}
michael@0 1253 bx lr
michael@0 1254
michael@0 1255 .unreq DCT_TABLE
michael@0 1256 .unreq COEF_BLOCK
michael@0 1257 .unreq OUTPUT_BUF
michael@0 1258 .unreq OUTPUT_COL
michael@0 1259 .unreq TMP1
michael@0 1260 .unreq TMP2
michael@0 1261 .endfunc
michael@0 1262
michael@0 1263 .purgem idct_helper
michael@0 1264
michael@0 1265 /*****************************************************************************/
michael@0 1266
michael@0 1267 /*
michael@0 1268 * jsimd_ycc_extrgb_convert_neon
michael@0 1269 * jsimd_ycc_extbgr_convert_neon
michael@0 1270 * jsimd_ycc_extrgbx_convert_neon
michael@0 1271 * jsimd_ycc_extbgrx_convert_neon
michael@0 1272 * jsimd_ycc_extxbgr_convert_neon
michael@0 1273 * jsimd_ycc_extxrgb_convert_neon
michael@0 1274 *
michael@0 1275 * Colorspace conversion YCbCr -> RGB
michael@0 1276 */
michael@0 1277
michael@0 1278
michael@0 1279 .macro do_load size
michael@0 1280 .if \size == 8
michael@0 1281 vld1.8 {d4}, [U, :64]!
michael@0 1282 vld1.8 {d5}, [V, :64]!
michael@0 1283 vld1.8 {d0}, [Y, :64]!
michael@0 1284 pld [U, #64]
michael@0 1285 pld [V, #64]
michael@0 1286 pld [Y, #64]
michael@0 1287 .elseif \size == 4
michael@0 1288 vld1.8 {d4[0]}, [U]!
michael@0 1289 vld1.8 {d4[1]}, [U]!
michael@0 1290 vld1.8 {d4[2]}, [U]!
michael@0 1291 vld1.8 {d4[3]}, [U]!
michael@0 1292 vld1.8 {d5[0]}, [V]!
michael@0 1293 vld1.8 {d5[1]}, [V]!
michael@0 1294 vld1.8 {d5[2]}, [V]!
michael@0 1295 vld1.8 {d5[3]}, [V]!
michael@0 1296 vld1.8 {d0[0]}, [Y]!
michael@0 1297 vld1.8 {d0[1]}, [Y]!
michael@0 1298 vld1.8 {d0[2]}, [Y]!
michael@0 1299 vld1.8 {d0[3]}, [Y]!
michael@0 1300 .elseif \size == 2
michael@0 1301 vld1.8 {d4[4]}, [U]!
michael@0 1302 vld1.8 {d4[5]}, [U]!
michael@0 1303 vld1.8 {d5[4]}, [V]!
michael@0 1304 vld1.8 {d5[5]}, [V]!
michael@0 1305 vld1.8 {d0[4]}, [Y]!
michael@0 1306 vld1.8 {d0[5]}, [Y]!
michael@0 1307 .elseif \size == 1
michael@0 1308 vld1.8 {d4[6]}, [U]!
michael@0 1309 vld1.8 {d5[6]}, [V]!
michael@0 1310 vld1.8 {d0[6]}, [Y]!
michael@0 1311 .else
michael@0 1312 .error unsupported macroblock size
michael@0 1313 .endif
michael@0 1314 .endm
michael@0 1315
michael@0 1316 .macro do_store bpp, size
michael@0 1317 .if \bpp == 24
michael@0 1318 .if \size == 8
michael@0 1319 vst3.8 {d10, d11, d12}, [RGB]!
michael@0 1320 .elseif \size == 4
michael@0 1321 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
michael@0 1322 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
michael@0 1323 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
michael@0 1324 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
michael@0 1325 .elseif \size == 2
michael@0 1326 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
michael@0 1327 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
michael@0 1328 .elseif \size == 1
michael@0 1329 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
michael@0 1330 .else
michael@0 1331 .error unsupported macroblock size
michael@0 1332 .endif
michael@0 1333 .elseif \bpp == 32
michael@0 1334 .if \size == 8
michael@0 1335 vst4.8 {d10, d11, d12, d13}, [RGB]!
michael@0 1336 .elseif \size == 4
michael@0 1337 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
michael@0 1338 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
michael@0 1339 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
michael@0 1340 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
michael@0 1341 .elseif \size == 2
michael@0 1342 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
michael@0 1343 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
michael@0 1344 .elseif \size == 1
michael@0 1345 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
michael@0 1346 .else
michael@0 1347 .error unsupported macroblock size
michael@0 1348 .endif
michael@0 1349 .else
michael@0 1350 .error unsupported bpp
michael@0 1351 .endif
michael@0 1352 .endm
michael@0 1353
michael@0 1354 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
michael@0 1355
michael@0 1356 /*
michael@0 1357 * 2 stage pipelined YCbCr->RGB conversion
michael@0 1358 */
michael@0 1359
michael@0 1360 .macro do_yuv_to_rgb_stage1
michael@0 1361 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
michael@0 1362 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
michael@0 1363 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
michael@0 1364 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
michael@0 1365 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
michael@0 1366 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
michael@0 1367 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
michael@0 1368 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
michael@0 1369 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
michael@0 1370 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
michael@0 1371 .endm
michael@0 1372
michael@0 1373 .macro do_yuv_to_rgb_stage2
michael@0 1374 vrshrn.s32 d20, q10, #15
michael@0 1375 vrshrn.s32 d21, q11, #15
michael@0 1376 vrshrn.s32 d24, q12, #14
michael@0 1377 vrshrn.s32 d25, q13, #14
michael@0 1378 vrshrn.s32 d28, q14, #14
michael@0 1379 vrshrn.s32 d29, q15, #14
michael@0 1380 vaddw.u8 q10, q10, d0
michael@0 1381 vaddw.u8 q12, q12, d0
michael@0 1382 vaddw.u8 q14, q14, d0
michael@0 1383 vqmovun.s16 d1\g_offs, q10
michael@0 1384 vqmovun.s16 d1\r_offs, q12
michael@0 1385 vqmovun.s16 d1\b_offs, q14
michael@0 1386 .endm
michael@0 1387
michael@0 1388 .macro do_yuv_to_rgb_stage2_store_load_stage1
michael@0 1389 vld1.8 {d4}, [U, :64]!
michael@0 1390 vrshrn.s32 d20, q10, #15
michael@0 1391 vrshrn.s32 d21, q11, #15
michael@0 1392 vrshrn.s32 d24, q12, #14
michael@0 1393 vrshrn.s32 d25, q13, #14
michael@0 1394 vrshrn.s32 d28, q14, #14
michael@0 1395 vld1.8 {d5}, [V, :64]!
michael@0 1396 vrshrn.s32 d29, q15, #14
michael@0 1397 vaddw.u8 q10, q10, d0
michael@0 1398 vaddw.u8 q12, q12, d0
michael@0 1399 vaddw.u8 q14, q14, d0
michael@0 1400 vqmovun.s16 d1\g_offs, q10
michael@0 1401 vld1.8 {d0}, [Y, :64]!
michael@0 1402 vqmovun.s16 d1\r_offs, q12
michael@0 1403 pld [U, #64]
michael@0 1404 pld [V, #64]
michael@0 1405 pld [Y, #64]
michael@0 1406 vqmovun.s16 d1\b_offs, q14
michael@0 1407 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
michael@0 1408 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
michael@0 1409 do_store \bpp, 8
michael@0 1410 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
michael@0 1411 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
michael@0 1412 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
michael@0 1413 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
michael@0 1414 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
michael@0 1415 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
michael@0 1416 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
michael@0 1417 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
michael@0 1418 .endm
michael@0 1419
michael@0 1420 .macro do_yuv_to_rgb
michael@0 1421 do_yuv_to_rgb_stage1
michael@0 1422 do_yuv_to_rgb_stage2
michael@0 1423 .endm
michael@0 1424
michael@0 1425 /* Apple gas crashes on adrl, work around that by using adr.
michael@0 1426 * But this requires a copy of these constants for each function.
michael@0 1427 */
michael@0 1428
michael@0 1429 .balign 16
michael@0 1430 jsimd_ycc_\colorid\()_neon_consts:
michael@0 1431 .short 0, 0, 0, 0
michael@0 1432 .short 22971, -11277, -23401, 29033
michael@0 1433 .short -128, -128, -128, -128
michael@0 1434 .short -128, -128, -128, -128
michael@0 1435
michael@0 1436 asm_function jsimd_ycc_\colorid\()_convert_neon
michael@0 1437 OUTPUT_WIDTH .req r0
michael@0 1438 INPUT_BUF .req r1
michael@0 1439 INPUT_ROW .req r2
michael@0 1440 OUTPUT_BUF .req r3
michael@0 1441 NUM_ROWS .req r4
michael@0 1442
michael@0 1443 INPUT_BUF0 .req r5
michael@0 1444 INPUT_BUF1 .req r6
michael@0 1445 INPUT_BUF2 .req INPUT_BUF
michael@0 1446
michael@0 1447 RGB .req r7
michael@0 1448 Y .req r8
michael@0 1449 U .req r9
michael@0 1450 V .req r10
michael@0 1451 N .req ip
michael@0 1452
michael@0 1453 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
michael@0 1454 adr ip, jsimd_ycc_\colorid\()_neon_consts
michael@0 1455 vld1.16 {d0, d1, d2, d3}, [ip, :128]
michael@0 1456
michael@0 1457 /* Save ARM registers and handle input arguments */
michael@0 1458 push {r4, r5, r6, r7, r8, r9, r10, lr}
michael@0 1459 ldr NUM_ROWS, [sp, #(4 * 8)]
michael@0 1460 ldr INPUT_BUF0, [INPUT_BUF]
michael@0 1461 ldr INPUT_BUF1, [INPUT_BUF, #4]
michael@0 1462 ldr INPUT_BUF2, [INPUT_BUF, #8]
michael@0 1463 .unreq INPUT_BUF
michael@0 1464
michael@0 1465 /* Save NEON registers */
michael@0 1466 vpush {d8-d15}
michael@0 1467
michael@0 1468 /* Initially set d10, d11, d12, d13 to 0xFF */
michael@0 1469 vmov.u8 q5, #255
michael@0 1470 vmov.u8 q6, #255
michael@0 1471
michael@0 1472 /* Outer loop over scanlines */
michael@0 1473 cmp NUM_ROWS, #1
michael@0 1474 blt 9f
michael@0 1475 0:
michael@0 1476 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
michael@0 1477 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
michael@0 1478 mov N, OUTPUT_WIDTH
michael@0 1479 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
michael@0 1480 add INPUT_ROW, INPUT_ROW, #1
michael@0 1481 ldr RGB, [OUTPUT_BUF], #4
michael@0 1482
michael@0 1483 /* Inner loop over pixels */
michael@0 1484 subs N, N, #8
michael@0 1485 blt 3f
michael@0 1486 do_load 8
michael@0 1487 do_yuv_to_rgb_stage1
michael@0 1488 subs N, N, #8
michael@0 1489 blt 2f
michael@0 1490 1:
michael@0 1491 do_yuv_to_rgb_stage2_store_load_stage1
michael@0 1492 subs N, N, #8
michael@0 1493 bge 1b
michael@0 1494 2:
michael@0 1495 do_yuv_to_rgb_stage2
michael@0 1496 do_store \bpp, 8
michael@0 1497 tst N, #7
michael@0 1498 beq 8f
michael@0 1499 3:
michael@0 1500 tst N, #4
michael@0 1501 beq 3f
michael@0 1502 do_load 4
michael@0 1503 3:
michael@0 1504 tst N, #2
michael@0 1505 beq 4f
michael@0 1506 do_load 2
michael@0 1507 4:
michael@0 1508 tst N, #1
michael@0 1509 beq 5f
michael@0 1510 do_load 1
michael@0 1511 5:
michael@0 1512 do_yuv_to_rgb
michael@0 1513 tst N, #4
michael@0 1514 beq 6f
michael@0 1515 do_store \bpp, 4
michael@0 1516 6:
michael@0 1517 tst N, #2
michael@0 1518 beq 7f
michael@0 1519 do_store \bpp, 2
michael@0 1520 7:
michael@0 1521 tst N, #1
michael@0 1522 beq 8f
michael@0 1523 do_store \bpp, 1
michael@0 1524 8:
michael@0 1525 subs NUM_ROWS, NUM_ROWS, #1
michael@0 1526 bgt 0b
michael@0 1527 9:
michael@0 1528 /* Restore all registers and return */
michael@0 1529 vpop {d8-d15}
michael@0 1530 pop {r4, r5, r6, r7, r8, r9, r10, pc}
michael@0 1531
michael@0 1532 .unreq OUTPUT_WIDTH
michael@0 1533 .unreq INPUT_ROW
michael@0 1534 .unreq OUTPUT_BUF
michael@0 1535 .unreq NUM_ROWS
michael@0 1536 .unreq INPUT_BUF0
michael@0 1537 .unreq INPUT_BUF1
michael@0 1538 .unreq INPUT_BUF2
michael@0 1539 .unreq RGB
michael@0 1540 .unreq Y
michael@0 1541 .unreq U
michael@0 1542 .unreq V
michael@0 1543 .unreq N
michael@0 1544 .endfunc
michael@0 1545
michael@0 1546 .purgem do_yuv_to_rgb
michael@0 1547 .purgem do_yuv_to_rgb_stage1
michael@0 1548 .purgem do_yuv_to_rgb_stage2
michael@0 1549 .purgem do_yuv_to_rgb_stage2_store_load_stage1
michael@0 1550
michael@0 1551 .endm
michael@0 1552
michael@0 1553 /*--------------------------------- id ----- bpp R G B */
michael@0 1554 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
michael@0 1555 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
michael@0 1556 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
michael@0 1557 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
michael@0 1558 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
michael@0 1559 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
michael@0 1560
michael@0 1561 .purgem do_load
michael@0 1562 .purgem do_store
michael@0 1563
michael@0 1564 /*****************************************************************************/
michael@0 1565
michael@0 1566 /*
michael@0 1567 * jsimd_extrgb_ycc_convert_neon
michael@0 1568 * jsimd_extbgr_ycc_convert_neon
michael@0 1569 * jsimd_extrgbx_ycc_convert_neon
michael@0 1570 * jsimd_extbgrx_ycc_convert_neon
michael@0 1571 * jsimd_extxbgr_ycc_convert_neon
michael@0 1572 * jsimd_extxrgb_ycc_convert_neon
michael@0 1573 *
michael@0 1574 * Colorspace conversion RGB -> YCbCr
michael@0 1575 */
michael@0 1576
michael@0 1577 .macro do_store size
michael@0 1578 .if \size == 8
michael@0 1579 vst1.8 {d20}, [Y]!
michael@0 1580 vst1.8 {d21}, [U]!
michael@0 1581 vst1.8 {d22}, [V]!
michael@0 1582 .elseif \size == 4
michael@0 1583 vst1.8 {d20[0]}, [Y]!
michael@0 1584 vst1.8 {d20[1]}, [Y]!
michael@0 1585 vst1.8 {d20[2]}, [Y]!
michael@0 1586 vst1.8 {d20[3]}, [Y]!
michael@0 1587 vst1.8 {d21[0]}, [U]!
michael@0 1588 vst1.8 {d21[1]}, [U]!
michael@0 1589 vst1.8 {d21[2]}, [U]!
michael@0 1590 vst1.8 {d21[3]}, [U]!
michael@0 1591 vst1.8 {d22[0]}, [V]!
michael@0 1592 vst1.8 {d22[1]}, [V]!
michael@0 1593 vst1.8 {d22[2]}, [V]!
michael@0 1594 vst1.8 {d22[3]}, [V]!
michael@0 1595 .elseif \size == 2
michael@0 1596 vst1.8 {d20[4]}, [Y]!
michael@0 1597 vst1.8 {d20[5]}, [Y]!
michael@0 1598 vst1.8 {d21[4]}, [U]!
michael@0 1599 vst1.8 {d21[5]}, [U]!
michael@0 1600 vst1.8 {d22[4]}, [V]!
michael@0 1601 vst1.8 {d22[5]}, [V]!
michael@0 1602 .elseif \size == 1
michael@0 1603 vst1.8 {d20[6]}, [Y]!
michael@0 1604 vst1.8 {d21[6]}, [U]!
michael@0 1605 vst1.8 {d22[6]}, [V]!
michael@0 1606 .else
michael@0 1607 .error unsupported macroblock size
michael@0 1608 .endif
michael@0 1609 .endm
michael@0 1610
michael@0 1611 .macro do_load bpp, size
michael@0 1612 .if \bpp == 24
michael@0 1613 .if \size == 8
michael@0 1614 vld3.8 {d10, d11, d12}, [RGB]!
michael@0 1615 pld [RGB, #128]
michael@0 1616 .elseif \size == 4
michael@0 1617 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
michael@0 1618 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
michael@0 1619 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
michael@0 1620 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
michael@0 1621 .elseif \size == 2
michael@0 1622 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
michael@0 1623 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
michael@0 1624 .elseif \size == 1
michael@0 1625 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
michael@0 1626 .else
michael@0 1627 .error unsupported macroblock size
michael@0 1628 .endif
michael@0 1629 .elseif \bpp == 32
michael@0 1630 .if \size == 8
michael@0 1631 vld4.8 {d10, d11, d12, d13}, [RGB]!
michael@0 1632 pld [RGB, #128]
michael@0 1633 .elseif \size == 4
michael@0 1634 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
michael@0 1635 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
michael@0 1636 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
michael@0 1637 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
michael@0 1638 .elseif \size == 2
michael@0 1639 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
michael@0 1640 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
michael@0 1641 .elseif \size == 1
michael@0 1642 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
michael@0 1643 .else
michael@0 1644 .error unsupported macroblock size
michael@0 1645 .endif
michael@0 1646 .else
michael@0 1647 .error unsupported bpp
michael@0 1648 .endif
michael@0 1649 .endm
michael@0 1650
michael@0 1651 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
michael@0 1652
michael@0 1653 /*
michael@0 1654 * 2 stage pipelined RGB->YCbCr conversion
michael@0 1655 */
michael@0 1656
michael@0 1657 .macro do_rgb_to_yuv_stage1
michael@0 1658 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
michael@0 1659 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
michael@0 1660 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
michael@0 1661 vmull.u16 q7, d4, d0[0]
michael@0 1662 vmlal.u16 q7, d6, d0[1]
michael@0 1663 vmlal.u16 q7, d8, d0[2]
michael@0 1664 vmull.u16 q8, d5, d0[0]
michael@0 1665 vmlal.u16 q8, d7, d0[1]
michael@0 1666 vmlal.u16 q8, d9, d0[2]
michael@0 1667 vrev64.32 q9, q1
michael@0 1668 vrev64.32 q13, q1
michael@0 1669 vmlsl.u16 q9, d4, d0[3]
michael@0 1670 vmlsl.u16 q9, d6, d1[0]
michael@0 1671 vmlal.u16 q9, d8, d1[1]
michael@0 1672 vmlsl.u16 q13, d5, d0[3]
michael@0 1673 vmlsl.u16 q13, d7, d1[0]
michael@0 1674 vmlal.u16 q13, d9, d1[1]
michael@0 1675 vrev64.32 q14, q1
michael@0 1676 vrev64.32 q15, q1
michael@0 1677 vmlal.u16 q14, d4, d1[1]
michael@0 1678 vmlsl.u16 q14, d6, d1[2]
michael@0 1679 vmlsl.u16 q14, d8, d1[3]
michael@0 1680 vmlal.u16 q15, d5, d1[1]
michael@0 1681 vmlsl.u16 q15, d7, d1[2]
michael@0 1682 vmlsl.u16 q15, d9, d1[3]
michael@0 1683 .endm
michael@0 1684
michael@0 1685 .macro do_rgb_to_yuv_stage2
michael@0 1686 vrshrn.u32 d20, q7, #16
michael@0 1687 vrshrn.u32 d21, q8, #16
michael@0 1688 vshrn.u32 d22, q9, #16
michael@0 1689 vshrn.u32 d23, q13, #16
michael@0 1690 vshrn.u32 d24, q14, #16
michael@0 1691 vshrn.u32 d25, q15, #16
michael@0 1692 vmovn.u16 d20, q10 /* d20 = y */
michael@0 1693 vmovn.u16 d21, q11 /* d21 = u */
michael@0 1694 vmovn.u16 d22, q12 /* d22 = v */
michael@0 1695 .endm
michael@0 1696
michael@0 1697 .macro do_rgb_to_yuv
michael@0 1698 do_rgb_to_yuv_stage1
michael@0 1699 do_rgb_to_yuv_stage2
michael@0 1700 .endm
michael@0 1701
michael@0 1702 .macro do_rgb_to_yuv_stage2_store_load_stage1
michael@0 1703 vrshrn.u32 d20, q7, #16
michael@0 1704 vrshrn.u32 d21, q8, #16
michael@0 1705 vshrn.u32 d22, q9, #16
michael@0 1706 vrev64.32 q9, q1
michael@0 1707 vshrn.u32 d23, q13, #16
michael@0 1708 vrev64.32 q13, q1
michael@0 1709 vshrn.u32 d24, q14, #16
michael@0 1710 vshrn.u32 d25, q15, #16
michael@0 1711 do_load \bpp, 8
michael@0 1712 vmovn.u16 d20, q10 /* d20 = y */
michael@0 1713 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
michael@0 1714 vmovn.u16 d21, q11 /* d21 = u */
michael@0 1715 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
michael@0 1716 vmovn.u16 d22, q12 /* d22 = v */
michael@0 1717 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
michael@0 1718 vmull.u16 q7, d4, d0[0]
michael@0 1719 vmlal.u16 q7, d6, d0[1]
michael@0 1720 vmlal.u16 q7, d8, d0[2]
michael@0 1721 vst1.8 {d20}, [Y]!
michael@0 1722 vmull.u16 q8, d5, d0[0]
michael@0 1723 vmlal.u16 q8, d7, d0[1]
michael@0 1724 vmlal.u16 q8, d9, d0[2]
michael@0 1725 vmlsl.u16 q9, d4, d0[3]
michael@0 1726 vmlsl.u16 q9, d6, d1[0]
michael@0 1727 vmlal.u16 q9, d8, d1[1]
michael@0 1728 vst1.8 {d21}, [U]!
michael@0 1729 vmlsl.u16 q13, d5, d0[3]
michael@0 1730 vmlsl.u16 q13, d7, d1[0]
michael@0 1731 vmlal.u16 q13, d9, d1[1]
michael@0 1732 vrev64.32 q14, q1
michael@0 1733 vrev64.32 q15, q1
michael@0 1734 vmlal.u16 q14, d4, d1[1]
michael@0 1735 vmlsl.u16 q14, d6, d1[2]
michael@0 1736 vmlsl.u16 q14, d8, d1[3]
michael@0 1737 vst1.8 {d22}, [V]!
michael@0 1738 vmlal.u16 q15, d5, d1[1]
michael@0 1739 vmlsl.u16 q15, d7, d1[2]
michael@0 1740 vmlsl.u16 q15, d9, d1[3]
michael@0 1741 .endm
michael@0 1742
michael@0 1743 .balign 16
michael@0 1744 jsimd_\colorid\()_ycc_neon_consts:
michael@0 1745 .short 19595, 38470, 7471, 11059
michael@0 1746 .short 21709, 32768, 27439, 5329
michael@0 1747 .short 32767, 128, 32767, 128
michael@0 1748 .short 32767, 128, 32767, 128
michael@0 1749
michael@0 1750 asm_function jsimd_\colorid\()_ycc_convert_neon
michael@0 1751 OUTPUT_WIDTH .req r0
michael@0 1752 INPUT_BUF .req r1
michael@0 1753 OUTPUT_BUF .req r2
michael@0 1754 OUTPUT_ROW .req r3
michael@0 1755 NUM_ROWS .req r4
michael@0 1756
michael@0 1757 OUTPUT_BUF0 .req r5
michael@0 1758 OUTPUT_BUF1 .req r6
michael@0 1759 OUTPUT_BUF2 .req OUTPUT_BUF
michael@0 1760
michael@0 1761 RGB .req r7
michael@0 1762 Y .req r8
michael@0 1763 U .req r9
michael@0 1764 V .req r10
michael@0 1765 N .req ip
michael@0 1766
michael@0 1767 /* Load constants to d0, d1, d2, d3 */
michael@0 1768 adr ip, jsimd_\colorid\()_ycc_neon_consts
michael@0 1769 vld1.16 {d0, d1, d2, d3}, [ip, :128]
michael@0 1770
michael@0 1771 /* Save ARM registers and handle input arguments */
michael@0 1772 push {r4, r5, r6, r7, r8, r9, r10, lr}
michael@0 1773 ldr NUM_ROWS, [sp, #(4 * 8)]
michael@0 1774 ldr OUTPUT_BUF0, [OUTPUT_BUF]
michael@0 1775 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
michael@0 1776 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
michael@0 1777 .unreq OUTPUT_BUF
michael@0 1778
michael@0 1779 /* Save NEON registers */
michael@0 1780 vpush {d8-d15}
michael@0 1781
michael@0 1782 /* Outer loop over scanlines */
michael@0 1783 cmp NUM_ROWS, #1
michael@0 1784 blt 9f
michael@0 1785 0:
michael@0 1786 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
michael@0 1787 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
michael@0 1788 mov N, OUTPUT_WIDTH
michael@0 1789 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
michael@0 1790 add OUTPUT_ROW, OUTPUT_ROW, #1
michael@0 1791 ldr RGB, [INPUT_BUF], #4
michael@0 1792
michael@0 1793 /* Inner loop over pixels */
michael@0 1794 subs N, N, #8
michael@0 1795 blt 3f
michael@0 1796 do_load \bpp, 8
michael@0 1797 do_rgb_to_yuv_stage1
michael@0 1798 subs N, N, #8
michael@0 1799 blt 2f
michael@0 1800 1:
michael@0 1801 do_rgb_to_yuv_stage2_store_load_stage1
michael@0 1802 subs N, N, #8
michael@0 1803 bge 1b
michael@0 1804 2:
michael@0 1805 do_rgb_to_yuv_stage2
michael@0 1806 do_store 8
michael@0 1807 tst N, #7
michael@0 1808 beq 8f
michael@0 1809 3:
michael@0 1810 tst N, #4
michael@0 1811 beq 3f
michael@0 1812 do_load \bpp, 4
michael@0 1813 3:
michael@0 1814 tst N, #2
michael@0 1815 beq 4f
michael@0 1816 do_load \bpp, 2
michael@0 1817 4:
michael@0 1818 tst N, #1
michael@0 1819 beq 5f
michael@0 1820 do_load \bpp, 1
michael@0 1821 5:
michael@0 1822 do_rgb_to_yuv
michael@0 1823 tst N, #4
michael@0 1824 beq 6f
michael@0 1825 do_store 4
michael@0 1826 6:
michael@0 1827 tst N, #2
michael@0 1828 beq 7f
michael@0 1829 do_store 2
michael@0 1830 7:
michael@0 1831 tst N, #1
michael@0 1832 beq 8f
michael@0 1833 do_store 1
michael@0 1834 8:
michael@0 1835 subs NUM_ROWS, NUM_ROWS, #1
michael@0 1836 bgt 0b
michael@0 1837 9:
michael@0 1838 /* Restore all registers and return */
michael@0 1839 vpop {d8-d15}
michael@0 1840 pop {r4, r5, r6, r7, r8, r9, r10, pc}
michael@0 1841
michael@0 1842 .unreq OUTPUT_WIDTH
michael@0 1843 .unreq OUTPUT_ROW
michael@0 1844 .unreq INPUT_BUF
michael@0 1845 .unreq NUM_ROWS
michael@0 1846 .unreq OUTPUT_BUF0
michael@0 1847 .unreq OUTPUT_BUF1
michael@0 1848 .unreq OUTPUT_BUF2
michael@0 1849 .unreq RGB
michael@0 1850 .unreq Y
michael@0 1851 .unreq U
michael@0 1852 .unreq V
michael@0 1853 .unreq N
michael@0 1854 .endfunc
michael@0 1855
michael@0 1856 .purgem do_rgb_to_yuv
michael@0 1857 .purgem do_rgb_to_yuv_stage1
michael@0 1858 .purgem do_rgb_to_yuv_stage2
michael@0 1859 .purgem do_rgb_to_yuv_stage2_store_load_stage1
michael@0 1860
michael@0 1861 .endm
michael@0 1862
michael@0 1863 /*--------------------------------- id ----- bpp R G B */
michael@0 1864 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
michael@0 1865 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
michael@0 1866 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
michael@0 1867 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
michael@0 1868 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
michael@0 1869 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
michael@0 1870
michael@0 1871 .purgem do_load
michael@0 1872 .purgem do_store
michael@0 1873
michael@0 1874 /*****************************************************************************/
michael@0 1875
michael@0 1876 /*
michael@0 1877 * Load data into workspace, applying unsigned->signed conversion
michael@0 1878 *
michael@0 1879 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
michael@0 1880 * rid of VST1.16 instructions
michael@0 1881 */
michael@0 1882
michael@0 1883 asm_function jsimd_convsamp_neon
michael@0 1884 SAMPLE_DATA .req r0
michael@0 1885 START_COL .req r1
michael@0 1886 WORKSPACE .req r2
michael@0 1887 TMP1 .req r3
michael@0 1888 TMP2 .req r4
michael@0 1889 TMP3 .req r5
michael@0 1890 TMP4 .req ip
michael@0 1891
michael@0 1892 push {r4, r5}
michael@0 1893 vmov.u8 d0, #128
michael@0 1894
michael@0 1895 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
michael@0 1896 add TMP1, TMP1, START_COL
michael@0 1897 add TMP2, TMP2, START_COL
michael@0 1898 add TMP3, TMP3, START_COL
michael@0 1899 add TMP4, TMP4, START_COL
michael@0 1900 vld1.8 {d16}, [TMP1]
michael@0 1901 vsubl.u8 q8, d16, d0
michael@0 1902 vld1.8 {d18}, [TMP2]
michael@0 1903 vsubl.u8 q9, d18, d0
michael@0 1904 vld1.8 {d20}, [TMP3]
michael@0 1905 vsubl.u8 q10, d20, d0
michael@0 1906 vld1.8 {d22}, [TMP4]
michael@0 1907 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
michael@0 1908 vsubl.u8 q11, d22, d0
michael@0 1909 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
michael@0 1910 add TMP1, TMP1, START_COL
michael@0 1911 add TMP2, TMP2, START_COL
michael@0 1912 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
michael@0 1913 add TMP3, TMP3, START_COL
michael@0 1914 add TMP4, TMP4, START_COL
michael@0 1915 vld1.8 {d24}, [TMP1]
michael@0 1916 vsubl.u8 q12, d24, d0
michael@0 1917 vld1.8 {d26}, [TMP2]
michael@0 1918 vsubl.u8 q13, d26, d0
michael@0 1919 vld1.8 {d28}, [TMP3]
michael@0 1920 vsubl.u8 q14, d28, d0
michael@0 1921 vld1.8 {d30}, [TMP4]
michael@0 1922 vsubl.u8 q15, d30, d0
michael@0 1923 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
michael@0 1924 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
michael@0 1925 pop {r4, r5}
michael@0 1926 bx lr
michael@0 1927
michael@0 1928 .unreq SAMPLE_DATA
michael@0 1929 .unreq START_COL
michael@0 1930 .unreq WORKSPACE
michael@0 1931 .unreq TMP1
michael@0 1932 .unreq TMP2
michael@0 1933 .unreq TMP3
michael@0 1934 .unreq TMP4
michael@0 1935 .endfunc
michael@0 1936
michael@0 1937 /*****************************************************************************/
michael@0 1938
michael@0 1939 /*
michael@0 1940 * jsimd_fdct_ifast_neon
michael@0 1941 *
michael@0 1942 * This function contains a fast, not so accurate integer implementation of
michael@0 1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
michael@0 1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
michael@0 1945 * function from jfdctfst.c
michael@0 1946 *
michael@0 1947 * TODO: can be combined with 'jsimd_convsamp_neon' to get
michael@0 1948 * rid of a bunch of VLD1.16 instructions
michael@0 1949 */
michael@0 1950
michael@0 1951 #define XFIX_0_382683433 d0[0]
michael@0 1952 #define XFIX_0_541196100 d0[1]
michael@0 1953 #define XFIX_0_707106781 d0[2]
michael@0 1954 #define XFIX_1_306562965 d0[3]
michael@0 1955
michael@0 1956 .balign 16
michael@0 1957 jsimd_fdct_ifast_neon_consts:
michael@0 1958 .short (98 * 128) /* XFIX_0_382683433 */
michael@0 1959 .short (139 * 128) /* XFIX_0_541196100 */
michael@0 1960 .short (181 * 128) /* XFIX_0_707106781 */
michael@0 1961 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
michael@0 1962
michael@0 1963 asm_function jsimd_fdct_ifast_neon
michael@0 1964
michael@0 1965 DATA .req r0
michael@0 1966 TMP .req ip
michael@0 1967
michael@0 1968 vpush {d8-d15}
michael@0 1969
michael@0 1970 /* Load constants */
michael@0 1971 adr TMP, jsimd_fdct_ifast_neon_consts
michael@0 1972 vld1.16 {d0}, [TMP, :64]
michael@0 1973
michael@0 1974 /* Load all DATA into NEON registers with the following allocation:
michael@0 1975 * 0 1 2 3 | 4 5 6 7
michael@0 1976 * ---------+--------
michael@0 1977 * 0 | d16 | d17 | q8
michael@0 1978 * 1 | d18 | d19 | q9
michael@0 1979 * 2 | d20 | d21 | q10
michael@0 1980 * 3 | d22 | d23 | q11
michael@0 1981 * 4 | d24 | d25 | q12
michael@0 1982 * 5 | d26 | d27 | q13
michael@0 1983 * 6 | d28 | d29 | q14
michael@0 1984 * 7 | d30 | d31 | q15
michael@0 1985 */
michael@0 1986
michael@0 1987 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
michael@0 1988 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
michael@0 1989 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
michael@0 1990 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
michael@0 1991 sub DATA, DATA, #(128 - 32)
michael@0 1992
michael@0 1993 mov TMP, #2
michael@0 1994 1:
michael@0 1995 /* Transpose */
michael@0 1996 vtrn.16 q12, q13
michael@0 1997 vtrn.16 q10, q11
michael@0 1998 vtrn.16 q8, q9
michael@0 1999 vtrn.16 q14, q15
michael@0 2000 vtrn.32 q9, q11
michael@0 2001 vtrn.32 q13, q15
michael@0 2002 vtrn.32 q8, q10
michael@0 2003 vtrn.32 q12, q14
michael@0 2004 vswp d30, d23
michael@0 2005 vswp d24, d17
michael@0 2006 vswp d26, d19
michael@0 2007 /* 1-D FDCT */
michael@0 2008 vadd.s16 q2, q11, q12
michael@0 2009 vswp d28, d21
michael@0 2010 vsub.s16 q12, q11, q12
michael@0 2011 vsub.s16 q6, q10, q13
michael@0 2012 vadd.s16 q10, q10, q13
michael@0 2013 vsub.s16 q7, q9, q14
michael@0 2014 vadd.s16 q9, q9, q14
michael@0 2015 vsub.s16 q1, q8, q15
michael@0 2016 vadd.s16 q8, q8, q15
michael@0 2017 vsub.s16 q4, q9, q10
michael@0 2018 vsub.s16 q5, q8, q2
michael@0 2019 vadd.s16 q3, q9, q10
michael@0 2020 vadd.s16 q4, q4, q5
michael@0 2021 vadd.s16 q2, q8, q2
michael@0 2022 vqdmulh.s16 q4, q4, XFIX_0_707106781
michael@0 2023 vadd.s16 q11, q12, q6
michael@0 2024 vadd.s16 q8, q2, q3
michael@0 2025 vsub.s16 q12, q2, q3
michael@0 2026 vadd.s16 q3, q6, q7
michael@0 2027 vadd.s16 q7, q7, q1
michael@0 2028 vqdmulh.s16 q3, q3, XFIX_0_707106781
michael@0 2029 vsub.s16 q6, q11, q7
michael@0 2030 vadd.s16 q10, q5, q4
michael@0 2031 vqdmulh.s16 q6, q6, XFIX_0_382683433
michael@0 2032 vsub.s16 q14, q5, q4
michael@0 2033 vqdmulh.s16 q11, q11, XFIX_0_541196100
michael@0 2034 vqdmulh.s16 q5, q7, XFIX_1_306562965
michael@0 2035 vadd.s16 q4, q1, q3
michael@0 2036 vsub.s16 q3, q1, q3
michael@0 2037 vadd.s16 q7, q7, q6
michael@0 2038 vadd.s16 q11, q11, q6
michael@0 2039 vadd.s16 q7, q7, q5
michael@0 2040 vadd.s16 q13, q3, q11
michael@0 2041 vsub.s16 q11, q3, q11
michael@0 2042 vadd.s16 q9, q4, q7
michael@0 2043 vsub.s16 q15, q4, q7
michael@0 2044 subs TMP, TMP, #1
michael@0 2045 bne 1b
michael@0 2046
michael@0 2047 /* store results */
michael@0 2048 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
michael@0 2049 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
michael@0 2050 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
michael@0 2051 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
michael@0 2052
michael@0 2053 vpop {d8-d15}
michael@0 2054 bx lr
michael@0 2055
michael@0 2056 .unreq DATA
michael@0 2057 .unreq TMP
michael@0 2058 .endfunc
michael@0 2059
michael@0 2060 /*****************************************************************************/
michael@0 2061
michael@0 2062 /*
michael@0 2063 * GLOBAL(void)
michael@0 2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
michael@0 2065 * DCTELEM * workspace);
michael@0 2066 *
michael@0 2067 * Note: the code uses 2 stage pipelining in order to improve instructions
michael@0 2068 * scheduling and eliminate stalls (this provides ~15% better
michael@0 2069 * performance for this function on both ARM Cortex-A8 and
michael@0 2070 * ARM Cortex-A9 when compared to the non-pipelined variant).
michael@0 2071 * The instructions which belong to the second stage use different
michael@0 2072 * indentation for better readiability.
michael@0 2073 */
michael@0 2074 asm_function jsimd_quantize_neon
michael@0 2075
michael@0 2076 COEF_BLOCK .req r0
michael@0 2077 DIVISORS .req r1
michael@0 2078 WORKSPACE .req r2
michael@0 2079
michael@0 2080 RECIPROCAL .req DIVISORS
michael@0 2081 CORRECTION .req r3
michael@0 2082 SHIFT .req ip
michael@0 2083 LOOP_COUNT .req r4
michael@0 2084
michael@0 2085 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
michael@0 2086 vabs.s16 q12, q0
michael@0 2087 add CORRECTION, DIVISORS, #(64 * 2)
michael@0 2088 add SHIFT, DIVISORS, #(64 * 6)
michael@0 2089 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
michael@0 2090 vabs.s16 q13, q1
michael@0 2091 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
michael@0 2092 vadd.u16 q12, q12, q10 /* add correction */
michael@0 2093 vadd.u16 q13, q13, q11
michael@0 2094 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
michael@0 2095 vmull.u16 q11, d25, d17
michael@0 2096 vmull.u16 q8, d26, d18
michael@0 2097 vmull.u16 q9, d27, d19
michael@0 2098 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
michael@0 2099 vshrn.u32 d20, q10, #16
michael@0 2100 vshrn.u32 d21, q11, #16
michael@0 2101 vshrn.u32 d22, q8, #16
michael@0 2102 vshrn.u32 d23, q9, #16
michael@0 2103 vneg.s16 q12, q12
michael@0 2104 vneg.s16 q13, q13
michael@0 2105 vshr.s16 q2, q0, #15 /* extract sign */
michael@0 2106 vshr.s16 q3, q1, #15
michael@0 2107 vshl.u16 q14, q10, q12 /* shift */
michael@0 2108 vshl.u16 q15, q11, q13
michael@0 2109
michael@0 2110 push {r4, r5}
michael@0 2111 mov LOOP_COUNT, #3
michael@0 2112 1:
michael@0 2113 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
michael@0 2114 veor.u16 q14, q14, q2 /* restore sign */
michael@0 2115 vabs.s16 q12, q0
michael@0 2116 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
michael@0 2117 vabs.s16 q13, q1
michael@0 2118 veor.u16 q15, q15, q3
michael@0 2119 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
michael@0 2120 vadd.u16 q12, q12, q10 /* add correction */
michael@0 2121 vadd.u16 q13, q13, q11
michael@0 2122 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
michael@0 2123 vmull.u16 q11, d25, d17
michael@0 2124 vmull.u16 q8, d26, d18
michael@0 2125 vmull.u16 q9, d27, d19
michael@0 2126 vsub.u16 q14, q14, q2
michael@0 2127 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
michael@0 2128 vsub.u16 q15, q15, q3
michael@0 2129 vshrn.u32 d20, q10, #16
michael@0 2130 vshrn.u32 d21, q11, #16
michael@0 2131 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
michael@0 2132 vshrn.u32 d22, q8, #16
michael@0 2133 vshrn.u32 d23, q9, #16
michael@0 2134 vneg.s16 q12, q12
michael@0 2135 vneg.s16 q13, q13
michael@0 2136 vshr.s16 q2, q0, #15 /* extract sign */
michael@0 2137 vshr.s16 q3, q1, #15
michael@0 2138 vshl.u16 q14, q10, q12 /* shift */
michael@0 2139 vshl.u16 q15, q11, q13
michael@0 2140 subs LOOP_COUNT, LOOP_COUNT, #1
michael@0 2141 bne 1b
michael@0 2142 pop {r4, r5}
michael@0 2143
michael@0 2144 veor.u16 q14, q14, q2 /* restore sign */
michael@0 2145 veor.u16 q15, q15, q3
michael@0 2146 vsub.u16 q14, q14, q2
michael@0 2147 vsub.u16 q15, q15, q3
michael@0 2148 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
michael@0 2149
michael@0 2150 bx lr /* return */
michael@0 2151
michael@0 2152 .unreq COEF_BLOCK
michael@0 2153 .unreq DIVISORS
michael@0 2154 .unreq WORKSPACE
michael@0 2155 .unreq RECIPROCAL
michael@0 2156 .unreq CORRECTION
michael@0 2157 .unreq SHIFT
michael@0 2158 .unreq LOOP_COUNT
michael@0 2159 .endfunc
michael@0 2160
michael@0 2161 /*****************************************************************************/
michael@0 2162
michael@0 2163 /*
michael@0 2164 * GLOBAL(void)
michael@0 2165 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
michael@0 2166 * JDIMENSION downsampled_width,
michael@0 2167 * JSAMPARRAY input_data,
michael@0 2168 * JSAMPARRAY * output_data_ptr);
michael@0 2169 *
michael@0 2170 * Note: the use of unaligned writes is the main remaining bottleneck in
michael@0 2171 * this code, which can be potentially solved to get up to tens
michael@0 2172 * of percents performance improvement on Cortex-A8/Cortex-A9.
michael@0 2173 */
michael@0 2174
michael@0 2175 /*
michael@0 2176 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
michael@0 2177 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
michael@0 2178 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
michael@0 2179 * Register d28 is used for multiplication by 3. Register q15 is used
michael@0 2180 * for adding +1 bias.
michael@0 2181 */
michael@0 2182 .macro upsample16 OUTPTR, INPTR
michael@0 2183 vld1.8 {q0}, [\INPTR]!
michael@0 2184 vmovl.u8 q8, d0
michael@0 2185 vext.8 q2, q1, q0, #15
michael@0 2186 vmovl.u8 q9, d1
michael@0 2187 vaddw.u8 q10, q15, d4
michael@0 2188 vaddw.u8 q11, q15, d5
michael@0 2189 vmlal.u8 q8, d4, d28
michael@0 2190 vmlal.u8 q9, d5, d28
michael@0 2191 vmlal.u8 q10, d0, d28
michael@0 2192 vmlal.u8 q11, d1, d28
michael@0 2193 vmov q1, q0 /* backup source pixels to q1 */
michael@0 2194 vrshrn.u16 d6, q8, #2
michael@0 2195 vrshrn.u16 d7, q9, #2
michael@0 2196 vshrn.u16 d8, q10, #2
michael@0 2197 vshrn.u16 d9, q11, #2
michael@0 2198 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
michael@0 2199 .endm
michael@0 2200
michael@0 2201 /*
michael@0 2202 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
michael@0 2203 * macro, the roles of q0 and q1 registers are reversed for even and odd
michael@0 2204 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
michael@0 2205 * Also this unrolling allows to reorder loads and stores to compensate
michael@0 2206 * multiplication latency and reduce stalls.
michael@0 2207 */
michael@0 2208 .macro upsample32 OUTPTR, INPTR
michael@0 2209 /* even 16 pixels group */
michael@0 2210 vld1.8 {q0}, [\INPTR]!
michael@0 2211 vmovl.u8 q8, d0
michael@0 2212 vext.8 q2, q1, q0, #15
michael@0 2213 vmovl.u8 q9, d1
michael@0 2214 vaddw.u8 q10, q15, d4
michael@0 2215 vaddw.u8 q11, q15, d5
michael@0 2216 vmlal.u8 q8, d4, d28
michael@0 2217 vmlal.u8 q9, d5, d28
michael@0 2218 vmlal.u8 q10, d0, d28
michael@0 2219 vmlal.u8 q11, d1, d28
michael@0 2220 /* odd 16 pixels group */
michael@0 2221 vld1.8 {q1}, [\INPTR]!
michael@0 2222 vrshrn.u16 d6, q8, #2
michael@0 2223 vrshrn.u16 d7, q9, #2
michael@0 2224 vshrn.u16 d8, q10, #2
michael@0 2225 vshrn.u16 d9, q11, #2
michael@0 2226 vmovl.u8 q8, d2
michael@0 2227 vext.8 q2, q0, q1, #15
michael@0 2228 vmovl.u8 q9, d3
michael@0 2229 vaddw.u8 q10, q15, d4
michael@0 2230 vaddw.u8 q11, q15, d5
michael@0 2231 vmlal.u8 q8, d4, d28
michael@0 2232 vmlal.u8 q9, d5, d28
michael@0 2233 vmlal.u8 q10, d2, d28
michael@0 2234 vmlal.u8 q11, d3, d28
michael@0 2235 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
michael@0 2236 vrshrn.u16 d6, q8, #2
michael@0 2237 vrshrn.u16 d7, q9, #2
michael@0 2238 vshrn.u16 d8, q10, #2
michael@0 2239 vshrn.u16 d9, q11, #2
michael@0 2240 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
michael@0 2241 .endm
michael@0 2242
michael@0 2243 /*
michael@0 2244 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
michael@0 2245 */
michael@0 2246 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
michael@0 2247 /* special case for the first and last pixels */
michael@0 2248 sub \WIDTH, \WIDTH, #1
michael@0 2249 add \OUTPTR, \OUTPTR, #1
michael@0 2250 ldrb \TMP1, [\INPTR, \WIDTH]
michael@0 2251 strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
michael@0 2252 ldrb \TMP1, [\INPTR], #1
michael@0 2253 strb \TMP1, [\OUTPTR, #-1]
michael@0 2254 vmov.8 d3[7], \TMP1
michael@0 2255
michael@0 2256 subs \WIDTH, \WIDTH, #32
michael@0 2257 blt 5f
michael@0 2258 0: /* process 32 pixels per iteration */
michael@0 2259 upsample32 \OUTPTR, \INPTR
michael@0 2260 subs \WIDTH, \WIDTH, #32
michael@0 2261 bge 0b
michael@0 2262 5:
michael@0 2263 adds \WIDTH, \WIDTH, #16
michael@0 2264 blt 1f
michael@0 2265 0: /* process 16 pixels if needed */
michael@0 2266 upsample16 \OUTPTR, \INPTR
michael@0 2267 subs \WIDTH, \WIDTH, #16
michael@0 2268 1:
michael@0 2269 adds \WIDTH, \WIDTH, #16
michael@0 2270 beq 9f
michael@0 2271
michael@0 2272 /* load the remaining 1-15 pixels */
michael@0 2273 add \INPTR, \INPTR, \WIDTH
michael@0 2274 tst \WIDTH, #1
michael@0 2275 beq 2f
michael@0 2276 sub \INPTR, \INPTR, #1
michael@0 2277 vld1.8 {d0[0]}, [\INPTR]
michael@0 2278 2:
michael@0 2279 tst \WIDTH, #2
michael@0 2280 beq 2f
michael@0 2281 vext.8 d0, d0, d0, #6
michael@0 2282 sub \INPTR, \INPTR, #1
michael@0 2283 vld1.8 {d0[1]}, [\INPTR]
michael@0 2284 sub \INPTR, \INPTR, #1
michael@0 2285 vld1.8 {d0[0]}, [\INPTR]
michael@0 2286 2:
michael@0 2287 tst \WIDTH, #4
michael@0 2288 beq 2f
michael@0 2289 vrev64.32 d0, d0
michael@0 2290 sub \INPTR, \INPTR, #1
michael@0 2291 vld1.8 {d0[3]}, [\INPTR]
michael@0 2292 sub \INPTR, \INPTR, #1
michael@0 2293 vld1.8 {d0[2]}, [\INPTR]
michael@0 2294 sub \INPTR, \INPTR, #1
michael@0 2295 vld1.8 {d0[1]}, [\INPTR]
michael@0 2296 sub \INPTR, \INPTR, #1
michael@0 2297 vld1.8 {d0[0]}, [\INPTR]
michael@0 2298 2:
michael@0 2299 tst \WIDTH, #8
michael@0 2300 beq 2f
michael@0 2301 vmov d1, d0
michael@0 2302 sub \INPTR, \INPTR, #8
michael@0 2303 vld1.8 {d0}, [\INPTR]
michael@0 2304 2: /* upsample the remaining pixels */
michael@0 2305 vmovl.u8 q8, d0
michael@0 2306 vext.8 q2, q1, q0, #15
michael@0 2307 vmovl.u8 q9, d1
michael@0 2308 vaddw.u8 q10, q15, d4
michael@0 2309 vaddw.u8 q11, q15, d5
michael@0 2310 vmlal.u8 q8, d4, d28
michael@0 2311 vmlal.u8 q9, d5, d28
michael@0 2312 vmlal.u8 q10, d0, d28
michael@0 2313 vmlal.u8 q11, d1, d28
michael@0 2314 vrshrn.u16 d10, q8, #2
michael@0 2315 vrshrn.u16 d12, q9, #2
michael@0 2316 vshrn.u16 d11, q10, #2
michael@0 2317 vshrn.u16 d13, q11, #2
michael@0 2318 vzip.8 d10, d11
michael@0 2319 vzip.8 d12, d13
michael@0 2320 /* store the remaining pixels */
michael@0 2321 tst \WIDTH, #8
michael@0 2322 beq 2f
michael@0 2323 vst1.8 {d10, d11}, [\OUTPTR]!
michael@0 2324 vmov q5, q6
michael@0 2325 2:
michael@0 2326 tst \WIDTH, #4
michael@0 2327 beq 2f
michael@0 2328 vst1.8 {d10}, [\OUTPTR]!
michael@0 2329 vmov d10, d11
michael@0 2330 2:
michael@0 2331 tst \WIDTH, #2
michael@0 2332 beq 2f
michael@0 2333 vst1.8 {d10[0]}, [\OUTPTR]!
michael@0 2334 vst1.8 {d10[1]}, [\OUTPTR]!
michael@0 2335 vst1.8 {d10[2]}, [\OUTPTR]!
michael@0 2336 vst1.8 {d10[3]}, [\OUTPTR]!
michael@0 2337 vext.8 d10, d10, d10, #4
michael@0 2338 2:
michael@0 2339 tst \WIDTH, #1
michael@0 2340 beq 2f
michael@0 2341 vst1.8 {d10[0]}, [\OUTPTR]!
michael@0 2342 vst1.8 {d10[1]}, [\OUTPTR]!
michael@0 2343 2:
michael@0 2344 9:
michael@0 2345 .endm
michael@0 2346
michael@0 2347 asm_function jsimd_h2v1_fancy_upsample_neon
michael@0 2348
michael@0 2349 MAX_V_SAMP_FACTOR .req r0
michael@0 2350 DOWNSAMPLED_WIDTH .req r1
michael@0 2351 INPUT_DATA .req r2
michael@0 2352 OUTPUT_DATA_PTR .req r3
michael@0 2353 OUTPUT_DATA .req OUTPUT_DATA_PTR
michael@0 2354
michael@0 2355 OUTPTR .req r4
michael@0 2356 INPTR .req r5
michael@0 2357 WIDTH .req ip
michael@0 2358 TMP .req lr
michael@0 2359
michael@0 2360 push {r4, r5, r6, lr}
michael@0 2361 vpush {d8-d15}
michael@0 2362
michael@0 2363 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
michael@0 2364 cmp MAX_V_SAMP_FACTOR, #0
michael@0 2365 ble 99f
michael@0 2366
michael@0 2367 /* initialize constants */
michael@0 2368 vmov.u8 d28, #3
michael@0 2369 vmov.u16 q15, #1
michael@0 2370 11:
michael@0 2371 ldr INPTR, [INPUT_DATA], #4
michael@0 2372 ldr OUTPTR, [OUTPUT_DATA], #4
michael@0 2373 mov WIDTH, DOWNSAMPLED_WIDTH
michael@0 2374 upsample_row OUTPTR, INPTR, WIDTH, TMP
michael@0 2375 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
michael@0 2376 bgt 11b
michael@0 2377
michael@0 2378 99:
michael@0 2379 vpop {d8-d15}
michael@0 2380 pop {r4, r5, r6, pc}
michael@0 2381
michael@0 2382 .unreq MAX_V_SAMP_FACTOR
michael@0 2383 .unreq DOWNSAMPLED_WIDTH
michael@0 2384 .unreq INPUT_DATA
michael@0 2385 .unreq OUTPUT_DATA_PTR
michael@0 2386 .unreq OUTPUT_DATA
michael@0 2387
michael@0 2388 .unreq OUTPTR
michael@0 2389 .unreq INPTR
michael@0 2390 .unreq WIDTH
michael@0 2391 .unreq TMP
michael@0 2392
michael@0 2393 .endfunc
michael@0 2394
michael@0 2395 .purgem upsample16
michael@0 2396 .purgem upsample32
michael@0 2397 .purgem upsample_row

mercurial