Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * ARM NEON optimizations for libjpeg-turbo |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
michael@0 | 5 | * All rights reserved. |
michael@0 | 6 | * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
michael@0 | 7 | * |
michael@0 | 8 | * This software is provided 'as-is', without any express or implied |
michael@0 | 9 | * warranty. In no event will the authors be held liable for any damages |
michael@0 | 10 | * arising from the use of this software. |
michael@0 | 11 | * |
michael@0 | 12 | * Permission is granted to anyone to use this software for any purpose, |
michael@0 | 13 | * including commercial applications, and to alter it and redistribute it |
michael@0 | 14 | * freely, subject to the following restrictions: |
michael@0 | 15 | * |
michael@0 | 16 | * 1. The origin of this software must not be misrepresented; you must not |
michael@0 | 17 | * claim that you wrote the original software. If you use this software |
michael@0 | 18 | * in a product, an acknowledgment in the product documentation would be |
michael@0 | 19 | * appreciated but is not required. |
michael@0 | 20 | * 2. Altered source versions must be plainly marked as such, and must not be |
michael@0 | 21 | * misrepresented as being the original software. |
michael@0 | 22 | * 3. This notice may not be removed or altered from any source distribution. |
michael@0 | 23 | */ |
michael@0 | 24 | |
michael@0 | 25 | #if defined(__linux__) && defined(__ELF__) |
michael@0 | 26 | .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ |
michael@0 | 27 | #endif |
michael@0 | 28 | |
michael@0 | 29 | .text |
michael@0 | 30 | .fpu neon |
michael@0 | 31 | .arch armv7a |
michael@0 | 32 | .object_arch armv4 |
michael@0 | 33 | .arm |
michael@0 | 34 | |
michael@0 | 35 | |
michael@0 | 36 | #define RESPECT_STRICT_ALIGNMENT 1 |
michael@0 | 37 | |
michael@0 | 38 | /*****************************************************************************/ |
michael@0 | 39 | |
michael@0 | 40 | /* Supplementary macro for setting function attributes */ |
michael@0 | 41 | .macro asm_function fname |
michael@0 | 42 | #ifdef __APPLE__ |
michael@0 | 43 | .func _\fname |
michael@0 | 44 | .globl _\fname |
michael@0 | 45 | _\fname: |
michael@0 | 46 | #else |
michael@0 | 47 | .func \fname |
michael@0 | 48 | .global \fname |
michael@0 | 49 | #ifdef __ELF__ |
michael@0 | 50 | .hidden \fname |
michael@0 | 51 | .type \fname, %function |
michael@0 | 52 | #endif |
michael@0 | 53 | \fname: |
michael@0 | 54 | #endif |
michael@0 | 55 | .endm |
michael@0 | 56 | |
michael@0 | 57 | /* Transpose a block of 4x4 coefficients in four 64-bit registers */ |
michael@0 | 58 | .macro transpose_4x4 x0, x1, x2, x3 |
michael@0 | 59 | vtrn.16 \x0, \x1 |
michael@0 | 60 | vtrn.16 \x2, \x3 |
michael@0 | 61 | vtrn.32 \x0, \x2 |
michael@0 | 62 | vtrn.32 \x1, \x3 |
michael@0 | 63 | .endm |
michael@0 | 64 | |
michael@0 | 65 | #define CENTERJSAMPLE 128 |
michael@0 | 66 | |
michael@0 | 67 | /*****************************************************************************/ |
michael@0 | 68 | |
michael@0 | 69 | /* |
michael@0 | 70 | * Perform dequantization and inverse DCT on one block of coefficients. |
michael@0 | 71 | * |
michael@0 | 72 | * GLOBAL(void) |
michael@0 | 73 | * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, |
michael@0 | 74 | * JSAMPARRAY output_buf, JDIMENSION output_col) |
michael@0 | 75 | */ |
michael@0 | 76 | |
michael@0 | 77 | #define FIX_0_298631336 (2446) |
michael@0 | 78 | #define FIX_0_390180644 (3196) |
michael@0 | 79 | #define FIX_0_541196100 (4433) |
michael@0 | 80 | #define FIX_0_765366865 (6270) |
michael@0 | 81 | #define FIX_0_899976223 (7373) |
michael@0 | 82 | #define FIX_1_175875602 (9633) |
michael@0 | 83 | #define FIX_1_501321110 (12299) |
michael@0 | 84 | #define FIX_1_847759065 (15137) |
michael@0 | 85 | #define FIX_1_961570560 (16069) |
michael@0 | 86 | #define FIX_2_053119869 (16819) |
michael@0 | 87 | #define FIX_2_562915447 (20995) |
michael@0 | 88 | #define FIX_3_072711026 (25172) |
michael@0 | 89 | |
michael@0 | 90 | #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) |
michael@0 | 91 | #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) |
michael@0 | 92 | #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) |
michael@0 | 93 | #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) |
michael@0 | 94 | #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) |
michael@0 | 95 | #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) |
michael@0 | 96 | #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) |
michael@0 | 97 | #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) |
michael@0 | 98 | |
michael@0 | 99 | /* |
michael@0 | 100 | * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. |
michael@0 | 101 | * Uses some ideas from the comments in 'simd/jiss2int-64.asm' |
michael@0 | 102 | */ |
michael@0 | 103 | #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ |
michael@0 | 104 | { \ |
michael@0 | 105 | DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ |
michael@0 | 106 | INT32 q1, q2, q3, q4, q5, q6, q7; \ |
michael@0 | 107 | INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ |
michael@0 | 108 | \ |
michael@0 | 109 | /* 1-D iDCT input data */ \ |
michael@0 | 110 | row0 = xrow0; \ |
michael@0 | 111 | row1 = xrow1; \ |
michael@0 | 112 | row2 = xrow2; \ |
michael@0 | 113 | row3 = xrow3; \ |
michael@0 | 114 | row4 = xrow4; \ |
michael@0 | 115 | row5 = xrow5; \ |
michael@0 | 116 | row6 = xrow6; \ |
michael@0 | 117 | row7 = xrow7; \ |
michael@0 | 118 | \ |
michael@0 | 119 | q5 = row7 + row3; \ |
michael@0 | 120 | q4 = row5 + row1; \ |
michael@0 | 121 | q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ |
michael@0 | 122 | MULTIPLY(q4, FIX_1_175875602); \ |
michael@0 | 123 | q7 = MULTIPLY(q5, FIX_1_175875602) + \ |
michael@0 | 124 | MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ |
michael@0 | 125 | q2 = MULTIPLY(row2, FIX_0_541196100) + \ |
michael@0 | 126 | MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ |
michael@0 | 127 | q4 = q6; \ |
michael@0 | 128 | q3 = ((INT32) row0 - (INT32) row4) << 13; \ |
michael@0 | 129 | q6 += MULTIPLY(row5, -FIX_2_562915447) + \ |
michael@0 | 130 | MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ |
michael@0 | 131 | /* now we can use q1 (reloadable constants have been used up) */ \ |
michael@0 | 132 | q1 = q3 + q2; \ |
michael@0 | 133 | q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ |
michael@0 | 134 | MULTIPLY(row1, -FIX_0_899976223); \ |
michael@0 | 135 | q5 = q7; \ |
michael@0 | 136 | q1 = q1 + q6; \ |
michael@0 | 137 | q7 += MULTIPLY(row7, -FIX_0_899976223) + \ |
michael@0 | 138 | MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ |
michael@0 | 139 | \ |
michael@0 | 140 | /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ |
michael@0 | 141 | tmp11_plus_tmp2 = q1; \ |
michael@0 | 142 | row1 = 0; \ |
michael@0 | 143 | \ |
michael@0 | 144 | q1 = q1 - q6; \ |
michael@0 | 145 | q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ |
michael@0 | 146 | MULTIPLY(row3, -FIX_2_562915447); \ |
michael@0 | 147 | q1 = q1 - q6; \ |
michael@0 | 148 | q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ |
michael@0 | 149 | MULTIPLY(row6, FIX_0_541196100); \ |
michael@0 | 150 | q3 = q3 - q2; \ |
michael@0 | 151 | \ |
michael@0 | 152 | /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ |
michael@0 | 153 | tmp11_minus_tmp2 = q1; \ |
michael@0 | 154 | \ |
michael@0 | 155 | q1 = ((INT32) row0 + (INT32) row4) << 13; \ |
michael@0 | 156 | q2 = q1 + q6; \ |
michael@0 | 157 | q1 = q1 - q6; \ |
michael@0 | 158 | \ |
michael@0 | 159 | /* pick up the results */ \ |
michael@0 | 160 | tmp0 = q4; \ |
michael@0 | 161 | tmp1 = q5; \ |
michael@0 | 162 | tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ |
michael@0 | 163 | tmp3 = q7; \ |
michael@0 | 164 | tmp10 = q2; \ |
michael@0 | 165 | tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ |
michael@0 | 166 | tmp12 = q3; \ |
michael@0 | 167 | tmp13 = q1; \ |
michael@0 | 168 | } |
michael@0 | 169 | |
michael@0 | 170 | #define XFIX_0_899976223 d0[0] |
michael@0 | 171 | #define XFIX_0_541196100 d0[1] |
michael@0 | 172 | #define XFIX_2_562915447 d0[2] |
michael@0 | 173 | #define XFIX_0_298631336_MINUS_0_899976223 d0[3] |
michael@0 | 174 | #define XFIX_1_501321110_MINUS_0_899976223 d1[0] |
michael@0 | 175 | #define XFIX_2_053119869_MINUS_2_562915447 d1[1] |
michael@0 | 176 | #define XFIX_0_541196100_PLUS_0_765366865 d1[2] |
michael@0 | 177 | #define XFIX_1_175875602 d1[3] |
michael@0 | 178 | #define XFIX_1_175875602_MINUS_0_390180644 d2[0] |
michael@0 | 179 | #define XFIX_0_541196100_MINUS_1_847759065 d2[1] |
michael@0 | 180 | #define XFIX_3_072711026_MINUS_2_562915447 d2[2] |
michael@0 | 181 | #define XFIX_1_175875602_MINUS_1_961570560 d2[3] |
michael@0 | 182 | |
michael@0 | 183 | .balign 16 |
michael@0 | 184 | jsimd_idct_islow_neon_consts: |
michael@0 | 185 | .short FIX_0_899976223 /* d0[0] */ |
michael@0 | 186 | .short FIX_0_541196100 /* d0[1] */ |
michael@0 | 187 | .short FIX_2_562915447 /* d0[2] */ |
michael@0 | 188 | .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ |
michael@0 | 189 | .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ |
michael@0 | 190 | .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ |
michael@0 | 191 | .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ |
michael@0 | 192 | .short FIX_1_175875602 /* d1[3] */ |
michael@0 | 193 | /* reloadable constants */ |
michael@0 | 194 | .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ |
michael@0 | 195 | .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ |
michael@0 | 196 | .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ |
michael@0 | 197 | .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ |
michael@0 | 198 | |
michael@0 | 199 | asm_function jsimd_idct_islow_neon |
michael@0 | 200 | |
michael@0 | 201 | DCT_TABLE .req r0 |
michael@0 | 202 | COEF_BLOCK .req r1 |
michael@0 | 203 | OUTPUT_BUF .req r2 |
michael@0 | 204 | OUTPUT_COL .req r3 |
michael@0 | 205 | TMP1 .req r0 |
michael@0 | 206 | TMP2 .req r1 |
michael@0 | 207 | TMP3 .req r2 |
michael@0 | 208 | TMP4 .req ip |
michael@0 | 209 | |
michael@0 | 210 | ROW0L .req d16 |
michael@0 | 211 | ROW0R .req d17 |
michael@0 | 212 | ROW1L .req d18 |
michael@0 | 213 | ROW1R .req d19 |
michael@0 | 214 | ROW2L .req d20 |
michael@0 | 215 | ROW2R .req d21 |
michael@0 | 216 | ROW3L .req d22 |
michael@0 | 217 | ROW3R .req d23 |
michael@0 | 218 | ROW4L .req d24 |
michael@0 | 219 | ROW4R .req d25 |
michael@0 | 220 | ROW5L .req d26 |
michael@0 | 221 | ROW5R .req d27 |
michael@0 | 222 | ROW6L .req d28 |
michael@0 | 223 | ROW6R .req d29 |
michael@0 | 224 | ROW7L .req d30 |
michael@0 | 225 | ROW7R .req d31 |
michael@0 | 226 | |
michael@0 | 227 | /* Load and dequantize coefficients into NEON registers |
michael@0 | 228 | * with the following allocation: |
michael@0 | 229 | * 0 1 2 3 | 4 5 6 7 |
michael@0 | 230 | * ---------+-------- |
michael@0 | 231 | * 0 | d16 | d17 ( q8 ) |
michael@0 | 232 | * 1 | d18 | d19 ( q9 ) |
michael@0 | 233 | * 2 | d20 | d21 ( q10 ) |
michael@0 | 234 | * 3 | d22 | d23 ( q11 ) |
michael@0 | 235 | * 4 | d24 | d25 ( q12 ) |
michael@0 | 236 | * 5 | d26 | d27 ( q13 ) |
michael@0 | 237 | * 6 | d28 | d29 ( q14 ) |
michael@0 | 238 | * 7 | d30 | d31 ( q15 ) |
michael@0 | 239 | */ |
michael@0 | 240 | adr ip, jsimd_idct_islow_neon_consts |
michael@0 | 241 | vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
michael@0 | 242 | vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
michael@0 | 243 | vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
michael@0 | 244 | vmul.s16 q8, q8, q0 |
michael@0 | 245 | vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
michael@0 | 246 | vmul.s16 q9, q9, q1 |
michael@0 | 247 | vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
michael@0 | 248 | vmul.s16 q10, q10, q2 |
michael@0 | 249 | vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
michael@0 | 250 | vmul.s16 q11, q11, q3 |
michael@0 | 251 | vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
michael@0 | 252 | vmul.s16 q12, q12, q0 |
michael@0 | 253 | vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
michael@0 | 254 | vmul.s16 q14, q14, q2 |
michael@0 | 255 | vmul.s16 q13, q13, q1 |
michael@0 | 256 | vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ |
michael@0 | 257 | add ip, ip, #16 |
michael@0 | 258 | vmul.s16 q15, q15, q3 |
michael@0 | 259 | vpush {d8-d15} /* save NEON registers */ |
michael@0 | 260 | /* 1-D IDCT, pass 1, left 4x8 half */ |
michael@0 | 261 | vadd.s16 d4, ROW7L, ROW3L |
michael@0 | 262 | vadd.s16 d5, ROW5L, ROW1L |
michael@0 | 263 | vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 |
michael@0 | 264 | vmlal.s16 q6, d5, XFIX_1_175875602 |
michael@0 | 265 | vmull.s16 q7, d4, XFIX_1_175875602 |
michael@0 | 266 | /* Check for the zero coefficients in the right 4x8 half */ |
michael@0 | 267 | push {r4, r5} |
michael@0 | 268 | vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 |
michael@0 | 269 | vsubl.s16 q3, ROW0L, ROW4L |
michael@0 | 270 | ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] |
michael@0 | 271 | vmull.s16 q2, ROW2L, XFIX_0_541196100 |
michael@0 | 272 | vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 |
michael@0 | 273 | orr r0, r4, r5 |
michael@0 | 274 | vmov q4, q6 |
michael@0 | 275 | vmlsl.s16 q6, ROW5L, XFIX_2_562915447 |
michael@0 | 276 | ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] |
michael@0 | 277 | vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
michael@0 | 278 | vshl.s32 q3, q3, #13 |
michael@0 | 279 | orr r0, r0, r4 |
michael@0 | 280 | vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
michael@0 | 281 | orr r0, r0, r5 |
michael@0 | 282 | vadd.s32 q1, q3, q2 |
michael@0 | 283 | ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] |
michael@0 | 284 | vmov q5, q7 |
michael@0 | 285 | vadd.s32 q1, q1, q6 |
michael@0 | 286 | orr r0, r0, r4 |
michael@0 | 287 | vmlsl.s16 q7, ROW7L, XFIX_0_899976223 |
michael@0 | 288 | orr r0, r0, r5 |
michael@0 | 289 | vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
michael@0 | 290 | vrshrn.s32 ROW1L, q1, #11 |
michael@0 | 291 | ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] |
michael@0 | 292 | vsub.s32 q1, q1, q6 |
michael@0 | 293 | vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 |
michael@0 | 294 | orr r0, r0, r4 |
michael@0 | 295 | vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
michael@0 | 296 | orr r0, r0, r5 |
michael@0 | 297 | vsub.s32 q1, q1, q6 |
michael@0 | 298 | vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
michael@0 | 299 | ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] |
michael@0 | 300 | vmlal.s16 q6, ROW6L, XFIX_0_541196100 |
michael@0 | 301 | vsub.s32 q3, q3, q2 |
michael@0 | 302 | orr r0, r0, r4 |
michael@0 | 303 | vrshrn.s32 ROW6L, q1, #11 |
michael@0 | 304 | orr r0, r0, r5 |
michael@0 | 305 | vadd.s32 q1, q3, q5 |
michael@0 | 306 | ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] |
michael@0 | 307 | vsub.s32 q3, q3, q5 |
michael@0 | 308 | vaddl.s16 q5, ROW0L, ROW4L |
michael@0 | 309 | orr r0, r0, r4 |
michael@0 | 310 | vrshrn.s32 ROW2L, q1, #11 |
michael@0 | 311 | orr r0, r0, r5 |
michael@0 | 312 | vrshrn.s32 ROW5L, q3, #11 |
michael@0 | 313 | ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] |
michael@0 | 314 | vshl.s32 q5, q5, #13 |
michael@0 | 315 | vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 |
michael@0 | 316 | orr r0, r0, r4 |
michael@0 | 317 | vadd.s32 q2, q5, q6 |
michael@0 | 318 | orrs r0, r0, r5 |
michael@0 | 319 | vsub.s32 q1, q5, q6 |
michael@0 | 320 | vadd.s32 q6, q2, q7 |
michael@0 | 321 | ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] |
michael@0 | 322 | vsub.s32 q2, q2, q7 |
michael@0 | 323 | vadd.s32 q5, q1, q4 |
michael@0 | 324 | orr r0, r4, r5 |
michael@0 | 325 | vsub.s32 q3, q1, q4 |
michael@0 | 326 | pop {r4, r5} |
michael@0 | 327 | vrshrn.s32 ROW7L, q2, #11 |
michael@0 | 328 | vrshrn.s32 ROW3L, q5, #11 |
michael@0 | 329 | vrshrn.s32 ROW0L, q6, #11 |
michael@0 | 330 | vrshrn.s32 ROW4L, q3, #11 |
michael@0 | 331 | |
michael@0 | 332 | beq 3f /* Go to do some special handling for the sparse right 4x8 half */ |
michael@0 | 333 | |
michael@0 | 334 | /* 1-D IDCT, pass 1, right 4x8 half */ |
michael@0 | 335 | vld1.s16 {d2}, [ip, :64] /* reload constants */ |
michael@0 | 336 | vadd.s16 d10, ROW7R, ROW3R |
michael@0 | 337 | vadd.s16 d8, ROW5R, ROW1R |
michael@0 | 338 | /* Transpose left 4x8 half */ |
michael@0 | 339 | vtrn.16 ROW6L, ROW7L |
michael@0 | 340 | vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 |
michael@0 | 341 | vmlal.s16 q6, d8, XFIX_1_175875602 |
michael@0 | 342 | vtrn.16 ROW2L, ROW3L |
michael@0 | 343 | vmull.s16 q7, d10, XFIX_1_175875602 |
michael@0 | 344 | vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 |
michael@0 | 345 | vtrn.16 ROW0L, ROW1L |
michael@0 | 346 | vsubl.s16 q3, ROW0R, ROW4R |
michael@0 | 347 | vmull.s16 q2, ROW2R, XFIX_0_541196100 |
michael@0 | 348 | vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
michael@0 | 349 | vtrn.16 ROW4L, ROW5L |
michael@0 | 350 | vmov q4, q6 |
michael@0 | 351 | vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
michael@0 | 352 | vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 |
michael@0 | 353 | vtrn.32 ROW1L, ROW3L |
michael@0 | 354 | vshl.s32 q3, q3, #13 |
michael@0 | 355 | vmlsl.s16 q4, ROW1R, XFIX_0_899976223 |
michael@0 | 356 | vtrn.32 ROW4L, ROW6L |
michael@0 | 357 | vadd.s32 q1, q3, q2 |
michael@0 | 358 | vmov q5, q7 |
michael@0 | 359 | vadd.s32 q1, q1, q6 |
michael@0 | 360 | vtrn.32 ROW0L, ROW2L |
michael@0 | 361 | vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
michael@0 | 362 | vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 |
michael@0 | 363 | vrshrn.s32 ROW1R, q1, #11 |
michael@0 | 364 | vtrn.32 ROW5L, ROW7L |
michael@0 | 365 | vsub.s32 q1, q1, q6 |
michael@0 | 366 | vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
michael@0 | 367 | vmlsl.s16 q5, ROW3R, XFIX_2_562915447 |
michael@0 | 368 | vsub.s32 q1, q1, q6 |
michael@0 | 369 | vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 |
michael@0 | 370 | vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
michael@0 | 371 | vsub.s32 q3, q3, q2 |
michael@0 | 372 | vrshrn.s32 ROW6R, q1, #11 |
michael@0 | 373 | vadd.s32 q1, q3, q5 |
michael@0 | 374 | vsub.s32 q3, q3, q5 |
michael@0 | 375 | vaddl.s16 q5, ROW0R, ROW4R |
michael@0 | 376 | vrshrn.s32 ROW2R, q1, #11 |
michael@0 | 377 | vrshrn.s32 ROW5R, q3, #11 |
michael@0 | 378 | vshl.s32 q5, q5, #13 |
michael@0 | 379 | vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
michael@0 | 380 | vadd.s32 q2, q5, q6 |
michael@0 | 381 | vsub.s32 q1, q5, q6 |
michael@0 | 382 | vadd.s32 q6, q2, q7 |
michael@0 | 383 | vsub.s32 q2, q2, q7 |
michael@0 | 384 | vadd.s32 q5, q1, q4 |
michael@0 | 385 | vsub.s32 q3, q1, q4 |
michael@0 | 386 | vrshrn.s32 ROW7R, q2, #11 |
michael@0 | 387 | vrshrn.s32 ROW3R, q5, #11 |
michael@0 | 388 | vrshrn.s32 ROW0R, q6, #11 |
michael@0 | 389 | vrshrn.s32 ROW4R, q3, #11 |
michael@0 | 390 | /* Transpose right 4x8 half */ |
michael@0 | 391 | vtrn.16 ROW6R, ROW7R |
michael@0 | 392 | vtrn.16 ROW2R, ROW3R |
michael@0 | 393 | vtrn.16 ROW0R, ROW1R |
michael@0 | 394 | vtrn.16 ROW4R, ROW5R |
michael@0 | 395 | vtrn.32 ROW1R, ROW3R |
michael@0 | 396 | vtrn.32 ROW4R, ROW6R |
michael@0 | 397 | vtrn.32 ROW0R, ROW2R |
michael@0 | 398 | vtrn.32 ROW5R, ROW7R |
michael@0 | 399 | |
michael@0 | 400 | 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ |
michael@0 | 401 | vld1.s16 {d2}, [ip, :64] /* reload constants */ |
michael@0 | 402 | vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
michael@0 | 403 | vmlal.s16 q6, ROW1L, XFIX_1_175875602 |
michael@0 | 404 | vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ |
michael@0 | 405 | vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
michael@0 | 406 | vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
michael@0 | 407 | vmlal.s16 q7, ROW3L, XFIX_1_175875602 |
michael@0 | 408 | vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ |
michael@0 | 409 | vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
michael@0 | 410 | vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
michael@0 | 411 | vmull.s16 q2, ROW2L, XFIX_0_541196100 |
michael@0 | 412 | vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ |
michael@0 | 413 | vmov q4, q6 |
michael@0 | 414 | vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ |
michael@0 | 415 | vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
michael@0 | 416 | vshl.s32 q3, q3, #13 |
michael@0 | 417 | vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
michael@0 | 418 | vadd.s32 q1, q3, q2 |
michael@0 | 419 | vmov q5, q7 |
michael@0 | 420 | vadd.s32 q1, q1, q6 |
michael@0 | 421 | vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ |
michael@0 | 422 | vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
michael@0 | 423 | vshrn.s32 ROW1L, q1, #16 |
michael@0 | 424 | vsub.s32 q1, q1, q6 |
michael@0 | 425 | vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ |
michael@0 | 426 | vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
michael@0 | 427 | vsub.s32 q1, q1, q6 |
michael@0 | 428 | vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
michael@0 | 429 | vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
michael@0 | 430 | vsub.s32 q3, q3, q2 |
michael@0 | 431 | vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
michael@0 | 432 | vadd.s32 q1, q3, q5 |
michael@0 | 433 | vsub.s32 q3, q3, q5 |
michael@0 | 434 | vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
michael@0 | 435 | vshrn.s32 ROW2L, q1, #16 |
michael@0 | 436 | vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
michael@0 | 437 | vshl.s32 q5, q5, #13 |
michael@0 | 438 | vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ |
michael@0 | 439 | vadd.s32 q2, q5, q6 |
michael@0 | 440 | vsub.s32 q1, q5, q6 |
michael@0 | 441 | vadd.s32 q6, q2, q7 |
michael@0 | 442 | vsub.s32 q2, q2, q7 |
michael@0 | 443 | vadd.s32 q5, q1, q4 |
michael@0 | 444 | vsub.s32 q3, q1, q4 |
michael@0 | 445 | vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
michael@0 | 446 | vshrn.s32 ROW3L, q5, #16 |
michael@0 | 447 | vshrn.s32 ROW0L, q6, #16 |
michael@0 | 448 | vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
michael@0 | 449 | /* 1-D IDCT, pass 2, right 4x8 half */ |
michael@0 | 450 | vld1.s16 {d2}, [ip, :64] /* reload constants */ |
michael@0 | 451 | vmull.s16 q6, ROW5R, XFIX_1_175875602 |
michael@0 | 452 | vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
michael@0 | 453 | vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 |
michael@0 | 454 | vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ |
michael@0 | 455 | vmull.s16 q7, ROW7R, XFIX_1_175875602 |
michael@0 | 456 | vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
michael@0 | 457 | vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 |
michael@0 | 458 | vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ |
michael@0 | 459 | vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
michael@0 | 460 | vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
michael@0 | 461 | vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
michael@0 | 462 | vmov q4, q6 |
michael@0 | 463 | vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
michael@0 | 464 | vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ |
michael@0 | 465 | vshl.s32 q3, q3, #13 |
michael@0 | 466 | vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ |
michael@0 | 467 | vadd.s32 q1, q3, q2 |
michael@0 | 468 | vmov q5, q7 |
michael@0 | 469 | vadd.s32 q1, q1, q6 |
michael@0 | 470 | vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
michael@0 | 471 | vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ |
michael@0 | 472 | vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
michael@0 | 473 | vsub.s32 q1, q1, q6 |
michael@0 | 474 | vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
michael@0 | 475 | vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ |
michael@0 | 476 | vsub.s32 q1, q1, q6 |
michael@0 | 477 | vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ |
michael@0 | 478 | vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
michael@0 | 479 | vsub.s32 q3, q3, q2 |
michael@0 | 480 | vshrn.s32 ROW6R, q1, #16 |
michael@0 | 481 | vadd.s32 q1, q3, q5 |
michael@0 | 482 | vsub.s32 q3, q3, q5 |
michael@0 | 483 | vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
michael@0 | 484 | vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
michael@0 | 485 | vshrn.s32 ROW5R, q3, #16 |
michael@0 | 486 | vshl.s32 q5, q5, #13 |
michael@0 | 487 | vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
michael@0 | 488 | vadd.s32 q2, q5, q6 |
michael@0 | 489 | vsub.s32 q1, q5, q6 |
michael@0 | 490 | vadd.s32 q6, q2, q7 |
michael@0 | 491 | vsub.s32 q2, q2, q7 |
michael@0 | 492 | vadd.s32 q5, q1, q4 |
michael@0 | 493 | vsub.s32 q3, q1, q4 |
michael@0 | 494 | vshrn.s32 ROW7R, q2, #16 |
michael@0 | 495 | vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
michael@0 | 496 | vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
michael@0 | 497 | vshrn.s32 ROW4R, q3, #16 |
michael@0 | 498 | |
michael@0 | 499 | 2: /* Descale to 8-bit and range limit */ |
michael@0 | 500 | vqrshrn.s16 d16, q8, #2 |
michael@0 | 501 | vqrshrn.s16 d17, q9, #2 |
michael@0 | 502 | vqrshrn.s16 d18, q10, #2 |
michael@0 | 503 | vqrshrn.s16 d19, q11, #2 |
michael@0 | 504 | vpop {d8-d15} /* restore NEON registers */ |
michael@0 | 505 | vqrshrn.s16 d20, q12, #2 |
michael@0 | 506 | /* Transpose the final 8-bit samples and do signed->unsigned conversion */ |
michael@0 | 507 | vtrn.16 q8, q9 |
michael@0 | 508 | vqrshrn.s16 d21, q13, #2 |
michael@0 | 509 | vqrshrn.s16 d22, q14, #2 |
michael@0 | 510 | vmov.u8 q0, #(CENTERJSAMPLE) |
michael@0 | 511 | vqrshrn.s16 d23, q15, #2 |
michael@0 | 512 | vtrn.8 d16, d17 |
michael@0 | 513 | vtrn.8 d18, d19 |
michael@0 | 514 | vadd.u8 q8, q8, q0 |
michael@0 | 515 | vadd.u8 q9, q9, q0 |
michael@0 | 516 | vtrn.16 q10, q11 |
michael@0 | 517 | /* Store results to the output buffer */ |
michael@0 | 518 | ldmia OUTPUT_BUF!, {TMP1, TMP2} |
michael@0 | 519 | add TMP1, TMP1, OUTPUT_COL |
michael@0 | 520 | add TMP2, TMP2, OUTPUT_COL |
michael@0 | 521 | vst1.8 {d16}, [TMP1] |
michael@0 | 522 | vtrn.8 d20, d21 |
michael@0 | 523 | vst1.8 {d17}, [TMP2] |
michael@0 | 524 | ldmia OUTPUT_BUF!, {TMP1, TMP2} |
michael@0 | 525 | add TMP1, TMP1, OUTPUT_COL |
michael@0 | 526 | add TMP2, TMP2, OUTPUT_COL |
michael@0 | 527 | vst1.8 {d18}, [TMP1] |
michael@0 | 528 | vadd.u8 q10, q10, q0 |
michael@0 | 529 | vst1.8 {d19}, [TMP2] |
michael@0 | 530 | ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
michael@0 | 531 | add TMP1, TMP1, OUTPUT_COL |
michael@0 | 532 | add TMP2, TMP2, OUTPUT_COL |
michael@0 | 533 | add TMP3, TMP3, OUTPUT_COL |
michael@0 | 534 | add TMP4, TMP4, OUTPUT_COL |
michael@0 | 535 | vtrn.8 d22, d23 |
michael@0 | 536 | vst1.8 {d20}, [TMP1] |
michael@0 | 537 | vadd.u8 q11, q11, q0 |
michael@0 | 538 | vst1.8 {d21}, [TMP2] |
michael@0 | 539 | vst1.8 {d22}, [TMP3] |
michael@0 | 540 | vst1.8 {d23}, [TMP4] |
michael@0 | 541 | bx lr |
michael@0 | 542 | |
michael@0 | 543 | 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ |
michael@0 | 544 | |
michael@0 | 545 | /* Transpose left 4x8 half */ |
michael@0 | 546 | vtrn.16 ROW6L, ROW7L |
michael@0 | 547 | vtrn.16 ROW2L, ROW3L |
michael@0 | 548 | vtrn.16 ROW0L, ROW1L |
michael@0 | 549 | vtrn.16 ROW4L, ROW5L |
michael@0 | 550 | vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ |
michael@0 | 551 | vtrn.32 ROW1L, ROW3L |
michael@0 | 552 | vtrn.32 ROW4L, ROW6L |
michael@0 | 553 | vtrn.32 ROW0L, ROW2L |
michael@0 | 554 | vtrn.32 ROW5L, ROW7L |
michael@0 | 555 | |
michael@0 | 556 | cmp r0, #0 |
michael@0 | 557 | beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ |
michael@0 | 558 | |
michael@0 | 559 | /* Only row 0 is non-zero for the right 4x8 half */ |
michael@0 | 560 | vdup.s16 ROW1R, ROW0R[1] |
michael@0 | 561 | vdup.s16 ROW2R, ROW0R[2] |
michael@0 | 562 | vdup.s16 ROW3R, ROW0R[3] |
michael@0 | 563 | vdup.s16 ROW4R, ROW0R[0] |
michael@0 | 564 | vdup.s16 ROW5R, ROW0R[1] |
michael@0 | 565 | vdup.s16 ROW6R, ROW0R[2] |
michael@0 | 566 | vdup.s16 ROW7R, ROW0R[3] |
michael@0 | 567 | vdup.s16 ROW0R, ROW0R[0] |
michael@0 | 568 | b 1b /* Go to 'normal' second pass */ |
michael@0 | 569 | |
michael@0 | 570 | 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ |
michael@0 | 571 | vld1.s16 {d2}, [ip, :64] /* reload constants */ |
michael@0 | 572 | vmull.s16 q6, ROW1L, XFIX_1_175875602 |
michael@0 | 573 | vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
michael@0 | 574 | vmull.s16 q7, ROW3L, XFIX_1_175875602 |
michael@0 | 575 | vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
michael@0 | 576 | vmull.s16 q2, ROW2L, XFIX_0_541196100 |
michael@0 | 577 | vshll.s16 q3, ROW0L, #13 |
michael@0 | 578 | vmov q4, q6 |
michael@0 | 579 | vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
michael@0 | 580 | vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
michael@0 | 581 | vadd.s32 q1, q3, q2 |
michael@0 | 582 | vmov q5, q7 |
michael@0 | 583 | vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
michael@0 | 584 | vadd.s32 q1, q1, q6 |
michael@0 | 585 | vadd.s32 q6, q6, q6 |
michael@0 | 586 | vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
michael@0 | 587 | vshrn.s32 ROW1L, q1, #16 |
michael@0 | 588 | vsub.s32 q1, q1, q6 |
michael@0 | 589 | vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
michael@0 | 590 | vsub.s32 q3, q3, q2 |
michael@0 | 591 | vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
michael@0 | 592 | vadd.s32 q1, q3, q5 |
michael@0 | 593 | vsub.s32 q3, q3, q5 |
michael@0 | 594 | vshll.s16 q5, ROW0L, #13 |
michael@0 | 595 | vshrn.s32 ROW2L, q1, #16 |
michael@0 | 596 | vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
michael@0 | 597 | vadd.s32 q2, q5, q6 |
michael@0 | 598 | vsub.s32 q1, q5, q6 |
michael@0 | 599 | vadd.s32 q6, q2, q7 |
michael@0 | 600 | vsub.s32 q2, q2, q7 |
michael@0 | 601 | vadd.s32 q5, q1, q4 |
michael@0 | 602 | vsub.s32 q3, q1, q4 |
michael@0 | 603 | vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
michael@0 | 604 | vshrn.s32 ROW3L, q5, #16 |
michael@0 | 605 | vshrn.s32 ROW0L, q6, #16 |
michael@0 | 606 | vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
michael@0 | 607 | /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ |
michael@0 | 608 | vld1.s16 {d2}, [ip, :64] /* reload constants */ |
michael@0 | 609 | vmull.s16 q6, ROW5L, XFIX_1_175875602 |
michael@0 | 610 | vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 |
michael@0 | 611 | vmull.s16 q7, ROW7L, XFIX_1_175875602 |
michael@0 | 612 | vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 |
michael@0 | 613 | vmull.s16 q2, ROW6L, XFIX_0_541196100 |
michael@0 | 614 | vshll.s16 q3, ROW4L, #13 |
michael@0 | 615 | vmov q4, q6 |
michael@0 | 616 | vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 |
michael@0 | 617 | vmlsl.s16 q4, ROW5L, XFIX_0_899976223 |
michael@0 | 618 | vadd.s32 q1, q3, q2 |
michael@0 | 619 | vmov q5, q7 |
michael@0 | 620 | vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 |
michael@0 | 621 | vadd.s32 q1, q1, q6 |
michael@0 | 622 | vadd.s32 q6, q6, q6 |
michael@0 | 623 | vmlsl.s16 q5, ROW7L, XFIX_2_562915447 |
michael@0 | 624 | vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
michael@0 | 625 | vsub.s32 q1, q1, q6 |
michael@0 | 626 | vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 |
michael@0 | 627 | vsub.s32 q3, q3, q2 |
michael@0 | 628 | vshrn.s32 ROW6R, q1, #16 |
michael@0 | 629 | vadd.s32 q1, q3, q5 |
michael@0 | 630 | vsub.s32 q3, q3, q5 |
michael@0 | 631 | vshll.s16 q5, ROW4L, #13 |
michael@0 | 632 | vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
michael@0 | 633 | vshrn.s32 ROW5R, q3, #16 |
michael@0 | 634 | vadd.s32 q2, q5, q6 |
michael@0 | 635 | vsub.s32 q1, q5, q6 |
michael@0 | 636 | vadd.s32 q6, q2, q7 |
michael@0 | 637 | vsub.s32 q2, q2, q7 |
michael@0 | 638 | vadd.s32 q5, q1, q4 |
michael@0 | 639 | vsub.s32 q3, q1, q4 |
michael@0 | 640 | vshrn.s32 ROW7R, q2, #16 |
michael@0 | 641 | vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
michael@0 | 642 | vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
michael@0 | 643 | vshrn.s32 ROW4R, q3, #16 |
michael@0 | 644 | b 2b /* Go to epilogue */ |
michael@0 | 645 | |
michael@0 | 646 | .unreq DCT_TABLE |
michael@0 | 647 | .unreq COEF_BLOCK |
michael@0 | 648 | .unreq OUTPUT_BUF |
michael@0 | 649 | .unreq OUTPUT_COL |
michael@0 | 650 | .unreq TMP1 |
michael@0 | 651 | .unreq TMP2 |
michael@0 | 652 | .unreq TMP3 |
michael@0 | 653 | .unreq TMP4 |
michael@0 | 654 | |
michael@0 | 655 | .unreq ROW0L |
michael@0 | 656 | .unreq ROW0R |
michael@0 | 657 | .unreq ROW1L |
michael@0 | 658 | .unreq ROW1R |
michael@0 | 659 | .unreq ROW2L |
michael@0 | 660 | .unreq ROW2R |
michael@0 | 661 | .unreq ROW3L |
michael@0 | 662 | .unreq ROW3R |
michael@0 | 663 | .unreq ROW4L |
michael@0 | 664 | .unreq ROW4R |
michael@0 | 665 | .unreq ROW5L |
michael@0 | 666 | .unreq ROW5R |
michael@0 | 667 | .unreq ROW6L |
michael@0 | 668 | .unreq ROW6R |
michael@0 | 669 | .unreq ROW7L |
michael@0 | 670 | .unreq ROW7R |
michael@0 | 671 | .endfunc |
michael@0 | 672 | |
michael@0 | 673 | /*****************************************************************************/ |
michael@0 | 674 | |
michael@0 | 675 | /* |
michael@0 | 676 | * jsimd_idct_ifast_neon |
michael@0 | 677 | * |
michael@0 | 678 | * This function contains a fast, not so accurate integer implementation of |
michael@0 | 679 | * the inverse DCT (Discrete Cosine Transform). It uses the same calculations |
michael@0 | 680 | * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' |
michael@0 | 681 | * function from jidctfst.c |
michael@0 | 682 | * |
michael@0 | 683 | * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. |
michael@0 | 684 | * But in ARM NEON case some extra additions are required because VQDMULH |
michael@0 | 685 | * instruction can't handle the constants larger than 1. So the expressions |
michael@0 | 686 | * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", |
michael@0 | 687 | * which introduces an extra addition. Overall, there are 6 extra additions |
michael@0 | 688 | * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. |
michael@0 | 689 | */ |
michael@0 | 690 | |
michael@0 | 691 | #define XFIX_1_082392200 d0[0] |
michael@0 | 692 | #define XFIX_1_414213562 d0[1] |
michael@0 | 693 | #define XFIX_1_847759065 d0[2] |
michael@0 | 694 | #define XFIX_2_613125930 d0[3] |
michael@0 | 695 | |
michael@0 | 696 | .balign 16 |
michael@0 | 697 | jsimd_idct_ifast_neon_consts: |
michael@0 | 698 | .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ |
michael@0 | 699 | .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ |
michael@0 | 700 | .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
michael@0 | 701 | .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
michael@0 | 702 | |
michael@0 | 703 | asm_function jsimd_idct_ifast_neon |
michael@0 | 704 | |
michael@0 | 705 | DCT_TABLE .req r0 |
michael@0 | 706 | COEF_BLOCK .req r1 |
michael@0 | 707 | OUTPUT_BUF .req r2 |
michael@0 | 708 | OUTPUT_COL .req r3 |
michael@0 | 709 | TMP1 .req r0 |
michael@0 | 710 | TMP2 .req r1 |
michael@0 | 711 | TMP3 .req r2 |
michael@0 | 712 | TMP4 .req ip |
michael@0 | 713 | |
michael@0 | 714 | /* Load and dequantize coefficients into NEON registers |
michael@0 | 715 | * with the following allocation: |
michael@0 | 716 | * 0 1 2 3 | 4 5 6 7 |
michael@0 | 717 | * ---------+-------- |
michael@0 | 718 | * 0 | d16 | d17 ( q8 ) |
michael@0 | 719 | * 1 | d18 | d19 ( q9 ) |
michael@0 | 720 | * 2 | d20 | d21 ( q10 ) |
michael@0 | 721 | * 3 | d22 | d23 ( q11 ) |
michael@0 | 722 | * 4 | d24 | d25 ( q12 ) |
michael@0 | 723 | * 5 | d26 | d27 ( q13 ) |
michael@0 | 724 | * 6 | d28 | d29 ( q14 ) |
michael@0 | 725 | * 7 | d30 | d31 ( q15 ) |
michael@0 | 726 | */ |
michael@0 | 727 | adr ip, jsimd_idct_ifast_neon_consts |
michael@0 | 728 | vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
michael@0 | 729 | vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
michael@0 | 730 | vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
michael@0 | 731 | vmul.s16 q8, q8, q0 |
michael@0 | 732 | vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
michael@0 | 733 | vmul.s16 q9, q9, q1 |
michael@0 | 734 | vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
michael@0 | 735 | vmul.s16 q10, q10, q2 |
michael@0 | 736 | vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
michael@0 | 737 | vmul.s16 q11, q11, q3 |
michael@0 | 738 | vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
michael@0 | 739 | vmul.s16 q12, q12, q0 |
michael@0 | 740 | vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
michael@0 | 741 | vmul.s16 q14, q14, q2 |
michael@0 | 742 | vmul.s16 q13, q13, q1 |
michael@0 | 743 | vld1.16 {d0}, [ip, :64] /* load constants */ |
michael@0 | 744 | vmul.s16 q15, q15, q3 |
michael@0 | 745 | vpush {d8-d13} /* save NEON registers */ |
michael@0 | 746 | /* 1-D IDCT, pass 1 */ |
michael@0 | 747 | vsub.s16 q2, q10, q14 |
michael@0 | 748 | vadd.s16 q14, q10, q14 |
michael@0 | 749 | vsub.s16 q1, q11, q13 |
michael@0 | 750 | vadd.s16 q13, q11, q13 |
michael@0 | 751 | vsub.s16 q5, q9, q15 |
michael@0 | 752 | vadd.s16 q15, q9, q15 |
michael@0 | 753 | vqdmulh.s16 q4, q2, XFIX_1_414213562 |
michael@0 | 754 | vqdmulh.s16 q6, q1, XFIX_2_613125930 |
michael@0 | 755 | vadd.s16 q3, q1, q1 |
michael@0 | 756 | vsub.s16 q1, q5, q1 |
michael@0 | 757 | vadd.s16 q10, q2, q4 |
michael@0 | 758 | vqdmulh.s16 q4, q1, XFIX_1_847759065 |
michael@0 | 759 | vsub.s16 q2, q15, q13 |
michael@0 | 760 | vadd.s16 q3, q3, q6 |
michael@0 | 761 | vqdmulh.s16 q6, q2, XFIX_1_414213562 |
michael@0 | 762 | vadd.s16 q1, q1, q4 |
michael@0 | 763 | vqdmulh.s16 q4, q5, XFIX_1_082392200 |
michael@0 | 764 | vsub.s16 q10, q10, q14 |
michael@0 | 765 | vadd.s16 q2, q2, q6 |
michael@0 | 766 | vsub.s16 q6, q8, q12 |
michael@0 | 767 | vadd.s16 q12, q8, q12 |
michael@0 | 768 | vadd.s16 q9, q5, q4 |
michael@0 | 769 | vadd.s16 q5, q6, q10 |
michael@0 | 770 | vsub.s16 q10, q6, q10 |
michael@0 | 771 | vadd.s16 q6, q15, q13 |
michael@0 | 772 | vadd.s16 q8, q12, q14 |
michael@0 | 773 | vsub.s16 q3, q6, q3 |
michael@0 | 774 | vsub.s16 q12, q12, q14 |
michael@0 | 775 | vsub.s16 q3, q3, q1 |
michael@0 | 776 | vsub.s16 q1, q9, q1 |
michael@0 | 777 | vadd.s16 q2, q3, q2 |
michael@0 | 778 | vsub.s16 q15, q8, q6 |
michael@0 | 779 | vadd.s16 q1, q1, q2 |
michael@0 | 780 | vadd.s16 q8, q8, q6 |
michael@0 | 781 | vadd.s16 q14, q5, q3 |
michael@0 | 782 | vsub.s16 q9, q5, q3 |
michael@0 | 783 | vsub.s16 q13, q10, q2 |
michael@0 | 784 | vadd.s16 q10, q10, q2 |
michael@0 | 785 | /* Transpose */ |
michael@0 | 786 | vtrn.16 q8, q9 |
michael@0 | 787 | vsub.s16 q11, q12, q1 |
michael@0 | 788 | vtrn.16 q14, q15 |
michael@0 | 789 | vadd.s16 q12, q12, q1 |
michael@0 | 790 | vtrn.16 q10, q11 |
michael@0 | 791 | vtrn.16 q12, q13 |
michael@0 | 792 | vtrn.32 q9, q11 |
michael@0 | 793 | vtrn.32 q12, q14 |
michael@0 | 794 | vtrn.32 q8, q10 |
michael@0 | 795 | vtrn.32 q13, q15 |
michael@0 | 796 | vswp d28, d21 |
michael@0 | 797 | vswp d26, d19 |
michael@0 | 798 | /* 1-D IDCT, pass 2 */ |
michael@0 | 799 | vsub.s16 q2, q10, q14 |
michael@0 | 800 | vswp d30, d23 |
michael@0 | 801 | vadd.s16 q14, q10, q14 |
michael@0 | 802 | vswp d24, d17 |
michael@0 | 803 | vsub.s16 q1, q11, q13 |
michael@0 | 804 | vadd.s16 q13, q11, q13 |
michael@0 | 805 | vsub.s16 q5, q9, q15 |
michael@0 | 806 | vadd.s16 q15, q9, q15 |
michael@0 | 807 | vqdmulh.s16 q4, q2, XFIX_1_414213562 |
michael@0 | 808 | vqdmulh.s16 q6, q1, XFIX_2_613125930 |
michael@0 | 809 | vadd.s16 q3, q1, q1 |
michael@0 | 810 | vsub.s16 q1, q5, q1 |
michael@0 | 811 | vadd.s16 q10, q2, q4 |
michael@0 | 812 | vqdmulh.s16 q4, q1, XFIX_1_847759065 |
michael@0 | 813 | vsub.s16 q2, q15, q13 |
michael@0 | 814 | vadd.s16 q3, q3, q6 |
michael@0 | 815 | vqdmulh.s16 q6, q2, XFIX_1_414213562 |
michael@0 | 816 | vadd.s16 q1, q1, q4 |
michael@0 | 817 | vqdmulh.s16 q4, q5, XFIX_1_082392200 |
michael@0 | 818 | vsub.s16 q10, q10, q14 |
michael@0 | 819 | vadd.s16 q2, q2, q6 |
michael@0 | 820 | vsub.s16 q6, q8, q12 |
michael@0 | 821 | vadd.s16 q12, q8, q12 |
michael@0 | 822 | vadd.s16 q9, q5, q4 |
michael@0 | 823 | vadd.s16 q5, q6, q10 |
michael@0 | 824 | vsub.s16 q10, q6, q10 |
michael@0 | 825 | vadd.s16 q6, q15, q13 |
michael@0 | 826 | vadd.s16 q8, q12, q14 |
michael@0 | 827 | vsub.s16 q3, q6, q3 |
michael@0 | 828 | vsub.s16 q12, q12, q14 |
michael@0 | 829 | vsub.s16 q3, q3, q1 |
michael@0 | 830 | vsub.s16 q1, q9, q1 |
michael@0 | 831 | vadd.s16 q2, q3, q2 |
michael@0 | 832 | vsub.s16 q15, q8, q6 |
michael@0 | 833 | vadd.s16 q1, q1, q2 |
michael@0 | 834 | vadd.s16 q8, q8, q6 |
michael@0 | 835 | vadd.s16 q14, q5, q3 |
michael@0 | 836 | vsub.s16 q9, q5, q3 |
michael@0 | 837 | vsub.s16 q13, q10, q2 |
michael@0 | 838 | vpop {d8-d13} /* restore NEON registers */ |
michael@0 | 839 | vadd.s16 q10, q10, q2 |
michael@0 | 840 | vsub.s16 q11, q12, q1 |
michael@0 | 841 | vadd.s16 q12, q12, q1 |
michael@0 | 842 | /* Descale to 8-bit and range limit */ |
michael@0 | 843 | vmov.u8 q0, #0x80 |
michael@0 | 844 | vqshrn.s16 d16, q8, #5 |
michael@0 | 845 | vqshrn.s16 d17, q9, #5 |
michael@0 | 846 | vqshrn.s16 d18, q10, #5 |
michael@0 | 847 | vqshrn.s16 d19, q11, #5 |
michael@0 | 848 | vqshrn.s16 d20, q12, #5 |
michael@0 | 849 | vqshrn.s16 d21, q13, #5 |
michael@0 | 850 | vqshrn.s16 d22, q14, #5 |
michael@0 | 851 | vqshrn.s16 d23, q15, #5 |
michael@0 | 852 | vadd.u8 q8, q8, q0 |
michael@0 | 853 | vadd.u8 q9, q9, q0 |
michael@0 | 854 | vadd.u8 q10, q10, q0 |
michael@0 | 855 | vadd.u8 q11, q11, q0 |
michael@0 | 856 | /* Transpose the final 8-bit samples */ |
michael@0 | 857 | vtrn.16 q8, q9 |
michael@0 | 858 | vtrn.16 q10, q11 |
michael@0 | 859 | vtrn.32 q8, q10 |
michael@0 | 860 | vtrn.32 q9, q11 |
michael@0 | 861 | vtrn.8 d16, d17 |
michael@0 | 862 | vtrn.8 d18, d19 |
michael@0 | 863 | /* Store results to the output buffer */ |
michael@0 | 864 | ldmia OUTPUT_BUF!, {TMP1, TMP2} |
michael@0 | 865 | add TMP1, TMP1, OUTPUT_COL |
michael@0 | 866 | add TMP2, TMP2, OUTPUT_COL |
michael@0 | 867 | vst1.8 {d16}, [TMP1] |
michael@0 | 868 | vst1.8 {d17}, [TMP2] |
michael@0 | 869 | ldmia OUTPUT_BUF!, {TMP1, TMP2} |
michael@0 | 870 | add TMP1, TMP1, OUTPUT_COL |
michael@0 | 871 | add TMP2, TMP2, OUTPUT_COL |
michael@0 | 872 | vst1.8 {d18}, [TMP1] |
michael@0 | 873 | vtrn.8 d20, d21 |
michael@0 | 874 | vst1.8 {d19}, [TMP2] |
michael@0 | 875 | ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
michael@0 | 876 | add TMP1, TMP1, OUTPUT_COL |
michael@0 | 877 | add TMP2, TMP2, OUTPUT_COL |
michael@0 | 878 | add TMP3, TMP3, OUTPUT_COL |
michael@0 | 879 | add TMP4, TMP4, OUTPUT_COL |
michael@0 | 880 | vst1.8 {d20}, [TMP1] |
michael@0 | 881 | vtrn.8 d22, d23 |
michael@0 | 882 | vst1.8 {d21}, [TMP2] |
michael@0 | 883 | vst1.8 {d22}, [TMP3] |
michael@0 | 884 | vst1.8 {d23}, [TMP4] |
michael@0 | 885 | bx lr |
michael@0 | 886 | |
michael@0 | 887 | .unreq DCT_TABLE |
michael@0 | 888 | .unreq COEF_BLOCK |
michael@0 | 889 | .unreq OUTPUT_BUF |
michael@0 | 890 | .unreq OUTPUT_COL |
michael@0 | 891 | .unreq TMP1 |
michael@0 | 892 | .unreq TMP2 |
michael@0 | 893 | .unreq TMP3 |
michael@0 | 894 | .unreq TMP4 |
michael@0 | 895 | .endfunc |
michael@0 | 896 | |
michael@0 | 897 | /*****************************************************************************/ |
michael@0 | 898 | |
michael@0 | 899 | /* |
michael@0 | 900 | * jsimd_idct_4x4_neon |
michael@0 | 901 | * |
michael@0 | 902 | * This function contains inverse-DCT code for getting reduced-size |
michael@0 | 903 | * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations |
michael@0 | 904 | * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' |
michael@0 | 905 | * function from jpeg-6b (jidctred.c). |
michael@0 | 906 | * |
michael@0 | 907 | * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which |
michael@0 | 908 | * requires much less arithmetic operations and hence should be faster. |
michael@0 | 909 | * The primary purpose of this particular NEON optimized function is |
michael@0 | 910 | * bit exact compatibility with jpeg-6b. |
michael@0 | 911 | * |
michael@0 | 912 | * TODO: a bit better instructions scheduling can be achieved by expanding |
michael@0 | 913 | * idct_helper/transpose_4x4 macros and reordering instructions, |
michael@0 | 914 | * but readability will suffer somewhat. |
michael@0 | 915 | */ |
michael@0 | 916 | |
michael@0 | 917 | #define CONST_BITS 13 |
michael@0 | 918 | |
michael@0 | 919 | #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ |
michael@0 | 920 | #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ |
michael@0 | 921 | #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ |
michael@0 | 922 | #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ |
michael@0 | 923 | #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ |
michael@0 | 924 | #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ |
michael@0 | 925 | #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ |
michael@0 | 926 | #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ |
michael@0 | 927 | #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ |
michael@0 | 928 | #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ |
michael@0 | 929 | #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ |
michael@0 | 930 | #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ |
michael@0 | 931 | #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ |
michael@0 | 932 | #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ |
michael@0 | 933 | |
michael@0 | 934 | .balign 16 |
michael@0 | 935 | jsimd_idct_4x4_neon_consts: |
michael@0 | 936 | .short FIX_1_847759065 /* d0[0] */ |
michael@0 | 937 | .short -FIX_0_765366865 /* d0[1] */ |
michael@0 | 938 | .short -FIX_0_211164243 /* d0[2] */ |
michael@0 | 939 | .short FIX_1_451774981 /* d0[3] */ |
michael@0 | 940 | .short -FIX_2_172734803 /* d1[0] */ |
michael@0 | 941 | .short FIX_1_061594337 /* d1[1] */ |
michael@0 | 942 | .short -FIX_0_509795579 /* d1[2] */ |
michael@0 | 943 | .short -FIX_0_601344887 /* d1[3] */ |
michael@0 | 944 | .short FIX_0_899976223 /* d2[0] */ |
michael@0 | 945 | .short FIX_2_562915447 /* d2[1] */ |
michael@0 | 946 | .short 1 << (CONST_BITS+1) /* d2[2] */ |
michael@0 | 947 | .short 0 /* d2[3] */ |
michael@0 | 948 | |
michael@0 | 949 | .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 |
michael@0 | 950 | vmull.s16 q14, \x4, d2[2] |
michael@0 | 951 | vmlal.s16 q14, \x8, d0[0] |
michael@0 | 952 | vmlal.s16 q14, \x14, d0[1] |
michael@0 | 953 | |
michael@0 | 954 | vmull.s16 q13, \x16, d1[2] |
michael@0 | 955 | vmlal.s16 q13, \x12, d1[3] |
michael@0 | 956 | vmlal.s16 q13, \x10, d2[0] |
michael@0 | 957 | vmlal.s16 q13, \x6, d2[1] |
michael@0 | 958 | |
michael@0 | 959 | vmull.s16 q15, \x4, d2[2] |
michael@0 | 960 | vmlsl.s16 q15, \x8, d0[0] |
michael@0 | 961 | vmlsl.s16 q15, \x14, d0[1] |
michael@0 | 962 | |
michael@0 | 963 | vmull.s16 q12, \x16, d0[2] |
michael@0 | 964 | vmlal.s16 q12, \x12, d0[3] |
michael@0 | 965 | vmlal.s16 q12, \x10, d1[0] |
michael@0 | 966 | vmlal.s16 q12, \x6, d1[1] |
michael@0 | 967 | |
michael@0 | 968 | vadd.s32 q10, q14, q13 |
michael@0 | 969 | vsub.s32 q14, q14, q13 |
michael@0 | 970 | |
michael@0 | 971 | .if \shift > 16 |
michael@0 | 972 | vrshr.s32 q10, q10, #\shift |
michael@0 | 973 | vrshr.s32 q14, q14, #\shift |
michael@0 | 974 | vmovn.s32 \y26, q10 |
michael@0 | 975 | vmovn.s32 \y29, q14 |
michael@0 | 976 | .else |
michael@0 | 977 | vrshrn.s32 \y26, q10, #\shift |
michael@0 | 978 | vrshrn.s32 \y29, q14, #\shift |
michael@0 | 979 | .endif |
michael@0 | 980 | |
michael@0 | 981 | vadd.s32 q10, q15, q12 |
michael@0 | 982 | vsub.s32 q15, q15, q12 |
michael@0 | 983 | |
michael@0 | 984 | .if \shift > 16 |
michael@0 | 985 | vrshr.s32 q10, q10, #\shift |
michael@0 | 986 | vrshr.s32 q15, q15, #\shift |
michael@0 | 987 | vmovn.s32 \y27, q10 |
michael@0 | 988 | vmovn.s32 \y28, q15 |
michael@0 | 989 | .else |
michael@0 | 990 | vrshrn.s32 \y27, q10, #\shift |
michael@0 | 991 | vrshrn.s32 \y28, q15, #\shift |
michael@0 | 992 | .endif |
michael@0 | 993 | |
michael@0 | 994 | .endm |
michael@0 | 995 | |
michael@0 | 996 | asm_function jsimd_idct_4x4_neon |
michael@0 | 997 | |
michael@0 | 998 | DCT_TABLE .req r0 |
michael@0 | 999 | COEF_BLOCK .req r1 |
michael@0 | 1000 | OUTPUT_BUF .req r2 |
michael@0 | 1001 | OUTPUT_COL .req r3 |
michael@0 | 1002 | TMP1 .req r0 |
michael@0 | 1003 | TMP2 .req r1 |
michael@0 | 1004 | TMP3 .req r2 |
michael@0 | 1005 | TMP4 .req ip |
michael@0 | 1006 | |
michael@0 | 1007 | vpush {d8-d15} |
michael@0 | 1008 | |
michael@0 | 1009 | /* Load constants (d3 is just used for padding) */ |
michael@0 | 1010 | adr TMP4, jsimd_idct_4x4_neon_consts |
michael@0 | 1011 | vld1.16 {d0, d1, d2, d3}, [TMP4, :128] |
michael@0 | 1012 | |
michael@0 | 1013 | /* Load all COEF_BLOCK into NEON registers with the following allocation: |
michael@0 | 1014 | * 0 1 2 3 | 4 5 6 7 |
michael@0 | 1015 | * ---------+-------- |
michael@0 | 1016 | * 0 | d4 | d5 |
michael@0 | 1017 | * 1 | d6 | d7 |
michael@0 | 1018 | * 2 | d8 | d9 |
michael@0 | 1019 | * 3 | d10 | d11 |
michael@0 | 1020 | * 4 | - | - |
michael@0 | 1021 | * 5 | d12 | d13 |
michael@0 | 1022 | * 6 | d14 | d15 |
michael@0 | 1023 | * 7 | d16 | d17 |
michael@0 | 1024 | */ |
michael@0 | 1025 | vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! |
michael@0 | 1026 | vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! |
michael@0 | 1027 | add COEF_BLOCK, COEF_BLOCK, #16 |
michael@0 | 1028 | vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! |
michael@0 | 1029 | vld1.16 {d16, d17}, [COEF_BLOCK, :128]! |
michael@0 | 1030 | /* dequantize */ |
michael@0 | 1031 | vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! |
michael@0 | 1032 | vmul.s16 q2, q2, q9 |
michael@0 | 1033 | vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! |
michael@0 | 1034 | vmul.s16 q3, q3, q10 |
michael@0 | 1035 | vmul.s16 q4, q4, q11 |
michael@0 | 1036 | add DCT_TABLE, DCT_TABLE, #16 |
michael@0 | 1037 | vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! |
michael@0 | 1038 | vmul.s16 q5, q5, q12 |
michael@0 | 1039 | vmul.s16 q6, q6, q13 |
michael@0 | 1040 | vld1.16 {d30, d31}, [DCT_TABLE, :128]! |
michael@0 | 1041 | vmul.s16 q7, q7, q14 |
michael@0 | 1042 | vmul.s16 q8, q8, q15 |
michael@0 | 1043 | |
michael@0 | 1044 | /* Pass 1 */ |
michael@0 | 1045 | idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 |
michael@0 | 1046 | transpose_4x4 d4, d6, d8, d10 |
michael@0 | 1047 | idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 |
michael@0 | 1048 | transpose_4x4 d5, d7, d9, d11 |
michael@0 | 1049 | |
michael@0 | 1050 | /* Pass 2 */ |
michael@0 | 1051 | idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 |
michael@0 | 1052 | transpose_4x4 d26, d27, d28, d29 |
michael@0 | 1053 | |
michael@0 | 1054 | /* Range limit */ |
michael@0 | 1055 | vmov.u16 q15, #0x80 |
michael@0 | 1056 | vadd.s16 q13, q13, q15 |
michael@0 | 1057 | vadd.s16 q14, q14, q15 |
michael@0 | 1058 | vqmovun.s16 d26, q13 |
michael@0 | 1059 | vqmovun.s16 d27, q14 |
michael@0 | 1060 | |
michael@0 | 1061 | /* Store results to the output buffer */ |
michael@0 | 1062 | ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
michael@0 | 1063 | add TMP1, TMP1, OUTPUT_COL |
michael@0 | 1064 | add TMP2, TMP2, OUTPUT_COL |
michael@0 | 1065 | add TMP3, TMP3, OUTPUT_COL |
michael@0 | 1066 | add TMP4, TMP4, OUTPUT_COL |
michael@0 | 1067 | |
michael@0 | 1068 | #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT |
michael@0 | 1069 | /* We can use much less instructions on little endian systems if the |
michael@0 | 1070 | * OS kernel is not configured to trap unaligned memory accesses |
michael@0 | 1071 | */ |
michael@0 | 1072 | vst1.32 {d26[0]}, [TMP1]! |
michael@0 | 1073 | vst1.32 {d27[0]}, [TMP3]! |
michael@0 | 1074 | vst1.32 {d26[1]}, [TMP2]! |
michael@0 | 1075 | vst1.32 {d27[1]}, [TMP4]! |
michael@0 | 1076 | #else |
michael@0 | 1077 | vst1.8 {d26[0]}, [TMP1]! |
michael@0 | 1078 | vst1.8 {d27[0]}, [TMP3]! |
michael@0 | 1079 | vst1.8 {d26[1]}, [TMP1]! |
michael@0 | 1080 | vst1.8 {d27[1]}, [TMP3]! |
michael@0 | 1081 | vst1.8 {d26[2]}, [TMP1]! |
michael@0 | 1082 | vst1.8 {d27[2]}, [TMP3]! |
michael@0 | 1083 | vst1.8 {d26[3]}, [TMP1]! |
michael@0 | 1084 | vst1.8 {d27[3]}, [TMP3]! |
michael@0 | 1085 | |
michael@0 | 1086 | vst1.8 {d26[4]}, [TMP2]! |
michael@0 | 1087 | vst1.8 {d27[4]}, [TMP4]! |
michael@0 | 1088 | vst1.8 {d26[5]}, [TMP2]! |
michael@0 | 1089 | vst1.8 {d27[5]}, [TMP4]! |
michael@0 | 1090 | vst1.8 {d26[6]}, [TMP2]! |
michael@0 | 1091 | vst1.8 {d27[6]}, [TMP4]! |
michael@0 | 1092 | vst1.8 {d26[7]}, [TMP2]! |
michael@0 | 1093 | vst1.8 {d27[7]}, [TMP4]! |
michael@0 | 1094 | #endif |
michael@0 | 1095 | |
michael@0 | 1096 | vpop {d8-d15} |
michael@0 | 1097 | bx lr |
michael@0 | 1098 | |
michael@0 | 1099 | .unreq DCT_TABLE |
michael@0 | 1100 | .unreq COEF_BLOCK |
michael@0 | 1101 | .unreq OUTPUT_BUF |
michael@0 | 1102 | .unreq OUTPUT_COL |
michael@0 | 1103 | .unreq TMP1 |
michael@0 | 1104 | .unreq TMP2 |
michael@0 | 1105 | .unreq TMP3 |
michael@0 | 1106 | .unreq TMP4 |
michael@0 | 1107 | .endfunc |
michael@0 | 1108 | |
michael@0 | 1109 | .purgem idct_helper |
michael@0 | 1110 | |
michael@0 | 1111 | /*****************************************************************************/ |
michael@0 | 1112 | |
michael@0 | 1113 | /* |
michael@0 | 1114 | * jsimd_idct_2x2_neon |
michael@0 | 1115 | * |
michael@0 | 1116 | * This function contains inverse-DCT code for getting reduced-size |
michael@0 | 1117 | * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations |
michael@0 | 1118 | * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' |
michael@0 | 1119 | * function from jpeg-6b (jidctred.c). |
michael@0 | 1120 | * |
michael@0 | 1121 | * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which |
michael@0 | 1122 | * requires much less arithmetic operations and hence should be faster. |
michael@0 | 1123 | * The primary purpose of this particular NEON optimized function is |
michael@0 | 1124 | * bit exact compatibility with jpeg-6b. |
michael@0 | 1125 | */ |
michael@0 | 1126 | |
michael@0 | 1127 | .balign 8 |
michael@0 | 1128 | jsimd_idct_2x2_neon_consts: |
michael@0 | 1129 | .short -FIX_0_720959822 /* d0[0] */ |
michael@0 | 1130 | .short FIX_0_850430095 /* d0[1] */ |
michael@0 | 1131 | .short -FIX_1_272758580 /* d0[2] */ |
michael@0 | 1132 | .short FIX_3_624509785 /* d0[3] */ |
michael@0 | 1133 | |
michael@0 | 1134 | .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 |
michael@0 | 1135 | vshll.s16 q14, \x4, #15 |
michael@0 | 1136 | vmull.s16 q13, \x6, d0[3] |
michael@0 | 1137 | vmlal.s16 q13, \x10, d0[2] |
michael@0 | 1138 | vmlal.s16 q13, \x12, d0[1] |
michael@0 | 1139 | vmlal.s16 q13, \x16, d0[0] |
michael@0 | 1140 | |
michael@0 | 1141 | vadd.s32 q10, q14, q13 |
michael@0 | 1142 | vsub.s32 q14, q14, q13 |
michael@0 | 1143 | |
michael@0 | 1144 | .if \shift > 16 |
michael@0 | 1145 | vrshr.s32 q10, q10, #\shift |
michael@0 | 1146 | vrshr.s32 q14, q14, #\shift |
michael@0 | 1147 | vmovn.s32 \y26, q10 |
michael@0 | 1148 | vmovn.s32 \y27, q14 |
michael@0 | 1149 | .else |
michael@0 | 1150 | vrshrn.s32 \y26, q10, #\shift |
michael@0 | 1151 | vrshrn.s32 \y27, q14, #\shift |
michael@0 | 1152 | .endif |
michael@0 | 1153 | |
michael@0 | 1154 | .endm |
michael@0 | 1155 | |
michael@0 | 1156 | asm_function jsimd_idct_2x2_neon |
michael@0 | 1157 | |
michael@0 | 1158 | DCT_TABLE .req r0 |
michael@0 | 1159 | COEF_BLOCK .req r1 |
michael@0 | 1160 | OUTPUT_BUF .req r2 |
michael@0 | 1161 | OUTPUT_COL .req r3 |
michael@0 | 1162 | TMP1 .req r0 |
michael@0 | 1163 | TMP2 .req ip |
michael@0 | 1164 | |
michael@0 | 1165 | vpush {d8-d15} |
michael@0 | 1166 | |
michael@0 | 1167 | /* Load constants */ |
michael@0 | 1168 | adr TMP2, jsimd_idct_2x2_neon_consts |
michael@0 | 1169 | vld1.16 {d0}, [TMP2, :64] |
michael@0 | 1170 | |
michael@0 | 1171 | /* Load all COEF_BLOCK into NEON registers with the following allocation: |
michael@0 | 1172 | * 0 1 2 3 | 4 5 6 7 |
michael@0 | 1173 | * ---------+-------- |
michael@0 | 1174 | * 0 | d4 | d5 |
michael@0 | 1175 | * 1 | d6 | d7 |
michael@0 | 1176 | * 2 | - | - |
michael@0 | 1177 | * 3 | d10 | d11 |
michael@0 | 1178 | * 4 | - | - |
michael@0 | 1179 | * 5 | d12 | d13 |
michael@0 | 1180 | * 6 | - | - |
michael@0 | 1181 | * 7 | d16 | d17 |
michael@0 | 1182 | */ |
michael@0 | 1183 | vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! |
michael@0 | 1184 | add COEF_BLOCK, COEF_BLOCK, #16 |
michael@0 | 1185 | vld1.16 {d10, d11}, [COEF_BLOCK, :128]! |
michael@0 | 1186 | add COEF_BLOCK, COEF_BLOCK, #16 |
michael@0 | 1187 | vld1.16 {d12, d13}, [COEF_BLOCK, :128]! |
michael@0 | 1188 | add COEF_BLOCK, COEF_BLOCK, #16 |
michael@0 | 1189 | vld1.16 {d16, d17}, [COEF_BLOCK, :128]! |
michael@0 | 1190 | /* Dequantize */ |
michael@0 | 1191 | vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! |
michael@0 | 1192 | vmul.s16 q2, q2, q9 |
michael@0 | 1193 | vmul.s16 q3, q3, q10 |
michael@0 | 1194 | add DCT_TABLE, DCT_TABLE, #16 |
michael@0 | 1195 | vld1.16 {d24, d25}, [DCT_TABLE, :128]! |
michael@0 | 1196 | vmul.s16 q5, q5, q12 |
michael@0 | 1197 | add DCT_TABLE, DCT_TABLE, #16 |
michael@0 | 1198 | vld1.16 {d26, d27}, [DCT_TABLE, :128]! |
michael@0 | 1199 | vmul.s16 q6, q6, q13 |
michael@0 | 1200 | add DCT_TABLE, DCT_TABLE, #16 |
michael@0 | 1201 | vld1.16 {d30, d31}, [DCT_TABLE, :128]! |
michael@0 | 1202 | vmul.s16 q8, q8, q15 |
michael@0 | 1203 | |
michael@0 | 1204 | /* Pass 1 */ |
michael@0 | 1205 | #if 0 |
michael@0 | 1206 | idct_helper d4, d6, d10, d12, d16, 13, d4, d6 |
michael@0 | 1207 | transpose_4x4 d4, d6, d8, d10 |
michael@0 | 1208 | idct_helper d5, d7, d11, d13, d17, 13, d5, d7 |
michael@0 | 1209 | transpose_4x4 d5, d7, d9, d11 |
michael@0 | 1210 | #else |
michael@0 | 1211 | vmull.s16 q13, d6, d0[3] |
michael@0 | 1212 | vmlal.s16 q13, d10, d0[2] |
michael@0 | 1213 | vmlal.s16 q13, d12, d0[1] |
michael@0 | 1214 | vmlal.s16 q13, d16, d0[0] |
michael@0 | 1215 | vmull.s16 q12, d7, d0[3] |
michael@0 | 1216 | vmlal.s16 q12, d11, d0[2] |
michael@0 | 1217 | vmlal.s16 q12, d13, d0[1] |
michael@0 | 1218 | vmlal.s16 q12, d17, d0[0] |
michael@0 | 1219 | vshll.s16 q14, d4, #15 |
michael@0 | 1220 | vshll.s16 q15, d5, #15 |
michael@0 | 1221 | vadd.s32 q10, q14, q13 |
michael@0 | 1222 | vsub.s32 q14, q14, q13 |
michael@0 | 1223 | vrshrn.s32 d4, q10, #13 |
michael@0 | 1224 | vrshrn.s32 d6, q14, #13 |
michael@0 | 1225 | vadd.s32 q10, q15, q12 |
michael@0 | 1226 | vsub.s32 q14, q15, q12 |
michael@0 | 1227 | vrshrn.s32 d5, q10, #13 |
michael@0 | 1228 | vrshrn.s32 d7, q14, #13 |
michael@0 | 1229 | vtrn.16 q2, q3 |
michael@0 | 1230 | vtrn.32 q3, q5 |
michael@0 | 1231 | #endif |
michael@0 | 1232 | |
michael@0 | 1233 | /* Pass 2 */ |
michael@0 | 1234 | idct_helper d4, d6, d10, d7, d11, 20, d26, d27 |
michael@0 | 1235 | |
michael@0 | 1236 | /* Range limit */ |
michael@0 | 1237 | vmov.u16 q15, #0x80 |
michael@0 | 1238 | vadd.s16 q13, q13, q15 |
michael@0 | 1239 | vqmovun.s16 d26, q13 |
michael@0 | 1240 | vqmovun.s16 d27, q13 |
michael@0 | 1241 | |
michael@0 | 1242 | /* Store results to the output buffer */ |
michael@0 | 1243 | ldmia OUTPUT_BUF, {TMP1, TMP2} |
michael@0 | 1244 | add TMP1, TMP1, OUTPUT_COL |
michael@0 | 1245 | add TMP2, TMP2, OUTPUT_COL |
michael@0 | 1246 | |
michael@0 | 1247 | vst1.8 {d26[0]}, [TMP1]! |
michael@0 | 1248 | vst1.8 {d27[4]}, [TMP1]! |
michael@0 | 1249 | vst1.8 {d26[1]}, [TMP2]! |
michael@0 | 1250 | vst1.8 {d27[5]}, [TMP2]! |
michael@0 | 1251 | |
michael@0 | 1252 | vpop {d8-d15} |
michael@0 | 1253 | bx lr |
michael@0 | 1254 | |
michael@0 | 1255 | .unreq DCT_TABLE |
michael@0 | 1256 | .unreq COEF_BLOCK |
michael@0 | 1257 | .unreq OUTPUT_BUF |
michael@0 | 1258 | .unreq OUTPUT_COL |
michael@0 | 1259 | .unreq TMP1 |
michael@0 | 1260 | .unreq TMP2 |
michael@0 | 1261 | .endfunc |
michael@0 | 1262 | |
michael@0 | 1263 | .purgem idct_helper |
michael@0 | 1264 | |
michael@0 | 1265 | /*****************************************************************************/ |
michael@0 | 1266 | |
michael@0 | 1267 | /* |
michael@0 | 1268 | * jsimd_ycc_extrgb_convert_neon |
michael@0 | 1269 | * jsimd_ycc_extbgr_convert_neon |
michael@0 | 1270 | * jsimd_ycc_extrgbx_convert_neon |
michael@0 | 1271 | * jsimd_ycc_extbgrx_convert_neon |
michael@0 | 1272 | * jsimd_ycc_extxbgr_convert_neon |
michael@0 | 1273 | * jsimd_ycc_extxrgb_convert_neon |
michael@0 | 1274 | * |
michael@0 | 1275 | * Colorspace conversion YCbCr -> RGB |
michael@0 | 1276 | */ |
michael@0 | 1277 | |
michael@0 | 1278 | |
michael@0 | 1279 | .macro do_load size |
michael@0 | 1280 | .if \size == 8 |
michael@0 | 1281 | vld1.8 {d4}, [U, :64]! |
michael@0 | 1282 | vld1.8 {d5}, [V, :64]! |
michael@0 | 1283 | vld1.8 {d0}, [Y, :64]! |
michael@0 | 1284 | pld [U, #64] |
michael@0 | 1285 | pld [V, #64] |
michael@0 | 1286 | pld [Y, #64] |
michael@0 | 1287 | .elseif \size == 4 |
michael@0 | 1288 | vld1.8 {d4[0]}, [U]! |
michael@0 | 1289 | vld1.8 {d4[1]}, [U]! |
michael@0 | 1290 | vld1.8 {d4[2]}, [U]! |
michael@0 | 1291 | vld1.8 {d4[3]}, [U]! |
michael@0 | 1292 | vld1.8 {d5[0]}, [V]! |
michael@0 | 1293 | vld1.8 {d5[1]}, [V]! |
michael@0 | 1294 | vld1.8 {d5[2]}, [V]! |
michael@0 | 1295 | vld1.8 {d5[3]}, [V]! |
michael@0 | 1296 | vld1.8 {d0[0]}, [Y]! |
michael@0 | 1297 | vld1.8 {d0[1]}, [Y]! |
michael@0 | 1298 | vld1.8 {d0[2]}, [Y]! |
michael@0 | 1299 | vld1.8 {d0[3]}, [Y]! |
michael@0 | 1300 | .elseif \size == 2 |
michael@0 | 1301 | vld1.8 {d4[4]}, [U]! |
michael@0 | 1302 | vld1.8 {d4[5]}, [U]! |
michael@0 | 1303 | vld1.8 {d5[4]}, [V]! |
michael@0 | 1304 | vld1.8 {d5[5]}, [V]! |
michael@0 | 1305 | vld1.8 {d0[4]}, [Y]! |
michael@0 | 1306 | vld1.8 {d0[5]}, [Y]! |
michael@0 | 1307 | .elseif \size == 1 |
michael@0 | 1308 | vld1.8 {d4[6]}, [U]! |
michael@0 | 1309 | vld1.8 {d5[6]}, [V]! |
michael@0 | 1310 | vld1.8 {d0[6]}, [Y]! |
michael@0 | 1311 | .else |
michael@0 | 1312 | .error unsupported macroblock size |
michael@0 | 1313 | .endif |
michael@0 | 1314 | .endm |
michael@0 | 1315 | |
michael@0 | 1316 | .macro do_store bpp, size |
michael@0 | 1317 | .if \bpp == 24 |
michael@0 | 1318 | .if \size == 8 |
michael@0 | 1319 | vst3.8 {d10, d11, d12}, [RGB]! |
michael@0 | 1320 | .elseif \size == 4 |
michael@0 | 1321 | vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
michael@0 | 1322 | vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
michael@0 | 1323 | vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
michael@0 | 1324 | vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
michael@0 | 1325 | .elseif \size == 2 |
michael@0 | 1326 | vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
michael@0 | 1327 | vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
michael@0 | 1328 | .elseif \size == 1 |
michael@0 | 1329 | vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
michael@0 | 1330 | .else |
michael@0 | 1331 | .error unsupported macroblock size |
michael@0 | 1332 | .endif |
michael@0 | 1333 | .elseif \bpp == 32 |
michael@0 | 1334 | .if \size == 8 |
michael@0 | 1335 | vst4.8 {d10, d11, d12, d13}, [RGB]! |
michael@0 | 1336 | .elseif \size == 4 |
michael@0 | 1337 | vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
michael@0 | 1338 | vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
michael@0 | 1339 | vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
michael@0 | 1340 | vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
michael@0 | 1341 | .elseif \size == 2 |
michael@0 | 1342 | vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
michael@0 | 1343 | vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
michael@0 | 1344 | .elseif \size == 1 |
michael@0 | 1345 | vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
michael@0 | 1346 | .else |
michael@0 | 1347 | .error unsupported macroblock size |
michael@0 | 1348 | .endif |
michael@0 | 1349 | .else |
michael@0 | 1350 | .error unsupported bpp |
michael@0 | 1351 | .endif |
michael@0 | 1352 | .endm |
michael@0 | 1353 | |
michael@0 | 1354 | .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
michael@0 | 1355 | |
michael@0 | 1356 | /* |
michael@0 | 1357 | * 2 stage pipelined YCbCr->RGB conversion |
michael@0 | 1358 | */ |
michael@0 | 1359 | |
michael@0 | 1360 | .macro do_yuv_to_rgb_stage1 |
michael@0 | 1361 | vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
michael@0 | 1362 | vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
michael@0 | 1363 | vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
michael@0 | 1364 | vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
michael@0 | 1365 | vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
michael@0 | 1366 | vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
michael@0 | 1367 | vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
michael@0 | 1368 | vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
michael@0 | 1369 | vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
michael@0 | 1370 | vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
michael@0 | 1371 | .endm |
michael@0 | 1372 | |
michael@0 | 1373 | .macro do_yuv_to_rgb_stage2 |
michael@0 | 1374 | vrshrn.s32 d20, q10, #15 |
michael@0 | 1375 | vrshrn.s32 d21, q11, #15 |
michael@0 | 1376 | vrshrn.s32 d24, q12, #14 |
michael@0 | 1377 | vrshrn.s32 d25, q13, #14 |
michael@0 | 1378 | vrshrn.s32 d28, q14, #14 |
michael@0 | 1379 | vrshrn.s32 d29, q15, #14 |
michael@0 | 1380 | vaddw.u8 q10, q10, d0 |
michael@0 | 1381 | vaddw.u8 q12, q12, d0 |
michael@0 | 1382 | vaddw.u8 q14, q14, d0 |
michael@0 | 1383 | vqmovun.s16 d1\g_offs, q10 |
michael@0 | 1384 | vqmovun.s16 d1\r_offs, q12 |
michael@0 | 1385 | vqmovun.s16 d1\b_offs, q14 |
michael@0 | 1386 | .endm |
michael@0 | 1387 | |
michael@0 | 1388 | .macro do_yuv_to_rgb_stage2_store_load_stage1 |
michael@0 | 1389 | vld1.8 {d4}, [U, :64]! |
michael@0 | 1390 | vrshrn.s32 d20, q10, #15 |
michael@0 | 1391 | vrshrn.s32 d21, q11, #15 |
michael@0 | 1392 | vrshrn.s32 d24, q12, #14 |
michael@0 | 1393 | vrshrn.s32 d25, q13, #14 |
michael@0 | 1394 | vrshrn.s32 d28, q14, #14 |
michael@0 | 1395 | vld1.8 {d5}, [V, :64]! |
michael@0 | 1396 | vrshrn.s32 d29, q15, #14 |
michael@0 | 1397 | vaddw.u8 q10, q10, d0 |
michael@0 | 1398 | vaddw.u8 q12, q12, d0 |
michael@0 | 1399 | vaddw.u8 q14, q14, d0 |
michael@0 | 1400 | vqmovun.s16 d1\g_offs, q10 |
michael@0 | 1401 | vld1.8 {d0}, [Y, :64]! |
michael@0 | 1402 | vqmovun.s16 d1\r_offs, q12 |
michael@0 | 1403 | pld [U, #64] |
michael@0 | 1404 | pld [V, #64] |
michael@0 | 1405 | pld [Y, #64] |
michael@0 | 1406 | vqmovun.s16 d1\b_offs, q14 |
michael@0 | 1407 | vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
michael@0 | 1408 | vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
michael@0 | 1409 | do_store \bpp, 8 |
michael@0 | 1410 | vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
michael@0 | 1411 | vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
michael@0 | 1412 | vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
michael@0 | 1413 | vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
michael@0 | 1414 | vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
michael@0 | 1415 | vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
michael@0 | 1416 | vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
michael@0 | 1417 | vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
michael@0 | 1418 | .endm |
michael@0 | 1419 | |
michael@0 | 1420 | .macro do_yuv_to_rgb |
michael@0 | 1421 | do_yuv_to_rgb_stage1 |
michael@0 | 1422 | do_yuv_to_rgb_stage2 |
michael@0 | 1423 | .endm |
michael@0 | 1424 | |
michael@0 | 1425 | /* Apple gas crashes on adrl, work around that by using adr. |
michael@0 | 1426 | * But this requires a copy of these constants for each function. |
michael@0 | 1427 | */ |
michael@0 | 1428 | |
michael@0 | 1429 | .balign 16 |
michael@0 | 1430 | jsimd_ycc_\colorid\()_neon_consts: |
michael@0 | 1431 | .short 0, 0, 0, 0 |
michael@0 | 1432 | .short 22971, -11277, -23401, 29033 |
michael@0 | 1433 | .short -128, -128, -128, -128 |
michael@0 | 1434 | .short -128, -128, -128, -128 |
michael@0 | 1435 | |
michael@0 | 1436 | asm_function jsimd_ycc_\colorid\()_convert_neon |
michael@0 | 1437 | OUTPUT_WIDTH .req r0 |
michael@0 | 1438 | INPUT_BUF .req r1 |
michael@0 | 1439 | INPUT_ROW .req r2 |
michael@0 | 1440 | OUTPUT_BUF .req r3 |
michael@0 | 1441 | NUM_ROWS .req r4 |
michael@0 | 1442 | |
michael@0 | 1443 | INPUT_BUF0 .req r5 |
michael@0 | 1444 | INPUT_BUF1 .req r6 |
michael@0 | 1445 | INPUT_BUF2 .req INPUT_BUF |
michael@0 | 1446 | |
michael@0 | 1447 | RGB .req r7 |
michael@0 | 1448 | Y .req r8 |
michael@0 | 1449 | U .req r9 |
michael@0 | 1450 | V .req r10 |
michael@0 | 1451 | N .req ip |
michael@0 | 1452 | |
michael@0 | 1453 | /* Load constants to d1, d2, d3 (d0 is just used for padding) */ |
michael@0 | 1454 | adr ip, jsimd_ycc_\colorid\()_neon_consts |
michael@0 | 1455 | vld1.16 {d0, d1, d2, d3}, [ip, :128] |
michael@0 | 1456 | |
michael@0 | 1457 | /* Save ARM registers and handle input arguments */ |
michael@0 | 1458 | push {r4, r5, r6, r7, r8, r9, r10, lr} |
michael@0 | 1459 | ldr NUM_ROWS, [sp, #(4 * 8)] |
michael@0 | 1460 | ldr INPUT_BUF0, [INPUT_BUF] |
michael@0 | 1461 | ldr INPUT_BUF1, [INPUT_BUF, #4] |
michael@0 | 1462 | ldr INPUT_BUF2, [INPUT_BUF, #8] |
michael@0 | 1463 | .unreq INPUT_BUF |
michael@0 | 1464 | |
michael@0 | 1465 | /* Save NEON registers */ |
michael@0 | 1466 | vpush {d8-d15} |
michael@0 | 1467 | |
michael@0 | 1468 | /* Initially set d10, d11, d12, d13 to 0xFF */ |
michael@0 | 1469 | vmov.u8 q5, #255 |
michael@0 | 1470 | vmov.u8 q6, #255 |
michael@0 | 1471 | |
michael@0 | 1472 | /* Outer loop over scanlines */ |
michael@0 | 1473 | cmp NUM_ROWS, #1 |
michael@0 | 1474 | blt 9f |
michael@0 | 1475 | 0: |
michael@0 | 1476 | ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] |
michael@0 | 1477 | ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] |
michael@0 | 1478 | mov N, OUTPUT_WIDTH |
michael@0 | 1479 | ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] |
michael@0 | 1480 | add INPUT_ROW, INPUT_ROW, #1 |
michael@0 | 1481 | ldr RGB, [OUTPUT_BUF], #4 |
michael@0 | 1482 | |
michael@0 | 1483 | /* Inner loop over pixels */ |
michael@0 | 1484 | subs N, N, #8 |
michael@0 | 1485 | blt 3f |
michael@0 | 1486 | do_load 8 |
michael@0 | 1487 | do_yuv_to_rgb_stage1 |
michael@0 | 1488 | subs N, N, #8 |
michael@0 | 1489 | blt 2f |
michael@0 | 1490 | 1: |
michael@0 | 1491 | do_yuv_to_rgb_stage2_store_load_stage1 |
michael@0 | 1492 | subs N, N, #8 |
michael@0 | 1493 | bge 1b |
michael@0 | 1494 | 2: |
michael@0 | 1495 | do_yuv_to_rgb_stage2 |
michael@0 | 1496 | do_store \bpp, 8 |
michael@0 | 1497 | tst N, #7 |
michael@0 | 1498 | beq 8f |
michael@0 | 1499 | 3: |
michael@0 | 1500 | tst N, #4 |
michael@0 | 1501 | beq 3f |
michael@0 | 1502 | do_load 4 |
michael@0 | 1503 | 3: |
michael@0 | 1504 | tst N, #2 |
michael@0 | 1505 | beq 4f |
michael@0 | 1506 | do_load 2 |
michael@0 | 1507 | 4: |
michael@0 | 1508 | tst N, #1 |
michael@0 | 1509 | beq 5f |
michael@0 | 1510 | do_load 1 |
michael@0 | 1511 | 5: |
michael@0 | 1512 | do_yuv_to_rgb |
michael@0 | 1513 | tst N, #4 |
michael@0 | 1514 | beq 6f |
michael@0 | 1515 | do_store \bpp, 4 |
michael@0 | 1516 | 6: |
michael@0 | 1517 | tst N, #2 |
michael@0 | 1518 | beq 7f |
michael@0 | 1519 | do_store \bpp, 2 |
michael@0 | 1520 | 7: |
michael@0 | 1521 | tst N, #1 |
michael@0 | 1522 | beq 8f |
michael@0 | 1523 | do_store \bpp, 1 |
michael@0 | 1524 | 8: |
michael@0 | 1525 | subs NUM_ROWS, NUM_ROWS, #1 |
michael@0 | 1526 | bgt 0b |
michael@0 | 1527 | 9: |
michael@0 | 1528 | /* Restore all registers and return */ |
michael@0 | 1529 | vpop {d8-d15} |
michael@0 | 1530 | pop {r4, r5, r6, r7, r8, r9, r10, pc} |
michael@0 | 1531 | |
michael@0 | 1532 | .unreq OUTPUT_WIDTH |
michael@0 | 1533 | .unreq INPUT_ROW |
michael@0 | 1534 | .unreq OUTPUT_BUF |
michael@0 | 1535 | .unreq NUM_ROWS |
michael@0 | 1536 | .unreq INPUT_BUF0 |
michael@0 | 1537 | .unreq INPUT_BUF1 |
michael@0 | 1538 | .unreq INPUT_BUF2 |
michael@0 | 1539 | .unreq RGB |
michael@0 | 1540 | .unreq Y |
michael@0 | 1541 | .unreq U |
michael@0 | 1542 | .unreq V |
michael@0 | 1543 | .unreq N |
michael@0 | 1544 | .endfunc |
michael@0 | 1545 | |
michael@0 | 1546 | .purgem do_yuv_to_rgb |
michael@0 | 1547 | .purgem do_yuv_to_rgb_stage1 |
michael@0 | 1548 | .purgem do_yuv_to_rgb_stage2 |
michael@0 | 1549 | .purgem do_yuv_to_rgb_stage2_store_load_stage1 |
michael@0 | 1550 | |
michael@0 | 1551 | .endm |
michael@0 | 1552 | |
michael@0 | 1553 | /*--------------------------------- id ----- bpp R G B */ |
michael@0 | 1554 | generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 |
michael@0 | 1555 | generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 |
michael@0 | 1556 | generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 |
michael@0 | 1557 | generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 |
michael@0 | 1558 | generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 |
michael@0 | 1559 | generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 |
michael@0 | 1560 | |
michael@0 | 1561 | .purgem do_load |
michael@0 | 1562 | .purgem do_store |
michael@0 | 1563 | |
michael@0 | 1564 | /*****************************************************************************/ |
michael@0 | 1565 | |
michael@0 | 1566 | /* |
michael@0 | 1567 | * jsimd_extrgb_ycc_convert_neon |
michael@0 | 1568 | * jsimd_extbgr_ycc_convert_neon |
michael@0 | 1569 | * jsimd_extrgbx_ycc_convert_neon |
michael@0 | 1570 | * jsimd_extbgrx_ycc_convert_neon |
michael@0 | 1571 | * jsimd_extxbgr_ycc_convert_neon |
michael@0 | 1572 | * jsimd_extxrgb_ycc_convert_neon |
michael@0 | 1573 | * |
michael@0 | 1574 | * Colorspace conversion RGB -> YCbCr |
michael@0 | 1575 | */ |
michael@0 | 1576 | |
michael@0 | 1577 | .macro do_store size |
michael@0 | 1578 | .if \size == 8 |
michael@0 | 1579 | vst1.8 {d20}, [Y]! |
michael@0 | 1580 | vst1.8 {d21}, [U]! |
michael@0 | 1581 | vst1.8 {d22}, [V]! |
michael@0 | 1582 | .elseif \size == 4 |
michael@0 | 1583 | vst1.8 {d20[0]}, [Y]! |
michael@0 | 1584 | vst1.8 {d20[1]}, [Y]! |
michael@0 | 1585 | vst1.8 {d20[2]}, [Y]! |
michael@0 | 1586 | vst1.8 {d20[3]}, [Y]! |
michael@0 | 1587 | vst1.8 {d21[0]}, [U]! |
michael@0 | 1588 | vst1.8 {d21[1]}, [U]! |
michael@0 | 1589 | vst1.8 {d21[2]}, [U]! |
michael@0 | 1590 | vst1.8 {d21[3]}, [U]! |
michael@0 | 1591 | vst1.8 {d22[0]}, [V]! |
michael@0 | 1592 | vst1.8 {d22[1]}, [V]! |
michael@0 | 1593 | vst1.8 {d22[2]}, [V]! |
michael@0 | 1594 | vst1.8 {d22[3]}, [V]! |
michael@0 | 1595 | .elseif \size == 2 |
michael@0 | 1596 | vst1.8 {d20[4]}, [Y]! |
michael@0 | 1597 | vst1.8 {d20[5]}, [Y]! |
michael@0 | 1598 | vst1.8 {d21[4]}, [U]! |
michael@0 | 1599 | vst1.8 {d21[5]}, [U]! |
michael@0 | 1600 | vst1.8 {d22[4]}, [V]! |
michael@0 | 1601 | vst1.8 {d22[5]}, [V]! |
michael@0 | 1602 | .elseif \size == 1 |
michael@0 | 1603 | vst1.8 {d20[6]}, [Y]! |
michael@0 | 1604 | vst1.8 {d21[6]}, [U]! |
michael@0 | 1605 | vst1.8 {d22[6]}, [V]! |
michael@0 | 1606 | .else |
michael@0 | 1607 | .error unsupported macroblock size |
michael@0 | 1608 | .endif |
michael@0 | 1609 | .endm |
michael@0 | 1610 | |
michael@0 | 1611 | .macro do_load bpp, size |
michael@0 | 1612 | .if \bpp == 24 |
michael@0 | 1613 | .if \size == 8 |
michael@0 | 1614 | vld3.8 {d10, d11, d12}, [RGB]! |
michael@0 | 1615 | pld [RGB, #128] |
michael@0 | 1616 | .elseif \size == 4 |
michael@0 | 1617 | vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
michael@0 | 1618 | vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
michael@0 | 1619 | vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
michael@0 | 1620 | vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
michael@0 | 1621 | .elseif \size == 2 |
michael@0 | 1622 | vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
michael@0 | 1623 | vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
michael@0 | 1624 | .elseif \size == 1 |
michael@0 | 1625 | vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
michael@0 | 1626 | .else |
michael@0 | 1627 | .error unsupported macroblock size |
michael@0 | 1628 | .endif |
michael@0 | 1629 | .elseif \bpp == 32 |
michael@0 | 1630 | .if \size == 8 |
michael@0 | 1631 | vld4.8 {d10, d11, d12, d13}, [RGB]! |
michael@0 | 1632 | pld [RGB, #128] |
michael@0 | 1633 | .elseif \size == 4 |
michael@0 | 1634 | vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
michael@0 | 1635 | vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
michael@0 | 1636 | vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
michael@0 | 1637 | vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
michael@0 | 1638 | .elseif \size == 2 |
michael@0 | 1639 | vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
michael@0 | 1640 | vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
michael@0 | 1641 | .elseif \size == 1 |
michael@0 | 1642 | vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
michael@0 | 1643 | .else |
michael@0 | 1644 | .error unsupported macroblock size |
michael@0 | 1645 | .endif |
michael@0 | 1646 | .else |
michael@0 | 1647 | .error unsupported bpp |
michael@0 | 1648 | .endif |
michael@0 | 1649 | .endm |
michael@0 | 1650 | |
michael@0 | 1651 | .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
michael@0 | 1652 | |
michael@0 | 1653 | /* |
michael@0 | 1654 | * 2 stage pipelined RGB->YCbCr conversion |
michael@0 | 1655 | */ |
michael@0 | 1656 | |
michael@0 | 1657 | .macro do_rgb_to_yuv_stage1 |
michael@0 | 1658 | vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
michael@0 | 1659 | vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
michael@0 | 1660 | vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
michael@0 | 1661 | vmull.u16 q7, d4, d0[0] |
michael@0 | 1662 | vmlal.u16 q7, d6, d0[1] |
michael@0 | 1663 | vmlal.u16 q7, d8, d0[2] |
michael@0 | 1664 | vmull.u16 q8, d5, d0[0] |
michael@0 | 1665 | vmlal.u16 q8, d7, d0[1] |
michael@0 | 1666 | vmlal.u16 q8, d9, d0[2] |
michael@0 | 1667 | vrev64.32 q9, q1 |
michael@0 | 1668 | vrev64.32 q13, q1 |
michael@0 | 1669 | vmlsl.u16 q9, d4, d0[3] |
michael@0 | 1670 | vmlsl.u16 q9, d6, d1[0] |
michael@0 | 1671 | vmlal.u16 q9, d8, d1[1] |
michael@0 | 1672 | vmlsl.u16 q13, d5, d0[3] |
michael@0 | 1673 | vmlsl.u16 q13, d7, d1[0] |
michael@0 | 1674 | vmlal.u16 q13, d9, d1[1] |
michael@0 | 1675 | vrev64.32 q14, q1 |
michael@0 | 1676 | vrev64.32 q15, q1 |
michael@0 | 1677 | vmlal.u16 q14, d4, d1[1] |
michael@0 | 1678 | vmlsl.u16 q14, d6, d1[2] |
michael@0 | 1679 | vmlsl.u16 q14, d8, d1[3] |
michael@0 | 1680 | vmlal.u16 q15, d5, d1[1] |
michael@0 | 1681 | vmlsl.u16 q15, d7, d1[2] |
michael@0 | 1682 | vmlsl.u16 q15, d9, d1[3] |
michael@0 | 1683 | .endm |
michael@0 | 1684 | |
michael@0 | 1685 | .macro do_rgb_to_yuv_stage2 |
michael@0 | 1686 | vrshrn.u32 d20, q7, #16 |
michael@0 | 1687 | vrshrn.u32 d21, q8, #16 |
michael@0 | 1688 | vshrn.u32 d22, q9, #16 |
michael@0 | 1689 | vshrn.u32 d23, q13, #16 |
michael@0 | 1690 | vshrn.u32 d24, q14, #16 |
michael@0 | 1691 | vshrn.u32 d25, q15, #16 |
michael@0 | 1692 | vmovn.u16 d20, q10 /* d20 = y */ |
michael@0 | 1693 | vmovn.u16 d21, q11 /* d21 = u */ |
michael@0 | 1694 | vmovn.u16 d22, q12 /* d22 = v */ |
michael@0 | 1695 | .endm |
michael@0 | 1696 | |
michael@0 | 1697 | .macro do_rgb_to_yuv |
michael@0 | 1698 | do_rgb_to_yuv_stage1 |
michael@0 | 1699 | do_rgb_to_yuv_stage2 |
michael@0 | 1700 | .endm |
michael@0 | 1701 | |
michael@0 | 1702 | .macro do_rgb_to_yuv_stage2_store_load_stage1 |
michael@0 | 1703 | vrshrn.u32 d20, q7, #16 |
michael@0 | 1704 | vrshrn.u32 d21, q8, #16 |
michael@0 | 1705 | vshrn.u32 d22, q9, #16 |
michael@0 | 1706 | vrev64.32 q9, q1 |
michael@0 | 1707 | vshrn.u32 d23, q13, #16 |
michael@0 | 1708 | vrev64.32 q13, q1 |
michael@0 | 1709 | vshrn.u32 d24, q14, #16 |
michael@0 | 1710 | vshrn.u32 d25, q15, #16 |
michael@0 | 1711 | do_load \bpp, 8 |
michael@0 | 1712 | vmovn.u16 d20, q10 /* d20 = y */ |
michael@0 | 1713 | vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
michael@0 | 1714 | vmovn.u16 d21, q11 /* d21 = u */ |
michael@0 | 1715 | vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
michael@0 | 1716 | vmovn.u16 d22, q12 /* d22 = v */ |
michael@0 | 1717 | vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
michael@0 | 1718 | vmull.u16 q7, d4, d0[0] |
michael@0 | 1719 | vmlal.u16 q7, d6, d0[1] |
michael@0 | 1720 | vmlal.u16 q7, d8, d0[2] |
michael@0 | 1721 | vst1.8 {d20}, [Y]! |
michael@0 | 1722 | vmull.u16 q8, d5, d0[0] |
michael@0 | 1723 | vmlal.u16 q8, d7, d0[1] |
michael@0 | 1724 | vmlal.u16 q8, d9, d0[2] |
michael@0 | 1725 | vmlsl.u16 q9, d4, d0[3] |
michael@0 | 1726 | vmlsl.u16 q9, d6, d1[0] |
michael@0 | 1727 | vmlal.u16 q9, d8, d1[1] |
michael@0 | 1728 | vst1.8 {d21}, [U]! |
michael@0 | 1729 | vmlsl.u16 q13, d5, d0[3] |
michael@0 | 1730 | vmlsl.u16 q13, d7, d1[0] |
michael@0 | 1731 | vmlal.u16 q13, d9, d1[1] |
michael@0 | 1732 | vrev64.32 q14, q1 |
michael@0 | 1733 | vrev64.32 q15, q1 |
michael@0 | 1734 | vmlal.u16 q14, d4, d1[1] |
michael@0 | 1735 | vmlsl.u16 q14, d6, d1[2] |
michael@0 | 1736 | vmlsl.u16 q14, d8, d1[3] |
michael@0 | 1737 | vst1.8 {d22}, [V]! |
michael@0 | 1738 | vmlal.u16 q15, d5, d1[1] |
michael@0 | 1739 | vmlsl.u16 q15, d7, d1[2] |
michael@0 | 1740 | vmlsl.u16 q15, d9, d1[3] |
michael@0 | 1741 | .endm |
michael@0 | 1742 | |
michael@0 | 1743 | .balign 16 |
michael@0 | 1744 | jsimd_\colorid\()_ycc_neon_consts: |
michael@0 | 1745 | .short 19595, 38470, 7471, 11059 |
michael@0 | 1746 | .short 21709, 32768, 27439, 5329 |
michael@0 | 1747 | .short 32767, 128, 32767, 128 |
michael@0 | 1748 | .short 32767, 128, 32767, 128 |
michael@0 | 1749 | |
michael@0 | 1750 | asm_function jsimd_\colorid\()_ycc_convert_neon |
michael@0 | 1751 | OUTPUT_WIDTH .req r0 |
michael@0 | 1752 | INPUT_BUF .req r1 |
michael@0 | 1753 | OUTPUT_BUF .req r2 |
michael@0 | 1754 | OUTPUT_ROW .req r3 |
michael@0 | 1755 | NUM_ROWS .req r4 |
michael@0 | 1756 | |
michael@0 | 1757 | OUTPUT_BUF0 .req r5 |
michael@0 | 1758 | OUTPUT_BUF1 .req r6 |
michael@0 | 1759 | OUTPUT_BUF2 .req OUTPUT_BUF |
michael@0 | 1760 | |
michael@0 | 1761 | RGB .req r7 |
michael@0 | 1762 | Y .req r8 |
michael@0 | 1763 | U .req r9 |
michael@0 | 1764 | V .req r10 |
michael@0 | 1765 | N .req ip |
michael@0 | 1766 | |
michael@0 | 1767 | /* Load constants to d0, d1, d2, d3 */ |
michael@0 | 1768 | adr ip, jsimd_\colorid\()_ycc_neon_consts |
michael@0 | 1769 | vld1.16 {d0, d1, d2, d3}, [ip, :128] |
michael@0 | 1770 | |
michael@0 | 1771 | /* Save ARM registers and handle input arguments */ |
michael@0 | 1772 | push {r4, r5, r6, r7, r8, r9, r10, lr} |
michael@0 | 1773 | ldr NUM_ROWS, [sp, #(4 * 8)] |
michael@0 | 1774 | ldr OUTPUT_BUF0, [OUTPUT_BUF] |
michael@0 | 1775 | ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] |
michael@0 | 1776 | ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] |
michael@0 | 1777 | .unreq OUTPUT_BUF |
michael@0 | 1778 | |
michael@0 | 1779 | /* Save NEON registers */ |
michael@0 | 1780 | vpush {d8-d15} |
michael@0 | 1781 | |
michael@0 | 1782 | /* Outer loop over scanlines */ |
michael@0 | 1783 | cmp NUM_ROWS, #1 |
michael@0 | 1784 | blt 9f |
michael@0 | 1785 | 0: |
michael@0 | 1786 | ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] |
michael@0 | 1787 | ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] |
michael@0 | 1788 | mov N, OUTPUT_WIDTH |
michael@0 | 1789 | ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] |
michael@0 | 1790 | add OUTPUT_ROW, OUTPUT_ROW, #1 |
michael@0 | 1791 | ldr RGB, [INPUT_BUF], #4 |
michael@0 | 1792 | |
michael@0 | 1793 | /* Inner loop over pixels */ |
michael@0 | 1794 | subs N, N, #8 |
michael@0 | 1795 | blt 3f |
michael@0 | 1796 | do_load \bpp, 8 |
michael@0 | 1797 | do_rgb_to_yuv_stage1 |
michael@0 | 1798 | subs N, N, #8 |
michael@0 | 1799 | blt 2f |
michael@0 | 1800 | 1: |
michael@0 | 1801 | do_rgb_to_yuv_stage2_store_load_stage1 |
michael@0 | 1802 | subs N, N, #8 |
michael@0 | 1803 | bge 1b |
michael@0 | 1804 | 2: |
michael@0 | 1805 | do_rgb_to_yuv_stage2 |
michael@0 | 1806 | do_store 8 |
michael@0 | 1807 | tst N, #7 |
michael@0 | 1808 | beq 8f |
michael@0 | 1809 | 3: |
michael@0 | 1810 | tst N, #4 |
michael@0 | 1811 | beq 3f |
michael@0 | 1812 | do_load \bpp, 4 |
michael@0 | 1813 | 3: |
michael@0 | 1814 | tst N, #2 |
michael@0 | 1815 | beq 4f |
michael@0 | 1816 | do_load \bpp, 2 |
michael@0 | 1817 | 4: |
michael@0 | 1818 | tst N, #1 |
michael@0 | 1819 | beq 5f |
michael@0 | 1820 | do_load \bpp, 1 |
michael@0 | 1821 | 5: |
michael@0 | 1822 | do_rgb_to_yuv |
michael@0 | 1823 | tst N, #4 |
michael@0 | 1824 | beq 6f |
michael@0 | 1825 | do_store 4 |
michael@0 | 1826 | 6: |
michael@0 | 1827 | tst N, #2 |
michael@0 | 1828 | beq 7f |
michael@0 | 1829 | do_store 2 |
michael@0 | 1830 | 7: |
michael@0 | 1831 | tst N, #1 |
michael@0 | 1832 | beq 8f |
michael@0 | 1833 | do_store 1 |
michael@0 | 1834 | 8: |
michael@0 | 1835 | subs NUM_ROWS, NUM_ROWS, #1 |
michael@0 | 1836 | bgt 0b |
michael@0 | 1837 | 9: |
michael@0 | 1838 | /* Restore all registers and return */ |
michael@0 | 1839 | vpop {d8-d15} |
michael@0 | 1840 | pop {r4, r5, r6, r7, r8, r9, r10, pc} |
michael@0 | 1841 | |
michael@0 | 1842 | .unreq OUTPUT_WIDTH |
michael@0 | 1843 | .unreq OUTPUT_ROW |
michael@0 | 1844 | .unreq INPUT_BUF |
michael@0 | 1845 | .unreq NUM_ROWS |
michael@0 | 1846 | .unreq OUTPUT_BUF0 |
michael@0 | 1847 | .unreq OUTPUT_BUF1 |
michael@0 | 1848 | .unreq OUTPUT_BUF2 |
michael@0 | 1849 | .unreq RGB |
michael@0 | 1850 | .unreq Y |
michael@0 | 1851 | .unreq U |
michael@0 | 1852 | .unreq V |
michael@0 | 1853 | .unreq N |
michael@0 | 1854 | .endfunc |
michael@0 | 1855 | |
michael@0 | 1856 | .purgem do_rgb_to_yuv |
michael@0 | 1857 | .purgem do_rgb_to_yuv_stage1 |
michael@0 | 1858 | .purgem do_rgb_to_yuv_stage2 |
michael@0 | 1859 | .purgem do_rgb_to_yuv_stage2_store_load_stage1 |
michael@0 | 1860 | |
michael@0 | 1861 | .endm |
michael@0 | 1862 | |
michael@0 | 1863 | /*--------------------------------- id ----- bpp R G B */ |
michael@0 | 1864 | generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 |
michael@0 | 1865 | generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 |
michael@0 | 1866 | generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 |
michael@0 | 1867 | generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 |
michael@0 | 1868 | generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 |
michael@0 | 1869 | generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 |
michael@0 | 1870 | |
michael@0 | 1871 | .purgem do_load |
michael@0 | 1872 | .purgem do_store |
michael@0 | 1873 | |
michael@0 | 1874 | /*****************************************************************************/ |
michael@0 | 1875 | |
michael@0 | 1876 | /* |
michael@0 | 1877 | * Load data into workspace, applying unsigned->signed conversion |
michael@0 | 1878 | * |
michael@0 | 1879 | * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get |
michael@0 | 1880 | * rid of VST1.16 instructions |
michael@0 | 1881 | */ |
michael@0 | 1882 | |
michael@0 | 1883 | asm_function jsimd_convsamp_neon |
michael@0 | 1884 | SAMPLE_DATA .req r0 |
michael@0 | 1885 | START_COL .req r1 |
michael@0 | 1886 | WORKSPACE .req r2 |
michael@0 | 1887 | TMP1 .req r3 |
michael@0 | 1888 | TMP2 .req r4 |
michael@0 | 1889 | TMP3 .req r5 |
michael@0 | 1890 | TMP4 .req ip |
michael@0 | 1891 | |
michael@0 | 1892 | push {r4, r5} |
michael@0 | 1893 | vmov.u8 d0, #128 |
michael@0 | 1894 | |
michael@0 | 1895 | ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} |
michael@0 | 1896 | add TMP1, TMP1, START_COL |
michael@0 | 1897 | add TMP2, TMP2, START_COL |
michael@0 | 1898 | add TMP3, TMP3, START_COL |
michael@0 | 1899 | add TMP4, TMP4, START_COL |
michael@0 | 1900 | vld1.8 {d16}, [TMP1] |
michael@0 | 1901 | vsubl.u8 q8, d16, d0 |
michael@0 | 1902 | vld1.8 {d18}, [TMP2] |
michael@0 | 1903 | vsubl.u8 q9, d18, d0 |
michael@0 | 1904 | vld1.8 {d20}, [TMP3] |
michael@0 | 1905 | vsubl.u8 q10, d20, d0 |
michael@0 | 1906 | vld1.8 {d22}, [TMP4] |
michael@0 | 1907 | ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} |
michael@0 | 1908 | vsubl.u8 q11, d22, d0 |
michael@0 | 1909 | vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! |
michael@0 | 1910 | add TMP1, TMP1, START_COL |
michael@0 | 1911 | add TMP2, TMP2, START_COL |
michael@0 | 1912 | vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! |
michael@0 | 1913 | add TMP3, TMP3, START_COL |
michael@0 | 1914 | add TMP4, TMP4, START_COL |
michael@0 | 1915 | vld1.8 {d24}, [TMP1] |
michael@0 | 1916 | vsubl.u8 q12, d24, d0 |
michael@0 | 1917 | vld1.8 {d26}, [TMP2] |
michael@0 | 1918 | vsubl.u8 q13, d26, d0 |
michael@0 | 1919 | vld1.8 {d28}, [TMP3] |
michael@0 | 1920 | vsubl.u8 q14, d28, d0 |
michael@0 | 1921 | vld1.8 {d30}, [TMP4] |
michael@0 | 1922 | vsubl.u8 q15, d30, d0 |
michael@0 | 1923 | vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! |
michael@0 | 1924 | vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! |
michael@0 | 1925 | pop {r4, r5} |
michael@0 | 1926 | bx lr |
michael@0 | 1927 | |
michael@0 | 1928 | .unreq SAMPLE_DATA |
michael@0 | 1929 | .unreq START_COL |
michael@0 | 1930 | .unreq WORKSPACE |
michael@0 | 1931 | .unreq TMP1 |
michael@0 | 1932 | .unreq TMP2 |
michael@0 | 1933 | .unreq TMP3 |
michael@0 | 1934 | .unreq TMP4 |
michael@0 | 1935 | .endfunc |
michael@0 | 1936 | |
michael@0 | 1937 | /*****************************************************************************/ |
michael@0 | 1938 | |
michael@0 | 1939 | /* |
michael@0 | 1940 | * jsimd_fdct_ifast_neon |
michael@0 | 1941 | * |
michael@0 | 1942 | * This function contains a fast, not so accurate integer implementation of |
michael@0 | 1943 | * the forward DCT (Discrete Cosine Transform). It uses the same calculations |
michael@0 | 1944 | * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' |
michael@0 | 1945 | * function from jfdctfst.c |
michael@0 | 1946 | * |
michael@0 | 1947 | * TODO: can be combined with 'jsimd_convsamp_neon' to get |
michael@0 | 1948 | * rid of a bunch of VLD1.16 instructions |
michael@0 | 1949 | */ |
michael@0 | 1950 | |
michael@0 | 1951 | #define XFIX_0_382683433 d0[0] |
michael@0 | 1952 | #define XFIX_0_541196100 d0[1] |
michael@0 | 1953 | #define XFIX_0_707106781 d0[2] |
michael@0 | 1954 | #define XFIX_1_306562965 d0[3] |
michael@0 | 1955 | |
michael@0 | 1956 | .balign 16 |
michael@0 | 1957 | jsimd_fdct_ifast_neon_consts: |
michael@0 | 1958 | .short (98 * 128) /* XFIX_0_382683433 */ |
michael@0 | 1959 | .short (139 * 128) /* XFIX_0_541196100 */ |
michael@0 | 1960 | .short (181 * 128) /* XFIX_0_707106781 */ |
michael@0 | 1961 | .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ |
michael@0 | 1962 | |
michael@0 | 1963 | asm_function jsimd_fdct_ifast_neon |
michael@0 | 1964 | |
michael@0 | 1965 | DATA .req r0 |
michael@0 | 1966 | TMP .req ip |
michael@0 | 1967 | |
michael@0 | 1968 | vpush {d8-d15} |
michael@0 | 1969 | |
michael@0 | 1970 | /* Load constants */ |
michael@0 | 1971 | adr TMP, jsimd_fdct_ifast_neon_consts |
michael@0 | 1972 | vld1.16 {d0}, [TMP, :64] |
michael@0 | 1973 | |
michael@0 | 1974 | /* Load all DATA into NEON registers with the following allocation: |
michael@0 | 1975 | * 0 1 2 3 | 4 5 6 7 |
michael@0 | 1976 | * ---------+-------- |
michael@0 | 1977 | * 0 | d16 | d17 | q8 |
michael@0 | 1978 | * 1 | d18 | d19 | q9 |
michael@0 | 1979 | * 2 | d20 | d21 | q10 |
michael@0 | 1980 | * 3 | d22 | d23 | q11 |
michael@0 | 1981 | * 4 | d24 | d25 | q12 |
michael@0 | 1982 | * 5 | d26 | d27 | q13 |
michael@0 | 1983 | * 6 | d28 | d29 | q14 |
michael@0 | 1984 | * 7 | d30 | d31 | q15 |
michael@0 | 1985 | */ |
michael@0 | 1986 | |
michael@0 | 1987 | vld1.16 {d16, d17, d18, d19}, [DATA, :128]! |
michael@0 | 1988 | vld1.16 {d20, d21, d22, d23}, [DATA, :128]! |
michael@0 | 1989 | vld1.16 {d24, d25, d26, d27}, [DATA, :128]! |
michael@0 | 1990 | vld1.16 {d28, d29, d30, d31}, [DATA, :128] |
michael@0 | 1991 | sub DATA, DATA, #(128 - 32) |
michael@0 | 1992 | |
michael@0 | 1993 | mov TMP, #2 |
michael@0 | 1994 | 1: |
michael@0 | 1995 | /* Transpose */ |
michael@0 | 1996 | vtrn.16 q12, q13 |
michael@0 | 1997 | vtrn.16 q10, q11 |
michael@0 | 1998 | vtrn.16 q8, q9 |
michael@0 | 1999 | vtrn.16 q14, q15 |
michael@0 | 2000 | vtrn.32 q9, q11 |
michael@0 | 2001 | vtrn.32 q13, q15 |
michael@0 | 2002 | vtrn.32 q8, q10 |
michael@0 | 2003 | vtrn.32 q12, q14 |
michael@0 | 2004 | vswp d30, d23 |
michael@0 | 2005 | vswp d24, d17 |
michael@0 | 2006 | vswp d26, d19 |
michael@0 | 2007 | /* 1-D FDCT */ |
michael@0 | 2008 | vadd.s16 q2, q11, q12 |
michael@0 | 2009 | vswp d28, d21 |
michael@0 | 2010 | vsub.s16 q12, q11, q12 |
michael@0 | 2011 | vsub.s16 q6, q10, q13 |
michael@0 | 2012 | vadd.s16 q10, q10, q13 |
michael@0 | 2013 | vsub.s16 q7, q9, q14 |
michael@0 | 2014 | vadd.s16 q9, q9, q14 |
michael@0 | 2015 | vsub.s16 q1, q8, q15 |
michael@0 | 2016 | vadd.s16 q8, q8, q15 |
michael@0 | 2017 | vsub.s16 q4, q9, q10 |
michael@0 | 2018 | vsub.s16 q5, q8, q2 |
michael@0 | 2019 | vadd.s16 q3, q9, q10 |
michael@0 | 2020 | vadd.s16 q4, q4, q5 |
michael@0 | 2021 | vadd.s16 q2, q8, q2 |
michael@0 | 2022 | vqdmulh.s16 q4, q4, XFIX_0_707106781 |
michael@0 | 2023 | vadd.s16 q11, q12, q6 |
michael@0 | 2024 | vadd.s16 q8, q2, q3 |
michael@0 | 2025 | vsub.s16 q12, q2, q3 |
michael@0 | 2026 | vadd.s16 q3, q6, q7 |
michael@0 | 2027 | vadd.s16 q7, q7, q1 |
michael@0 | 2028 | vqdmulh.s16 q3, q3, XFIX_0_707106781 |
michael@0 | 2029 | vsub.s16 q6, q11, q7 |
michael@0 | 2030 | vadd.s16 q10, q5, q4 |
michael@0 | 2031 | vqdmulh.s16 q6, q6, XFIX_0_382683433 |
michael@0 | 2032 | vsub.s16 q14, q5, q4 |
michael@0 | 2033 | vqdmulh.s16 q11, q11, XFIX_0_541196100 |
michael@0 | 2034 | vqdmulh.s16 q5, q7, XFIX_1_306562965 |
michael@0 | 2035 | vadd.s16 q4, q1, q3 |
michael@0 | 2036 | vsub.s16 q3, q1, q3 |
michael@0 | 2037 | vadd.s16 q7, q7, q6 |
michael@0 | 2038 | vadd.s16 q11, q11, q6 |
michael@0 | 2039 | vadd.s16 q7, q7, q5 |
michael@0 | 2040 | vadd.s16 q13, q3, q11 |
michael@0 | 2041 | vsub.s16 q11, q3, q11 |
michael@0 | 2042 | vadd.s16 q9, q4, q7 |
michael@0 | 2043 | vsub.s16 q15, q4, q7 |
michael@0 | 2044 | subs TMP, TMP, #1 |
michael@0 | 2045 | bne 1b |
michael@0 | 2046 | |
michael@0 | 2047 | /* store results */ |
michael@0 | 2048 | vst1.16 {d16, d17, d18, d19}, [DATA, :128]! |
michael@0 | 2049 | vst1.16 {d20, d21, d22, d23}, [DATA, :128]! |
michael@0 | 2050 | vst1.16 {d24, d25, d26, d27}, [DATA, :128]! |
michael@0 | 2051 | vst1.16 {d28, d29, d30, d31}, [DATA, :128] |
michael@0 | 2052 | |
michael@0 | 2053 | vpop {d8-d15} |
michael@0 | 2054 | bx lr |
michael@0 | 2055 | |
michael@0 | 2056 | .unreq DATA |
michael@0 | 2057 | .unreq TMP |
michael@0 | 2058 | .endfunc |
michael@0 | 2059 | |
michael@0 | 2060 | /*****************************************************************************/ |
michael@0 | 2061 | |
michael@0 | 2062 | /* |
michael@0 | 2063 | * GLOBAL(void) |
michael@0 | 2064 | * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, |
michael@0 | 2065 | * DCTELEM * workspace); |
michael@0 | 2066 | * |
michael@0 | 2067 | * Note: the code uses 2 stage pipelining in order to improve instructions |
michael@0 | 2068 | * scheduling and eliminate stalls (this provides ~15% better |
michael@0 | 2069 | * performance for this function on both ARM Cortex-A8 and |
michael@0 | 2070 | * ARM Cortex-A9 when compared to the non-pipelined variant). |
michael@0 | 2071 | * The instructions which belong to the second stage use different |
michael@0 | 2072 | * indentation for better readiability. |
michael@0 | 2073 | */ |
michael@0 | 2074 | asm_function jsimd_quantize_neon |
michael@0 | 2075 | |
michael@0 | 2076 | COEF_BLOCK .req r0 |
michael@0 | 2077 | DIVISORS .req r1 |
michael@0 | 2078 | WORKSPACE .req r2 |
michael@0 | 2079 | |
michael@0 | 2080 | RECIPROCAL .req DIVISORS |
michael@0 | 2081 | CORRECTION .req r3 |
michael@0 | 2082 | SHIFT .req ip |
michael@0 | 2083 | LOOP_COUNT .req r4 |
michael@0 | 2084 | |
michael@0 | 2085 | vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
michael@0 | 2086 | vabs.s16 q12, q0 |
michael@0 | 2087 | add CORRECTION, DIVISORS, #(64 * 2) |
michael@0 | 2088 | add SHIFT, DIVISORS, #(64 * 6) |
michael@0 | 2089 | vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
michael@0 | 2090 | vabs.s16 q13, q1 |
michael@0 | 2091 | vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
michael@0 | 2092 | vadd.u16 q12, q12, q10 /* add correction */ |
michael@0 | 2093 | vadd.u16 q13, q13, q11 |
michael@0 | 2094 | vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
michael@0 | 2095 | vmull.u16 q11, d25, d17 |
michael@0 | 2096 | vmull.u16 q8, d26, d18 |
michael@0 | 2097 | vmull.u16 q9, d27, d19 |
michael@0 | 2098 | vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
michael@0 | 2099 | vshrn.u32 d20, q10, #16 |
michael@0 | 2100 | vshrn.u32 d21, q11, #16 |
michael@0 | 2101 | vshrn.u32 d22, q8, #16 |
michael@0 | 2102 | vshrn.u32 d23, q9, #16 |
michael@0 | 2103 | vneg.s16 q12, q12 |
michael@0 | 2104 | vneg.s16 q13, q13 |
michael@0 | 2105 | vshr.s16 q2, q0, #15 /* extract sign */ |
michael@0 | 2106 | vshr.s16 q3, q1, #15 |
michael@0 | 2107 | vshl.u16 q14, q10, q12 /* shift */ |
michael@0 | 2108 | vshl.u16 q15, q11, q13 |
michael@0 | 2109 | |
michael@0 | 2110 | push {r4, r5} |
michael@0 | 2111 | mov LOOP_COUNT, #3 |
michael@0 | 2112 | 1: |
michael@0 | 2113 | vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
michael@0 | 2114 | veor.u16 q14, q14, q2 /* restore sign */ |
michael@0 | 2115 | vabs.s16 q12, q0 |
michael@0 | 2116 | vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
michael@0 | 2117 | vabs.s16 q13, q1 |
michael@0 | 2118 | veor.u16 q15, q15, q3 |
michael@0 | 2119 | vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
michael@0 | 2120 | vadd.u16 q12, q12, q10 /* add correction */ |
michael@0 | 2121 | vadd.u16 q13, q13, q11 |
michael@0 | 2122 | vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
michael@0 | 2123 | vmull.u16 q11, d25, d17 |
michael@0 | 2124 | vmull.u16 q8, d26, d18 |
michael@0 | 2125 | vmull.u16 q9, d27, d19 |
michael@0 | 2126 | vsub.u16 q14, q14, q2 |
michael@0 | 2127 | vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
michael@0 | 2128 | vsub.u16 q15, q15, q3 |
michael@0 | 2129 | vshrn.u32 d20, q10, #16 |
michael@0 | 2130 | vshrn.u32 d21, q11, #16 |
michael@0 | 2131 | vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
michael@0 | 2132 | vshrn.u32 d22, q8, #16 |
michael@0 | 2133 | vshrn.u32 d23, q9, #16 |
michael@0 | 2134 | vneg.s16 q12, q12 |
michael@0 | 2135 | vneg.s16 q13, q13 |
michael@0 | 2136 | vshr.s16 q2, q0, #15 /* extract sign */ |
michael@0 | 2137 | vshr.s16 q3, q1, #15 |
michael@0 | 2138 | vshl.u16 q14, q10, q12 /* shift */ |
michael@0 | 2139 | vshl.u16 q15, q11, q13 |
michael@0 | 2140 | subs LOOP_COUNT, LOOP_COUNT, #1 |
michael@0 | 2141 | bne 1b |
michael@0 | 2142 | pop {r4, r5} |
michael@0 | 2143 | |
michael@0 | 2144 | veor.u16 q14, q14, q2 /* restore sign */ |
michael@0 | 2145 | veor.u16 q15, q15, q3 |
michael@0 | 2146 | vsub.u16 q14, q14, q2 |
michael@0 | 2147 | vsub.u16 q15, q15, q3 |
michael@0 | 2148 | vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
michael@0 | 2149 | |
michael@0 | 2150 | bx lr /* return */ |
michael@0 | 2151 | |
michael@0 | 2152 | .unreq COEF_BLOCK |
michael@0 | 2153 | .unreq DIVISORS |
michael@0 | 2154 | .unreq WORKSPACE |
michael@0 | 2155 | .unreq RECIPROCAL |
michael@0 | 2156 | .unreq CORRECTION |
michael@0 | 2157 | .unreq SHIFT |
michael@0 | 2158 | .unreq LOOP_COUNT |
michael@0 | 2159 | .endfunc |
michael@0 | 2160 | |
michael@0 | 2161 | /*****************************************************************************/ |
michael@0 | 2162 | |
michael@0 | 2163 | /* |
michael@0 | 2164 | * GLOBAL(void) |
michael@0 | 2165 | * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, |
michael@0 | 2166 | * JDIMENSION downsampled_width, |
michael@0 | 2167 | * JSAMPARRAY input_data, |
michael@0 | 2168 | * JSAMPARRAY * output_data_ptr); |
michael@0 | 2169 | * |
michael@0 | 2170 | * Note: the use of unaligned writes is the main remaining bottleneck in |
michael@0 | 2171 | * this code, which can be potentially solved to get up to tens |
michael@0 | 2172 | * of percents performance improvement on Cortex-A8/Cortex-A9. |
michael@0 | 2173 | */ |
michael@0 | 2174 | |
michael@0 | 2175 | /* |
michael@0 | 2176 | * Upsample 16 source pixels to 32 destination pixels. The new 16 source |
michael@0 | 2177 | * pixels are loaded to q0. The previous 16 source pixels are in q1. The |
michael@0 | 2178 | * shifted-by-one source pixels are constructed in q2 by using q0 and q1. |
michael@0 | 2179 | * Register d28 is used for multiplication by 3. Register q15 is used |
michael@0 | 2180 | * for adding +1 bias. |
michael@0 | 2181 | */ |
michael@0 | 2182 | .macro upsample16 OUTPTR, INPTR |
michael@0 | 2183 | vld1.8 {q0}, [\INPTR]! |
michael@0 | 2184 | vmovl.u8 q8, d0 |
michael@0 | 2185 | vext.8 q2, q1, q0, #15 |
michael@0 | 2186 | vmovl.u8 q9, d1 |
michael@0 | 2187 | vaddw.u8 q10, q15, d4 |
michael@0 | 2188 | vaddw.u8 q11, q15, d5 |
michael@0 | 2189 | vmlal.u8 q8, d4, d28 |
michael@0 | 2190 | vmlal.u8 q9, d5, d28 |
michael@0 | 2191 | vmlal.u8 q10, d0, d28 |
michael@0 | 2192 | vmlal.u8 q11, d1, d28 |
michael@0 | 2193 | vmov q1, q0 /* backup source pixels to q1 */ |
michael@0 | 2194 | vrshrn.u16 d6, q8, #2 |
michael@0 | 2195 | vrshrn.u16 d7, q9, #2 |
michael@0 | 2196 | vshrn.u16 d8, q10, #2 |
michael@0 | 2197 | vshrn.u16 d9, q11, #2 |
michael@0 | 2198 | vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
michael@0 | 2199 | .endm |
michael@0 | 2200 | |
michael@0 | 2201 | /* |
michael@0 | 2202 | * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' |
michael@0 | 2203 | * macro, the roles of q0 and q1 registers are reversed for even and odd |
michael@0 | 2204 | * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. |
michael@0 | 2205 | * Also this unrolling allows to reorder loads and stores to compensate |
michael@0 | 2206 | * multiplication latency and reduce stalls. |
michael@0 | 2207 | */ |
michael@0 | 2208 | .macro upsample32 OUTPTR, INPTR |
michael@0 | 2209 | /* even 16 pixels group */ |
michael@0 | 2210 | vld1.8 {q0}, [\INPTR]! |
michael@0 | 2211 | vmovl.u8 q8, d0 |
michael@0 | 2212 | vext.8 q2, q1, q0, #15 |
michael@0 | 2213 | vmovl.u8 q9, d1 |
michael@0 | 2214 | vaddw.u8 q10, q15, d4 |
michael@0 | 2215 | vaddw.u8 q11, q15, d5 |
michael@0 | 2216 | vmlal.u8 q8, d4, d28 |
michael@0 | 2217 | vmlal.u8 q9, d5, d28 |
michael@0 | 2218 | vmlal.u8 q10, d0, d28 |
michael@0 | 2219 | vmlal.u8 q11, d1, d28 |
michael@0 | 2220 | /* odd 16 pixels group */ |
michael@0 | 2221 | vld1.8 {q1}, [\INPTR]! |
michael@0 | 2222 | vrshrn.u16 d6, q8, #2 |
michael@0 | 2223 | vrshrn.u16 d7, q9, #2 |
michael@0 | 2224 | vshrn.u16 d8, q10, #2 |
michael@0 | 2225 | vshrn.u16 d9, q11, #2 |
michael@0 | 2226 | vmovl.u8 q8, d2 |
michael@0 | 2227 | vext.8 q2, q0, q1, #15 |
michael@0 | 2228 | vmovl.u8 q9, d3 |
michael@0 | 2229 | vaddw.u8 q10, q15, d4 |
michael@0 | 2230 | vaddw.u8 q11, q15, d5 |
michael@0 | 2231 | vmlal.u8 q8, d4, d28 |
michael@0 | 2232 | vmlal.u8 q9, d5, d28 |
michael@0 | 2233 | vmlal.u8 q10, d2, d28 |
michael@0 | 2234 | vmlal.u8 q11, d3, d28 |
michael@0 | 2235 | vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
michael@0 | 2236 | vrshrn.u16 d6, q8, #2 |
michael@0 | 2237 | vrshrn.u16 d7, q9, #2 |
michael@0 | 2238 | vshrn.u16 d8, q10, #2 |
michael@0 | 2239 | vshrn.u16 d9, q11, #2 |
michael@0 | 2240 | vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
michael@0 | 2241 | .endm |
michael@0 | 2242 | |
michael@0 | 2243 | /* |
michael@0 | 2244 | * Upsample a row of WIDTH pixels from INPTR to OUTPTR. |
michael@0 | 2245 | */ |
michael@0 | 2246 | .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 |
michael@0 | 2247 | /* special case for the first and last pixels */ |
michael@0 | 2248 | sub \WIDTH, \WIDTH, #1 |
michael@0 | 2249 | add \OUTPTR, \OUTPTR, #1 |
michael@0 | 2250 | ldrb \TMP1, [\INPTR, \WIDTH] |
michael@0 | 2251 | strb \TMP1, [\OUTPTR, \WIDTH, asl #1] |
michael@0 | 2252 | ldrb \TMP1, [\INPTR], #1 |
michael@0 | 2253 | strb \TMP1, [\OUTPTR, #-1] |
michael@0 | 2254 | vmov.8 d3[7], \TMP1 |
michael@0 | 2255 | |
michael@0 | 2256 | subs \WIDTH, \WIDTH, #32 |
michael@0 | 2257 | blt 5f |
michael@0 | 2258 | 0: /* process 32 pixels per iteration */ |
michael@0 | 2259 | upsample32 \OUTPTR, \INPTR |
michael@0 | 2260 | subs \WIDTH, \WIDTH, #32 |
michael@0 | 2261 | bge 0b |
michael@0 | 2262 | 5: |
michael@0 | 2263 | adds \WIDTH, \WIDTH, #16 |
michael@0 | 2264 | blt 1f |
michael@0 | 2265 | 0: /* process 16 pixels if needed */ |
michael@0 | 2266 | upsample16 \OUTPTR, \INPTR |
michael@0 | 2267 | subs \WIDTH, \WIDTH, #16 |
michael@0 | 2268 | 1: |
michael@0 | 2269 | adds \WIDTH, \WIDTH, #16 |
michael@0 | 2270 | beq 9f |
michael@0 | 2271 | |
michael@0 | 2272 | /* load the remaining 1-15 pixels */ |
michael@0 | 2273 | add \INPTR, \INPTR, \WIDTH |
michael@0 | 2274 | tst \WIDTH, #1 |
michael@0 | 2275 | beq 2f |
michael@0 | 2276 | sub \INPTR, \INPTR, #1 |
michael@0 | 2277 | vld1.8 {d0[0]}, [\INPTR] |
michael@0 | 2278 | 2: |
michael@0 | 2279 | tst \WIDTH, #2 |
michael@0 | 2280 | beq 2f |
michael@0 | 2281 | vext.8 d0, d0, d0, #6 |
michael@0 | 2282 | sub \INPTR, \INPTR, #1 |
michael@0 | 2283 | vld1.8 {d0[1]}, [\INPTR] |
michael@0 | 2284 | sub \INPTR, \INPTR, #1 |
michael@0 | 2285 | vld1.8 {d0[0]}, [\INPTR] |
michael@0 | 2286 | 2: |
michael@0 | 2287 | tst \WIDTH, #4 |
michael@0 | 2288 | beq 2f |
michael@0 | 2289 | vrev64.32 d0, d0 |
michael@0 | 2290 | sub \INPTR, \INPTR, #1 |
michael@0 | 2291 | vld1.8 {d0[3]}, [\INPTR] |
michael@0 | 2292 | sub \INPTR, \INPTR, #1 |
michael@0 | 2293 | vld1.8 {d0[2]}, [\INPTR] |
michael@0 | 2294 | sub \INPTR, \INPTR, #1 |
michael@0 | 2295 | vld1.8 {d0[1]}, [\INPTR] |
michael@0 | 2296 | sub \INPTR, \INPTR, #1 |
michael@0 | 2297 | vld1.8 {d0[0]}, [\INPTR] |
michael@0 | 2298 | 2: |
michael@0 | 2299 | tst \WIDTH, #8 |
michael@0 | 2300 | beq 2f |
michael@0 | 2301 | vmov d1, d0 |
michael@0 | 2302 | sub \INPTR, \INPTR, #8 |
michael@0 | 2303 | vld1.8 {d0}, [\INPTR] |
michael@0 | 2304 | 2: /* upsample the remaining pixels */ |
michael@0 | 2305 | vmovl.u8 q8, d0 |
michael@0 | 2306 | vext.8 q2, q1, q0, #15 |
michael@0 | 2307 | vmovl.u8 q9, d1 |
michael@0 | 2308 | vaddw.u8 q10, q15, d4 |
michael@0 | 2309 | vaddw.u8 q11, q15, d5 |
michael@0 | 2310 | vmlal.u8 q8, d4, d28 |
michael@0 | 2311 | vmlal.u8 q9, d5, d28 |
michael@0 | 2312 | vmlal.u8 q10, d0, d28 |
michael@0 | 2313 | vmlal.u8 q11, d1, d28 |
michael@0 | 2314 | vrshrn.u16 d10, q8, #2 |
michael@0 | 2315 | vrshrn.u16 d12, q9, #2 |
michael@0 | 2316 | vshrn.u16 d11, q10, #2 |
michael@0 | 2317 | vshrn.u16 d13, q11, #2 |
michael@0 | 2318 | vzip.8 d10, d11 |
michael@0 | 2319 | vzip.8 d12, d13 |
michael@0 | 2320 | /* store the remaining pixels */ |
michael@0 | 2321 | tst \WIDTH, #8 |
michael@0 | 2322 | beq 2f |
michael@0 | 2323 | vst1.8 {d10, d11}, [\OUTPTR]! |
michael@0 | 2324 | vmov q5, q6 |
michael@0 | 2325 | 2: |
michael@0 | 2326 | tst \WIDTH, #4 |
michael@0 | 2327 | beq 2f |
michael@0 | 2328 | vst1.8 {d10}, [\OUTPTR]! |
michael@0 | 2329 | vmov d10, d11 |
michael@0 | 2330 | 2: |
michael@0 | 2331 | tst \WIDTH, #2 |
michael@0 | 2332 | beq 2f |
michael@0 | 2333 | vst1.8 {d10[0]}, [\OUTPTR]! |
michael@0 | 2334 | vst1.8 {d10[1]}, [\OUTPTR]! |
michael@0 | 2335 | vst1.8 {d10[2]}, [\OUTPTR]! |
michael@0 | 2336 | vst1.8 {d10[3]}, [\OUTPTR]! |
michael@0 | 2337 | vext.8 d10, d10, d10, #4 |
michael@0 | 2338 | 2: |
michael@0 | 2339 | tst \WIDTH, #1 |
michael@0 | 2340 | beq 2f |
michael@0 | 2341 | vst1.8 {d10[0]}, [\OUTPTR]! |
michael@0 | 2342 | vst1.8 {d10[1]}, [\OUTPTR]! |
michael@0 | 2343 | 2: |
michael@0 | 2344 | 9: |
michael@0 | 2345 | .endm |
michael@0 | 2346 | |
michael@0 | 2347 | asm_function jsimd_h2v1_fancy_upsample_neon |
michael@0 | 2348 | |
michael@0 | 2349 | MAX_V_SAMP_FACTOR .req r0 |
michael@0 | 2350 | DOWNSAMPLED_WIDTH .req r1 |
michael@0 | 2351 | INPUT_DATA .req r2 |
michael@0 | 2352 | OUTPUT_DATA_PTR .req r3 |
michael@0 | 2353 | OUTPUT_DATA .req OUTPUT_DATA_PTR |
michael@0 | 2354 | |
michael@0 | 2355 | OUTPTR .req r4 |
michael@0 | 2356 | INPTR .req r5 |
michael@0 | 2357 | WIDTH .req ip |
michael@0 | 2358 | TMP .req lr |
michael@0 | 2359 | |
michael@0 | 2360 | push {r4, r5, r6, lr} |
michael@0 | 2361 | vpush {d8-d15} |
michael@0 | 2362 | |
michael@0 | 2363 | ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] |
michael@0 | 2364 | cmp MAX_V_SAMP_FACTOR, #0 |
michael@0 | 2365 | ble 99f |
michael@0 | 2366 | |
michael@0 | 2367 | /* initialize constants */ |
michael@0 | 2368 | vmov.u8 d28, #3 |
michael@0 | 2369 | vmov.u16 q15, #1 |
michael@0 | 2370 | 11: |
michael@0 | 2371 | ldr INPTR, [INPUT_DATA], #4 |
michael@0 | 2372 | ldr OUTPTR, [OUTPUT_DATA], #4 |
michael@0 | 2373 | mov WIDTH, DOWNSAMPLED_WIDTH |
michael@0 | 2374 | upsample_row OUTPTR, INPTR, WIDTH, TMP |
michael@0 | 2375 | subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 |
michael@0 | 2376 | bgt 11b |
michael@0 | 2377 | |
michael@0 | 2378 | 99: |
michael@0 | 2379 | vpop {d8-d15} |
michael@0 | 2380 | pop {r4, r5, r6, pc} |
michael@0 | 2381 | |
michael@0 | 2382 | .unreq MAX_V_SAMP_FACTOR |
michael@0 | 2383 | .unreq DOWNSAMPLED_WIDTH |
michael@0 | 2384 | .unreq INPUT_DATA |
michael@0 | 2385 | .unreq OUTPUT_DATA_PTR |
michael@0 | 2386 | .unreq OUTPUT_DATA |
michael@0 | 2387 | |
michael@0 | 2388 | .unreq OUTPTR |
michael@0 | 2389 | .unreq INPTR |
michael@0 | 2390 | .unreq WIDTH |
michael@0 | 2391 | .unreq TMP |
michael@0 | 2392 | |
michael@0 | 2393 | .endfunc |
michael@0 | 2394 | |
michael@0 | 2395 | .purgem upsample16 |
michael@0 | 2396 | .purgem upsample32 |
michael@0 | 2397 | .purgem upsample_row |