michael@0: /* michael@0: * ARM NEON optimizations for libjpeg-turbo michael@0: * michael@0: * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). michael@0: * All rights reserved. michael@0: * Author: Siarhei Siamashka michael@0: * michael@0: * This software is provided 'as-is', without any express or implied michael@0: * warranty. In no event will the authors be held liable for any damages michael@0: * arising from the use of this software. michael@0: * michael@0: * Permission is granted to anyone to use this software for any purpose, michael@0: * including commercial applications, and to alter it and redistribute it michael@0: * freely, subject to the following restrictions: michael@0: * michael@0: * 1. The origin of this software must not be misrepresented; you must not michael@0: * claim that you wrote the original software. If you use this software michael@0: * in a product, an acknowledgment in the product documentation would be michael@0: * appreciated but is not required. michael@0: * 2. Altered source versions must be plainly marked as such, and must not be michael@0: * misrepresented as being the original software. michael@0: * 3. This notice may not be removed or altered from any source distribution. michael@0: */ michael@0: michael@0: #if defined(__linux__) && defined(__ELF__) michael@0: .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ michael@0: #endif michael@0: michael@0: .text michael@0: .fpu neon michael@0: .arch armv7a michael@0: .object_arch armv4 michael@0: .arm michael@0: michael@0: michael@0: #define RESPECT_STRICT_ALIGNMENT 1 michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* Supplementary macro for setting function attributes */ michael@0: .macro asm_function fname michael@0: #ifdef __APPLE__ michael@0: .func _\fname michael@0: .globl _\fname michael@0: _\fname: michael@0: #else michael@0: .func \fname michael@0: .global \fname michael@0: #ifdef __ELF__ michael@0: .hidden \fname michael@0: .type \fname, %function michael@0: #endif michael@0: \fname: michael@0: #endif michael@0: .endm michael@0: michael@0: /* Transpose a block of 4x4 coefficients in four 64-bit registers */ michael@0: .macro transpose_4x4 x0, x1, x2, x3 michael@0: vtrn.16 \x0, \x1 michael@0: vtrn.16 \x2, \x3 michael@0: vtrn.32 \x0, \x2 michael@0: vtrn.32 \x1, \x3 michael@0: .endm michael@0: michael@0: #define CENTERJSAMPLE 128 michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * Perform dequantization and inverse DCT on one block of coefficients. michael@0: * michael@0: * GLOBAL(void) michael@0: * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, michael@0: * JSAMPARRAY output_buf, JDIMENSION output_col) michael@0: */ michael@0: michael@0: #define FIX_0_298631336 (2446) michael@0: #define FIX_0_390180644 (3196) michael@0: #define FIX_0_541196100 (4433) michael@0: #define FIX_0_765366865 (6270) michael@0: #define FIX_0_899976223 (7373) michael@0: #define FIX_1_175875602 (9633) michael@0: #define FIX_1_501321110 (12299) michael@0: #define FIX_1_847759065 (15137) michael@0: #define FIX_1_961570560 (16069) michael@0: #define FIX_2_053119869 (16819) michael@0: #define FIX_2_562915447 (20995) michael@0: #define FIX_3_072711026 (25172) michael@0: michael@0: #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) michael@0: #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) michael@0: #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) michael@0: #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) michael@0: #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) michael@0: #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) michael@0: #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) michael@0: #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) michael@0: michael@0: /* michael@0: * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. michael@0: * Uses some ideas from the comments in 'simd/jiss2int-64.asm' michael@0: */ michael@0: #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ michael@0: { \ michael@0: DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ michael@0: INT32 q1, q2, q3, q4, q5, q6, q7; \ michael@0: INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ michael@0: \ michael@0: /* 1-D iDCT input data */ \ michael@0: row0 = xrow0; \ michael@0: row1 = xrow1; \ michael@0: row2 = xrow2; \ michael@0: row3 = xrow3; \ michael@0: row4 = xrow4; \ michael@0: row5 = xrow5; \ michael@0: row6 = xrow6; \ michael@0: row7 = xrow7; \ michael@0: \ michael@0: q5 = row7 + row3; \ michael@0: q4 = row5 + row1; \ michael@0: q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ michael@0: MULTIPLY(q4, FIX_1_175875602); \ michael@0: q7 = MULTIPLY(q5, FIX_1_175875602) + \ michael@0: MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ michael@0: q2 = MULTIPLY(row2, FIX_0_541196100) + \ michael@0: MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ michael@0: q4 = q6; \ michael@0: q3 = ((INT32) row0 - (INT32) row4) << 13; \ michael@0: q6 += MULTIPLY(row5, -FIX_2_562915447) + \ michael@0: MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ michael@0: /* now we can use q1 (reloadable constants have been used up) */ \ michael@0: q1 = q3 + q2; \ michael@0: q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ michael@0: MULTIPLY(row1, -FIX_0_899976223); \ michael@0: q5 = q7; \ michael@0: q1 = q1 + q6; \ michael@0: q7 += MULTIPLY(row7, -FIX_0_899976223) + \ michael@0: MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ michael@0: \ michael@0: /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ michael@0: tmp11_plus_tmp2 = q1; \ michael@0: row1 = 0; \ michael@0: \ michael@0: q1 = q1 - q6; \ michael@0: q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ michael@0: MULTIPLY(row3, -FIX_2_562915447); \ michael@0: q1 = q1 - q6; \ michael@0: q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ michael@0: MULTIPLY(row6, FIX_0_541196100); \ michael@0: q3 = q3 - q2; \ michael@0: \ michael@0: /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ michael@0: tmp11_minus_tmp2 = q1; \ michael@0: \ michael@0: q1 = ((INT32) row0 + (INT32) row4) << 13; \ michael@0: q2 = q1 + q6; \ michael@0: q1 = q1 - q6; \ michael@0: \ michael@0: /* pick up the results */ \ michael@0: tmp0 = q4; \ michael@0: tmp1 = q5; \ michael@0: tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ michael@0: tmp3 = q7; \ michael@0: tmp10 = q2; \ michael@0: tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ michael@0: tmp12 = q3; \ michael@0: tmp13 = q1; \ michael@0: } michael@0: michael@0: #define XFIX_0_899976223 d0[0] michael@0: #define XFIX_0_541196100 d0[1] michael@0: #define XFIX_2_562915447 d0[2] michael@0: #define XFIX_0_298631336_MINUS_0_899976223 d0[3] michael@0: #define XFIX_1_501321110_MINUS_0_899976223 d1[0] michael@0: #define XFIX_2_053119869_MINUS_2_562915447 d1[1] michael@0: #define XFIX_0_541196100_PLUS_0_765366865 d1[2] michael@0: #define XFIX_1_175875602 d1[3] michael@0: #define XFIX_1_175875602_MINUS_0_390180644 d2[0] michael@0: #define XFIX_0_541196100_MINUS_1_847759065 d2[1] michael@0: #define XFIX_3_072711026_MINUS_2_562915447 d2[2] michael@0: #define XFIX_1_175875602_MINUS_1_961570560 d2[3] michael@0: michael@0: .balign 16 michael@0: jsimd_idct_islow_neon_consts: michael@0: .short FIX_0_899976223 /* d0[0] */ michael@0: .short FIX_0_541196100 /* d0[1] */ michael@0: .short FIX_2_562915447 /* d0[2] */ michael@0: .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ michael@0: .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ michael@0: .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ michael@0: .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ michael@0: .short FIX_1_175875602 /* d1[3] */ michael@0: /* reloadable constants */ michael@0: .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ michael@0: .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ michael@0: .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ michael@0: .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ michael@0: michael@0: asm_function jsimd_idct_islow_neon michael@0: michael@0: DCT_TABLE .req r0 michael@0: COEF_BLOCK .req r1 michael@0: OUTPUT_BUF .req r2 michael@0: OUTPUT_COL .req r3 michael@0: TMP1 .req r0 michael@0: TMP2 .req r1 michael@0: TMP3 .req r2 michael@0: TMP4 .req ip michael@0: michael@0: ROW0L .req d16 michael@0: ROW0R .req d17 michael@0: ROW1L .req d18 michael@0: ROW1R .req d19 michael@0: ROW2L .req d20 michael@0: ROW2R .req d21 michael@0: ROW3L .req d22 michael@0: ROW3R .req d23 michael@0: ROW4L .req d24 michael@0: ROW4R .req d25 michael@0: ROW5L .req d26 michael@0: ROW5R .req d27 michael@0: ROW6L .req d28 michael@0: ROW6R .req d29 michael@0: ROW7L .req d30 michael@0: ROW7R .req d31 michael@0: michael@0: /* Load and dequantize coefficients into NEON registers michael@0: * with the following allocation: michael@0: * 0 1 2 3 | 4 5 6 7 michael@0: * ---------+-------- michael@0: * 0 | d16 | d17 ( q8 ) michael@0: * 1 | d18 | d19 ( q9 ) michael@0: * 2 | d20 | d21 ( q10 ) michael@0: * 3 | d22 | d23 ( q11 ) michael@0: * 4 | d24 | d25 ( q12 ) michael@0: * 5 | d26 | d27 ( q13 ) michael@0: * 6 | d28 | d29 ( q14 ) michael@0: * 7 | d30 | d31 ( q15 ) michael@0: */ michael@0: adr ip, jsimd_idct_islow_neon_consts michael@0: vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! michael@0: vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! michael@0: vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! michael@0: vmul.s16 q8, q8, q0 michael@0: vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! michael@0: vmul.s16 q9, q9, q1 michael@0: vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! michael@0: vmul.s16 q10, q10, q2 michael@0: vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! michael@0: vmul.s16 q11, q11, q3 michael@0: vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] michael@0: vmul.s16 q12, q12, q0 michael@0: vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! michael@0: vmul.s16 q14, q14, q2 michael@0: vmul.s16 q13, q13, q1 michael@0: vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ michael@0: add ip, ip, #16 michael@0: vmul.s16 q15, q15, q3 michael@0: vpush {d8-d15} /* save NEON registers */ michael@0: /* 1-D IDCT, pass 1, left 4x8 half */ michael@0: vadd.s16 d4, ROW7L, ROW3L michael@0: vadd.s16 d5, ROW5L, ROW1L michael@0: vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 michael@0: vmlal.s16 q6, d5, XFIX_1_175875602 michael@0: vmull.s16 q7, d4, XFIX_1_175875602 michael@0: /* Check for the zero coefficients in the right 4x8 half */ michael@0: push {r4, r5} michael@0: vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 michael@0: vsubl.s16 q3, ROW0L, ROW4L michael@0: ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] michael@0: vmull.s16 q2, ROW2L, XFIX_0_541196100 michael@0: vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 michael@0: orr r0, r4, r5 michael@0: vmov q4, q6 michael@0: vmlsl.s16 q6, ROW5L, XFIX_2_562915447 michael@0: ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] michael@0: vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 michael@0: vshl.s32 q3, q3, #13 michael@0: orr r0, r0, r4 michael@0: vmlsl.s16 q4, ROW1L, XFIX_0_899976223 michael@0: orr r0, r0, r5 michael@0: vadd.s32 q1, q3, q2 michael@0: ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] michael@0: vmov q5, q7 michael@0: vadd.s32 q1, q1, q6 michael@0: orr r0, r0, r4 michael@0: vmlsl.s16 q7, ROW7L, XFIX_0_899976223 michael@0: orr r0, r0, r5 michael@0: vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 michael@0: vrshrn.s32 ROW1L, q1, #11 michael@0: ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] michael@0: vsub.s32 q1, q1, q6 michael@0: vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 michael@0: orr r0, r0, r4 michael@0: vmlsl.s16 q5, ROW3L, XFIX_2_562915447 michael@0: orr r0, r0, r5 michael@0: vsub.s32 q1, q1, q6 michael@0: vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 michael@0: ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] michael@0: vmlal.s16 q6, ROW6L, XFIX_0_541196100 michael@0: vsub.s32 q3, q3, q2 michael@0: orr r0, r0, r4 michael@0: vrshrn.s32 ROW6L, q1, #11 michael@0: orr r0, r0, r5 michael@0: vadd.s32 q1, q3, q5 michael@0: ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] michael@0: vsub.s32 q3, q3, q5 michael@0: vaddl.s16 q5, ROW0L, ROW4L michael@0: orr r0, r0, r4 michael@0: vrshrn.s32 ROW2L, q1, #11 michael@0: orr r0, r0, r5 michael@0: vrshrn.s32 ROW5L, q3, #11 michael@0: ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] michael@0: vshl.s32 q5, q5, #13 michael@0: vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 michael@0: orr r0, r0, r4 michael@0: vadd.s32 q2, q5, q6 michael@0: orrs r0, r0, r5 michael@0: vsub.s32 q1, q5, q6 michael@0: vadd.s32 q6, q2, q7 michael@0: ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] michael@0: vsub.s32 q2, q2, q7 michael@0: vadd.s32 q5, q1, q4 michael@0: orr r0, r4, r5 michael@0: vsub.s32 q3, q1, q4 michael@0: pop {r4, r5} michael@0: vrshrn.s32 ROW7L, q2, #11 michael@0: vrshrn.s32 ROW3L, q5, #11 michael@0: vrshrn.s32 ROW0L, q6, #11 michael@0: vrshrn.s32 ROW4L, q3, #11 michael@0: michael@0: beq 3f /* Go to do some special handling for the sparse right 4x8 half */ michael@0: michael@0: /* 1-D IDCT, pass 1, right 4x8 half */ michael@0: vld1.s16 {d2}, [ip, :64] /* reload constants */ michael@0: vadd.s16 d10, ROW7R, ROW3R michael@0: vadd.s16 d8, ROW5R, ROW1R michael@0: /* Transpose left 4x8 half */ michael@0: vtrn.16 ROW6L, ROW7L michael@0: vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 michael@0: vmlal.s16 q6, d8, XFIX_1_175875602 michael@0: vtrn.16 ROW2L, ROW3L michael@0: vmull.s16 q7, d10, XFIX_1_175875602 michael@0: vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 michael@0: vtrn.16 ROW0L, ROW1L michael@0: vsubl.s16 q3, ROW0R, ROW4R michael@0: vmull.s16 q2, ROW2R, XFIX_0_541196100 michael@0: vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 michael@0: vtrn.16 ROW4L, ROW5L michael@0: vmov q4, q6 michael@0: vmlsl.s16 q6, ROW5R, XFIX_2_562915447 michael@0: vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 michael@0: vtrn.32 ROW1L, ROW3L michael@0: vshl.s32 q3, q3, #13 michael@0: vmlsl.s16 q4, ROW1R, XFIX_0_899976223 michael@0: vtrn.32 ROW4L, ROW6L michael@0: vadd.s32 q1, q3, q2 michael@0: vmov q5, q7 michael@0: vadd.s32 q1, q1, q6 michael@0: vtrn.32 ROW0L, ROW2L michael@0: vmlsl.s16 q7, ROW7R, XFIX_0_899976223 michael@0: vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 michael@0: vrshrn.s32 ROW1R, q1, #11 michael@0: vtrn.32 ROW5L, ROW7L michael@0: vsub.s32 q1, q1, q6 michael@0: vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 michael@0: vmlsl.s16 q5, ROW3R, XFIX_2_562915447 michael@0: vsub.s32 q1, q1, q6 michael@0: vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 michael@0: vmlal.s16 q6, ROW6R, XFIX_0_541196100 michael@0: vsub.s32 q3, q3, q2 michael@0: vrshrn.s32 ROW6R, q1, #11 michael@0: vadd.s32 q1, q3, q5 michael@0: vsub.s32 q3, q3, q5 michael@0: vaddl.s16 q5, ROW0R, ROW4R michael@0: vrshrn.s32 ROW2R, q1, #11 michael@0: vrshrn.s32 ROW5R, q3, #11 michael@0: vshl.s32 q5, q5, #13 michael@0: vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 michael@0: vadd.s32 q2, q5, q6 michael@0: vsub.s32 q1, q5, q6 michael@0: vadd.s32 q6, q2, q7 michael@0: vsub.s32 q2, q2, q7 michael@0: vadd.s32 q5, q1, q4 michael@0: vsub.s32 q3, q1, q4 michael@0: vrshrn.s32 ROW7R, q2, #11 michael@0: vrshrn.s32 ROW3R, q5, #11 michael@0: vrshrn.s32 ROW0R, q6, #11 michael@0: vrshrn.s32 ROW4R, q3, #11 michael@0: /* Transpose right 4x8 half */ michael@0: vtrn.16 ROW6R, ROW7R michael@0: vtrn.16 ROW2R, ROW3R michael@0: vtrn.16 ROW0R, ROW1R michael@0: vtrn.16 ROW4R, ROW5R michael@0: vtrn.32 ROW1R, ROW3R michael@0: vtrn.32 ROW4R, ROW6R michael@0: vtrn.32 ROW0R, ROW2R michael@0: vtrn.32 ROW5R, ROW7R michael@0: michael@0: 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ michael@0: vld1.s16 {d2}, [ip, :64] /* reload constants */ michael@0: vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ michael@0: vmlal.s16 q6, ROW1L, XFIX_1_175875602 michael@0: vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ michael@0: vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 michael@0: vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ michael@0: vmlal.s16 q7, ROW3L, XFIX_1_175875602 michael@0: vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ michael@0: vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 michael@0: vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ michael@0: vmull.s16 q2, ROW2L, XFIX_0_541196100 michael@0: vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ michael@0: vmov q4, q6 michael@0: vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ michael@0: vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 michael@0: vshl.s32 q3, q3, #13 michael@0: vmlsl.s16 q4, ROW1L, XFIX_0_899976223 michael@0: vadd.s32 q1, q3, q2 michael@0: vmov q5, q7 michael@0: vadd.s32 q1, q1, q6 michael@0: vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ michael@0: vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 michael@0: vshrn.s32 ROW1L, q1, #16 michael@0: vsub.s32 q1, q1, q6 michael@0: vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ michael@0: vmlsl.s16 q5, ROW3L, XFIX_2_562915447 michael@0: vsub.s32 q1, q1, q6 michael@0: vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 michael@0: vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ michael@0: vsub.s32 q3, q3, q2 michael@0: vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ michael@0: vadd.s32 q1, q3, q5 michael@0: vsub.s32 q3, q3, q5 michael@0: vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ michael@0: vshrn.s32 ROW2L, q1, #16 michael@0: vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ michael@0: vshl.s32 q5, q5, #13 michael@0: vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ michael@0: vadd.s32 q2, q5, q6 michael@0: vsub.s32 q1, q5, q6 michael@0: vadd.s32 q6, q2, q7 michael@0: vsub.s32 q2, q2, q7 michael@0: vadd.s32 q5, q1, q4 michael@0: vsub.s32 q3, q1, q4 michael@0: vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ michael@0: vshrn.s32 ROW3L, q5, #16 michael@0: vshrn.s32 ROW0L, q6, #16 michael@0: vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ michael@0: /* 1-D IDCT, pass 2, right 4x8 half */ michael@0: vld1.s16 {d2}, [ip, :64] /* reload constants */ michael@0: vmull.s16 q6, ROW5R, XFIX_1_175875602 michael@0: vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ michael@0: vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 michael@0: vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ michael@0: vmull.s16 q7, ROW7R, XFIX_1_175875602 michael@0: vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ michael@0: vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 michael@0: vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ michael@0: vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ michael@0: vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ michael@0: vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 michael@0: vmov q4, q6 michael@0: vmlsl.s16 q6, ROW5R, XFIX_2_562915447 michael@0: vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ michael@0: vshl.s32 q3, q3, #13 michael@0: vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ michael@0: vadd.s32 q1, q3, q2 michael@0: vmov q5, q7 michael@0: vadd.s32 q1, q1, q6 michael@0: vmlsl.s16 q7, ROW7R, XFIX_0_899976223 michael@0: vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ michael@0: vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ michael@0: vsub.s32 q1, q1, q6 michael@0: vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 michael@0: vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ michael@0: vsub.s32 q1, q1, q6 michael@0: vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ michael@0: vmlal.s16 q6, ROW6R, XFIX_0_541196100 michael@0: vsub.s32 q3, q3, q2 michael@0: vshrn.s32 ROW6R, q1, #16 michael@0: vadd.s32 q1, q3, q5 michael@0: vsub.s32 q3, q3, q5 michael@0: vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ michael@0: vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ michael@0: vshrn.s32 ROW5R, q3, #16 michael@0: vshl.s32 q5, q5, #13 michael@0: vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 michael@0: vadd.s32 q2, q5, q6 michael@0: vsub.s32 q1, q5, q6 michael@0: vadd.s32 q6, q2, q7 michael@0: vsub.s32 q2, q2, q7 michael@0: vadd.s32 q5, q1, q4 michael@0: vsub.s32 q3, q1, q4 michael@0: vshrn.s32 ROW7R, q2, #16 michael@0: vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ michael@0: vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ michael@0: vshrn.s32 ROW4R, q3, #16 michael@0: michael@0: 2: /* Descale to 8-bit and range limit */ michael@0: vqrshrn.s16 d16, q8, #2 michael@0: vqrshrn.s16 d17, q9, #2 michael@0: vqrshrn.s16 d18, q10, #2 michael@0: vqrshrn.s16 d19, q11, #2 michael@0: vpop {d8-d15} /* restore NEON registers */ michael@0: vqrshrn.s16 d20, q12, #2 michael@0: /* Transpose the final 8-bit samples and do signed->unsigned conversion */ michael@0: vtrn.16 q8, q9 michael@0: vqrshrn.s16 d21, q13, #2 michael@0: vqrshrn.s16 d22, q14, #2 michael@0: vmov.u8 q0, #(CENTERJSAMPLE) michael@0: vqrshrn.s16 d23, q15, #2 michael@0: vtrn.8 d16, d17 michael@0: vtrn.8 d18, d19 michael@0: vadd.u8 q8, q8, q0 michael@0: vadd.u8 q9, q9, q0 michael@0: vtrn.16 q10, q11 michael@0: /* Store results to the output buffer */ michael@0: ldmia OUTPUT_BUF!, {TMP1, TMP2} michael@0: add TMP1, TMP1, OUTPUT_COL michael@0: add TMP2, TMP2, OUTPUT_COL michael@0: vst1.8 {d16}, [TMP1] michael@0: vtrn.8 d20, d21 michael@0: vst1.8 {d17}, [TMP2] michael@0: ldmia OUTPUT_BUF!, {TMP1, TMP2} michael@0: add TMP1, TMP1, OUTPUT_COL michael@0: add TMP2, TMP2, OUTPUT_COL michael@0: vst1.8 {d18}, [TMP1] michael@0: vadd.u8 q10, q10, q0 michael@0: vst1.8 {d19}, [TMP2] michael@0: ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} michael@0: add TMP1, TMP1, OUTPUT_COL michael@0: add TMP2, TMP2, OUTPUT_COL michael@0: add TMP3, TMP3, OUTPUT_COL michael@0: add TMP4, TMP4, OUTPUT_COL michael@0: vtrn.8 d22, d23 michael@0: vst1.8 {d20}, [TMP1] michael@0: vadd.u8 q11, q11, q0 michael@0: vst1.8 {d21}, [TMP2] michael@0: vst1.8 {d22}, [TMP3] michael@0: vst1.8 {d23}, [TMP4] michael@0: bx lr michael@0: michael@0: 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ michael@0: michael@0: /* Transpose left 4x8 half */ michael@0: vtrn.16 ROW6L, ROW7L michael@0: vtrn.16 ROW2L, ROW3L michael@0: vtrn.16 ROW0L, ROW1L michael@0: vtrn.16 ROW4L, ROW5L michael@0: vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ michael@0: vtrn.32 ROW1L, ROW3L michael@0: vtrn.32 ROW4L, ROW6L michael@0: vtrn.32 ROW0L, ROW2L michael@0: vtrn.32 ROW5L, ROW7L michael@0: michael@0: cmp r0, #0 michael@0: beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ michael@0: michael@0: /* Only row 0 is non-zero for the right 4x8 half */ michael@0: vdup.s16 ROW1R, ROW0R[1] michael@0: vdup.s16 ROW2R, ROW0R[2] michael@0: vdup.s16 ROW3R, ROW0R[3] michael@0: vdup.s16 ROW4R, ROW0R[0] michael@0: vdup.s16 ROW5R, ROW0R[1] michael@0: vdup.s16 ROW6R, ROW0R[2] michael@0: vdup.s16 ROW7R, ROW0R[3] michael@0: vdup.s16 ROW0R, ROW0R[0] michael@0: b 1b /* Go to 'normal' second pass */ michael@0: michael@0: 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ michael@0: vld1.s16 {d2}, [ip, :64] /* reload constants */ michael@0: vmull.s16 q6, ROW1L, XFIX_1_175875602 michael@0: vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 michael@0: vmull.s16 q7, ROW3L, XFIX_1_175875602 michael@0: vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 michael@0: vmull.s16 q2, ROW2L, XFIX_0_541196100 michael@0: vshll.s16 q3, ROW0L, #13 michael@0: vmov q4, q6 michael@0: vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 michael@0: vmlsl.s16 q4, ROW1L, XFIX_0_899976223 michael@0: vadd.s32 q1, q3, q2 michael@0: vmov q5, q7 michael@0: vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 michael@0: vadd.s32 q1, q1, q6 michael@0: vadd.s32 q6, q6, q6 michael@0: vmlsl.s16 q5, ROW3L, XFIX_2_562915447 michael@0: vshrn.s32 ROW1L, q1, #16 michael@0: vsub.s32 q1, q1, q6 michael@0: vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 michael@0: vsub.s32 q3, q3, q2 michael@0: vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ michael@0: vadd.s32 q1, q3, q5 michael@0: vsub.s32 q3, q3, q5 michael@0: vshll.s16 q5, ROW0L, #13 michael@0: vshrn.s32 ROW2L, q1, #16 michael@0: vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ michael@0: vadd.s32 q2, q5, q6 michael@0: vsub.s32 q1, q5, q6 michael@0: vadd.s32 q6, q2, q7 michael@0: vsub.s32 q2, q2, q7 michael@0: vadd.s32 q5, q1, q4 michael@0: vsub.s32 q3, q1, q4 michael@0: vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ michael@0: vshrn.s32 ROW3L, q5, #16 michael@0: vshrn.s32 ROW0L, q6, #16 michael@0: vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ michael@0: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ michael@0: vld1.s16 {d2}, [ip, :64] /* reload constants */ michael@0: vmull.s16 q6, ROW5L, XFIX_1_175875602 michael@0: vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 michael@0: vmull.s16 q7, ROW7L, XFIX_1_175875602 michael@0: vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 michael@0: vmull.s16 q2, ROW6L, XFIX_0_541196100 michael@0: vshll.s16 q3, ROW4L, #13 michael@0: vmov q4, q6 michael@0: vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 michael@0: vmlsl.s16 q4, ROW5L, XFIX_0_899976223 michael@0: vadd.s32 q1, q3, q2 michael@0: vmov q5, q7 michael@0: vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 michael@0: vadd.s32 q1, q1, q6 michael@0: vadd.s32 q6, q6, q6 michael@0: vmlsl.s16 q5, ROW7L, XFIX_2_562915447 michael@0: vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ michael@0: vsub.s32 q1, q1, q6 michael@0: vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 michael@0: vsub.s32 q3, q3, q2 michael@0: vshrn.s32 ROW6R, q1, #16 michael@0: vadd.s32 q1, q3, q5 michael@0: vsub.s32 q3, q3, q5 michael@0: vshll.s16 q5, ROW4L, #13 michael@0: vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ michael@0: vshrn.s32 ROW5R, q3, #16 michael@0: vadd.s32 q2, q5, q6 michael@0: vsub.s32 q1, q5, q6 michael@0: vadd.s32 q6, q2, q7 michael@0: vsub.s32 q2, q2, q7 michael@0: vadd.s32 q5, q1, q4 michael@0: vsub.s32 q3, q1, q4 michael@0: vshrn.s32 ROW7R, q2, #16 michael@0: vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ michael@0: vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ michael@0: vshrn.s32 ROW4R, q3, #16 michael@0: b 2b /* Go to epilogue */ michael@0: michael@0: .unreq DCT_TABLE michael@0: .unreq COEF_BLOCK michael@0: .unreq OUTPUT_BUF michael@0: .unreq OUTPUT_COL michael@0: .unreq TMP1 michael@0: .unreq TMP2 michael@0: .unreq TMP3 michael@0: .unreq TMP4 michael@0: michael@0: .unreq ROW0L michael@0: .unreq ROW0R michael@0: .unreq ROW1L michael@0: .unreq ROW1R michael@0: .unreq ROW2L michael@0: .unreq ROW2R michael@0: .unreq ROW3L michael@0: .unreq ROW3R michael@0: .unreq ROW4L michael@0: .unreq ROW4R michael@0: .unreq ROW5L michael@0: .unreq ROW5R michael@0: .unreq ROW6L michael@0: .unreq ROW6R michael@0: .unreq ROW7L michael@0: .unreq ROW7R michael@0: .endfunc michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * jsimd_idct_ifast_neon michael@0: * michael@0: * This function contains a fast, not so accurate integer implementation of michael@0: * the inverse DCT (Discrete Cosine Transform). It uses the same calculations michael@0: * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' michael@0: * function from jidctfst.c michael@0: * michael@0: * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. michael@0: * But in ARM NEON case some extra additions are required because VQDMULH michael@0: * instruction can't handle the constants larger than 1. So the expressions michael@0: * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", michael@0: * which introduces an extra addition. Overall, there are 6 extra additions michael@0: * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. michael@0: */ michael@0: michael@0: #define XFIX_1_082392200 d0[0] michael@0: #define XFIX_1_414213562 d0[1] michael@0: #define XFIX_1_847759065 d0[2] michael@0: #define XFIX_2_613125930 d0[3] michael@0: michael@0: .balign 16 michael@0: jsimd_idct_ifast_neon_consts: michael@0: .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ michael@0: .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ michael@0: .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ michael@0: .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ michael@0: michael@0: asm_function jsimd_idct_ifast_neon michael@0: michael@0: DCT_TABLE .req r0 michael@0: COEF_BLOCK .req r1 michael@0: OUTPUT_BUF .req r2 michael@0: OUTPUT_COL .req r3 michael@0: TMP1 .req r0 michael@0: TMP2 .req r1 michael@0: TMP3 .req r2 michael@0: TMP4 .req ip michael@0: michael@0: /* Load and dequantize coefficients into NEON registers michael@0: * with the following allocation: michael@0: * 0 1 2 3 | 4 5 6 7 michael@0: * ---------+-------- michael@0: * 0 | d16 | d17 ( q8 ) michael@0: * 1 | d18 | d19 ( q9 ) michael@0: * 2 | d20 | d21 ( q10 ) michael@0: * 3 | d22 | d23 ( q11 ) michael@0: * 4 | d24 | d25 ( q12 ) michael@0: * 5 | d26 | d27 ( q13 ) michael@0: * 6 | d28 | d29 ( q14 ) michael@0: * 7 | d30 | d31 ( q15 ) michael@0: */ michael@0: adr ip, jsimd_idct_ifast_neon_consts michael@0: vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! michael@0: vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! michael@0: vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! michael@0: vmul.s16 q8, q8, q0 michael@0: vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! michael@0: vmul.s16 q9, q9, q1 michael@0: vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! michael@0: vmul.s16 q10, q10, q2 michael@0: vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! michael@0: vmul.s16 q11, q11, q3 michael@0: vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] michael@0: vmul.s16 q12, q12, q0 michael@0: vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! michael@0: vmul.s16 q14, q14, q2 michael@0: vmul.s16 q13, q13, q1 michael@0: vld1.16 {d0}, [ip, :64] /* load constants */ michael@0: vmul.s16 q15, q15, q3 michael@0: vpush {d8-d13} /* save NEON registers */ michael@0: /* 1-D IDCT, pass 1 */ michael@0: vsub.s16 q2, q10, q14 michael@0: vadd.s16 q14, q10, q14 michael@0: vsub.s16 q1, q11, q13 michael@0: vadd.s16 q13, q11, q13 michael@0: vsub.s16 q5, q9, q15 michael@0: vadd.s16 q15, q9, q15 michael@0: vqdmulh.s16 q4, q2, XFIX_1_414213562 michael@0: vqdmulh.s16 q6, q1, XFIX_2_613125930 michael@0: vadd.s16 q3, q1, q1 michael@0: vsub.s16 q1, q5, q1 michael@0: vadd.s16 q10, q2, q4 michael@0: vqdmulh.s16 q4, q1, XFIX_1_847759065 michael@0: vsub.s16 q2, q15, q13 michael@0: vadd.s16 q3, q3, q6 michael@0: vqdmulh.s16 q6, q2, XFIX_1_414213562 michael@0: vadd.s16 q1, q1, q4 michael@0: vqdmulh.s16 q4, q5, XFIX_1_082392200 michael@0: vsub.s16 q10, q10, q14 michael@0: vadd.s16 q2, q2, q6 michael@0: vsub.s16 q6, q8, q12 michael@0: vadd.s16 q12, q8, q12 michael@0: vadd.s16 q9, q5, q4 michael@0: vadd.s16 q5, q6, q10 michael@0: vsub.s16 q10, q6, q10 michael@0: vadd.s16 q6, q15, q13 michael@0: vadd.s16 q8, q12, q14 michael@0: vsub.s16 q3, q6, q3 michael@0: vsub.s16 q12, q12, q14 michael@0: vsub.s16 q3, q3, q1 michael@0: vsub.s16 q1, q9, q1 michael@0: vadd.s16 q2, q3, q2 michael@0: vsub.s16 q15, q8, q6 michael@0: vadd.s16 q1, q1, q2 michael@0: vadd.s16 q8, q8, q6 michael@0: vadd.s16 q14, q5, q3 michael@0: vsub.s16 q9, q5, q3 michael@0: vsub.s16 q13, q10, q2 michael@0: vadd.s16 q10, q10, q2 michael@0: /* Transpose */ michael@0: vtrn.16 q8, q9 michael@0: vsub.s16 q11, q12, q1 michael@0: vtrn.16 q14, q15 michael@0: vadd.s16 q12, q12, q1 michael@0: vtrn.16 q10, q11 michael@0: vtrn.16 q12, q13 michael@0: vtrn.32 q9, q11 michael@0: vtrn.32 q12, q14 michael@0: vtrn.32 q8, q10 michael@0: vtrn.32 q13, q15 michael@0: vswp d28, d21 michael@0: vswp d26, d19 michael@0: /* 1-D IDCT, pass 2 */ michael@0: vsub.s16 q2, q10, q14 michael@0: vswp d30, d23 michael@0: vadd.s16 q14, q10, q14 michael@0: vswp d24, d17 michael@0: vsub.s16 q1, q11, q13 michael@0: vadd.s16 q13, q11, q13 michael@0: vsub.s16 q5, q9, q15 michael@0: vadd.s16 q15, q9, q15 michael@0: vqdmulh.s16 q4, q2, XFIX_1_414213562 michael@0: vqdmulh.s16 q6, q1, XFIX_2_613125930 michael@0: vadd.s16 q3, q1, q1 michael@0: vsub.s16 q1, q5, q1 michael@0: vadd.s16 q10, q2, q4 michael@0: vqdmulh.s16 q4, q1, XFIX_1_847759065 michael@0: vsub.s16 q2, q15, q13 michael@0: vadd.s16 q3, q3, q6 michael@0: vqdmulh.s16 q6, q2, XFIX_1_414213562 michael@0: vadd.s16 q1, q1, q4 michael@0: vqdmulh.s16 q4, q5, XFIX_1_082392200 michael@0: vsub.s16 q10, q10, q14 michael@0: vadd.s16 q2, q2, q6 michael@0: vsub.s16 q6, q8, q12 michael@0: vadd.s16 q12, q8, q12 michael@0: vadd.s16 q9, q5, q4 michael@0: vadd.s16 q5, q6, q10 michael@0: vsub.s16 q10, q6, q10 michael@0: vadd.s16 q6, q15, q13 michael@0: vadd.s16 q8, q12, q14 michael@0: vsub.s16 q3, q6, q3 michael@0: vsub.s16 q12, q12, q14 michael@0: vsub.s16 q3, q3, q1 michael@0: vsub.s16 q1, q9, q1 michael@0: vadd.s16 q2, q3, q2 michael@0: vsub.s16 q15, q8, q6 michael@0: vadd.s16 q1, q1, q2 michael@0: vadd.s16 q8, q8, q6 michael@0: vadd.s16 q14, q5, q3 michael@0: vsub.s16 q9, q5, q3 michael@0: vsub.s16 q13, q10, q2 michael@0: vpop {d8-d13} /* restore NEON registers */ michael@0: vadd.s16 q10, q10, q2 michael@0: vsub.s16 q11, q12, q1 michael@0: vadd.s16 q12, q12, q1 michael@0: /* Descale to 8-bit and range limit */ michael@0: vmov.u8 q0, #0x80 michael@0: vqshrn.s16 d16, q8, #5 michael@0: vqshrn.s16 d17, q9, #5 michael@0: vqshrn.s16 d18, q10, #5 michael@0: vqshrn.s16 d19, q11, #5 michael@0: vqshrn.s16 d20, q12, #5 michael@0: vqshrn.s16 d21, q13, #5 michael@0: vqshrn.s16 d22, q14, #5 michael@0: vqshrn.s16 d23, q15, #5 michael@0: vadd.u8 q8, q8, q0 michael@0: vadd.u8 q9, q9, q0 michael@0: vadd.u8 q10, q10, q0 michael@0: vadd.u8 q11, q11, q0 michael@0: /* Transpose the final 8-bit samples */ michael@0: vtrn.16 q8, q9 michael@0: vtrn.16 q10, q11 michael@0: vtrn.32 q8, q10 michael@0: vtrn.32 q9, q11 michael@0: vtrn.8 d16, d17 michael@0: vtrn.8 d18, d19 michael@0: /* Store results to the output buffer */ michael@0: ldmia OUTPUT_BUF!, {TMP1, TMP2} michael@0: add TMP1, TMP1, OUTPUT_COL michael@0: add TMP2, TMP2, OUTPUT_COL michael@0: vst1.8 {d16}, [TMP1] michael@0: vst1.8 {d17}, [TMP2] michael@0: ldmia OUTPUT_BUF!, {TMP1, TMP2} michael@0: add TMP1, TMP1, OUTPUT_COL michael@0: add TMP2, TMP2, OUTPUT_COL michael@0: vst1.8 {d18}, [TMP1] michael@0: vtrn.8 d20, d21 michael@0: vst1.8 {d19}, [TMP2] michael@0: ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} michael@0: add TMP1, TMP1, OUTPUT_COL michael@0: add TMP2, TMP2, OUTPUT_COL michael@0: add TMP3, TMP3, OUTPUT_COL michael@0: add TMP4, TMP4, OUTPUT_COL michael@0: vst1.8 {d20}, [TMP1] michael@0: vtrn.8 d22, d23 michael@0: vst1.8 {d21}, [TMP2] michael@0: vst1.8 {d22}, [TMP3] michael@0: vst1.8 {d23}, [TMP4] michael@0: bx lr michael@0: michael@0: .unreq DCT_TABLE michael@0: .unreq COEF_BLOCK michael@0: .unreq OUTPUT_BUF michael@0: .unreq OUTPUT_COL michael@0: .unreq TMP1 michael@0: .unreq TMP2 michael@0: .unreq TMP3 michael@0: .unreq TMP4 michael@0: .endfunc michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * jsimd_idct_4x4_neon michael@0: * michael@0: * This function contains inverse-DCT code for getting reduced-size michael@0: * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations michael@0: * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' michael@0: * function from jpeg-6b (jidctred.c). michael@0: * michael@0: * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which michael@0: * requires much less arithmetic operations and hence should be faster. michael@0: * The primary purpose of this particular NEON optimized function is michael@0: * bit exact compatibility with jpeg-6b. michael@0: * michael@0: * TODO: a bit better instructions scheduling can be achieved by expanding michael@0: * idct_helper/transpose_4x4 macros and reordering instructions, michael@0: * but readability will suffer somewhat. michael@0: */ michael@0: michael@0: #define CONST_BITS 13 michael@0: michael@0: #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ michael@0: #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ michael@0: #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ michael@0: #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ michael@0: #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ michael@0: #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ michael@0: #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ michael@0: #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ michael@0: #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ michael@0: #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ michael@0: #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ michael@0: #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ michael@0: #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ michael@0: #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ michael@0: michael@0: .balign 16 michael@0: jsimd_idct_4x4_neon_consts: michael@0: .short FIX_1_847759065 /* d0[0] */ michael@0: .short -FIX_0_765366865 /* d0[1] */ michael@0: .short -FIX_0_211164243 /* d0[2] */ michael@0: .short FIX_1_451774981 /* d0[3] */ michael@0: .short -FIX_2_172734803 /* d1[0] */ michael@0: .short FIX_1_061594337 /* d1[1] */ michael@0: .short -FIX_0_509795579 /* d1[2] */ michael@0: .short -FIX_0_601344887 /* d1[3] */ michael@0: .short FIX_0_899976223 /* d2[0] */ michael@0: .short FIX_2_562915447 /* d2[1] */ michael@0: .short 1 << (CONST_BITS+1) /* d2[2] */ michael@0: .short 0 /* d2[3] */ michael@0: michael@0: .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 michael@0: vmull.s16 q14, \x4, d2[2] michael@0: vmlal.s16 q14, \x8, d0[0] michael@0: vmlal.s16 q14, \x14, d0[1] michael@0: michael@0: vmull.s16 q13, \x16, d1[2] michael@0: vmlal.s16 q13, \x12, d1[3] michael@0: vmlal.s16 q13, \x10, d2[0] michael@0: vmlal.s16 q13, \x6, d2[1] michael@0: michael@0: vmull.s16 q15, \x4, d2[2] michael@0: vmlsl.s16 q15, \x8, d0[0] michael@0: vmlsl.s16 q15, \x14, d0[1] michael@0: michael@0: vmull.s16 q12, \x16, d0[2] michael@0: vmlal.s16 q12, \x12, d0[3] michael@0: vmlal.s16 q12, \x10, d1[0] michael@0: vmlal.s16 q12, \x6, d1[1] michael@0: michael@0: vadd.s32 q10, q14, q13 michael@0: vsub.s32 q14, q14, q13 michael@0: michael@0: .if \shift > 16 michael@0: vrshr.s32 q10, q10, #\shift michael@0: vrshr.s32 q14, q14, #\shift michael@0: vmovn.s32 \y26, q10 michael@0: vmovn.s32 \y29, q14 michael@0: .else michael@0: vrshrn.s32 \y26, q10, #\shift michael@0: vrshrn.s32 \y29, q14, #\shift michael@0: .endif michael@0: michael@0: vadd.s32 q10, q15, q12 michael@0: vsub.s32 q15, q15, q12 michael@0: michael@0: .if \shift > 16 michael@0: vrshr.s32 q10, q10, #\shift michael@0: vrshr.s32 q15, q15, #\shift michael@0: vmovn.s32 \y27, q10 michael@0: vmovn.s32 \y28, q15 michael@0: .else michael@0: vrshrn.s32 \y27, q10, #\shift michael@0: vrshrn.s32 \y28, q15, #\shift michael@0: .endif michael@0: michael@0: .endm michael@0: michael@0: asm_function jsimd_idct_4x4_neon michael@0: michael@0: DCT_TABLE .req r0 michael@0: COEF_BLOCK .req r1 michael@0: OUTPUT_BUF .req r2 michael@0: OUTPUT_COL .req r3 michael@0: TMP1 .req r0 michael@0: TMP2 .req r1 michael@0: TMP3 .req r2 michael@0: TMP4 .req ip michael@0: michael@0: vpush {d8-d15} michael@0: michael@0: /* Load constants (d3 is just used for padding) */ michael@0: adr TMP4, jsimd_idct_4x4_neon_consts michael@0: vld1.16 {d0, d1, d2, d3}, [TMP4, :128] michael@0: michael@0: /* Load all COEF_BLOCK into NEON registers with the following allocation: michael@0: * 0 1 2 3 | 4 5 6 7 michael@0: * ---------+-------- michael@0: * 0 | d4 | d5 michael@0: * 1 | d6 | d7 michael@0: * 2 | d8 | d9 michael@0: * 3 | d10 | d11 michael@0: * 4 | - | - michael@0: * 5 | d12 | d13 michael@0: * 6 | d14 | d15 michael@0: * 7 | d16 | d17 michael@0: */ michael@0: vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! michael@0: vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! michael@0: add COEF_BLOCK, COEF_BLOCK, #16 michael@0: vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! michael@0: vld1.16 {d16, d17}, [COEF_BLOCK, :128]! michael@0: /* dequantize */ michael@0: vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! michael@0: vmul.s16 q2, q2, q9 michael@0: vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! michael@0: vmul.s16 q3, q3, q10 michael@0: vmul.s16 q4, q4, q11 michael@0: add DCT_TABLE, DCT_TABLE, #16 michael@0: vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! michael@0: vmul.s16 q5, q5, q12 michael@0: vmul.s16 q6, q6, q13 michael@0: vld1.16 {d30, d31}, [DCT_TABLE, :128]! michael@0: vmul.s16 q7, q7, q14 michael@0: vmul.s16 q8, q8, q15 michael@0: michael@0: /* Pass 1 */ michael@0: idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 michael@0: transpose_4x4 d4, d6, d8, d10 michael@0: idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 michael@0: transpose_4x4 d5, d7, d9, d11 michael@0: michael@0: /* Pass 2 */ michael@0: idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 michael@0: transpose_4x4 d26, d27, d28, d29 michael@0: michael@0: /* Range limit */ michael@0: vmov.u16 q15, #0x80 michael@0: vadd.s16 q13, q13, q15 michael@0: vadd.s16 q14, q14, q15 michael@0: vqmovun.s16 d26, q13 michael@0: vqmovun.s16 d27, q14 michael@0: michael@0: /* Store results to the output buffer */ michael@0: ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} michael@0: add TMP1, TMP1, OUTPUT_COL michael@0: add TMP2, TMP2, OUTPUT_COL michael@0: add TMP3, TMP3, OUTPUT_COL michael@0: add TMP4, TMP4, OUTPUT_COL michael@0: michael@0: #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT michael@0: /* We can use much less instructions on little endian systems if the michael@0: * OS kernel is not configured to trap unaligned memory accesses michael@0: */ michael@0: vst1.32 {d26[0]}, [TMP1]! michael@0: vst1.32 {d27[0]}, [TMP3]! michael@0: vst1.32 {d26[1]}, [TMP2]! michael@0: vst1.32 {d27[1]}, [TMP4]! michael@0: #else michael@0: vst1.8 {d26[0]}, [TMP1]! michael@0: vst1.8 {d27[0]}, [TMP3]! michael@0: vst1.8 {d26[1]}, [TMP1]! michael@0: vst1.8 {d27[1]}, [TMP3]! michael@0: vst1.8 {d26[2]}, [TMP1]! michael@0: vst1.8 {d27[2]}, [TMP3]! michael@0: vst1.8 {d26[3]}, [TMP1]! michael@0: vst1.8 {d27[3]}, [TMP3]! michael@0: michael@0: vst1.8 {d26[4]}, [TMP2]! michael@0: vst1.8 {d27[4]}, [TMP4]! michael@0: vst1.8 {d26[5]}, [TMP2]! michael@0: vst1.8 {d27[5]}, [TMP4]! michael@0: vst1.8 {d26[6]}, [TMP2]! michael@0: vst1.8 {d27[6]}, [TMP4]! michael@0: vst1.8 {d26[7]}, [TMP2]! michael@0: vst1.8 {d27[7]}, [TMP4]! michael@0: #endif michael@0: michael@0: vpop {d8-d15} michael@0: bx lr michael@0: michael@0: .unreq DCT_TABLE michael@0: .unreq COEF_BLOCK michael@0: .unreq OUTPUT_BUF michael@0: .unreq OUTPUT_COL michael@0: .unreq TMP1 michael@0: .unreq TMP2 michael@0: .unreq TMP3 michael@0: .unreq TMP4 michael@0: .endfunc michael@0: michael@0: .purgem idct_helper michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * jsimd_idct_2x2_neon michael@0: * michael@0: * This function contains inverse-DCT code for getting reduced-size michael@0: * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations michael@0: * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' michael@0: * function from jpeg-6b (jidctred.c). michael@0: * michael@0: * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which michael@0: * requires much less arithmetic operations and hence should be faster. michael@0: * The primary purpose of this particular NEON optimized function is michael@0: * bit exact compatibility with jpeg-6b. michael@0: */ michael@0: michael@0: .balign 8 michael@0: jsimd_idct_2x2_neon_consts: michael@0: .short -FIX_0_720959822 /* d0[0] */ michael@0: .short FIX_0_850430095 /* d0[1] */ michael@0: .short -FIX_1_272758580 /* d0[2] */ michael@0: .short FIX_3_624509785 /* d0[3] */ michael@0: michael@0: .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 michael@0: vshll.s16 q14, \x4, #15 michael@0: vmull.s16 q13, \x6, d0[3] michael@0: vmlal.s16 q13, \x10, d0[2] michael@0: vmlal.s16 q13, \x12, d0[1] michael@0: vmlal.s16 q13, \x16, d0[0] michael@0: michael@0: vadd.s32 q10, q14, q13 michael@0: vsub.s32 q14, q14, q13 michael@0: michael@0: .if \shift > 16 michael@0: vrshr.s32 q10, q10, #\shift michael@0: vrshr.s32 q14, q14, #\shift michael@0: vmovn.s32 \y26, q10 michael@0: vmovn.s32 \y27, q14 michael@0: .else michael@0: vrshrn.s32 \y26, q10, #\shift michael@0: vrshrn.s32 \y27, q14, #\shift michael@0: .endif michael@0: michael@0: .endm michael@0: michael@0: asm_function jsimd_idct_2x2_neon michael@0: michael@0: DCT_TABLE .req r0 michael@0: COEF_BLOCK .req r1 michael@0: OUTPUT_BUF .req r2 michael@0: OUTPUT_COL .req r3 michael@0: TMP1 .req r0 michael@0: TMP2 .req ip michael@0: michael@0: vpush {d8-d15} michael@0: michael@0: /* Load constants */ michael@0: adr TMP2, jsimd_idct_2x2_neon_consts michael@0: vld1.16 {d0}, [TMP2, :64] michael@0: michael@0: /* Load all COEF_BLOCK into NEON registers with the following allocation: michael@0: * 0 1 2 3 | 4 5 6 7 michael@0: * ---------+-------- michael@0: * 0 | d4 | d5 michael@0: * 1 | d6 | d7 michael@0: * 2 | - | - michael@0: * 3 | d10 | d11 michael@0: * 4 | - | - michael@0: * 5 | d12 | d13 michael@0: * 6 | - | - michael@0: * 7 | d16 | d17 michael@0: */ michael@0: vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! michael@0: add COEF_BLOCK, COEF_BLOCK, #16 michael@0: vld1.16 {d10, d11}, [COEF_BLOCK, :128]! michael@0: add COEF_BLOCK, COEF_BLOCK, #16 michael@0: vld1.16 {d12, d13}, [COEF_BLOCK, :128]! michael@0: add COEF_BLOCK, COEF_BLOCK, #16 michael@0: vld1.16 {d16, d17}, [COEF_BLOCK, :128]! michael@0: /* Dequantize */ michael@0: vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! michael@0: vmul.s16 q2, q2, q9 michael@0: vmul.s16 q3, q3, q10 michael@0: add DCT_TABLE, DCT_TABLE, #16 michael@0: vld1.16 {d24, d25}, [DCT_TABLE, :128]! michael@0: vmul.s16 q5, q5, q12 michael@0: add DCT_TABLE, DCT_TABLE, #16 michael@0: vld1.16 {d26, d27}, [DCT_TABLE, :128]! michael@0: vmul.s16 q6, q6, q13 michael@0: add DCT_TABLE, DCT_TABLE, #16 michael@0: vld1.16 {d30, d31}, [DCT_TABLE, :128]! michael@0: vmul.s16 q8, q8, q15 michael@0: michael@0: /* Pass 1 */ michael@0: #if 0 michael@0: idct_helper d4, d6, d10, d12, d16, 13, d4, d6 michael@0: transpose_4x4 d4, d6, d8, d10 michael@0: idct_helper d5, d7, d11, d13, d17, 13, d5, d7 michael@0: transpose_4x4 d5, d7, d9, d11 michael@0: #else michael@0: vmull.s16 q13, d6, d0[3] michael@0: vmlal.s16 q13, d10, d0[2] michael@0: vmlal.s16 q13, d12, d0[1] michael@0: vmlal.s16 q13, d16, d0[0] michael@0: vmull.s16 q12, d7, d0[3] michael@0: vmlal.s16 q12, d11, d0[2] michael@0: vmlal.s16 q12, d13, d0[1] michael@0: vmlal.s16 q12, d17, d0[0] michael@0: vshll.s16 q14, d4, #15 michael@0: vshll.s16 q15, d5, #15 michael@0: vadd.s32 q10, q14, q13 michael@0: vsub.s32 q14, q14, q13 michael@0: vrshrn.s32 d4, q10, #13 michael@0: vrshrn.s32 d6, q14, #13 michael@0: vadd.s32 q10, q15, q12 michael@0: vsub.s32 q14, q15, q12 michael@0: vrshrn.s32 d5, q10, #13 michael@0: vrshrn.s32 d7, q14, #13 michael@0: vtrn.16 q2, q3 michael@0: vtrn.32 q3, q5 michael@0: #endif michael@0: michael@0: /* Pass 2 */ michael@0: idct_helper d4, d6, d10, d7, d11, 20, d26, d27 michael@0: michael@0: /* Range limit */ michael@0: vmov.u16 q15, #0x80 michael@0: vadd.s16 q13, q13, q15 michael@0: vqmovun.s16 d26, q13 michael@0: vqmovun.s16 d27, q13 michael@0: michael@0: /* Store results to the output buffer */ michael@0: ldmia OUTPUT_BUF, {TMP1, TMP2} michael@0: add TMP1, TMP1, OUTPUT_COL michael@0: add TMP2, TMP2, OUTPUT_COL michael@0: michael@0: vst1.8 {d26[0]}, [TMP1]! michael@0: vst1.8 {d27[4]}, [TMP1]! michael@0: vst1.8 {d26[1]}, [TMP2]! michael@0: vst1.8 {d27[5]}, [TMP2]! michael@0: michael@0: vpop {d8-d15} michael@0: bx lr michael@0: michael@0: .unreq DCT_TABLE michael@0: .unreq COEF_BLOCK michael@0: .unreq OUTPUT_BUF michael@0: .unreq OUTPUT_COL michael@0: .unreq TMP1 michael@0: .unreq TMP2 michael@0: .endfunc michael@0: michael@0: .purgem idct_helper michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * jsimd_ycc_extrgb_convert_neon michael@0: * jsimd_ycc_extbgr_convert_neon michael@0: * jsimd_ycc_extrgbx_convert_neon michael@0: * jsimd_ycc_extbgrx_convert_neon michael@0: * jsimd_ycc_extxbgr_convert_neon michael@0: * jsimd_ycc_extxrgb_convert_neon michael@0: * michael@0: * Colorspace conversion YCbCr -> RGB michael@0: */ michael@0: michael@0: michael@0: .macro do_load size michael@0: .if \size == 8 michael@0: vld1.8 {d4}, [U, :64]! michael@0: vld1.8 {d5}, [V, :64]! michael@0: vld1.8 {d0}, [Y, :64]! michael@0: pld [U, #64] michael@0: pld [V, #64] michael@0: pld [Y, #64] michael@0: .elseif \size == 4 michael@0: vld1.8 {d4[0]}, [U]! michael@0: vld1.8 {d4[1]}, [U]! michael@0: vld1.8 {d4[2]}, [U]! michael@0: vld1.8 {d4[3]}, [U]! michael@0: vld1.8 {d5[0]}, [V]! michael@0: vld1.8 {d5[1]}, [V]! michael@0: vld1.8 {d5[2]}, [V]! michael@0: vld1.8 {d5[3]}, [V]! michael@0: vld1.8 {d0[0]}, [Y]! michael@0: vld1.8 {d0[1]}, [Y]! michael@0: vld1.8 {d0[2]}, [Y]! michael@0: vld1.8 {d0[3]}, [Y]! michael@0: .elseif \size == 2 michael@0: vld1.8 {d4[4]}, [U]! michael@0: vld1.8 {d4[5]}, [U]! michael@0: vld1.8 {d5[4]}, [V]! michael@0: vld1.8 {d5[5]}, [V]! michael@0: vld1.8 {d0[4]}, [Y]! michael@0: vld1.8 {d0[5]}, [Y]! michael@0: .elseif \size == 1 michael@0: vld1.8 {d4[6]}, [U]! michael@0: vld1.8 {d5[6]}, [V]! michael@0: vld1.8 {d0[6]}, [Y]! michael@0: .else michael@0: .error unsupported macroblock size michael@0: .endif michael@0: .endm michael@0: michael@0: .macro do_store bpp, size michael@0: .if \bpp == 24 michael@0: .if \size == 8 michael@0: vst3.8 {d10, d11, d12}, [RGB]! michael@0: .elseif \size == 4 michael@0: vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! michael@0: vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! michael@0: vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! michael@0: vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! michael@0: .elseif \size == 2 michael@0: vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! michael@0: vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! michael@0: .elseif \size == 1 michael@0: vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! michael@0: .else michael@0: .error unsupported macroblock size michael@0: .endif michael@0: .elseif \bpp == 32 michael@0: .if \size == 8 michael@0: vst4.8 {d10, d11, d12, d13}, [RGB]! michael@0: .elseif \size == 4 michael@0: vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! michael@0: vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! michael@0: vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! michael@0: vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! michael@0: .elseif \size == 2 michael@0: vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! michael@0: vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! michael@0: .elseif \size == 1 michael@0: vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! michael@0: .else michael@0: .error unsupported macroblock size michael@0: .endif michael@0: .else michael@0: .error unsupported bpp michael@0: .endif michael@0: .endm michael@0: michael@0: .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs michael@0: michael@0: /* michael@0: * 2 stage pipelined YCbCr->RGB conversion michael@0: */ michael@0: michael@0: .macro do_yuv_to_rgb_stage1 michael@0: vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ michael@0: vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ michael@0: vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ michael@0: vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ michael@0: vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ michael@0: vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ michael@0: vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ michael@0: vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ michael@0: vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ michael@0: vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ michael@0: .endm michael@0: michael@0: .macro do_yuv_to_rgb_stage2 michael@0: vrshrn.s32 d20, q10, #15 michael@0: vrshrn.s32 d21, q11, #15 michael@0: vrshrn.s32 d24, q12, #14 michael@0: vrshrn.s32 d25, q13, #14 michael@0: vrshrn.s32 d28, q14, #14 michael@0: vrshrn.s32 d29, q15, #14 michael@0: vaddw.u8 q10, q10, d0 michael@0: vaddw.u8 q12, q12, d0 michael@0: vaddw.u8 q14, q14, d0 michael@0: vqmovun.s16 d1\g_offs, q10 michael@0: vqmovun.s16 d1\r_offs, q12 michael@0: vqmovun.s16 d1\b_offs, q14 michael@0: .endm michael@0: michael@0: .macro do_yuv_to_rgb_stage2_store_load_stage1 michael@0: vld1.8 {d4}, [U, :64]! michael@0: vrshrn.s32 d20, q10, #15 michael@0: vrshrn.s32 d21, q11, #15 michael@0: vrshrn.s32 d24, q12, #14 michael@0: vrshrn.s32 d25, q13, #14 michael@0: vrshrn.s32 d28, q14, #14 michael@0: vld1.8 {d5}, [V, :64]! michael@0: vrshrn.s32 d29, q15, #14 michael@0: vaddw.u8 q10, q10, d0 michael@0: vaddw.u8 q12, q12, d0 michael@0: vaddw.u8 q14, q14, d0 michael@0: vqmovun.s16 d1\g_offs, q10 michael@0: vld1.8 {d0}, [Y, :64]! michael@0: vqmovun.s16 d1\r_offs, q12 michael@0: pld [U, #64] michael@0: pld [V, #64] michael@0: pld [Y, #64] michael@0: vqmovun.s16 d1\b_offs, q14 michael@0: vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ michael@0: vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ michael@0: do_store \bpp, 8 michael@0: vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ michael@0: vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ michael@0: vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ michael@0: vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ michael@0: vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ michael@0: vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ michael@0: vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ michael@0: vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ michael@0: .endm michael@0: michael@0: .macro do_yuv_to_rgb michael@0: do_yuv_to_rgb_stage1 michael@0: do_yuv_to_rgb_stage2 michael@0: .endm michael@0: michael@0: /* Apple gas crashes on adrl, work around that by using adr. michael@0: * But this requires a copy of these constants for each function. michael@0: */ michael@0: michael@0: .balign 16 michael@0: jsimd_ycc_\colorid\()_neon_consts: michael@0: .short 0, 0, 0, 0 michael@0: .short 22971, -11277, -23401, 29033 michael@0: .short -128, -128, -128, -128 michael@0: .short -128, -128, -128, -128 michael@0: michael@0: asm_function jsimd_ycc_\colorid\()_convert_neon michael@0: OUTPUT_WIDTH .req r0 michael@0: INPUT_BUF .req r1 michael@0: INPUT_ROW .req r2 michael@0: OUTPUT_BUF .req r3 michael@0: NUM_ROWS .req r4 michael@0: michael@0: INPUT_BUF0 .req r5 michael@0: INPUT_BUF1 .req r6 michael@0: INPUT_BUF2 .req INPUT_BUF michael@0: michael@0: RGB .req r7 michael@0: Y .req r8 michael@0: U .req r9 michael@0: V .req r10 michael@0: N .req ip michael@0: michael@0: /* Load constants to d1, d2, d3 (d0 is just used for padding) */ michael@0: adr ip, jsimd_ycc_\colorid\()_neon_consts michael@0: vld1.16 {d0, d1, d2, d3}, [ip, :128] michael@0: michael@0: /* Save ARM registers and handle input arguments */ michael@0: push {r4, r5, r6, r7, r8, r9, r10, lr} michael@0: ldr NUM_ROWS, [sp, #(4 * 8)] michael@0: ldr INPUT_BUF0, [INPUT_BUF] michael@0: ldr INPUT_BUF1, [INPUT_BUF, #4] michael@0: ldr INPUT_BUF2, [INPUT_BUF, #8] michael@0: .unreq INPUT_BUF michael@0: michael@0: /* Save NEON registers */ michael@0: vpush {d8-d15} michael@0: michael@0: /* Initially set d10, d11, d12, d13 to 0xFF */ michael@0: vmov.u8 q5, #255 michael@0: vmov.u8 q6, #255 michael@0: michael@0: /* Outer loop over scanlines */ michael@0: cmp NUM_ROWS, #1 michael@0: blt 9f michael@0: 0: michael@0: ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] michael@0: ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] michael@0: mov N, OUTPUT_WIDTH michael@0: ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] michael@0: add INPUT_ROW, INPUT_ROW, #1 michael@0: ldr RGB, [OUTPUT_BUF], #4 michael@0: michael@0: /* Inner loop over pixels */ michael@0: subs N, N, #8 michael@0: blt 3f michael@0: do_load 8 michael@0: do_yuv_to_rgb_stage1 michael@0: subs N, N, #8 michael@0: blt 2f michael@0: 1: michael@0: do_yuv_to_rgb_stage2_store_load_stage1 michael@0: subs N, N, #8 michael@0: bge 1b michael@0: 2: michael@0: do_yuv_to_rgb_stage2 michael@0: do_store \bpp, 8 michael@0: tst N, #7 michael@0: beq 8f michael@0: 3: michael@0: tst N, #4 michael@0: beq 3f michael@0: do_load 4 michael@0: 3: michael@0: tst N, #2 michael@0: beq 4f michael@0: do_load 2 michael@0: 4: michael@0: tst N, #1 michael@0: beq 5f michael@0: do_load 1 michael@0: 5: michael@0: do_yuv_to_rgb michael@0: tst N, #4 michael@0: beq 6f michael@0: do_store \bpp, 4 michael@0: 6: michael@0: tst N, #2 michael@0: beq 7f michael@0: do_store \bpp, 2 michael@0: 7: michael@0: tst N, #1 michael@0: beq 8f michael@0: do_store \bpp, 1 michael@0: 8: michael@0: subs NUM_ROWS, NUM_ROWS, #1 michael@0: bgt 0b michael@0: 9: michael@0: /* Restore all registers and return */ michael@0: vpop {d8-d15} michael@0: pop {r4, r5, r6, r7, r8, r9, r10, pc} michael@0: michael@0: .unreq OUTPUT_WIDTH michael@0: .unreq INPUT_ROW michael@0: .unreq OUTPUT_BUF michael@0: .unreq NUM_ROWS michael@0: .unreq INPUT_BUF0 michael@0: .unreq INPUT_BUF1 michael@0: .unreq INPUT_BUF2 michael@0: .unreq RGB michael@0: .unreq Y michael@0: .unreq U michael@0: .unreq V michael@0: .unreq N michael@0: .endfunc michael@0: michael@0: .purgem do_yuv_to_rgb michael@0: .purgem do_yuv_to_rgb_stage1 michael@0: .purgem do_yuv_to_rgb_stage2 michael@0: .purgem do_yuv_to_rgb_stage2_store_load_stage1 michael@0: michael@0: .endm michael@0: michael@0: /*--------------------------------- id ----- bpp R G B */ michael@0: generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 michael@0: generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 michael@0: generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 michael@0: generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 michael@0: generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 michael@0: generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 michael@0: michael@0: .purgem do_load michael@0: .purgem do_store michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * jsimd_extrgb_ycc_convert_neon michael@0: * jsimd_extbgr_ycc_convert_neon michael@0: * jsimd_extrgbx_ycc_convert_neon michael@0: * jsimd_extbgrx_ycc_convert_neon michael@0: * jsimd_extxbgr_ycc_convert_neon michael@0: * jsimd_extxrgb_ycc_convert_neon michael@0: * michael@0: * Colorspace conversion RGB -> YCbCr michael@0: */ michael@0: michael@0: .macro do_store size michael@0: .if \size == 8 michael@0: vst1.8 {d20}, [Y]! michael@0: vst1.8 {d21}, [U]! michael@0: vst1.8 {d22}, [V]! michael@0: .elseif \size == 4 michael@0: vst1.8 {d20[0]}, [Y]! michael@0: vst1.8 {d20[1]}, [Y]! michael@0: vst1.8 {d20[2]}, [Y]! michael@0: vst1.8 {d20[3]}, [Y]! michael@0: vst1.8 {d21[0]}, [U]! michael@0: vst1.8 {d21[1]}, [U]! michael@0: vst1.8 {d21[2]}, [U]! michael@0: vst1.8 {d21[3]}, [U]! michael@0: vst1.8 {d22[0]}, [V]! michael@0: vst1.8 {d22[1]}, [V]! michael@0: vst1.8 {d22[2]}, [V]! michael@0: vst1.8 {d22[3]}, [V]! michael@0: .elseif \size == 2 michael@0: vst1.8 {d20[4]}, [Y]! michael@0: vst1.8 {d20[5]}, [Y]! michael@0: vst1.8 {d21[4]}, [U]! michael@0: vst1.8 {d21[5]}, [U]! michael@0: vst1.8 {d22[4]}, [V]! michael@0: vst1.8 {d22[5]}, [V]! michael@0: .elseif \size == 1 michael@0: vst1.8 {d20[6]}, [Y]! michael@0: vst1.8 {d21[6]}, [U]! michael@0: vst1.8 {d22[6]}, [V]! michael@0: .else michael@0: .error unsupported macroblock size michael@0: .endif michael@0: .endm michael@0: michael@0: .macro do_load bpp, size michael@0: .if \bpp == 24 michael@0: .if \size == 8 michael@0: vld3.8 {d10, d11, d12}, [RGB]! michael@0: pld [RGB, #128] michael@0: .elseif \size == 4 michael@0: vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! michael@0: vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! michael@0: vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! michael@0: vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! michael@0: .elseif \size == 2 michael@0: vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! michael@0: vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! michael@0: .elseif \size == 1 michael@0: vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! michael@0: .else michael@0: .error unsupported macroblock size michael@0: .endif michael@0: .elseif \bpp == 32 michael@0: .if \size == 8 michael@0: vld4.8 {d10, d11, d12, d13}, [RGB]! michael@0: pld [RGB, #128] michael@0: .elseif \size == 4 michael@0: vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! michael@0: vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! michael@0: vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! michael@0: vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! michael@0: .elseif \size == 2 michael@0: vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! michael@0: vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! michael@0: .elseif \size == 1 michael@0: vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! michael@0: .else michael@0: .error unsupported macroblock size michael@0: .endif michael@0: .else michael@0: .error unsupported bpp michael@0: .endif michael@0: .endm michael@0: michael@0: .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs michael@0: michael@0: /* michael@0: * 2 stage pipelined RGB->YCbCr conversion michael@0: */ michael@0: michael@0: .macro do_rgb_to_yuv_stage1 michael@0: vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ michael@0: vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ michael@0: vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ michael@0: vmull.u16 q7, d4, d0[0] michael@0: vmlal.u16 q7, d6, d0[1] michael@0: vmlal.u16 q7, d8, d0[2] michael@0: vmull.u16 q8, d5, d0[0] michael@0: vmlal.u16 q8, d7, d0[1] michael@0: vmlal.u16 q8, d9, d0[2] michael@0: vrev64.32 q9, q1 michael@0: vrev64.32 q13, q1 michael@0: vmlsl.u16 q9, d4, d0[3] michael@0: vmlsl.u16 q9, d6, d1[0] michael@0: vmlal.u16 q9, d8, d1[1] michael@0: vmlsl.u16 q13, d5, d0[3] michael@0: vmlsl.u16 q13, d7, d1[0] michael@0: vmlal.u16 q13, d9, d1[1] michael@0: vrev64.32 q14, q1 michael@0: vrev64.32 q15, q1 michael@0: vmlal.u16 q14, d4, d1[1] michael@0: vmlsl.u16 q14, d6, d1[2] michael@0: vmlsl.u16 q14, d8, d1[3] michael@0: vmlal.u16 q15, d5, d1[1] michael@0: vmlsl.u16 q15, d7, d1[2] michael@0: vmlsl.u16 q15, d9, d1[3] michael@0: .endm michael@0: michael@0: .macro do_rgb_to_yuv_stage2 michael@0: vrshrn.u32 d20, q7, #16 michael@0: vrshrn.u32 d21, q8, #16 michael@0: vshrn.u32 d22, q9, #16 michael@0: vshrn.u32 d23, q13, #16 michael@0: vshrn.u32 d24, q14, #16 michael@0: vshrn.u32 d25, q15, #16 michael@0: vmovn.u16 d20, q10 /* d20 = y */ michael@0: vmovn.u16 d21, q11 /* d21 = u */ michael@0: vmovn.u16 d22, q12 /* d22 = v */ michael@0: .endm michael@0: michael@0: .macro do_rgb_to_yuv michael@0: do_rgb_to_yuv_stage1 michael@0: do_rgb_to_yuv_stage2 michael@0: .endm michael@0: michael@0: .macro do_rgb_to_yuv_stage2_store_load_stage1 michael@0: vrshrn.u32 d20, q7, #16 michael@0: vrshrn.u32 d21, q8, #16 michael@0: vshrn.u32 d22, q9, #16 michael@0: vrev64.32 q9, q1 michael@0: vshrn.u32 d23, q13, #16 michael@0: vrev64.32 q13, q1 michael@0: vshrn.u32 d24, q14, #16 michael@0: vshrn.u32 d25, q15, #16 michael@0: do_load \bpp, 8 michael@0: vmovn.u16 d20, q10 /* d20 = y */ michael@0: vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ michael@0: vmovn.u16 d21, q11 /* d21 = u */ michael@0: vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ michael@0: vmovn.u16 d22, q12 /* d22 = v */ michael@0: vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ michael@0: vmull.u16 q7, d4, d0[0] michael@0: vmlal.u16 q7, d6, d0[1] michael@0: vmlal.u16 q7, d8, d0[2] michael@0: vst1.8 {d20}, [Y]! michael@0: vmull.u16 q8, d5, d0[0] michael@0: vmlal.u16 q8, d7, d0[1] michael@0: vmlal.u16 q8, d9, d0[2] michael@0: vmlsl.u16 q9, d4, d0[3] michael@0: vmlsl.u16 q9, d6, d1[0] michael@0: vmlal.u16 q9, d8, d1[1] michael@0: vst1.8 {d21}, [U]! michael@0: vmlsl.u16 q13, d5, d0[3] michael@0: vmlsl.u16 q13, d7, d1[0] michael@0: vmlal.u16 q13, d9, d1[1] michael@0: vrev64.32 q14, q1 michael@0: vrev64.32 q15, q1 michael@0: vmlal.u16 q14, d4, d1[1] michael@0: vmlsl.u16 q14, d6, d1[2] michael@0: vmlsl.u16 q14, d8, d1[3] michael@0: vst1.8 {d22}, [V]! michael@0: vmlal.u16 q15, d5, d1[1] michael@0: vmlsl.u16 q15, d7, d1[2] michael@0: vmlsl.u16 q15, d9, d1[3] michael@0: .endm michael@0: michael@0: .balign 16 michael@0: jsimd_\colorid\()_ycc_neon_consts: michael@0: .short 19595, 38470, 7471, 11059 michael@0: .short 21709, 32768, 27439, 5329 michael@0: .short 32767, 128, 32767, 128 michael@0: .short 32767, 128, 32767, 128 michael@0: michael@0: asm_function jsimd_\colorid\()_ycc_convert_neon michael@0: OUTPUT_WIDTH .req r0 michael@0: INPUT_BUF .req r1 michael@0: OUTPUT_BUF .req r2 michael@0: OUTPUT_ROW .req r3 michael@0: NUM_ROWS .req r4 michael@0: michael@0: OUTPUT_BUF0 .req r5 michael@0: OUTPUT_BUF1 .req r6 michael@0: OUTPUT_BUF2 .req OUTPUT_BUF michael@0: michael@0: RGB .req r7 michael@0: Y .req r8 michael@0: U .req r9 michael@0: V .req r10 michael@0: N .req ip michael@0: michael@0: /* Load constants to d0, d1, d2, d3 */ michael@0: adr ip, jsimd_\colorid\()_ycc_neon_consts michael@0: vld1.16 {d0, d1, d2, d3}, [ip, :128] michael@0: michael@0: /* Save ARM registers and handle input arguments */ michael@0: push {r4, r5, r6, r7, r8, r9, r10, lr} michael@0: ldr NUM_ROWS, [sp, #(4 * 8)] michael@0: ldr OUTPUT_BUF0, [OUTPUT_BUF] michael@0: ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] michael@0: ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] michael@0: .unreq OUTPUT_BUF michael@0: michael@0: /* Save NEON registers */ michael@0: vpush {d8-d15} michael@0: michael@0: /* Outer loop over scanlines */ michael@0: cmp NUM_ROWS, #1 michael@0: blt 9f michael@0: 0: michael@0: ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] michael@0: ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] michael@0: mov N, OUTPUT_WIDTH michael@0: ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] michael@0: add OUTPUT_ROW, OUTPUT_ROW, #1 michael@0: ldr RGB, [INPUT_BUF], #4 michael@0: michael@0: /* Inner loop over pixels */ michael@0: subs N, N, #8 michael@0: blt 3f michael@0: do_load \bpp, 8 michael@0: do_rgb_to_yuv_stage1 michael@0: subs N, N, #8 michael@0: blt 2f michael@0: 1: michael@0: do_rgb_to_yuv_stage2_store_load_stage1 michael@0: subs N, N, #8 michael@0: bge 1b michael@0: 2: michael@0: do_rgb_to_yuv_stage2 michael@0: do_store 8 michael@0: tst N, #7 michael@0: beq 8f michael@0: 3: michael@0: tst N, #4 michael@0: beq 3f michael@0: do_load \bpp, 4 michael@0: 3: michael@0: tst N, #2 michael@0: beq 4f michael@0: do_load \bpp, 2 michael@0: 4: michael@0: tst N, #1 michael@0: beq 5f michael@0: do_load \bpp, 1 michael@0: 5: michael@0: do_rgb_to_yuv michael@0: tst N, #4 michael@0: beq 6f michael@0: do_store 4 michael@0: 6: michael@0: tst N, #2 michael@0: beq 7f michael@0: do_store 2 michael@0: 7: michael@0: tst N, #1 michael@0: beq 8f michael@0: do_store 1 michael@0: 8: michael@0: subs NUM_ROWS, NUM_ROWS, #1 michael@0: bgt 0b michael@0: 9: michael@0: /* Restore all registers and return */ michael@0: vpop {d8-d15} michael@0: pop {r4, r5, r6, r7, r8, r9, r10, pc} michael@0: michael@0: .unreq OUTPUT_WIDTH michael@0: .unreq OUTPUT_ROW michael@0: .unreq INPUT_BUF michael@0: .unreq NUM_ROWS michael@0: .unreq OUTPUT_BUF0 michael@0: .unreq OUTPUT_BUF1 michael@0: .unreq OUTPUT_BUF2 michael@0: .unreq RGB michael@0: .unreq Y michael@0: .unreq U michael@0: .unreq V michael@0: .unreq N michael@0: .endfunc michael@0: michael@0: .purgem do_rgb_to_yuv michael@0: .purgem do_rgb_to_yuv_stage1 michael@0: .purgem do_rgb_to_yuv_stage2 michael@0: .purgem do_rgb_to_yuv_stage2_store_load_stage1 michael@0: michael@0: .endm michael@0: michael@0: /*--------------------------------- id ----- bpp R G B */ michael@0: generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 michael@0: generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 michael@0: generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 michael@0: generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 michael@0: generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 michael@0: generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 michael@0: michael@0: .purgem do_load michael@0: .purgem do_store michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * Load data into workspace, applying unsigned->signed conversion michael@0: * michael@0: * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get michael@0: * rid of VST1.16 instructions michael@0: */ michael@0: michael@0: asm_function jsimd_convsamp_neon michael@0: SAMPLE_DATA .req r0 michael@0: START_COL .req r1 michael@0: WORKSPACE .req r2 michael@0: TMP1 .req r3 michael@0: TMP2 .req r4 michael@0: TMP3 .req r5 michael@0: TMP4 .req ip michael@0: michael@0: push {r4, r5} michael@0: vmov.u8 d0, #128 michael@0: michael@0: ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} michael@0: add TMP1, TMP1, START_COL michael@0: add TMP2, TMP2, START_COL michael@0: add TMP3, TMP3, START_COL michael@0: add TMP4, TMP4, START_COL michael@0: vld1.8 {d16}, [TMP1] michael@0: vsubl.u8 q8, d16, d0 michael@0: vld1.8 {d18}, [TMP2] michael@0: vsubl.u8 q9, d18, d0 michael@0: vld1.8 {d20}, [TMP3] michael@0: vsubl.u8 q10, d20, d0 michael@0: vld1.8 {d22}, [TMP4] michael@0: ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} michael@0: vsubl.u8 q11, d22, d0 michael@0: vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! michael@0: add TMP1, TMP1, START_COL michael@0: add TMP2, TMP2, START_COL michael@0: vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! michael@0: add TMP3, TMP3, START_COL michael@0: add TMP4, TMP4, START_COL michael@0: vld1.8 {d24}, [TMP1] michael@0: vsubl.u8 q12, d24, d0 michael@0: vld1.8 {d26}, [TMP2] michael@0: vsubl.u8 q13, d26, d0 michael@0: vld1.8 {d28}, [TMP3] michael@0: vsubl.u8 q14, d28, d0 michael@0: vld1.8 {d30}, [TMP4] michael@0: vsubl.u8 q15, d30, d0 michael@0: vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! michael@0: vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! michael@0: pop {r4, r5} michael@0: bx lr michael@0: michael@0: .unreq SAMPLE_DATA michael@0: .unreq START_COL michael@0: .unreq WORKSPACE michael@0: .unreq TMP1 michael@0: .unreq TMP2 michael@0: .unreq TMP3 michael@0: .unreq TMP4 michael@0: .endfunc michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * jsimd_fdct_ifast_neon michael@0: * michael@0: * This function contains a fast, not so accurate integer implementation of michael@0: * the forward DCT (Discrete Cosine Transform). It uses the same calculations michael@0: * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' michael@0: * function from jfdctfst.c michael@0: * michael@0: * TODO: can be combined with 'jsimd_convsamp_neon' to get michael@0: * rid of a bunch of VLD1.16 instructions michael@0: */ michael@0: michael@0: #define XFIX_0_382683433 d0[0] michael@0: #define XFIX_0_541196100 d0[1] michael@0: #define XFIX_0_707106781 d0[2] michael@0: #define XFIX_1_306562965 d0[3] michael@0: michael@0: .balign 16 michael@0: jsimd_fdct_ifast_neon_consts: michael@0: .short (98 * 128) /* XFIX_0_382683433 */ michael@0: .short (139 * 128) /* XFIX_0_541196100 */ michael@0: .short (181 * 128) /* XFIX_0_707106781 */ michael@0: .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ michael@0: michael@0: asm_function jsimd_fdct_ifast_neon michael@0: michael@0: DATA .req r0 michael@0: TMP .req ip michael@0: michael@0: vpush {d8-d15} michael@0: michael@0: /* Load constants */ michael@0: adr TMP, jsimd_fdct_ifast_neon_consts michael@0: vld1.16 {d0}, [TMP, :64] michael@0: michael@0: /* Load all DATA into NEON registers with the following allocation: michael@0: * 0 1 2 3 | 4 5 6 7 michael@0: * ---------+-------- michael@0: * 0 | d16 | d17 | q8 michael@0: * 1 | d18 | d19 | q9 michael@0: * 2 | d20 | d21 | q10 michael@0: * 3 | d22 | d23 | q11 michael@0: * 4 | d24 | d25 | q12 michael@0: * 5 | d26 | d27 | q13 michael@0: * 6 | d28 | d29 | q14 michael@0: * 7 | d30 | d31 | q15 michael@0: */ michael@0: michael@0: vld1.16 {d16, d17, d18, d19}, [DATA, :128]! michael@0: vld1.16 {d20, d21, d22, d23}, [DATA, :128]! michael@0: vld1.16 {d24, d25, d26, d27}, [DATA, :128]! michael@0: vld1.16 {d28, d29, d30, d31}, [DATA, :128] michael@0: sub DATA, DATA, #(128 - 32) michael@0: michael@0: mov TMP, #2 michael@0: 1: michael@0: /* Transpose */ michael@0: vtrn.16 q12, q13 michael@0: vtrn.16 q10, q11 michael@0: vtrn.16 q8, q9 michael@0: vtrn.16 q14, q15 michael@0: vtrn.32 q9, q11 michael@0: vtrn.32 q13, q15 michael@0: vtrn.32 q8, q10 michael@0: vtrn.32 q12, q14 michael@0: vswp d30, d23 michael@0: vswp d24, d17 michael@0: vswp d26, d19 michael@0: /* 1-D FDCT */ michael@0: vadd.s16 q2, q11, q12 michael@0: vswp d28, d21 michael@0: vsub.s16 q12, q11, q12 michael@0: vsub.s16 q6, q10, q13 michael@0: vadd.s16 q10, q10, q13 michael@0: vsub.s16 q7, q9, q14 michael@0: vadd.s16 q9, q9, q14 michael@0: vsub.s16 q1, q8, q15 michael@0: vadd.s16 q8, q8, q15 michael@0: vsub.s16 q4, q9, q10 michael@0: vsub.s16 q5, q8, q2 michael@0: vadd.s16 q3, q9, q10 michael@0: vadd.s16 q4, q4, q5 michael@0: vadd.s16 q2, q8, q2 michael@0: vqdmulh.s16 q4, q4, XFIX_0_707106781 michael@0: vadd.s16 q11, q12, q6 michael@0: vadd.s16 q8, q2, q3 michael@0: vsub.s16 q12, q2, q3 michael@0: vadd.s16 q3, q6, q7 michael@0: vadd.s16 q7, q7, q1 michael@0: vqdmulh.s16 q3, q3, XFIX_0_707106781 michael@0: vsub.s16 q6, q11, q7 michael@0: vadd.s16 q10, q5, q4 michael@0: vqdmulh.s16 q6, q6, XFIX_0_382683433 michael@0: vsub.s16 q14, q5, q4 michael@0: vqdmulh.s16 q11, q11, XFIX_0_541196100 michael@0: vqdmulh.s16 q5, q7, XFIX_1_306562965 michael@0: vadd.s16 q4, q1, q3 michael@0: vsub.s16 q3, q1, q3 michael@0: vadd.s16 q7, q7, q6 michael@0: vadd.s16 q11, q11, q6 michael@0: vadd.s16 q7, q7, q5 michael@0: vadd.s16 q13, q3, q11 michael@0: vsub.s16 q11, q3, q11 michael@0: vadd.s16 q9, q4, q7 michael@0: vsub.s16 q15, q4, q7 michael@0: subs TMP, TMP, #1 michael@0: bne 1b michael@0: michael@0: /* store results */ michael@0: vst1.16 {d16, d17, d18, d19}, [DATA, :128]! michael@0: vst1.16 {d20, d21, d22, d23}, [DATA, :128]! michael@0: vst1.16 {d24, d25, d26, d27}, [DATA, :128]! michael@0: vst1.16 {d28, d29, d30, d31}, [DATA, :128] michael@0: michael@0: vpop {d8-d15} michael@0: bx lr michael@0: michael@0: .unreq DATA michael@0: .unreq TMP michael@0: .endfunc michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * GLOBAL(void) michael@0: * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, michael@0: * DCTELEM * workspace); michael@0: * michael@0: * Note: the code uses 2 stage pipelining in order to improve instructions michael@0: * scheduling and eliminate stalls (this provides ~15% better michael@0: * performance for this function on both ARM Cortex-A8 and michael@0: * ARM Cortex-A9 when compared to the non-pipelined variant). michael@0: * The instructions which belong to the second stage use different michael@0: * indentation for better readiability. michael@0: */ michael@0: asm_function jsimd_quantize_neon michael@0: michael@0: COEF_BLOCK .req r0 michael@0: DIVISORS .req r1 michael@0: WORKSPACE .req r2 michael@0: michael@0: RECIPROCAL .req DIVISORS michael@0: CORRECTION .req r3 michael@0: SHIFT .req ip michael@0: LOOP_COUNT .req r4 michael@0: michael@0: vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! michael@0: vabs.s16 q12, q0 michael@0: add CORRECTION, DIVISORS, #(64 * 2) michael@0: add SHIFT, DIVISORS, #(64 * 6) michael@0: vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! michael@0: vabs.s16 q13, q1 michael@0: vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! michael@0: vadd.u16 q12, q12, q10 /* add correction */ michael@0: vadd.u16 q13, q13, q11 michael@0: vmull.u16 q10, d24, d16 /* multiply by reciprocal */ michael@0: vmull.u16 q11, d25, d17 michael@0: vmull.u16 q8, d26, d18 michael@0: vmull.u16 q9, d27, d19 michael@0: vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! michael@0: vshrn.u32 d20, q10, #16 michael@0: vshrn.u32 d21, q11, #16 michael@0: vshrn.u32 d22, q8, #16 michael@0: vshrn.u32 d23, q9, #16 michael@0: vneg.s16 q12, q12 michael@0: vneg.s16 q13, q13 michael@0: vshr.s16 q2, q0, #15 /* extract sign */ michael@0: vshr.s16 q3, q1, #15 michael@0: vshl.u16 q14, q10, q12 /* shift */ michael@0: vshl.u16 q15, q11, q13 michael@0: michael@0: push {r4, r5} michael@0: mov LOOP_COUNT, #3 michael@0: 1: michael@0: vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! michael@0: veor.u16 q14, q14, q2 /* restore sign */ michael@0: vabs.s16 q12, q0 michael@0: vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! michael@0: vabs.s16 q13, q1 michael@0: veor.u16 q15, q15, q3 michael@0: vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! michael@0: vadd.u16 q12, q12, q10 /* add correction */ michael@0: vadd.u16 q13, q13, q11 michael@0: vmull.u16 q10, d24, d16 /* multiply by reciprocal */ michael@0: vmull.u16 q11, d25, d17 michael@0: vmull.u16 q8, d26, d18 michael@0: vmull.u16 q9, d27, d19 michael@0: vsub.u16 q14, q14, q2 michael@0: vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! michael@0: vsub.u16 q15, q15, q3 michael@0: vshrn.u32 d20, q10, #16 michael@0: vshrn.u32 d21, q11, #16 michael@0: vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! michael@0: vshrn.u32 d22, q8, #16 michael@0: vshrn.u32 d23, q9, #16 michael@0: vneg.s16 q12, q12 michael@0: vneg.s16 q13, q13 michael@0: vshr.s16 q2, q0, #15 /* extract sign */ michael@0: vshr.s16 q3, q1, #15 michael@0: vshl.u16 q14, q10, q12 /* shift */ michael@0: vshl.u16 q15, q11, q13 michael@0: subs LOOP_COUNT, LOOP_COUNT, #1 michael@0: bne 1b michael@0: pop {r4, r5} michael@0: michael@0: veor.u16 q14, q14, q2 /* restore sign */ michael@0: veor.u16 q15, q15, q3 michael@0: vsub.u16 q14, q14, q2 michael@0: vsub.u16 q15, q15, q3 michael@0: vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! michael@0: michael@0: bx lr /* return */ michael@0: michael@0: .unreq COEF_BLOCK michael@0: .unreq DIVISORS michael@0: .unreq WORKSPACE michael@0: .unreq RECIPROCAL michael@0: .unreq CORRECTION michael@0: .unreq SHIFT michael@0: .unreq LOOP_COUNT michael@0: .endfunc michael@0: michael@0: /*****************************************************************************/ michael@0: michael@0: /* michael@0: * GLOBAL(void) michael@0: * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, michael@0: * JDIMENSION downsampled_width, michael@0: * JSAMPARRAY input_data, michael@0: * JSAMPARRAY * output_data_ptr); michael@0: * michael@0: * Note: the use of unaligned writes is the main remaining bottleneck in michael@0: * this code, which can be potentially solved to get up to tens michael@0: * of percents performance improvement on Cortex-A8/Cortex-A9. michael@0: */ michael@0: michael@0: /* michael@0: * Upsample 16 source pixels to 32 destination pixels. The new 16 source michael@0: * pixels are loaded to q0. The previous 16 source pixels are in q1. The michael@0: * shifted-by-one source pixels are constructed in q2 by using q0 and q1. michael@0: * Register d28 is used for multiplication by 3. Register q15 is used michael@0: * for adding +1 bias. michael@0: */ michael@0: .macro upsample16 OUTPTR, INPTR michael@0: vld1.8 {q0}, [\INPTR]! michael@0: vmovl.u8 q8, d0 michael@0: vext.8 q2, q1, q0, #15 michael@0: vmovl.u8 q9, d1 michael@0: vaddw.u8 q10, q15, d4 michael@0: vaddw.u8 q11, q15, d5 michael@0: vmlal.u8 q8, d4, d28 michael@0: vmlal.u8 q9, d5, d28 michael@0: vmlal.u8 q10, d0, d28 michael@0: vmlal.u8 q11, d1, d28 michael@0: vmov q1, q0 /* backup source pixels to q1 */ michael@0: vrshrn.u16 d6, q8, #2 michael@0: vrshrn.u16 d7, q9, #2 michael@0: vshrn.u16 d8, q10, #2 michael@0: vshrn.u16 d9, q11, #2 michael@0: vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! michael@0: .endm michael@0: michael@0: /* michael@0: * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' michael@0: * macro, the roles of q0 and q1 registers are reversed for even and odd michael@0: * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. michael@0: * Also this unrolling allows to reorder loads and stores to compensate michael@0: * multiplication latency and reduce stalls. michael@0: */ michael@0: .macro upsample32 OUTPTR, INPTR michael@0: /* even 16 pixels group */ michael@0: vld1.8 {q0}, [\INPTR]! michael@0: vmovl.u8 q8, d0 michael@0: vext.8 q2, q1, q0, #15 michael@0: vmovl.u8 q9, d1 michael@0: vaddw.u8 q10, q15, d4 michael@0: vaddw.u8 q11, q15, d5 michael@0: vmlal.u8 q8, d4, d28 michael@0: vmlal.u8 q9, d5, d28 michael@0: vmlal.u8 q10, d0, d28 michael@0: vmlal.u8 q11, d1, d28 michael@0: /* odd 16 pixels group */ michael@0: vld1.8 {q1}, [\INPTR]! michael@0: vrshrn.u16 d6, q8, #2 michael@0: vrshrn.u16 d7, q9, #2 michael@0: vshrn.u16 d8, q10, #2 michael@0: vshrn.u16 d9, q11, #2 michael@0: vmovl.u8 q8, d2 michael@0: vext.8 q2, q0, q1, #15 michael@0: vmovl.u8 q9, d3 michael@0: vaddw.u8 q10, q15, d4 michael@0: vaddw.u8 q11, q15, d5 michael@0: vmlal.u8 q8, d4, d28 michael@0: vmlal.u8 q9, d5, d28 michael@0: vmlal.u8 q10, d2, d28 michael@0: vmlal.u8 q11, d3, d28 michael@0: vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! michael@0: vrshrn.u16 d6, q8, #2 michael@0: vrshrn.u16 d7, q9, #2 michael@0: vshrn.u16 d8, q10, #2 michael@0: vshrn.u16 d9, q11, #2 michael@0: vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! michael@0: .endm michael@0: michael@0: /* michael@0: * Upsample a row of WIDTH pixels from INPTR to OUTPTR. michael@0: */ michael@0: .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 michael@0: /* special case for the first and last pixels */ michael@0: sub \WIDTH, \WIDTH, #1 michael@0: add \OUTPTR, \OUTPTR, #1 michael@0: ldrb \TMP1, [\INPTR, \WIDTH] michael@0: strb \TMP1, [\OUTPTR, \WIDTH, asl #1] michael@0: ldrb \TMP1, [\INPTR], #1 michael@0: strb \TMP1, [\OUTPTR, #-1] michael@0: vmov.8 d3[7], \TMP1 michael@0: michael@0: subs \WIDTH, \WIDTH, #32 michael@0: blt 5f michael@0: 0: /* process 32 pixels per iteration */ michael@0: upsample32 \OUTPTR, \INPTR michael@0: subs \WIDTH, \WIDTH, #32 michael@0: bge 0b michael@0: 5: michael@0: adds \WIDTH, \WIDTH, #16 michael@0: blt 1f michael@0: 0: /* process 16 pixels if needed */ michael@0: upsample16 \OUTPTR, \INPTR michael@0: subs \WIDTH, \WIDTH, #16 michael@0: 1: michael@0: adds \WIDTH, \WIDTH, #16 michael@0: beq 9f michael@0: michael@0: /* load the remaining 1-15 pixels */ michael@0: add \INPTR, \INPTR, \WIDTH michael@0: tst \WIDTH, #1 michael@0: beq 2f michael@0: sub \INPTR, \INPTR, #1 michael@0: vld1.8 {d0[0]}, [\INPTR] michael@0: 2: michael@0: tst \WIDTH, #2 michael@0: beq 2f michael@0: vext.8 d0, d0, d0, #6 michael@0: sub \INPTR, \INPTR, #1 michael@0: vld1.8 {d0[1]}, [\INPTR] michael@0: sub \INPTR, \INPTR, #1 michael@0: vld1.8 {d0[0]}, [\INPTR] michael@0: 2: michael@0: tst \WIDTH, #4 michael@0: beq 2f michael@0: vrev64.32 d0, d0 michael@0: sub \INPTR, \INPTR, #1 michael@0: vld1.8 {d0[3]}, [\INPTR] michael@0: sub \INPTR, \INPTR, #1 michael@0: vld1.8 {d0[2]}, [\INPTR] michael@0: sub \INPTR, \INPTR, #1 michael@0: vld1.8 {d0[1]}, [\INPTR] michael@0: sub \INPTR, \INPTR, #1 michael@0: vld1.8 {d0[0]}, [\INPTR] michael@0: 2: michael@0: tst \WIDTH, #8 michael@0: beq 2f michael@0: vmov d1, d0 michael@0: sub \INPTR, \INPTR, #8 michael@0: vld1.8 {d0}, [\INPTR] michael@0: 2: /* upsample the remaining pixels */ michael@0: vmovl.u8 q8, d0 michael@0: vext.8 q2, q1, q0, #15 michael@0: vmovl.u8 q9, d1 michael@0: vaddw.u8 q10, q15, d4 michael@0: vaddw.u8 q11, q15, d5 michael@0: vmlal.u8 q8, d4, d28 michael@0: vmlal.u8 q9, d5, d28 michael@0: vmlal.u8 q10, d0, d28 michael@0: vmlal.u8 q11, d1, d28 michael@0: vrshrn.u16 d10, q8, #2 michael@0: vrshrn.u16 d12, q9, #2 michael@0: vshrn.u16 d11, q10, #2 michael@0: vshrn.u16 d13, q11, #2 michael@0: vzip.8 d10, d11 michael@0: vzip.8 d12, d13 michael@0: /* store the remaining pixels */ michael@0: tst \WIDTH, #8 michael@0: beq 2f michael@0: vst1.8 {d10, d11}, [\OUTPTR]! michael@0: vmov q5, q6 michael@0: 2: michael@0: tst \WIDTH, #4 michael@0: beq 2f michael@0: vst1.8 {d10}, [\OUTPTR]! michael@0: vmov d10, d11 michael@0: 2: michael@0: tst \WIDTH, #2 michael@0: beq 2f michael@0: vst1.8 {d10[0]}, [\OUTPTR]! michael@0: vst1.8 {d10[1]}, [\OUTPTR]! michael@0: vst1.8 {d10[2]}, [\OUTPTR]! michael@0: vst1.8 {d10[3]}, [\OUTPTR]! michael@0: vext.8 d10, d10, d10, #4 michael@0: 2: michael@0: tst \WIDTH, #1 michael@0: beq 2f michael@0: vst1.8 {d10[0]}, [\OUTPTR]! michael@0: vst1.8 {d10[1]}, [\OUTPTR]! michael@0: 2: michael@0: 9: michael@0: .endm michael@0: michael@0: asm_function jsimd_h2v1_fancy_upsample_neon michael@0: michael@0: MAX_V_SAMP_FACTOR .req r0 michael@0: DOWNSAMPLED_WIDTH .req r1 michael@0: INPUT_DATA .req r2 michael@0: OUTPUT_DATA_PTR .req r3 michael@0: OUTPUT_DATA .req OUTPUT_DATA_PTR michael@0: michael@0: OUTPTR .req r4 michael@0: INPTR .req r5 michael@0: WIDTH .req ip michael@0: TMP .req lr michael@0: michael@0: push {r4, r5, r6, lr} michael@0: vpush {d8-d15} michael@0: michael@0: ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] michael@0: cmp MAX_V_SAMP_FACTOR, #0 michael@0: ble 99f michael@0: michael@0: /* initialize constants */ michael@0: vmov.u8 d28, #3 michael@0: vmov.u16 q15, #1 michael@0: 11: michael@0: ldr INPTR, [INPUT_DATA], #4 michael@0: ldr OUTPTR, [OUTPUT_DATA], #4 michael@0: mov WIDTH, DOWNSAMPLED_WIDTH michael@0: upsample_row OUTPTR, INPTR, WIDTH, TMP michael@0: subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 michael@0: bgt 11b michael@0: michael@0: 99: michael@0: vpop {d8-d15} michael@0: pop {r4, r5, r6, pc} michael@0: michael@0: .unreq MAX_V_SAMP_FACTOR michael@0: .unreq DOWNSAMPLED_WIDTH michael@0: .unreq INPUT_DATA michael@0: .unreq OUTPUT_DATA_PTR michael@0: .unreq OUTPUT_DATA michael@0: michael@0: .unreq OUTPTR michael@0: .unreq INPTR michael@0: .unreq WIDTH michael@0: .unreq TMP michael@0: michael@0: .endfunc michael@0: michael@0: .purgem upsample16 michael@0: .purgem upsample32 michael@0: .purgem upsample_row