|
1 /* |
|
2 * ARM NEON optimizations for libjpeg-turbo |
|
3 * |
|
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
|
5 * All rights reserved. |
|
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
|
7 * |
|
8 * This software is provided 'as-is', without any express or implied |
|
9 * warranty. In no event will the authors be held liable for any damages |
|
10 * arising from the use of this software. |
|
11 * |
|
12 * Permission is granted to anyone to use this software for any purpose, |
|
13 * including commercial applications, and to alter it and redistribute it |
|
14 * freely, subject to the following restrictions: |
|
15 * |
|
16 * 1. The origin of this software must not be misrepresented; you must not |
|
17 * claim that you wrote the original software. If you use this software |
|
18 * in a product, an acknowledgment in the product documentation would be |
|
19 * appreciated but is not required. |
|
20 * 2. Altered source versions must be plainly marked as such, and must not be |
|
21 * misrepresented as being the original software. |
|
22 * 3. This notice may not be removed or altered from any source distribution. |
|
23 */ |
|
24 |
|
25 #if defined(__linux__) && defined(__ELF__) |
|
26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ |
|
27 #endif |
|
28 |
|
29 .text |
|
30 .fpu neon |
|
31 .arch armv7a |
|
32 .object_arch armv4 |
|
33 .arm |
|
34 |
|
35 |
|
36 #define RESPECT_STRICT_ALIGNMENT 1 |
|
37 |
|
38 /*****************************************************************************/ |
|
39 |
|
40 /* Supplementary macro for setting function attributes */ |
|
41 .macro asm_function fname |
|
42 #ifdef __APPLE__ |
|
43 .func _\fname |
|
44 .globl _\fname |
|
45 _\fname: |
|
46 #else |
|
47 .func \fname |
|
48 .global \fname |
|
49 #ifdef __ELF__ |
|
50 .hidden \fname |
|
51 .type \fname, %function |
|
52 #endif |
|
53 \fname: |
|
54 #endif |
|
55 .endm |
|
56 |
|
57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ |
|
58 .macro transpose_4x4 x0, x1, x2, x3 |
|
59 vtrn.16 \x0, \x1 |
|
60 vtrn.16 \x2, \x3 |
|
61 vtrn.32 \x0, \x2 |
|
62 vtrn.32 \x1, \x3 |
|
63 .endm |
|
64 |
|
65 #define CENTERJSAMPLE 128 |
|
66 |
|
67 /*****************************************************************************/ |
|
68 |
|
69 /* |
|
70 * Perform dequantization and inverse DCT on one block of coefficients. |
|
71 * |
|
72 * GLOBAL(void) |
|
73 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, |
|
74 * JSAMPARRAY output_buf, JDIMENSION output_col) |
|
75 */ |
|
76 |
|
77 #define FIX_0_298631336 (2446) |
|
78 #define FIX_0_390180644 (3196) |
|
79 #define FIX_0_541196100 (4433) |
|
80 #define FIX_0_765366865 (6270) |
|
81 #define FIX_0_899976223 (7373) |
|
82 #define FIX_1_175875602 (9633) |
|
83 #define FIX_1_501321110 (12299) |
|
84 #define FIX_1_847759065 (15137) |
|
85 #define FIX_1_961570560 (16069) |
|
86 #define FIX_2_053119869 (16819) |
|
87 #define FIX_2_562915447 (20995) |
|
88 #define FIX_3_072711026 (25172) |
|
89 |
|
90 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) |
|
91 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) |
|
92 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) |
|
93 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) |
|
94 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) |
|
95 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) |
|
96 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) |
|
97 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) |
|
98 |
|
99 /* |
|
100 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. |
|
101 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' |
|
102 */ |
|
103 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ |
|
104 { \ |
|
105 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ |
|
106 INT32 q1, q2, q3, q4, q5, q6, q7; \ |
|
107 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ |
|
108 \ |
|
109 /* 1-D iDCT input data */ \ |
|
110 row0 = xrow0; \ |
|
111 row1 = xrow1; \ |
|
112 row2 = xrow2; \ |
|
113 row3 = xrow3; \ |
|
114 row4 = xrow4; \ |
|
115 row5 = xrow5; \ |
|
116 row6 = xrow6; \ |
|
117 row7 = xrow7; \ |
|
118 \ |
|
119 q5 = row7 + row3; \ |
|
120 q4 = row5 + row1; \ |
|
121 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ |
|
122 MULTIPLY(q4, FIX_1_175875602); \ |
|
123 q7 = MULTIPLY(q5, FIX_1_175875602) + \ |
|
124 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ |
|
125 q2 = MULTIPLY(row2, FIX_0_541196100) + \ |
|
126 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ |
|
127 q4 = q6; \ |
|
128 q3 = ((INT32) row0 - (INT32) row4) << 13; \ |
|
129 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ |
|
130 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ |
|
131 /* now we can use q1 (reloadable constants have been used up) */ \ |
|
132 q1 = q3 + q2; \ |
|
133 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ |
|
134 MULTIPLY(row1, -FIX_0_899976223); \ |
|
135 q5 = q7; \ |
|
136 q1 = q1 + q6; \ |
|
137 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ |
|
138 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ |
|
139 \ |
|
140 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ |
|
141 tmp11_plus_tmp2 = q1; \ |
|
142 row1 = 0; \ |
|
143 \ |
|
144 q1 = q1 - q6; \ |
|
145 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ |
|
146 MULTIPLY(row3, -FIX_2_562915447); \ |
|
147 q1 = q1 - q6; \ |
|
148 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ |
|
149 MULTIPLY(row6, FIX_0_541196100); \ |
|
150 q3 = q3 - q2; \ |
|
151 \ |
|
152 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ |
|
153 tmp11_minus_tmp2 = q1; \ |
|
154 \ |
|
155 q1 = ((INT32) row0 + (INT32) row4) << 13; \ |
|
156 q2 = q1 + q6; \ |
|
157 q1 = q1 - q6; \ |
|
158 \ |
|
159 /* pick up the results */ \ |
|
160 tmp0 = q4; \ |
|
161 tmp1 = q5; \ |
|
162 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ |
|
163 tmp3 = q7; \ |
|
164 tmp10 = q2; \ |
|
165 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ |
|
166 tmp12 = q3; \ |
|
167 tmp13 = q1; \ |
|
168 } |
|
169 |
|
170 #define XFIX_0_899976223 d0[0] |
|
171 #define XFIX_0_541196100 d0[1] |
|
172 #define XFIX_2_562915447 d0[2] |
|
173 #define XFIX_0_298631336_MINUS_0_899976223 d0[3] |
|
174 #define XFIX_1_501321110_MINUS_0_899976223 d1[0] |
|
175 #define XFIX_2_053119869_MINUS_2_562915447 d1[1] |
|
176 #define XFIX_0_541196100_PLUS_0_765366865 d1[2] |
|
177 #define XFIX_1_175875602 d1[3] |
|
178 #define XFIX_1_175875602_MINUS_0_390180644 d2[0] |
|
179 #define XFIX_0_541196100_MINUS_1_847759065 d2[1] |
|
180 #define XFIX_3_072711026_MINUS_2_562915447 d2[2] |
|
181 #define XFIX_1_175875602_MINUS_1_961570560 d2[3] |
|
182 |
|
183 .balign 16 |
|
184 jsimd_idct_islow_neon_consts: |
|
185 .short FIX_0_899976223 /* d0[0] */ |
|
186 .short FIX_0_541196100 /* d0[1] */ |
|
187 .short FIX_2_562915447 /* d0[2] */ |
|
188 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ |
|
189 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ |
|
190 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ |
|
191 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ |
|
192 .short FIX_1_175875602 /* d1[3] */ |
|
193 /* reloadable constants */ |
|
194 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ |
|
195 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ |
|
196 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ |
|
197 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ |
|
198 |
|
199 asm_function jsimd_idct_islow_neon |
|
200 |
|
201 DCT_TABLE .req r0 |
|
202 COEF_BLOCK .req r1 |
|
203 OUTPUT_BUF .req r2 |
|
204 OUTPUT_COL .req r3 |
|
205 TMP1 .req r0 |
|
206 TMP2 .req r1 |
|
207 TMP3 .req r2 |
|
208 TMP4 .req ip |
|
209 |
|
210 ROW0L .req d16 |
|
211 ROW0R .req d17 |
|
212 ROW1L .req d18 |
|
213 ROW1R .req d19 |
|
214 ROW2L .req d20 |
|
215 ROW2R .req d21 |
|
216 ROW3L .req d22 |
|
217 ROW3R .req d23 |
|
218 ROW4L .req d24 |
|
219 ROW4R .req d25 |
|
220 ROW5L .req d26 |
|
221 ROW5R .req d27 |
|
222 ROW6L .req d28 |
|
223 ROW6R .req d29 |
|
224 ROW7L .req d30 |
|
225 ROW7R .req d31 |
|
226 |
|
227 /* Load and dequantize coefficients into NEON registers |
|
228 * with the following allocation: |
|
229 * 0 1 2 3 | 4 5 6 7 |
|
230 * ---------+-------- |
|
231 * 0 | d16 | d17 ( q8 ) |
|
232 * 1 | d18 | d19 ( q9 ) |
|
233 * 2 | d20 | d21 ( q10 ) |
|
234 * 3 | d22 | d23 ( q11 ) |
|
235 * 4 | d24 | d25 ( q12 ) |
|
236 * 5 | d26 | d27 ( q13 ) |
|
237 * 6 | d28 | d29 ( q14 ) |
|
238 * 7 | d30 | d31 ( q15 ) |
|
239 */ |
|
240 adr ip, jsimd_idct_islow_neon_consts |
|
241 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
|
242 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
|
243 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
|
244 vmul.s16 q8, q8, q0 |
|
245 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
|
246 vmul.s16 q9, q9, q1 |
|
247 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
|
248 vmul.s16 q10, q10, q2 |
|
249 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
|
250 vmul.s16 q11, q11, q3 |
|
251 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
|
252 vmul.s16 q12, q12, q0 |
|
253 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
|
254 vmul.s16 q14, q14, q2 |
|
255 vmul.s16 q13, q13, q1 |
|
256 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ |
|
257 add ip, ip, #16 |
|
258 vmul.s16 q15, q15, q3 |
|
259 vpush {d8-d15} /* save NEON registers */ |
|
260 /* 1-D IDCT, pass 1, left 4x8 half */ |
|
261 vadd.s16 d4, ROW7L, ROW3L |
|
262 vadd.s16 d5, ROW5L, ROW1L |
|
263 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 |
|
264 vmlal.s16 q6, d5, XFIX_1_175875602 |
|
265 vmull.s16 q7, d4, XFIX_1_175875602 |
|
266 /* Check for the zero coefficients in the right 4x8 half */ |
|
267 push {r4, r5} |
|
268 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 |
|
269 vsubl.s16 q3, ROW0L, ROW4L |
|
270 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] |
|
271 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
|
272 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 |
|
273 orr r0, r4, r5 |
|
274 vmov q4, q6 |
|
275 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 |
|
276 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] |
|
277 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
|
278 vshl.s32 q3, q3, #13 |
|
279 orr r0, r0, r4 |
|
280 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
|
281 orr r0, r0, r5 |
|
282 vadd.s32 q1, q3, q2 |
|
283 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] |
|
284 vmov q5, q7 |
|
285 vadd.s32 q1, q1, q6 |
|
286 orr r0, r0, r4 |
|
287 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 |
|
288 orr r0, r0, r5 |
|
289 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
|
290 vrshrn.s32 ROW1L, q1, #11 |
|
291 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] |
|
292 vsub.s32 q1, q1, q6 |
|
293 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 |
|
294 orr r0, r0, r4 |
|
295 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
|
296 orr r0, r0, r5 |
|
297 vsub.s32 q1, q1, q6 |
|
298 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
|
299 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] |
|
300 vmlal.s16 q6, ROW6L, XFIX_0_541196100 |
|
301 vsub.s32 q3, q3, q2 |
|
302 orr r0, r0, r4 |
|
303 vrshrn.s32 ROW6L, q1, #11 |
|
304 orr r0, r0, r5 |
|
305 vadd.s32 q1, q3, q5 |
|
306 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] |
|
307 vsub.s32 q3, q3, q5 |
|
308 vaddl.s16 q5, ROW0L, ROW4L |
|
309 orr r0, r0, r4 |
|
310 vrshrn.s32 ROW2L, q1, #11 |
|
311 orr r0, r0, r5 |
|
312 vrshrn.s32 ROW5L, q3, #11 |
|
313 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] |
|
314 vshl.s32 q5, q5, #13 |
|
315 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 |
|
316 orr r0, r0, r4 |
|
317 vadd.s32 q2, q5, q6 |
|
318 orrs r0, r0, r5 |
|
319 vsub.s32 q1, q5, q6 |
|
320 vadd.s32 q6, q2, q7 |
|
321 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] |
|
322 vsub.s32 q2, q2, q7 |
|
323 vadd.s32 q5, q1, q4 |
|
324 orr r0, r4, r5 |
|
325 vsub.s32 q3, q1, q4 |
|
326 pop {r4, r5} |
|
327 vrshrn.s32 ROW7L, q2, #11 |
|
328 vrshrn.s32 ROW3L, q5, #11 |
|
329 vrshrn.s32 ROW0L, q6, #11 |
|
330 vrshrn.s32 ROW4L, q3, #11 |
|
331 |
|
332 beq 3f /* Go to do some special handling for the sparse right 4x8 half */ |
|
333 |
|
334 /* 1-D IDCT, pass 1, right 4x8 half */ |
|
335 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
|
336 vadd.s16 d10, ROW7R, ROW3R |
|
337 vadd.s16 d8, ROW5R, ROW1R |
|
338 /* Transpose left 4x8 half */ |
|
339 vtrn.16 ROW6L, ROW7L |
|
340 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 |
|
341 vmlal.s16 q6, d8, XFIX_1_175875602 |
|
342 vtrn.16 ROW2L, ROW3L |
|
343 vmull.s16 q7, d10, XFIX_1_175875602 |
|
344 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 |
|
345 vtrn.16 ROW0L, ROW1L |
|
346 vsubl.s16 q3, ROW0R, ROW4R |
|
347 vmull.s16 q2, ROW2R, XFIX_0_541196100 |
|
348 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
|
349 vtrn.16 ROW4L, ROW5L |
|
350 vmov q4, q6 |
|
351 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
|
352 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 |
|
353 vtrn.32 ROW1L, ROW3L |
|
354 vshl.s32 q3, q3, #13 |
|
355 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 |
|
356 vtrn.32 ROW4L, ROW6L |
|
357 vadd.s32 q1, q3, q2 |
|
358 vmov q5, q7 |
|
359 vadd.s32 q1, q1, q6 |
|
360 vtrn.32 ROW0L, ROW2L |
|
361 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
|
362 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 |
|
363 vrshrn.s32 ROW1R, q1, #11 |
|
364 vtrn.32 ROW5L, ROW7L |
|
365 vsub.s32 q1, q1, q6 |
|
366 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
|
367 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 |
|
368 vsub.s32 q1, q1, q6 |
|
369 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 |
|
370 vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
|
371 vsub.s32 q3, q3, q2 |
|
372 vrshrn.s32 ROW6R, q1, #11 |
|
373 vadd.s32 q1, q3, q5 |
|
374 vsub.s32 q3, q3, q5 |
|
375 vaddl.s16 q5, ROW0R, ROW4R |
|
376 vrshrn.s32 ROW2R, q1, #11 |
|
377 vrshrn.s32 ROW5R, q3, #11 |
|
378 vshl.s32 q5, q5, #13 |
|
379 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
|
380 vadd.s32 q2, q5, q6 |
|
381 vsub.s32 q1, q5, q6 |
|
382 vadd.s32 q6, q2, q7 |
|
383 vsub.s32 q2, q2, q7 |
|
384 vadd.s32 q5, q1, q4 |
|
385 vsub.s32 q3, q1, q4 |
|
386 vrshrn.s32 ROW7R, q2, #11 |
|
387 vrshrn.s32 ROW3R, q5, #11 |
|
388 vrshrn.s32 ROW0R, q6, #11 |
|
389 vrshrn.s32 ROW4R, q3, #11 |
|
390 /* Transpose right 4x8 half */ |
|
391 vtrn.16 ROW6R, ROW7R |
|
392 vtrn.16 ROW2R, ROW3R |
|
393 vtrn.16 ROW0R, ROW1R |
|
394 vtrn.16 ROW4R, ROW5R |
|
395 vtrn.32 ROW1R, ROW3R |
|
396 vtrn.32 ROW4R, ROW6R |
|
397 vtrn.32 ROW0R, ROW2R |
|
398 vtrn.32 ROW5R, ROW7R |
|
399 |
|
400 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ |
|
401 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
|
402 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
|
403 vmlal.s16 q6, ROW1L, XFIX_1_175875602 |
|
404 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ |
|
405 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
|
406 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
|
407 vmlal.s16 q7, ROW3L, XFIX_1_175875602 |
|
408 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ |
|
409 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
|
410 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
|
411 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
|
412 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ |
|
413 vmov q4, q6 |
|
414 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ |
|
415 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
|
416 vshl.s32 q3, q3, #13 |
|
417 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
|
418 vadd.s32 q1, q3, q2 |
|
419 vmov q5, q7 |
|
420 vadd.s32 q1, q1, q6 |
|
421 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ |
|
422 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
|
423 vshrn.s32 ROW1L, q1, #16 |
|
424 vsub.s32 q1, q1, q6 |
|
425 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ |
|
426 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
|
427 vsub.s32 q1, q1, q6 |
|
428 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
|
429 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
|
430 vsub.s32 q3, q3, q2 |
|
431 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
|
432 vadd.s32 q1, q3, q5 |
|
433 vsub.s32 q3, q3, q5 |
|
434 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
|
435 vshrn.s32 ROW2L, q1, #16 |
|
436 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
|
437 vshl.s32 q5, q5, #13 |
|
438 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ |
|
439 vadd.s32 q2, q5, q6 |
|
440 vsub.s32 q1, q5, q6 |
|
441 vadd.s32 q6, q2, q7 |
|
442 vsub.s32 q2, q2, q7 |
|
443 vadd.s32 q5, q1, q4 |
|
444 vsub.s32 q3, q1, q4 |
|
445 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
|
446 vshrn.s32 ROW3L, q5, #16 |
|
447 vshrn.s32 ROW0L, q6, #16 |
|
448 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
|
449 /* 1-D IDCT, pass 2, right 4x8 half */ |
|
450 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
|
451 vmull.s16 q6, ROW5R, XFIX_1_175875602 |
|
452 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
|
453 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 |
|
454 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ |
|
455 vmull.s16 q7, ROW7R, XFIX_1_175875602 |
|
456 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
|
457 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 |
|
458 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ |
|
459 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
|
460 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
|
461 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
|
462 vmov q4, q6 |
|
463 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
|
464 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ |
|
465 vshl.s32 q3, q3, #13 |
|
466 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ |
|
467 vadd.s32 q1, q3, q2 |
|
468 vmov q5, q7 |
|
469 vadd.s32 q1, q1, q6 |
|
470 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
|
471 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ |
|
472 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
|
473 vsub.s32 q1, q1, q6 |
|
474 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
|
475 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ |
|
476 vsub.s32 q1, q1, q6 |
|
477 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ |
|
478 vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
|
479 vsub.s32 q3, q3, q2 |
|
480 vshrn.s32 ROW6R, q1, #16 |
|
481 vadd.s32 q1, q3, q5 |
|
482 vsub.s32 q3, q3, q5 |
|
483 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
|
484 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
|
485 vshrn.s32 ROW5R, q3, #16 |
|
486 vshl.s32 q5, q5, #13 |
|
487 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
|
488 vadd.s32 q2, q5, q6 |
|
489 vsub.s32 q1, q5, q6 |
|
490 vadd.s32 q6, q2, q7 |
|
491 vsub.s32 q2, q2, q7 |
|
492 vadd.s32 q5, q1, q4 |
|
493 vsub.s32 q3, q1, q4 |
|
494 vshrn.s32 ROW7R, q2, #16 |
|
495 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
|
496 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
|
497 vshrn.s32 ROW4R, q3, #16 |
|
498 |
|
499 2: /* Descale to 8-bit and range limit */ |
|
500 vqrshrn.s16 d16, q8, #2 |
|
501 vqrshrn.s16 d17, q9, #2 |
|
502 vqrshrn.s16 d18, q10, #2 |
|
503 vqrshrn.s16 d19, q11, #2 |
|
504 vpop {d8-d15} /* restore NEON registers */ |
|
505 vqrshrn.s16 d20, q12, #2 |
|
506 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ |
|
507 vtrn.16 q8, q9 |
|
508 vqrshrn.s16 d21, q13, #2 |
|
509 vqrshrn.s16 d22, q14, #2 |
|
510 vmov.u8 q0, #(CENTERJSAMPLE) |
|
511 vqrshrn.s16 d23, q15, #2 |
|
512 vtrn.8 d16, d17 |
|
513 vtrn.8 d18, d19 |
|
514 vadd.u8 q8, q8, q0 |
|
515 vadd.u8 q9, q9, q0 |
|
516 vtrn.16 q10, q11 |
|
517 /* Store results to the output buffer */ |
|
518 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
|
519 add TMP1, TMP1, OUTPUT_COL |
|
520 add TMP2, TMP2, OUTPUT_COL |
|
521 vst1.8 {d16}, [TMP1] |
|
522 vtrn.8 d20, d21 |
|
523 vst1.8 {d17}, [TMP2] |
|
524 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
|
525 add TMP1, TMP1, OUTPUT_COL |
|
526 add TMP2, TMP2, OUTPUT_COL |
|
527 vst1.8 {d18}, [TMP1] |
|
528 vadd.u8 q10, q10, q0 |
|
529 vst1.8 {d19}, [TMP2] |
|
530 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
|
531 add TMP1, TMP1, OUTPUT_COL |
|
532 add TMP2, TMP2, OUTPUT_COL |
|
533 add TMP3, TMP3, OUTPUT_COL |
|
534 add TMP4, TMP4, OUTPUT_COL |
|
535 vtrn.8 d22, d23 |
|
536 vst1.8 {d20}, [TMP1] |
|
537 vadd.u8 q11, q11, q0 |
|
538 vst1.8 {d21}, [TMP2] |
|
539 vst1.8 {d22}, [TMP3] |
|
540 vst1.8 {d23}, [TMP4] |
|
541 bx lr |
|
542 |
|
543 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ |
|
544 |
|
545 /* Transpose left 4x8 half */ |
|
546 vtrn.16 ROW6L, ROW7L |
|
547 vtrn.16 ROW2L, ROW3L |
|
548 vtrn.16 ROW0L, ROW1L |
|
549 vtrn.16 ROW4L, ROW5L |
|
550 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ |
|
551 vtrn.32 ROW1L, ROW3L |
|
552 vtrn.32 ROW4L, ROW6L |
|
553 vtrn.32 ROW0L, ROW2L |
|
554 vtrn.32 ROW5L, ROW7L |
|
555 |
|
556 cmp r0, #0 |
|
557 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ |
|
558 |
|
559 /* Only row 0 is non-zero for the right 4x8 half */ |
|
560 vdup.s16 ROW1R, ROW0R[1] |
|
561 vdup.s16 ROW2R, ROW0R[2] |
|
562 vdup.s16 ROW3R, ROW0R[3] |
|
563 vdup.s16 ROW4R, ROW0R[0] |
|
564 vdup.s16 ROW5R, ROW0R[1] |
|
565 vdup.s16 ROW6R, ROW0R[2] |
|
566 vdup.s16 ROW7R, ROW0R[3] |
|
567 vdup.s16 ROW0R, ROW0R[0] |
|
568 b 1b /* Go to 'normal' second pass */ |
|
569 |
|
570 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ |
|
571 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
|
572 vmull.s16 q6, ROW1L, XFIX_1_175875602 |
|
573 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
|
574 vmull.s16 q7, ROW3L, XFIX_1_175875602 |
|
575 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
|
576 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
|
577 vshll.s16 q3, ROW0L, #13 |
|
578 vmov q4, q6 |
|
579 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
|
580 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
|
581 vadd.s32 q1, q3, q2 |
|
582 vmov q5, q7 |
|
583 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
|
584 vadd.s32 q1, q1, q6 |
|
585 vadd.s32 q6, q6, q6 |
|
586 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
|
587 vshrn.s32 ROW1L, q1, #16 |
|
588 vsub.s32 q1, q1, q6 |
|
589 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
|
590 vsub.s32 q3, q3, q2 |
|
591 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
|
592 vadd.s32 q1, q3, q5 |
|
593 vsub.s32 q3, q3, q5 |
|
594 vshll.s16 q5, ROW0L, #13 |
|
595 vshrn.s32 ROW2L, q1, #16 |
|
596 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
|
597 vadd.s32 q2, q5, q6 |
|
598 vsub.s32 q1, q5, q6 |
|
599 vadd.s32 q6, q2, q7 |
|
600 vsub.s32 q2, q2, q7 |
|
601 vadd.s32 q5, q1, q4 |
|
602 vsub.s32 q3, q1, q4 |
|
603 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
|
604 vshrn.s32 ROW3L, q5, #16 |
|
605 vshrn.s32 ROW0L, q6, #16 |
|
606 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
|
607 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ |
|
608 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
|
609 vmull.s16 q6, ROW5L, XFIX_1_175875602 |
|
610 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 |
|
611 vmull.s16 q7, ROW7L, XFIX_1_175875602 |
|
612 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 |
|
613 vmull.s16 q2, ROW6L, XFIX_0_541196100 |
|
614 vshll.s16 q3, ROW4L, #13 |
|
615 vmov q4, q6 |
|
616 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 |
|
617 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 |
|
618 vadd.s32 q1, q3, q2 |
|
619 vmov q5, q7 |
|
620 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 |
|
621 vadd.s32 q1, q1, q6 |
|
622 vadd.s32 q6, q6, q6 |
|
623 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 |
|
624 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
|
625 vsub.s32 q1, q1, q6 |
|
626 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 |
|
627 vsub.s32 q3, q3, q2 |
|
628 vshrn.s32 ROW6R, q1, #16 |
|
629 vadd.s32 q1, q3, q5 |
|
630 vsub.s32 q3, q3, q5 |
|
631 vshll.s16 q5, ROW4L, #13 |
|
632 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
|
633 vshrn.s32 ROW5R, q3, #16 |
|
634 vadd.s32 q2, q5, q6 |
|
635 vsub.s32 q1, q5, q6 |
|
636 vadd.s32 q6, q2, q7 |
|
637 vsub.s32 q2, q2, q7 |
|
638 vadd.s32 q5, q1, q4 |
|
639 vsub.s32 q3, q1, q4 |
|
640 vshrn.s32 ROW7R, q2, #16 |
|
641 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
|
642 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
|
643 vshrn.s32 ROW4R, q3, #16 |
|
644 b 2b /* Go to epilogue */ |
|
645 |
|
646 .unreq DCT_TABLE |
|
647 .unreq COEF_BLOCK |
|
648 .unreq OUTPUT_BUF |
|
649 .unreq OUTPUT_COL |
|
650 .unreq TMP1 |
|
651 .unreq TMP2 |
|
652 .unreq TMP3 |
|
653 .unreq TMP4 |
|
654 |
|
655 .unreq ROW0L |
|
656 .unreq ROW0R |
|
657 .unreq ROW1L |
|
658 .unreq ROW1R |
|
659 .unreq ROW2L |
|
660 .unreq ROW2R |
|
661 .unreq ROW3L |
|
662 .unreq ROW3R |
|
663 .unreq ROW4L |
|
664 .unreq ROW4R |
|
665 .unreq ROW5L |
|
666 .unreq ROW5R |
|
667 .unreq ROW6L |
|
668 .unreq ROW6R |
|
669 .unreq ROW7L |
|
670 .unreq ROW7R |
|
671 .endfunc |
|
672 |
|
673 /*****************************************************************************/ |
|
674 |
|
675 /* |
|
676 * jsimd_idct_ifast_neon |
|
677 * |
|
678 * This function contains a fast, not so accurate integer implementation of |
|
679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations |
|
680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' |
|
681 * function from jidctfst.c |
|
682 * |
|
683 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. |
|
684 * But in ARM NEON case some extra additions are required because VQDMULH |
|
685 * instruction can't handle the constants larger than 1. So the expressions |
|
686 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", |
|
687 * which introduces an extra addition. Overall, there are 6 extra additions |
|
688 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. |
|
689 */ |
|
690 |
|
691 #define XFIX_1_082392200 d0[0] |
|
692 #define XFIX_1_414213562 d0[1] |
|
693 #define XFIX_1_847759065 d0[2] |
|
694 #define XFIX_2_613125930 d0[3] |
|
695 |
|
696 .balign 16 |
|
697 jsimd_idct_ifast_neon_consts: |
|
698 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ |
|
699 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ |
|
700 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
|
701 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
|
702 |
|
703 asm_function jsimd_idct_ifast_neon |
|
704 |
|
705 DCT_TABLE .req r0 |
|
706 COEF_BLOCK .req r1 |
|
707 OUTPUT_BUF .req r2 |
|
708 OUTPUT_COL .req r3 |
|
709 TMP1 .req r0 |
|
710 TMP2 .req r1 |
|
711 TMP3 .req r2 |
|
712 TMP4 .req ip |
|
713 |
|
714 /* Load and dequantize coefficients into NEON registers |
|
715 * with the following allocation: |
|
716 * 0 1 2 3 | 4 5 6 7 |
|
717 * ---------+-------- |
|
718 * 0 | d16 | d17 ( q8 ) |
|
719 * 1 | d18 | d19 ( q9 ) |
|
720 * 2 | d20 | d21 ( q10 ) |
|
721 * 3 | d22 | d23 ( q11 ) |
|
722 * 4 | d24 | d25 ( q12 ) |
|
723 * 5 | d26 | d27 ( q13 ) |
|
724 * 6 | d28 | d29 ( q14 ) |
|
725 * 7 | d30 | d31 ( q15 ) |
|
726 */ |
|
727 adr ip, jsimd_idct_ifast_neon_consts |
|
728 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
|
729 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
|
730 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
|
731 vmul.s16 q8, q8, q0 |
|
732 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
|
733 vmul.s16 q9, q9, q1 |
|
734 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
|
735 vmul.s16 q10, q10, q2 |
|
736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
|
737 vmul.s16 q11, q11, q3 |
|
738 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
|
739 vmul.s16 q12, q12, q0 |
|
740 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
|
741 vmul.s16 q14, q14, q2 |
|
742 vmul.s16 q13, q13, q1 |
|
743 vld1.16 {d0}, [ip, :64] /* load constants */ |
|
744 vmul.s16 q15, q15, q3 |
|
745 vpush {d8-d13} /* save NEON registers */ |
|
746 /* 1-D IDCT, pass 1 */ |
|
747 vsub.s16 q2, q10, q14 |
|
748 vadd.s16 q14, q10, q14 |
|
749 vsub.s16 q1, q11, q13 |
|
750 vadd.s16 q13, q11, q13 |
|
751 vsub.s16 q5, q9, q15 |
|
752 vadd.s16 q15, q9, q15 |
|
753 vqdmulh.s16 q4, q2, XFIX_1_414213562 |
|
754 vqdmulh.s16 q6, q1, XFIX_2_613125930 |
|
755 vadd.s16 q3, q1, q1 |
|
756 vsub.s16 q1, q5, q1 |
|
757 vadd.s16 q10, q2, q4 |
|
758 vqdmulh.s16 q4, q1, XFIX_1_847759065 |
|
759 vsub.s16 q2, q15, q13 |
|
760 vadd.s16 q3, q3, q6 |
|
761 vqdmulh.s16 q6, q2, XFIX_1_414213562 |
|
762 vadd.s16 q1, q1, q4 |
|
763 vqdmulh.s16 q4, q5, XFIX_1_082392200 |
|
764 vsub.s16 q10, q10, q14 |
|
765 vadd.s16 q2, q2, q6 |
|
766 vsub.s16 q6, q8, q12 |
|
767 vadd.s16 q12, q8, q12 |
|
768 vadd.s16 q9, q5, q4 |
|
769 vadd.s16 q5, q6, q10 |
|
770 vsub.s16 q10, q6, q10 |
|
771 vadd.s16 q6, q15, q13 |
|
772 vadd.s16 q8, q12, q14 |
|
773 vsub.s16 q3, q6, q3 |
|
774 vsub.s16 q12, q12, q14 |
|
775 vsub.s16 q3, q3, q1 |
|
776 vsub.s16 q1, q9, q1 |
|
777 vadd.s16 q2, q3, q2 |
|
778 vsub.s16 q15, q8, q6 |
|
779 vadd.s16 q1, q1, q2 |
|
780 vadd.s16 q8, q8, q6 |
|
781 vadd.s16 q14, q5, q3 |
|
782 vsub.s16 q9, q5, q3 |
|
783 vsub.s16 q13, q10, q2 |
|
784 vadd.s16 q10, q10, q2 |
|
785 /* Transpose */ |
|
786 vtrn.16 q8, q9 |
|
787 vsub.s16 q11, q12, q1 |
|
788 vtrn.16 q14, q15 |
|
789 vadd.s16 q12, q12, q1 |
|
790 vtrn.16 q10, q11 |
|
791 vtrn.16 q12, q13 |
|
792 vtrn.32 q9, q11 |
|
793 vtrn.32 q12, q14 |
|
794 vtrn.32 q8, q10 |
|
795 vtrn.32 q13, q15 |
|
796 vswp d28, d21 |
|
797 vswp d26, d19 |
|
798 /* 1-D IDCT, pass 2 */ |
|
799 vsub.s16 q2, q10, q14 |
|
800 vswp d30, d23 |
|
801 vadd.s16 q14, q10, q14 |
|
802 vswp d24, d17 |
|
803 vsub.s16 q1, q11, q13 |
|
804 vadd.s16 q13, q11, q13 |
|
805 vsub.s16 q5, q9, q15 |
|
806 vadd.s16 q15, q9, q15 |
|
807 vqdmulh.s16 q4, q2, XFIX_1_414213562 |
|
808 vqdmulh.s16 q6, q1, XFIX_2_613125930 |
|
809 vadd.s16 q3, q1, q1 |
|
810 vsub.s16 q1, q5, q1 |
|
811 vadd.s16 q10, q2, q4 |
|
812 vqdmulh.s16 q4, q1, XFIX_1_847759065 |
|
813 vsub.s16 q2, q15, q13 |
|
814 vadd.s16 q3, q3, q6 |
|
815 vqdmulh.s16 q6, q2, XFIX_1_414213562 |
|
816 vadd.s16 q1, q1, q4 |
|
817 vqdmulh.s16 q4, q5, XFIX_1_082392200 |
|
818 vsub.s16 q10, q10, q14 |
|
819 vadd.s16 q2, q2, q6 |
|
820 vsub.s16 q6, q8, q12 |
|
821 vadd.s16 q12, q8, q12 |
|
822 vadd.s16 q9, q5, q4 |
|
823 vadd.s16 q5, q6, q10 |
|
824 vsub.s16 q10, q6, q10 |
|
825 vadd.s16 q6, q15, q13 |
|
826 vadd.s16 q8, q12, q14 |
|
827 vsub.s16 q3, q6, q3 |
|
828 vsub.s16 q12, q12, q14 |
|
829 vsub.s16 q3, q3, q1 |
|
830 vsub.s16 q1, q9, q1 |
|
831 vadd.s16 q2, q3, q2 |
|
832 vsub.s16 q15, q8, q6 |
|
833 vadd.s16 q1, q1, q2 |
|
834 vadd.s16 q8, q8, q6 |
|
835 vadd.s16 q14, q5, q3 |
|
836 vsub.s16 q9, q5, q3 |
|
837 vsub.s16 q13, q10, q2 |
|
838 vpop {d8-d13} /* restore NEON registers */ |
|
839 vadd.s16 q10, q10, q2 |
|
840 vsub.s16 q11, q12, q1 |
|
841 vadd.s16 q12, q12, q1 |
|
842 /* Descale to 8-bit and range limit */ |
|
843 vmov.u8 q0, #0x80 |
|
844 vqshrn.s16 d16, q8, #5 |
|
845 vqshrn.s16 d17, q9, #5 |
|
846 vqshrn.s16 d18, q10, #5 |
|
847 vqshrn.s16 d19, q11, #5 |
|
848 vqshrn.s16 d20, q12, #5 |
|
849 vqshrn.s16 d21, q13, #5 |
|
850 vqshrn.s16 d22, q14, #5 |
|
851 vqshrn.s16 d23, q15, #5 |
|
852 vadd.u8 q8, q8, q0 |
|
853 vadd.u8 q9, q9, q0 |
|
854 vadd.u8 q10, q10, q0 |
|
855 vadd.u8 q11, q11, q0 |
|
856 /* Transpose the final 8-bit samples */ |
|
857 vtrn.16 q8, q9 |
|
858 vtrn.16 q10, q11 |
|
859 vtrn.32 q8, q10 |
|
860 vtrn.32 q9, q11 |
|
861 vtrn.8 d16, d17 |
|
862 vtrn.8 d18, d19 |
|
863 /* Store results to the output buffer */ |
|
864 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
|
865 add TMP1, TMP1, OUTPUT_COL |
|
866 add TMP2, TMP2, OUTPUT_COL |
|
867 vst1.8 {d16}, [TMP1] |
|
868 vst1.8 {d17}, [TMP2] |
|
869 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
|
870 add TMP1, TMP1, OUTPUT_COL |
|
871 add TMP2, TMP2, OUTPUT_COL |
|
872 vst1.8 {d18}, [TMP1] |
|
873 vtrn.8 d20, d21 |
|
874 vst1.8 {d19}, [TMP2] |
|
875 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
|
876 add TMP1, TMP1, OUTPUT_COL |
|
877 add TMP2, TMP2, OUTPUT_COL |
|
878 add TMP3, TMP3, OUTPUT_COL |
|
879 add TMP4, TMP4, OUTPUT_COL |
|
880 vst1.8 {d20}, [TMP1] |
|
881 vtrn.8 d22, d23 |
|
882 vst1.8 {d21}, [TMP2] |
|
883 vst1.8 {d22}, [TMP3] |
|
884 vst1.8 {d23}, [TMP4] |
|
885 bx lr |
|
886 |
|
887 .unreq DCT_TABLE |
|
888 .unreq COEF_BLOCK |
|
889 .unreq OUTPUT_BUF |
|
890 .unreq OUTPUT_COL |
|
891 .unreq TMP1 |
|
892 .unreq TMP2 |
|
893 .unreq TMP3 |
|
894 .unreq TMP4 |
|
895 .endfunc |
|
896 |
|
897 /*****************************************************************************/ |
|
898 |
|
899 /* |
|
900 * jsimd_idct_4x4_neon |
|
901 * |
|
902 * This function contains inverse-DCT code for getting reduced-size |
|
903 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations |
|
904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' |
|
905 * function from jpeg-6b (jidctred.c). |
|
906 * |
|
907 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which |
|
908 * requires much less arithmetic operations and hence should be faster. |
|
909 * The primary purpose of this particular NEON optimized function is |
|
910 * bit exact compatibility with jpeg-6b. |
|
911 * |
|
912 * TODO: a bit better instructions scheduling can be achieved by expanding |
|
913 * idct_helper/transpose_4x4 macros and reordering instructions, |
|
914 * but readability will suffer somewhat. |
|
915 */ |
|
916 |
|
917 #define CONST_BITS 13 |
|
918 |
|
919 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ |
|
920 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ |
|
921 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ |
|
922 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ |
|
923 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ |
|
924 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ |
|
925 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ |
|
926 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ |
|
927 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ |
|
928 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ |
|
929 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ |
|
930 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ |
|
931 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ |
|
932 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ |
|
933 |
|
934 .balign 16 |
|
935 jsimd_idct_4x4_neon_consts: |
|
936 .short FIX_1_847759065 /* d0[0] */ |
|
937 .short -FIX_0_765366865 /* d0[1] */ |
|
938 .short -FIX_0_211164243 /* d0[2] */ |
|
939 .short FIX_1_451774981 /* d0[3] */ |
|
940 .short -FIX_2_172734803 /* d1[0] */ |
|
941 .short FIX_1_061594337 /* d1[1] */ |
|
942 .short -FIX_0_509795579 /* d1[2] */ |
|
943 .short -FIX_0_601344887 /* d1[3] */ |
|
944 .short FIX_0_899976223 /* d2[0] */ |
|
945 .short FIX_2_562915447 /* d2[1] */ |
|
946 .short 1 << (CONST_BITS+1) /* d2[2] */ |
|
947 .short 0 /* d2[3] */ |
|
948 |
|
949 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 |
|
950 vmull.s16 q14, \x4, d2[2] |
|
951 vmlal.s16 q14, \x8, d0[0] |
|
952 vmlal.s16 q14, \x14, d0[1] |
|
953 |
|
954 vmull.s16 q13, \x16, d1[2] |
|
955 vmlal.s16 q13, \x12, d1[3] |
|
956 vmlal.s16 q13, \x10, d2[0] |
|
957 vmlal.s16 q13, \x6, d2[1] |
|
958 |
|
959 vmull.s16 q15, \x4, d2[2] |
|
960 vmlsl.s16 q15, \x8, d0[0] |
|
961 vmlsl.s16 q15, \x14, d0[1] |
|
962 |
|
963 vmull.s16 q12, \x16, d0[2] |
|
964 vmlal.s16 q12, \x12, d0[3] |
|
965 vmlal.s16 q12, \x10, d1[0] |
|
966 vmlal.s16 q12, \x6, d1[1] |
|
967 |
|
968 vadd.s32 q10, q14, q13 |
|
969 vsub.s32 q14, q14, q13 |
|
970 |
|
971 .if \shift > 16 |
|
972 vrshr.s32 q10, q10, #\shift |
|
973 vrshr.s32 q14, q14, #\shift |
|
974 vmovn.s32 \y26, q10 |
|
975 vmovn.s32 \y29, q14 |
|
976 .else |
|
977 vrshrn.s32 \y26, q10, #\shift |
|
978 vrshrn.s32 \y29, q14, #\shift |
|
979 .endif |
|
980 |
|
981 vadd.s32 q10, q15, q12 |
|
982 vsub.s32 q15, q15, q12 |
|
983 |
|
984 .if \shift > 16 |
|
985 vrshr.s32 q10, q10, #\shift |
|
986 vrshr.s32 q15, q15, #\shift |
|
987 vmovn.s32 \y27, q10 |
|
988 vmovn.s32 \y28, q15 |
|
989 .else |
|
990 vrshrn.s32 \y27, q10, #\shift |
|
991 vrshrn.s32 \y28, q15, #\shift |
|
992 .endif |
|
993 |
|
994 .endm |
|
995 |
|
996 asm_function jsimd_idct_4x4_neon |
|
997 |
|
998 DCT_TABLE .req r0 |
|
999 COEF_BLOCK .req r1 |
|
1000 OUTPUT_BUF .req r2 |
|
1001 OUTPUT_COL .req r3 |
|
1002 TMP1 .req r0 |
|
1003 TMP2 .req r1 |
|
1004 TMP3 .req r2 |
|
1005 TMP4 .req ip |
|
1006 |
|
1007 vpush {d8-d15} |
|
1008 |
|
1009 /* Load constants (d3 is just used for padding) */ |
|
1010 adr TMP4, jsimd_idct_4x4_neon_consts |
|
1011 vld1.16 {d0, d1, d2, d3}, [TMP4, :128] |
|
1012 |
|
1013 /* Load all COEF_BLOCK into NEON registers with the following allocation: |
|
1014 * 0 1 2 3 | 4 5 6 7 |
|
1015 * ---------+-------- |
|
1016 * 0 | d4 | d5 |
|
1017 * 1 | d6 | d7 |
|
1018 * 2 | d8 | d9 |
|
1019 * 3 | d10 | d11 |
|
1020 * 4 | - | - |
|
1021 * 5 | d12 | d13 |
|
1022 * 6 | d14 | d15 |
|
1023 * 7 | d16 | d17 |
|
1024 */ |
|
1025 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! |
|
1026 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! |
|
1027 add COEF_BLOCK, COEF_BLOCK, #16 |
|
1028 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! |
|
1029 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! |
|
1030 /* dequantize */ |
|
1031 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! |
|
1032 vmul.s16 q2, q2, q9 |
|
1033 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! |
|
1034 vmul.s16 q3, q3, q10 |
|
1035 vmul.s16 q4, q4, q11 |
|
1036 add DCT_TABLE, DCT_TABLE, #16 |
|
1037 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! |
|
1038 vmul.s16 q5, q5, q12 |
|
1039 vmul.s16 q6, q6, q13 |
|
1040 vld1.16 {d30, d31}, [DCT_TABLE, :128]! |
|
1041 vmul.s16 q7, q7, q14 |
|
1042 vmul.s16 q8, q8, q15 |
|
1043 |
|
1044 /* Pass 1 */ |
|
1045 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 |
|
1046 transpose_4x4 d4, d6, d8, d10 |
|
1047 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 |
|
1048 transpose_4x4 d5, d7, d9, d11 |
|
1049 |
|
1050 /* Pass 2 */ |
|
1051 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 |
|
1052 transpose_4x4 d26, d27, d28, d29 |
|
1053 |
|
1054 /* Range limit */ |
|
1055 vmov.u16 q15, #0x80 |
|
1056 vadd.s16 q13, q13, q15 |
|
1057 vadd.s16 q14, q14, q15 |
|
1058 vqmovun.s16 d26, q13 |
|
1059 vqmovun.s16 d27, q14 |
|
1060 |
|
1061 /* Store results to the output buffer */ |
|
1062 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
|
1063 add TMP1, TMP1, OUTPUT_COL |
|
1064 add TMP2, TMP2, OUTPUT_COL |
|
1065 add TMP3, TMP3, OUTPUT_COL |
|
1066 add TMP4, TMP4, OUTPUT_COL |
|
1067 |
|
1068 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT |
|
1069 /* We can use much less instructions on little endian systems if the |
|
1070 * OS kernel is not configured to trap unaligned memory accesses |
|
1071 */ |
|
1072 vst1.32 {d26[0]}, [TMP1]! |
|
1073 vst1.32 {d27[0]}, [TMP3]! |
|
1074 vst1.32 {d26[1]}, [TMP2]! |
|
1075 vst1.32 {d27[1]}, [TMP4]! |
|
1076 #else |
|
1077 vst1.8 {d26[0]}, [TMP1]! |
|
1078 vst1.8 {d27[0]}, [TMP3]! |
|
1079 vst1.8 {d26[1]}, [TMP1]! |
|
1080 vst1.8 {d27[1]}, [TMP3]! |
|
1081 vst1.8 {d26[2]}, [TMP1]! |
|
1082 vst1.8 {d27[2]}, [TMP3]! |
|
1083 vst1.8 {d26[3]}, [TMP1]! |
|
1084 vst1.8 {d27[3]}, [TMP3]! |
|
1085 |
|
1086 vst1.8 {d26[4]}, [TMP2]! |
|
1087 vst1.8 {d27[4]}, [TMP4]! |
|
1088 vst1.8 {d26[5]}, [TMP2]! |
|
1089 vst1.8 {d27[5]}, [TMP4]! |
|
1090 vst1.8 {d26[6]}, [TMP2]! |
|
1091 vst1.8 {d27[6]}, [TMP4]! |
|
1092 vst1.8 {d26[7]}, [TMP2]! |
|
1093 vst1.8 {d27[7]}, [TMP4]! |
|
1094 #endif |
|
1095 |
|
1096 vpop {d8-d15} |
|
1097 bx lr |
|
1098 |
|
1099 .unreq DCT_TABLE |
|
1100 .unreq COEF_BLOCK |
|
1101 .unreq OUTPUT_BUF |
|
1102 .unreq OUTPUT_COL |
|
1103 .unreq TMP1 |
|
1104 .unreq TMP2 |
|
1105 .unreq TMP3 |
|
1106 .unreq TMP4 |
|
1107 .endfunc |
|
1108 |
|
1109 .purgem idct_helper |
|
1110 |
|
1111 /*****************************************************************************/ |
|
1112 |
|
1113 /* |
|
1114 * jsimd_idct_2x2_neon |
|
1115 * |
|
1116 * This function contains inverse-DCT code for getting reduced-size |
|
1117 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations |
|
1118 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' |
|
1119 * function from jpeg-6b (jidctred.c). |
|
1120 * |
|
1121 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which |
|
1122 * requires much less arithmetic operations and hence should be faster. |
|
1123 * The primary purpose of this particular NEON optimized function is |
|
1124 * bit exact compatibility with jpeg-6b. |
|
1125 */ |
|
1126 |
|
1127 .balign 8 |
|
1128 jsimd_idct_2x2_neon_consts: |
|
1129 .short -FIX_0_720959822 /* d0[0] */ |
|
1130 .short FIX_0_850430095 /* d0[1] */ |
|
1131 .short -FIX_1_272758580 /* d0[2] */ |
|
1132 .short FIX_3_624509785 /* d0[3] */ |
|
1133 |
|
1134 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 |
|
1135 vshll.s16 q14, \x4, #15 |
|
1136 vmull.s16 q13, \x6, d0[3] |
|
1137 vmlal.s16 q13, \x10, d0[2] |
|
1138 vmlal.s16 q13, \x12, d0[1] |
|
1139 vmlal.s16 q13, \x16, d0[0] |
|
1140 |
|
1141 vadd.s32 q10, q14, q13 |
|
1142 vsub.s32 q14, q14, q13 |
|
1143 |
|
1144 .if \shift > 16 |
|
1145 vrshr.s32 q10, q10, #\shift |
|
1146 vrshr.s32 q14, q14, #\shift |
|
1147 vmovn.s32 \y26, q10 |
|
1148 vmovn.s32 \y27, q14 |
|
1149 .else |
|
1150 vrshrn.s32 \y26, q10, #\shift |
|
1151 vrshrn.s32 \y27, q14, #\shift |
|
1152 .endif |
|
1153 |
|
1154 .endm |
|
1155 |
|
1156 asm_function jsimd_idct_2x2_neon |
|
1157 |
|
1158 DCT_TABLE .req r0 |
|
1159 COEF_BLOCK .req r1 |
|
1160 OUTPUT_BUF .req r2 |
|
1161 OUTPUT_COL .req r3 |
|
1162 TMP1 .req r0 |
|
1163 TMP2 .req ip |
|
1164 |
|
1165 vpush {d8-d15} |
|
1166 |
|
1167 /* Load constants */ |
|
1168 adr TMP2, jsimd_idct_2x2_neon_consts |
|
1169 vld1.16 {d0}, [TMP2, :64] |
|
1170 |
|
1171 /* Load all COEF_BLOCK into NEON registers with the following allocation: |
|
1172 * 0 1 2 3 | 4 5 6 7 |
|
1173 * ---------+-------- |
|
1174 * 0 | d4 | d5 |
|
1175 * 1 | d6 | d7 |
|
1176 * 2 | - | - |
|
1177 * 3 | d10 | d11 |
|
1178 * 4 | - | - |
|
1179 * 5 | d12 | d13 |
|
1180 * 6 | - | - |
|
1181 * 7 | d16 | d17 |
|
1182 */ |
|
1183 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! |
|
1184 add COEF_BLOCK, COEF_BLOCK, #16 |
|
1185 vld1.16 {d10, d11}, [COEF_BLOCK, :128]! |
|
1186 add COEF_BLOCK, COEF_BLOCK, #16 |
|
1187 vld1.16 {d12, d13}, [COEF_BLOCK, :128]! |
|
1188 add COEF_BLOCK, COEF_BLOCK, #16 |
|
1189 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! |
|
1190 /* Dequantize */ |
|
1191 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! |
|
1192 vmul.s16 q2, q2, q9 |
|
1193 vmul.s16 q3, q3, q10 |
|
1194 add DCT_TABLE, DCT_TABLE, #16 |
|
1195 vld1.16 {d24, d25}, [DCT_TABLE, :128]! |
|
1196 vmul.s16 q5, q5, q12 |
|
1197 add DCT_TABLE, DCT_TABLE, #16 |
|
1198 vld1.16 {d26, d27}, [DCT_TABLE, :128]! |
|
1199 vmul.s16 q6, q6, q13 |
|
1200 add DCT_TABLE, DCT_TABLE, #16 |
|
1201 vld1.16 {d30, d31}, [DCT_TABLE, :128]! |
|
1202 vmul.s16 q8, q8, q15 |
|
1203 |
|
1204 /* Pass 1 */ |
|
1205 #if 0 |
|
1206 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 |
|
1207 transpose_4x4 d4, d6, d8, d10 |
|
1208 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 |
|
1209 transpose_4x4 d5, d7, d9, d11 |
|
1210 #else |
|
1211 vmull.s16 q13, d6, d0[3] |
|
1212 vmlal.s16 q13, d10, d0[2] |
|
1213 vmlal.s16 q13, d12, d0[1] |
|
1214 vmlal.s16 q13, d16, d0[0] |
|
1215 vmull.s16 q12, d7, d0[3] |
|
1216 vmlal.s16 q12, d11, d0[2] |
|
1217 vmlal.s16 q12, d13, d0[1] |
|
1218 vmlal.s16 q12, d17, d0[0] |
|
1219 vshll.s16 q14, d4, #15 |
|
1220 vshll.s16 q15, d5, #15 |
|
1221 vadd.s32 q10, q14, q13 |
|
1222 vsub.s32 q14, q14, q13 |
|
1223 vrshrn.s32 d4, q10, #13 |
|
1224 vrshrn.s32 d6, q14, #13 |
|
1225 vadd.s32 q10, q15, q12 |
|
1226 vsub.s32 q14, q15, q12 |
|
1227 vrshrn.s32 d5, q10, #13 |
|
1228 vrshrn.s32 d7, q14, #13 |
|
1229 vtrn.16 q2, q3 |
|
1230 vtrn.32 q3, q5 |
|
1231 #endif |
|
1232 |
|
1233 /* Pass 2 */ |
|
1234 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 |
|
1235 |
|
1236 /* Range limit */ |
|
1237 vmov.u16 q15, #0x80 |
|
1238 vadd.s16 q13, q13, q15 |
|
1239 vqmovun.s16 d26, q13 |
|
1240 vqmovun.s16 d27, q13 |
|
1241 |
|
1242 /* Store results to the output buffer */ |
|
1243 ldmia OUTPUT_BUF, {TMP1, TMP2} |
|
1244 add TMP1, TMP1, OUTPUT_COL |
|
1245 add TMP2, TMP2, OUTPUT_COL |
|
1246 |
|
1247 vst1.8 {d26[0]}, [TMP1]! |
|
1248 vst1.8 {d27[4]}, [TMP1]! |
|
1249 vst1.8 {d26[1]}, [TMP2]! |
|
1250 vst1.8 {d27[5]}, [TMP2]! |
|
1251 |
|
1252 vpop {d8-d15} |
|
1253 bx lr |
|
1254 |
|
1255 .unreq DCT_TABLE |
|
1256 .unreq COEF_BLOCK |
|
1257 .unreq OUTPUT_BUF |
|
1258 .unreq OUTPUT_COL |
|
1259 .unreq TMP1 |
|
1260 .unreq TMP2 |
|
1261 .endfunc |
|
1262 |
|
1263 .purgem idct_helper |
|
1264 |
|
1265 /*****************************************************************************/ |
|
1266 |
|
1267 /* |
|
1268 * jsimd_ycc_extrgb_convert_neon |
|
1269 * jsimd_ycc_extbgr_convert_neon |
|
1270 * jsimd_ycc_extrgbx_convert_neon |
|
1271 * jsimd_ycc_extbgrx_convert_neon |
|
1272 * jsimd_ycc_extxbgr_convert_neon |
|
1273 * jsimd_ycc_extxrgb_convert_neon |
|
1274 * |
|
1275 * Colorspace conversion YCbCr -> RGB |
|
1276 */ |
|
1277 |
|
1278 |
|
1279 .macro do_load size |
|
1280 .if \size == 8 |
|
1281 vld1.8 {d4}, [U, :64]! |
|
1282 vld1.8 {d5}, [V, :64]! |
|
1283 vld1.8 {d0}, [Y, :64]! |
|
1284 pld [U, #64] |
|
1285 pld [V, #64] |
|
1286 pld [Y, #64] |
|
1287 .elseif \size == 4 |
|
1288 vld1.8 {d4[0]}, [U]! |
|
1289 vld1.8 {d4[1]}, [U]! |
|
1290 vld1.8 {d4[2]}, [U]! |
|
1291 vld1.8 {d4[3]}, [U]! |
|
1292 vld1.8 {d5[0]}, [V]! |
|
1293 vld1.8 {d5[1]}, [V]! |
|
1294 vld1.8 {d5[2]}, [V]! |
|
1295 vld1.8 {d5[3]}, [V]! |
|
1296 vld1.8 {d0[0]}, [Y]! |
|
1297 vld1.8 {d0[1]}, [Y]! |
|
1298 vld1.8 {d0[2]}, [Y]! |
|
1299 vld1.8 {d0[3]}, [Y]! |
|
1300 .elseif \size == 2 |
|
1301 vld1.8 {d4[4]}, [U]! |
|
1302 vld1.8 {d4[5]}, [U]! |
|
1303 vld1.8 {d5[4]}, [V]! |
|
1304 vld1.8 {d5[5]}, [V]! |
|
1305 vld1.8 {d0[4]}, [Y]! |
|
1306 vld1.8 {d0[5]}, [Y]! |
|
1307 .elseif \size == 1 |
|
1308 vld1.8 {d4[6]}, [U]! |
|
1309 vld1.8 {d5[6]}, [V]! |
|
1310 vld1.8 {d0[6]}, [Y]! |
|
1311 .else |
|
1312 .error unsupported macroblock size |
|
1313 .endif |
|
1314 .endm |
|
1315 |
|
1316 .macro do_store bpp, size |
|
1317 .if \bpp == 24 |
|
1318 .if \size == 8 |
|
1319 vst3.8 {d10, d11, d12}, [RGB]! |
|
1320 .elseif \size == 4 |
|
1321 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
|
1322 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
|
1323 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
|
1324 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
|
1325 .elseif \size == 2 |
|
1326 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
|
1327 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
|
1328 .elseif \size == 1 |
|
1329 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
|
1330 .else |
|
1331 .error unsupported macroblock size |
|
1332 .endif |
|
1333 .elseif \bpp == 32 |
|
1334 .if \size == 8 |
|
1335 vst4.8 {d10, d11, d12, d13}, [RGB]! |
|
1336 .elseif \size == 4 |
|
1337 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
|
1338 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
|
1339 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
|
1340 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
|
1341 .elseif \size == 2 |
|
1342 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
|
1343 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
|
1344 .elseif \size == 1 |
|
1345 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
|
1346 .else |
|
1347 .error unsupported macroblock size |
|
1348 .endif |
|
1349 .else |
|
1350 .error unsupported bpp |
|
1351 .endif |
|
1352 .endm |
|
1353 |
|
1354 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
|
1355 |
|
1356 /* |
|
1357 * 2 stage pipelined YCbCr->RGB conversion |
|
1358 */ |
|
1359 |
|
1360 .macro do_yuv_to_rgb_stage1 |
|
1361 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
|
1362 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
|
1363 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
|
1364 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
|
1365 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
|
1366 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
|
1367 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
|
1368 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
|
1369 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
|
1370 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
|
1371 .endm |
|
1372 |
|
1373 .macro do_yuv_to_rgb_stage2 |
|
1374 vrshrn.s32 d20, q10, #15 |
|
1375 vrshrn.s32 d21, q11, #15 |
|
1376 vrshrn.s32 d24, q12, #14 |
|
1377 vrshrn.s32 d25, q13, #14 |
|
1378 vrshrn.s32 d28, q14, #14 |
|
1379 vrshrn.s32 d29, q15, #14 |
|
1380 vaddw.u8 q10, q10, d0 |
|
1381 vaddw.u8 q12, q12, d0 |
|
1382 vaddw.u8 q14, q14, d0 |
|
1383 vqmovun.s16 d1\g_offs, q10 |
|
1384 vqmovun.s16 d1\r_offs, q12 |
|
1385 vqmovun.s16 d1\b_offs, q14 |
|
1386 .endm |
|
1387 |
|
1388 .macro do_yuv_to_rgb_stage2_store_load_stage1 |
|
1389 vld1.8 {d4}, [U, :64]! |
|
1390 vrshrn.s32 d20, q10, #15 |
|
1391 vrshrn.s32 d21, q11, #15 |
|
1392 vrshrn.s32 d24, q12, #14 |
|
1393 vrshrn.s32 d25, q13, #14 |
|
1394 vrshrn.s32 d28, q14, #14 |
|
1395 vld1.8 {d5}, [V, :64]! |
|
1396 vrshrn.s32 d29, q15, #14 |
|
1397 vaddw.u8 q10, q10, d0 |
|
1398 vaddw.u8 q12, q12, d0 |
|
1399 vaddw.u8 q14, q14, d0 |
|
1400 vqmovun.s16 d1\g_offs, q10 |
|
1401 vld1.8 {d0}, [Y, :64]! |
|
1402 vqmovun.s16 d1\r_offs, q12 |
|
1403 pld [U, #64] |
|
1404 pld [V, #64] |
|
1405 pld [Y, #64] |
|
1406 vqmovun.s16 d1\b_offs, q14 |
|
1407 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
|
1408 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
|
1409 do_store \bpp, 8 |
|
1410 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
|
1411 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
|
1412 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
|
1413 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
|
1414 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
|
1415 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
|
1416 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
|
1417 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
|
1418 .endm |
|
1419 |
|
1420 .macro do_yuv_to_rgb |
|
1421 do_yuv_to_rgb_stage1 |
|
1422 do_yuv_to_rgb_stage2 |
|
1423 .endm |
|
1424 |
|
1425 /* Apple gas crashes on adrl, work around that by using adr. |
|
1426 * But this requires a copy of these constants for each function. |
|
1427 */ |
|
1428 |
|
1429 .balign 16 |
|
1430 jsimd_ycc_\colorid\()_neon_consts: |
|
1431 .short 0, 0, 0, 0 |
|
1432 .short 22971, -11277, -23401, 29033 |
|
1433 .short -128, -128, -128, -128 |
|
1434 .short -128, -128, -128, -128 |
|
1435 |
|
1436 asm_function jsimd_ycc_\colorid\()_convert_neon |
|
1437 OUTPUT_WIDTH .req r0 |
|
1438 INPUT_BUF .req r1 |
|
1439 INPUT_ROW .req r2 |
|
1440 OUTPUT_BUF .req r3 |
|
1441 NUM_ROWS .req r4 |
|
1442 |
|
1443 INPUT_BUF0 .req r5 |
|
1444 INPUT_BUF1 .req r6 |
|
1445 INPUT_BUF2 .req INPUT_BUF |
|
1446 |
|
1447 RGB .req r7 |
|
1448 Y .req r8 |
|
1449 U .req r9 |
|
1450 V .req r10 |
|
1451 N .req ip |
|
1452 |
|
1453 /* Load constants to d1, d2, d3 (d0 is just used for padding) */ |
|
1454 adr ip, jsimd_ycc_\colorid\()_neon_consts |
|
1455 vld1.16 {d0, d1, d2, d3}, [ip, :128] |
|
1456 |
|
1457 /* Save ARM registers and handle input arguments */ |
|
1458 push {r4, r5, r6, r7, r8, r9, r10, lr} |
|
1459 ldr NUM_ROWS, [sp, #(4 * 8)] |
|
1460 ldr INPUT_BUF0, [INPUT_BUF] |
|
1461 ldr INPUT_BUF1, [INPUT_BUF, #4] |
|
1462 ldr INPUT_BUF2, [INPUT_BUF, #8] |
|
1463 .unreq INPUT_BUF |
|
1464 |
|
1465 /* Save NEON registers */ |
|
1466 vpush {d8-d15} |
|
1467 |
|
1468 /* Initially set d10, d11, d12, d13 to 0xFF */ |
|
1469 vmov.u8 q5, #255 |
|
1470 vmov.u8 q6, #255 |
|
1471 |
|
1472 /* Outer loop over scanlines */ |
|
1473 cmp NUM_ROWS, #1 |
|
1474 blt 9f |
|
1475 0: |
|
1476 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] |
|
1477 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] |
|
1478 mov N, OUTPUT_WIDTH |
|
1479 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] |
|
1480 add INPUT_ROW, INPUT_ROW, #1 |
|
1481 ldr RGB, [OUTPUT_BUF], #4 |
|
1482 |
|
1483 /* Inner loop over pixels */ |
|
1484 subs N, N, #8 |
|
1485 blt 3f |
|
1486 do_load 8 |
|
1487 do_yuv_to_rgb_stage1 |
|
1488 subs N, N, #8 |
|
1489 blt 2f |
|
1490 1: |
|
1491 do_yuv_to_rgb_stage2_store_load_stage1 |
|
1492 subs N, N, #8 |
|
1493 bge 1b |
|
1494 2: |
|
1495 do_yuv_to_rgb_stage2 |
|
1496 do_store \bpp, 8 |
|
1497 tst N, #7 |
|
1498 beq 8f |
|
1499 3: |
|
1500 tst N, #4 |
|
1501 beq 3f |
|
1502 do_load 4 |
|
1503 3: |
|
1504 tst N, #2 |
|
1505 beq 4f |
|
1506 do_load 2 |
|
1507 4: |
|
1508 tst N, #1 |
|
1509 beq 5f |
|
1510 do_load 1 |
|
1511 5: |
|
1512 do_yuv_to_rgb |
|
1513 tst N, #4 |
|
1514 beq 6f |
|
1515 do_store \bpp, 4 |
|
1516 6: |
|
1517 tst N, #2 |
|
1518 beq 7f |
|
1519 do_store \bpp, 2 |
|
1520 7: |
|
1521 tst N, #1 |
|
1522 beq 8f |
|
1523 do_store \bpp, 1 |
|
1524 8: |
|
1525 subs NUM_ROWS, NUM_ROWS, #1 |
|
1526 bgt 0b |
|
1527 9: |
|
1528 /* Restore all registers and return */ |
|
1529 vpop {d8-d15} |
|
1530 pop {r4, r5, r6, r7, r8, r9, r10, pc} |
|
1531 |
|
1532 .unreq OUTPUT_WIDTH |
|
1533 .unreq INPUT_ROW |
|
1534 .unreq OUTPUT_BUF |
|
1535 .unreq NUM_ROWS |
|
1536 .unreq INPUT_BUF0 |
|
1537 .unreq INPUT_BUF1 |
|
1538 .unreq INPUT_BUF2 |
|
1539 .unreq RGB |
|
1540 .unreq Y |
|
1541 .unreq U |
|
1542 .unreq V |
|
1543 .unreq N |
|
1544 .endfunc |
|
1545 |
|
1546 .purgem do_yuv_to_rgb |
|
1547 .purgem do_yuv_to_rgb_stage1 |
|
1548 .purgem do_yuv_to_rgb_stage2 |
|
1549 .purgem do_yuv_to_rgb_stage2_store_load_stage1 |
|
1550 |
|
1551 .endm |
|
1552 |
|
1553 /*--------------------------------- id ----- bpp R G B */ |
|
1554 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 |
|
1555 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 |
|
1556 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 |
|
1557 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 |
|
1558 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 |
|
1559 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 |
|
1560 |
|
1561 .purgem do_load |
|
1562 .purgem do_store |
|
1563 |
|
1564 /*****************************************************************************/ |
|
1565 |
|
1566 /* |
|
1567 * jsimd_extrgb_ycc_convert_neon |
|
1568 * jsimd_extbgr_ycc_convert_neon |
|
1569 * jsimd_extrgbx_ycc_convert_neon |
|
1570 * jsimd_extbgrx_ycc_convert_neon |
|
1571 * jsimd_extxbgr_ycc_convert_neon |
|
1572 * jsimd_extxrgb_ycc_convert_neon |
|
1573 * |
|
1574 * Colorspace conversion RGB -> YCbCr |
|
1575 */ |
|
1576 |
|
1577 .macro do_store size |
|
1578 .if \size == 8 |
|
1579 vst1.8 {d20}, [Y]! |
|
1580 vst1.8 {d21}, [U]! |
|
1581 vst1.8 {d22}, [V]! |
|
1582 .elseif \size == 4 |
|
1583 vst1.8 {d20[0]}, [Y]! |
|
1584 vst1.8 {d20[1]}, [Y]! |
|
1585 vst1.8 {d20[2]}, [Y]! |
|
1586 vst1.8 {d20[3]}, [Y]! |
|
1587 vst1.8 {d21[0]}, [U]! |
|
1588 vst1.8 {d21[1]}, [U]! |
|
1589 vst1.8 {d21[2]}, [U]! |
|
1590 vst1.8 {d21[3]}, [U]! |
|
1591 vst1.8 {d22[0]}, [V]! |
|
1592 vst1.8 {d22[1]}, [V]! |
|
1593 vst1.8 {d22[2]}, [V]! |
|
1594 vst1.8 {d22[3]}, [V]! |
|
1595 .elseif \size == 2 |
|
1596 vst1.8 {d20[4]}, [Y]! |
|
1597 vst1.8 {d20[5]}, [Y]! |
|
1598 vst1.8 {d21[4]}, [U]! |
|
1599 vst1.8 {d21[5]}, [U]! |
|
1600 vst1.8 {d22[4]}, [V]! |
|
1601 vst1.8 {d22[5]}, [V]! |
|
1602 .elseif \size == 1 |
|
1603 vst1.8 {d20[6]}, [Y]! |
|
1604 vst1.8 {d21[6]}, [U]! |
|
1605 vst1.8 {d22[6]}, [V]! |
|
1606 .else |
|
1607 .error unsupported macroblock size |
|
1608 .endif |
|
1609 .endm |
|
1610 |
|
1611 .macro do_load bpp, size |
|
1612 .if \bpp == 24 |
|
1613 .if \size == 8 |
|
1614 vld3.8 {d10, d11, d12}, [RGB]! |
|
1615 pld [RGB, #128] |
|
1616 .elseif \size == 4 |
|
1617 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
|
1618 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
|
1619 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
|
1620 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
|
1621 .elseif \size == 2 |
|
1622 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
|
1623 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
|
1624 .elseif \size == 1 |
|
1625 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
|
1626 .else |
|
1627 .error unsupported macroblock size |
|
1628 .endif |
|
1629 .elseif \bpp == 32 |
|
1630 .if \size == 8 |
|
1631 vld4.8 {d10, d11, d12, d13}, [RGB]! |
|
1632 pld [RGB, #128] |
|
1633 .elseif \size == 4 |
|
1634 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
|
1635 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
|
1636 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
|
1637 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
|
1638 .elseif \size == 2 |
|
1639 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
|
1640 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
|
1641 .elseif \size == 1 |
|
1642 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
|
1643 .else |
|
1644 .error unsupported macroblock size |
|
1645 .endif |
|
1646 .else |
|
1647 .error unsupported bpp |
|
1648 .endif |
|
1649 .endm |
|
1650 |
|
1651 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
|
1652 |
|
1653 /* |
|
1654 * 2 stage pipelined RGB->YCbCr conversion |
|
1655 */ |
|
1656 |
|
1657 .macro do_rgb_to_yuv_stage1 |
|
1658 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
|
1659 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
|
1660 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
|
1661 vmull.u16 q7, d4, d0[0] |
|
1662 vmlal.u16 q7, d6, d0[1] |
|
1663 vmlal.u16 q7, d8, d0[2] |
|
1664 vmull.u16 q8, d5, d0[0] |
|
1665 vmlal.u16 q8, d7, d0[1] |
|
1666 vmlal.u16 q8, d9, d0[2] |
|
1667 vrev64.32 q9, q1 |
|
1668 vrev64.32 q13, q1 |
|
1669 vmlsl.u16 q9, d4, d0[3] |
|
1670 vmlsl.u16 q9, d6, d1[0] |
|
1671 vmlal.u16 q9, d8, d1[1] |
|
1672 vmlsl.u16 q13, d5, d0[3] |
|
1673 vmlsl.u16 q13, d7, d1[0] |
|
1674 vmlal.u16 q13, d9, d1[1] |
|
1675 vrev64.32 q14, q1 |
|
1676 vrev64.32 q15, q1 |
|
1677 vmlal.u16 q14, d4, d1[1] |
|
1678 vmlsl.u16 q14, d6, d1[2] |
|
1679 vmlsl.u16 q14, d8, d1[3] |
|
1680 vmlal.u16 q15, d5, d1[1] |
|
1681 vmlsl.u16 q15, d7, d1[2] |
|
1682 vmlsl.u16 q15, d9, d1[3] |
|
1683 .endm |
|
1684 |
|
1685 .macro do_rgb_to_yuv_stage2 |
|
1686 vrshrn.u32 d20, q7, #16 |
|
1687 vrshrn.u32 d21, q8, #16 |
|
1688 vshrn.u32 d22, q9, #16 |
|
1689 vshrn.u32 d23, q13, #16 |
|
1690 vshrn.u32 d24, q14, #16 |
|
1691 vshrn.u32 d25, q15, #16 |
|
1692 vmovn.u16 d20, q10 /* d20 = y */ |
|
1693 vmovn.u16 d21, q11 /* d21 = u */ |
|
1694 vmovn.u16 d22, q12 /* d22 = v */ |
|
1695 .endm |
|
1696 |
|
1697 .macro do_rgb_to_yuv |
|
1698 do_rgb_to_yuv_stage1 |
|
1699 do_rgb_to_yuv_stage2 |
|
1700 .endm |
|
1701 |
|
1702 .macro do_rgb_to_yuv_stage2_store_load_stage1 |
|
1703 vrshrn.u32 d20, q7, #16 |
|
1704 vrshrn.u32 d21, q8, #16 |
|
1705 vshrn.u32 d22, q9, #16 |
|
1706 vrev64.32 q9, q1 |
|
1707 vshrn.u32 d23, q13, #16 |
|
1708 vrev64.32 q13, q1 |
|
1709 vshrn.u32 d24, q14, #16 |
|
1710 vshrn.u32 d25, q15, #16 |
|
1711 do_load \bpp, 8 |
|
1712 vmovn.u16 d20, q10 /* d20 = y */ |
|
1713 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
|
1714 vmovn.u16 d21, q11 /* d21 = u */ |
|
1715 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
|
1716 vmovn.u16 d22, q12 /* d22 = v */ |
|
1717 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
|
1718 vmull.u16 q7, d4, d0[0] |
|
1719 vmlal.u16 q7, d6, d0[1] |
|
1720 vmlal.u16 q7, d8, d0[2] |
|
1721 vst1.8 {d20}, [Y]! |
|
1722 vmull.u16 q8, d5, d0[0] |
|
1723 vmlal.u16 q8, d7, d0[1] |
|
1724 vmlal.u16 q8, d9, d0[2] |
|
1725 vmlsl.u16 q9, d4, d0[3] |
|
1726 vmlsl.u16 q9, d6, d1[0] |
|
1727 vmlal.u16 q9, d8, d1[1] |
|
1728 vst1.8 {d21}, [U]! |
|
1729 vmlsl.u16 q13, d5, d0[3] |
|
1730 vmlsl.u16 q13, d7, d1[0] |
|
1731 vmlal.u16 q13, d9, d1[1] |
|
1732 vrev64.32 q14, q1 |
|
1733 vrev64.32 q15, q1 |
|
1734 vmlal.u16 q14, d4, d1[1] |
|
1735 vmlsl.u16 q14, d6, d1[2] |
|
1736 vmlsl.u16 q14, d8, d1[3] |
|
1737 vst1.8 {d22}, [V]! |
|
1738 vmlal.u16 q15, d5, d1[1] |
|
1739 vmlsl.u16 q15, d7, d1[2] |
|
1740 vmlsl.u16 q15, d9, d1[3] |
|
1741 .endm |
|
1742 |
|
1743 .balign 16 |
|
1744 jsimd_\colorid\()_ycc_neon_consts: |
|
1745 .short 19595, 38470, 7471, 11059 |
|
1746 .short 21709, 32768, 27439, 5329 |
|
1747 .short 32767, 128, 32767, 128 |
|
1748 .short 32767, 128, 32767, 128 |
|
1749 |
|
1750 asm_function jsimd_\colorid\()_ycc_convert_neon |
|
1751 OUTPUT_WIDTH .req r0 |
|
1752 INPUT_BUF .req r1 |
|
1753 OUTPUT_BUF .req r2 |
|
1754 OUTPUT_ROW .req r3 |
|
1755 NUM_ROWS .req r4 |
|
1756 |
|
1757 OUTPUT_BUF0 .req r5 |
|
1758 OUTPUT_BUF1 .req r6 |
|
1759 OUTPUT_BUF2 .req OUTPUT_BUF |
|
1760 |
|
1761 RGB .req r7 |
|
1762 Y .req r8 |
|
1763 U .req r9 |
|
1764 V .req r10 |
|
1765 N .req ip |
|
1766 |
|
1767 /* Load constants to d0, d1, d2, d3 */ |
|
1768 adr ip, jsimd_\colorid\()_ycc_neon_consts |
|
1769 vld1.16 {d0, d1, d2, d3}, [ip, :128] |
|
1770 |
|
1771 /* Save ARM registers and handle input arguments */ |
|
1772 push {r4, r5, r6, r7, r8, r9, r10, lr} |
|
1773 ldr NUM_ROWS, [sp, #(4 * 8)] |
|
1774 ldr OUTPUT_BUF0, [OUTPUT_BUF] |
|
1775 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] |
|
1776 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] |
|
1777 .unreq OUTPUT_BUF |
|
1778 |
|
1779 /* Save NEON registers */ |
|
1780 vpush {d8-d15} |
|
1781 |
|
1782 /* Outer loop over scanlines */ |
|
1783 cmp NUM_ROWS, #1 |
|
1784 blt 9f |
|
1785 0: |
|
1786 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] |
|
1787 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] |
|
1788 mov N, OUTPUT_WIDTH |
|
1789 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] |
|
1790 add OUTPUT_ROW, OUTPUT_ROW, #1 |
|
1791 ldr RGB, [INPUT_BUF], #4 |
|
1792 |
|
1793 /* Inner loop over pixels */ |
|
1794 subs N, N, #8 |
|
1795 blt 3f |
|
1796 do_load \bpp, 8 |
|
1797 do_rgb_to_yuv_stage1 |
|
1798 subs N, N, #8 |
|
1799 blt 2f |
|
1800 1: |
|
1801 do_rgb_to_yuv_stage2_store_load_stage1 |
|
1802 subs N, N, #8 |
|
1803 bge 1b |
|
1804 2: |
|
1805 do_rgb_to_yuv_stage2 |
|
1806 do_store 8 |
|
1807 tst N, #7 |
|
1808 beq 8f |
|
1809 3: |
|
1810 tst N, #4 |
|
1811 beq 3f |
|
1812 do_load \bpp, 4 |
|
1813 3: |
|
1814 tst N, #2 |
|
1815 beq 4f |
|
1816 do_load \bpp, 2 |
|
1817 4: |
|
1818 tst N, #1 |
|
1819 beq 5f |
|
1820 do_load \bpp, 1 |
|
1821 5: |
|
1822 do_rgb_to_yuv |
|
1823 tst N, #4 |
|
1824 beq 6f |
|
1825 do_store 4 |
|
1826 6: |
|
1827 tst N, #2 |
|
1828 beq 7f |
|
1829 do_store 2 |
|
1830 7: |
|
1831 tst N, #1 |
|
1832 beq 8f |
|
1833 do_store 1 |
|
1834 8: |
|
1835 subs NUM_ROWS, NUM_ROWS, #1 |
|
1836 bgt 0b |
|
1837 9: |
|
1838 /* Restore all registers and return */ |
|
1839 vpop {d8-d15} |
|
1840 pop {r4, r5, r6, r7, r8, r9, r10, pc} |
|
1841 |
|
1842 .unreq OUTPUT_WIDTH |
|
1843 .unreq OUTPUT_ROW |
|
1844 .unreq INPUT_BUF |
|
1845 .unreq NUM_ROWS |
|
1846 .unreq OUTPUT_BUF0 |
|
1847 .unreq OUTPUT_BUF1 |
|
1848 .unreq OUTPUT_BUF2 |
|
1849 .unreq RGB |
|
1850 .unreq Y |
|
1851 .unreq U |
|
1852 .unreq V |
|
1853 .unreq N |
|
1854 .endfunc |
|
1855 |
|
1856 .purgem do_rgb_to_yuv |
|
1857 .purgem do_rgb_to_yuv_stage1 |
|
1858 .purgem do_rgb_to_yuv_stage2 |
|
1859 .purgem do_rgb_to_yuv_stage2_store_load_stage1 |
|
1860 |
|
1861 .endm |
|
1862 |
|
1863 /*--------------------------------- id ----- bpp R G B */ |
|
1864 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 |
|
1865 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 |
|
1866 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 |
|
1867 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 |
|
1868 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 |
|
1869 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 |
|
1870 |
|
1871 .purgem do_load |
|
1872 .purgem do_store |
|
1873 |
|
1874 /*****************************************************************************/ |
|
1875 |
|
1876 /* |
|
1877 * Load data into workspace, applying unsigned->signed conversion |
|
1878 * |
|
1879 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get |
|
1880 * rid of VST1.16 instructions |
|
1881 */ |
|
1882 |
|
1883 asm_function jsimd_convsamp_neon |
|
1884 SAMPLE_DATA .req r0 |
|
1885 START_COL .req r1 |
|
1886 WORKSPACE .req r2 |
|
1887 TMP1 .req r3 |
|
1888 TMP2 .req r4 |
|
1889 TMP3 .req r5 |
|
1890 TMP4 .req ip |
|
1891 |
|
1892 push {r4, r5} |
|
1893 vmov.u8 d0, #128 |
|
1894 |
|
1895 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} |
|
1896 add TMP1, TMP1, START_COL |
|
1897 add TMP2, TMP2, START_COL |
|
1898 add TMP3, TMP3, START_COL |
|
1899 add TMP4, TMP4, START_COL |
|
1900 vld1.8 {d16}, [TMP1] |
|
1901 vsubl.u8 q8, d16, d0 |
|
1902 vld1.8 {d18}, [TMP2] |
|
1903 vsubl.u8 q9, d18, d0 |
|
1904 vld1.8 {d20}, [TMP3] |
|
1905 vsubl.u8 q10, d20, d0 |
|
1906 vld1.8 {d22}, [TMP4] |
|
1907 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} |
|
1908 vsubl.u8 q11, d22, d0 |
|
1909 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! |
|
1910 add TMP1, TMP1, START_COL |
|
1911 add TMP2, TMP2, START_COL |
|
1912 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! |
|
1913 add TMP3, TMP3, START_COL |
|
1914 add TMP4, TMP4, START_COL |
|
1915 vld1.8 {d24}, [TMP1] |
|
1916 vsubl.u8 q12, d24, d0 |
|
1917 vld1.8 {d26}, [TMP2] |
|
1918 vsubl.u8 q13, d26, d0 |
|
1919 vld1.8 {d28}, [TMP3] |
|
1920 vsubl.u8 q14, d28, d0 |
|
1921 vld1.8 {d30}, [TMP4] |
|
1922 vsubl.u8 q15, d30, d0 |
|
1923 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! |
|
1924 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! |
|
1925 pop {r4, r5} |
|
1926 bx lr |
|
1927 |
|
1928 .unreq SAMPLE_DATA |
|
1929 .unreq START_COL |
|
1930 .unreq WORKSPACE |
|
1931 .unreq TMP1 |
|
1932 .unreq TMP2 |
|
1933 .unreq TMP3 |
|
1934 .unreq TMP4 |
|
1935 .endfunc |
|
1936 |
|
1937 /*****************************************************************************/ |
|
1938 |
|
1939 /* |
|
1940 * jsimd_fdct_ifast_neon |
|
1941 * |
|
1942 * This function contains a fast, not so accurate integer implementation of |
|
1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations |
|
1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' |
|
1945 * function from jfdctfst.c |
|
1946 * |
|
1947 * TODO: can be combined with 'jsimd_convsamp_neon' to get |
|
1948 * rid of a bunch of VLD1.16 instructions |
|
1949 */ |
|
1950 |
|
1951 #define XFIX_0_382683433 d0[0] |
|
1952 #define XFIX_0_541196100 d0[1] |
|
1953 #define XFIX_0_707106781 d0[2] |
|
1954 #define XFIX_1_306562965 d0[3] |
|
1955 |
|
1956 .balign 16 |
|
1957 jsimd_fdct_ifast_neon_consts: |
|
1958 .short (98 * 128) /* XFIX_0_382683433 */ |
|
1959 .short (139 * 128) /* XFIX_0_541196100 */ |
|
1960 .short (181 * 128) /* XFIX_0_707106781 */ |
|
1961 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ |
|
1962 |
|
1963 asm_function jsimd_fdct_ifast_neon |
|
1964 |
|
1965 DATA .req r0 |
|
1966 TMP .req ip |
|
1967 |
|
1968 vpush {d8-d15} |
|
1969 |
|
1970 /* Load constants */ |
|
1971 adr TMP, jsimd_fdct_ifast_neon_consts |
|
1972 vld1.16 {d0}, [TMP, :64] |
|
1973 |
|
1974 /* Load all DATA into NEON registers with the following allocation: |
|
1975 * 0 1 2 3 | 4 5 6 7 |
|
1976 * ---------+-------- |
|
1977 * 0 | d16 | d17 | q8 |
|
1978 * 1 | d18 | d19 | q9 |
|
1979 * 2 | d20 | d21 | q10 |
|
1980 * 3 | d22 | d23 | q11 |
|
1981 * 4 | d24 | d25 | q12 |
|
1982 * 5 | d26 | d27 | q13 |
|
1983 * 6 | d28 | d29 | q14 |
|
1984 * 7 | d30 | d31 | q15 |
|
1985 */ |
|
1986 |
|
1987 vld1.16 {d16, d17, d18, d19}, [DATA, :128]! |
|
1988 vld1.16 {d20, d21, d22, d23}, [DATA, :128]! |
|
1989 vld1.16 {d24, d25, d26, d27}, [DATA, :128]! |
|
1990 vld1.16 {d28, d29, d30, d31}, [DATA, :128] |
|
1991 sub DATA, DATA, #(128 - 32) |
|
1992 |
|
1993 mov TMP, #2 |
|
1994 1: |
|
1995 /* Transpose */ |
|
1996 vtrn.16 q12, q13 |
|
1997 vtrn.16 q10, q11 |
|
1998 vtrn.16 q8, q9 |
|
1999 vtrn.16 q14, q15 |
|
2000 vtrn.32 q9, q11 |
|
2001 vtrn.32 q13, q15 |
|
2002 vtrn.32 q8, q10 |
|
2003 vtrn.32 q12, q14 |
|
2004 vswp d30, d23 |
|
2005 vswp d24, d17 |
|
2006 vswp d26, d19 |
|
2007 /* 1-D FDCT */ |
|
2008 vadd.s16 q2, q11, q12 |
|
2009 vswp d28, d21 |
|
2010 vsub.s16 q12, q11, q12 |
|
2011 vsub.s16 q6, q10, q13 |
|
2012 vadd.s16 q10, q10, q13 |
|
2013 vsub.s16 q7, q9, q14 |
|
2014 vadd.s16 q9, q9, q14 |
|
2015 vsub.s16 q1, q8, q15 |
|
2016 vadd.s16 q8, q8, q15 |
|
2017 vsub.s16 q4, q9, q10 |
|
2018 vsub.s16 q5, q8, q2 |
|
2019 vadd.s16 q3, q9, q10 |
|
2020 vadd.s16 q4, q4, q5 |
|
2021 vadd.s16 q2, q8, q2 |
|
2022 vqdmulh.s16 q4, q4, XFIX_0_707106781 |
|
2023 vadd.s16 q11, q12, q6 |
|
2024 vadd.s16 q8, q2, q3 |
|
2025 vsub.s16 q12, q2, q3 |
|
2026 vadd.s16 q3, q6, q7 |
|
2027 vadd.s16 q7, q7, q1 |
|
2028 vqdmulh.s16 q3, q3, XFIX_0_707106781 |
|
2029 vsub.s16 q6, q11, q7 |
|
2030 vadd.s16 q10, q5, q4 |
|
2031 vqdmulh.s16 q6, q6, XFIX_0_382683433 |
|
2032 vsub.s16 q14, q5, q4 |
|
2033 vqdmulh.s16 q11, q11, XFIX_0_541196100 |
|
2034 vqdmulh.s16 q5, q7, XFIX_1_306562965 |
|
2035 vadd.s16 q4, q1, q3 |
|
2036 vsub.s16 q3, q1, q3 |
|
2037 vadd.s16 q7, q7, q6 |
|
2038 vadd.s16 q11, q11, q6 |
|
2039 vadd.s16 q7, q7, q5 |
|
2040 vadd.s16 q13, q3, q11 |
|
2041 vsub.s16 q11, q3, q11 |
|
2042 vadd.s16 q9, q4, q7 |
|
2043 vsub.s16 q15, q4, q7 |
|
2044 subs TMP, TMP, #1 |
|
2045 bne 1b |
|
2046 |
|
2047 /* store results */ |
|
2048 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! |
|
2049 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! |
|
2050 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! |
|
2051 vst1.16 {d28, d29, d30, d31}, [DATA, :128] |
|
2052 |
|
2053 vpop {d8-d15} |
|
2054 bx lr |
|
2055 |
|
2056 .unreq DATA |
|
2057 .unreq TMP |
|
2058 .endfunc |
|
2059 |
|
2060 /*****************************************************************************/ |
|
2061 |
|
2062 /* |
|
2063 * GLOBAL(void) |
|
2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, |
|
2065 * DCTELEM * workspace); |
|
2066 * |
|
2067 * Note: the code uses 2 stage pipelining in order to improve instructions |
|
2068 * scheduling and eliminate stalls (this provides ~15% better |
|
2069 * performance for this function on both ARM Cortex-A8 and |
|
2070 * ARM Cortex-A9 when compared to the non-pipelined variant). |
|
2071 * The instructions which belong to the second stage use different |
|
2072 * indentation for better readiability. |
|
2073 */ |
|
2074 asm_function jsimd_quantize_neon |
|
2075 |
|
2076 COEF_BLOCK .req r0 |
|
2077 DIVISORS .req r1 |
|
2078 WORKSPACE .req r2 |
|
2079 |
|
2080 RECIPROCAL .req DIVISORS |
|
2081 CORRECTION .req r3 |
|
2082 SHIFT .req ip |
|
2083 LOOP_COUNT .req r4 |
|
2084 |
|
2085 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
|
2086 vabs.s16 q12, q0 |
|
2087 add CORRECTION, DIVISORS, #(64 * 2) |
|
2088 add SHIFT, DIVISORS, #(64 * 6) |
|
2089 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
|
2090 vabs.s16 q13, q1 |
|
2091 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
|
2092 vadd.u16 q12, q12, q10 /* add correction */ |
|
2093 vadd.u16 q13, q13, q11 |
|
2094 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
|
2095 vmull.u16 q11, d25, d17 |
|
2096 vmull.u16 q8, d26, d18 |
|
2097 vmull.u16 q9, d27, d19 |
|
2098 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
|
2099 vshrn.u32 d20, q10, #16 |
|
2100 vshrn.u32 d21, q11, #16 |
|
2101 vshrn.u32 d22, q8, #16 |
|
2102 vshrn.u32 d23, q9, #16 |
|
2103 vneg.s16 q12, q12 |
|
2104 vneg.s16 q13, q13 |
|
2105 vshr.s16 q2, q0, #15 /* extract sign */ |
|
2106 vshr.s16 q3, q1, #15 |
|
2107 vshl.u16 q14, q10, q12 /* shift */ |
|
2108 vshl.u16 q15, q11, q13 |
|
2109 |
|
2110 push {r4, r5} |
|
2111 mov LOOP_COUNT, #3 |
|
2112 1: |
|
2113 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
|
2114 veor.u16 q14, q14, q2 /* restore sign */ |
|
2115 vabs.s16 q12, q0 |
|
2116 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
|
2117 vabs.s16 q13, q1 |
|
2118 veor.u16 q15, q15, q3 |
|
2119 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
|
2120 vadd.u16 q12, q12, q10 /* add correction */ |
|
2121 vadd.u16 q13, q13, q11 |
|
2122 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
|
2123 vmull.u16 q11, d25, d17 |
|
2124 vmull.u16 q8, d26, d18 |
|
2125 vmull.u16 q9, d27, d19 |
|
2126 vsub.u16 q14, q14, q2 |
|
2127 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
|
2128 vsub.u16 q15, q15, q3 |
|
2129 vshrn.u32 d20, q10, #16 |
|
2130 vshrn.u32 d21, q11, #16 |
|
2131 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
|
2132 vshrn.u32 d22, q8, #16 |
|
2133 vshrn.u32 d23, q9, #16 |
|
2134 vneg.s16 q12, q12 |
|
2135 vneg.s16 q13, q13 |
|
2136 vshr.s16 q2, q0, #15 /* extract sign */ |
|
2137 vshr.s16 q3, q1, #15 |
|
2138 vshl.u16 q14, q10, q12 /* shift */ |
|
2139 vshl.u16 q15, q11, q13 |
|
2140 subs LOOP_COUNT, LOOP_COUNT, #1 |
|
2141 bne 1b |
|
2142 pop {r4, r5} |
|
2143 |
|
2144 veor.u16 q14, q14, q2 /* restore sign */ |
|
2145 veor.u16 q15, q15, q3 |
|
2146 vsub.u16 q14, q14, q2 |
|
2147 vsub.u16 q15, q15, q3 |
|
2148 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
|
2149 |
|
2150 bx lr /* return */ |
|
2151 |
|
2152 .unreq COEF_BLOCK |
|
2153 .unreq DIVISORS |
|
2154 .unreq WORKSPACE |
|
2155 .unreq RECIPROCAL |
|
2156 .unreq CORRECTION |
|
2157 .unreq SHIFT |
|
2158 .unreq LOOP_COUNT |
|
2159 .endfunc |
|
2160 |
|
2161 /*****************************************************************************/ |
|
2162 |
|
2163 /* |
|
2164 * GLOBAL(void) |
|
2165 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, |
|
2166 * JDIMENSION downsampled_width, |
|
2167 * JSAMPARRAY input_data, |
|
2168 * JSAMPARRAY * output_data_ptr); |
|
2169 * |
|
2170 * Note: the use of unaligned writes is the main remaining bottleneck in |
|
2171 * this code, which can be potentially solved to get up to tens |
|
2172 * of percents performance improvement on Cortex-A8/Cortex-A9. |
|
2173 */ |
|
2174 |
|
2175 /* |
|
2176 * Upsample 16 source pixels to 32 destination pixels. The new 16 source |
|
2177 * pixels are loaded to q0. The previous 16 source pixels are in q1. The |
|
2178 * shifted-by-one source pixels are constructed in q2 by using q0 and q1. |
|
2179 * Register d28 is used for multiplication by 3. Register q15 is used |
|
2180 * for adding +1 bias. |
|
2181 */ |
|
2182 .macro upsample16 OUTPTR, INPTR |
|
2183 vld1.8 {q0}, [\INPTR]! |
|
2184 vmovl.u8 q8, d0 |
|
2185 vext.8 q2, q1, q0, #15 |
|
2186 vmovl.u8 q9, d1 |
|
2187 vaddw.u8 q10, q15, d4 |
|
2188 vaddw.u8 q11, q15, d5 |
|
2189 vmlal.u8 q8, d4, d28 |
|
2190 vmlal.u8 q9, d5, d28 |
|
2191 vmlal.u8 q10, d0, d28 |
|
2192 vmlal.u8 q11, d1, d28 |
|
2193 vmov q1, q0 /* backup source pixels to q1 */ |
|
2194 vrshrn.u16 d6, q8, #2 |
|
2195 vrshrn.u16 d7, q9, #2 |
|
2196 vshrn.u16 d8, q10, #2 |
|
2197 vshrn.u16 d9, q11, #2 |
|
2198 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
|
2199 .endm |
|
2200 |
|
2201 /* |
|
2202 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' |
|
2203 * macro, the roles of q0 and q1 registers are reversed for even and odd |
|
2204 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. |
|
2205 * Also this unrolling allows to reorder loads and stores to compensate |
|
2206 * multiplication latency and reduce stalls. |
|
2207 */ |
|
2208 .macro upsample32 OUTPTR, INPTR |
|
2209 /* even 16 pixels group */ |
|
2210 vld1.8 {q0}, [\INPTR]! |
|
2211 vmovl.u8 q8, d0 |
|
2212 vext.8 q2, q1, q0, #15 |
|
2213 vmovl.u8 q9, d1 |
|
2214 vaddw.u8 q10, q15, d4 |
|
2215 vaddw.u8 q11, q15, d5 |
|
2216 vmlal.u8 q8, d4, d28 |
|
2217 vmlal.u8 q9, d5, d28 |
|
2218 vmlal.u8 q10, d0, d28 |
|
2219 vmlal.u8 q11, d1, d28 |
|
2220 /* odd 16 pixels group */ |
|
2221 vld1.8 {q1}, [\INPTR]! |
|
2222 vrshrn.u16 d6, q8, #2 |
|
2223 vrshrn.u16 d7, q9, #2 |
|
2224 vshrn.u16 d8, q10, #2 |
|
2225 vshrn.u16 d9, q11, #2 |
|
2226 vmovl.u8 q8, d2 |
|
2227 vext.8 q2, q0, q1, #15 |
|
2228 vmovl.u8 q9, d3 |
|
2229 vaddw.u8 q10, q15, d4 |
|
2230 vaddw.u8 q11, q15, d5 |
|
2231 vmlal.u8 q8, d4, d28 |
|
2232 vmlal.u8 q9, d5, d28 |
|
2233 vmlal.u8 q10, d2, d28 |
|
2234 vmlal.u8 q11, d3, d28 |
|
2235 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
|
2236 vrshrn.u16 d6, q8, #2 |
|
2237 vrshrn.u16 d7, q9, #2 |
|
2238 vshrn.u16 d8, q10, #2 |
|
2239 vshrn.u16 d9, q11, #2 |
|
2240 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
|
2241 .endm |
|
2242 |
|
2243 /* |
|
2244 * Upsample a row of WIDTH pixels from INPTR to OUTPTR. |
|
2245 */ |
|
2246 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 |
|
2247 /* special case for the first and last pixels */ |
|
2248 sub \WIDTH, \WIDTH, #1 |
|
2249 add \OUTPTR, \OUTPTR, #1 |
|
2250 ldrb \TMP1, [\INPTR, \WIDTH] |
|
2251 strb \TMP1, [\OUTPTR, \WIDTH, asl #1] |
|
2252 ldrb \TMP1, [\INPTR], #1 |
|
2253 strb \TMP1, [\OUTPTR, #-1] |
|
2254 vmov.8 d3[7], \TMP1 |
|
2255 |
|
2256 subs \WIDTH, \WIDTH, #32 |
|
2257 blt 5f |
|
2258 0: /* process 32 pixels per iteration */ |
|
2259 upsample32 \OUTPTR, \INPTR |
|
2260 subs \WIDTH, \WIDTH, #32 |
|
2261 bge 0b |
|
2262 5: |
|
2263 adds \WIDTH, \WIDTH, #16 |
|
2264 blt 1f |
|
2265 0: /* process 16 pixels if needed */ |
|
2266 upsample16 \OUTPTR, \INPTR |
|
2267 subs \WIDTH, \WIDTH, #16 |
|
2268 1: |
|
2269 adds \WIDTH, \WIDTH, #16 |
|
2270 beq 9f |
|
2271 |
|
2272 /* load the remaining 1-15 pixels */ |
|
2273 add \INPTR, \INPTR, \WIDTH |
|
2274 tst \WIDTH, #1 |
|
2275 beq 2f |
|
2276 sub \INPTR, \INPTR, #1 |
|
2277 vld1.8 {d0[0]}, [\INPTR] |
|
2278 2: |
|
2279 tst \WIDTH, #2 |
|
2280 beq 2f |
|
2281 vext.8 d0, d0, d0, #6 |
|
2282 sub \INPTR, \INPTR, #1 |
|
2283 vld1.8 {d0[1]}, [\INPTR] |
|
2284 sub \INPTR, \INPTR, #1 |
|
2285 vld1.8 {d0[0]}, [\INPTR] |
|
2286 2: |
|
2287 tst \WIDTH, #4 |
|
2288 beq 2f |
|
2289 vrev64.32 d0, d0 |
|
2290 sub \INPTR, \INPTR, #1 |
|
2291 vld1.8 {d0[3]}, [\INPTR] |
|
2292 sub \INPTR, \INPTR, #1 |
|
2293 vld1.8 {d0[2]}, [\INPTR] |
|
2294 sub \INPTR, \INPTR, #1 |
|
2295 vld1.8 {d0[1]}, [\INPTR] |
|
2296 sub \INPTR, \INPTR, #1 |
|
2297 vld1.8 {d0[0]}, [\INPTR] |
|
2298 2: |
|
2299 tst \WIDTH, #8 |
|
2300 beq 2f |
|
2301 vmov d1, d0 |
|
2302 sub \INPTR, \INPTR, #8 |
|
2303 vld1.8 {d0}, [\INPTR] |
|
2304 2: /* upsample the remaining pixels */ |
|
2305 vmovl.u8 q8, d0 |
|
2306 vext.8 q2, q1, q0, #15 |
|
2307 vmovl.u8 q9, d1 |
|
2308 vaddw.u8 q10, q15, d4 |
|
2309 vaddw.u8 q11, q15, d5 |
|
2310 vmlal.u8 q8, d4, d28 |
|
2311 vmlal.u8 q9, d5, d28 |
|
2312 vmlal.u8 q10, d0, d28 |
|
2313 vmlal.u8 q11, d1, d28 |
|
2314 vrshrn.u16 d10, q8, #2 |
|
2315 vrshrn.u16 d12, q9, #2 |
|
2316 vshrn.u16 d11, q10, #2 |
|
2317 vshrn.u16 d13, q11, #2 |
|
2318 vzip.8 d10, d11 |
|
2319 vzip.8 d12, d13 |
|
2320 /* store the remaining pixels */ |
|
2321 tst \WIDTH, #8 |
|
2322 beq 2f |
|
2323 vst1.8 {d10, d11}, [\OUTPTR]! |
|
2324 vmov q5, q6 |
|
2325 2: |
|
2326 tst \WIDTH, #4 |
|
2327 beq 2f |
|
2328 vst1.8 {d10}, [\OUTPTR]! |
|
2329 vmov d10, d11 |
|
2330 2: |
|
2331 tst \WIDTH, #2 |
|
2332 beq 2f |
|
2333 vst1.8 {d10[0]}, [\OUTPTR]! |
|
2334 vst1.8 {d10[1]}, [\OUTPTR]! |
|
2335 vst1.8 {d10[2]}, [\OUTPTR]! |
|
2336 vst1.8 {d10[3]}, [\OUTPTR]! |
|
2337 vext.8 d10, d10, d10, #4 |
|
2338 2: |
|
2339 tst \WIDTH, #1 |
|
2340 beq 2f |
|
2341 vst1.8 {d10[0]}, [\OUTPTR]! |
|
2342 vst1.8 {d10[1]}, [\OUTPTR]! |
|
2343 2: |
|
2344 9: |
|
2345 .endm |
|
2346 |
|
2347 asm_function jsimd_h2v1_fancy_upsample_neon |
|
2348 |
|
2349 MAX_V_SAMP_FACTOR .req r0 |
|
2350 DOWNSAMPLED_WIDTH .req r1 |
|
2351 INPUT_DATA .req r2 |
|
2352 OUTPUT_DATA_PTR .req r3 |
|
2353 OUTPUT_DATA .req OUTPUT_DATA_PTR |
|
2354 |
|
2355 OUTPTR .req r4 |
|
2356 INPTR .req r5 |
|
2357 WIDTH .req ip |
|
2358 TMP .req lr |
|
2359 |
|
2360 push {r4, r5, r6, lr} |
|
2361 vpush {d8-d15} |
|
2362 |
|
2363 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] |
|
2364 cmp MAX_V_SAMP_FACTOR, #0 |
|
2365 ble 99f |
|
2366 |
|
2367 /* initialize constants */ |
|
2368 vmov.u8 d28, #3 |
|
2369 vmov.u16 q15, #1 |
|
2370 11: |
|
2371 ldr INPTR, [INPUT_DATA], #4 |
|
2372 ldr OUTPTR, [OUTPUT_DATA], #4 |
|
2373 mov WIDTH, DOWNSAMPLED_WIDTH |
|
2374 upsample_row OUTPTR, INPTR, WIDTH, TMP |
|
2375 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 |
|
2376 bgt 11b |
|
2377 |
|
2378 99: |
|
2379 vpop {d8-d15} |
|
2380 pop {r4, r5, r6, pc} |
|
2381 |
|
2382 .unreq MAX_V_SAMP_FACTOR |
|
2383 .unreq DOWNSAMPLED_WIDTH |
|
2384 .unreq INPUT_DATA |
|
2385 .unreq OUTPUT_DATA_PTR |
|
2386 .unreq OUTPUT_DATA |
|
2387 |
|
2388 .unreq OUTPTR |
|
2389 .unreq INPTR |
|
2390 .unreq WIDTH |
|
2391 .unreq TMP |
|
2392 |
|
2393 .endfunc |
|
2394 |
|
2395 .purgem upsample16 |
|
2396 .purgem upsample32 |
|
2397 .purgem upsample_row |