1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/ycbcr/yuv_convert_arm.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,228 @@ 1.4 +// Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.5 +// Use of this source code is governed by a BSD-style license that can be 1.6 +// found in the LICENSE file. 1.7 + 1.8 +// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com> 1.9 + 1.10 +#include "yuv_convert.h" 1.11 +#include "ycbcr_to_rgb565.h" 1.12 + 1.13 + 1.14 + 1.15 +#ifdef HAVE_YCBCR_TO_RGB565 1.16 + 1.17 +namespace mozilla { 1.18 + 1.19 +namespace gfx { 1.20 + 1.21 +# if defined(MOZILLA_MAY_SUPPORT_NEON) 1.22 +void __attribute((noinline,optimize("-fomit-frame-pointer"))) 1.23 + yuv42x_to_rgb565_row_neon(uint16 *dst, 1.24 + const uint8 *y, 1.25 + const uint8 *u, 1.26 + const uint8 *v, 1.27 + int n, 1.28 + int oddflag) 1.29 +{ 1.30 + static __attribute__((aligned(16))) uint16 acc_r[8] = { 1.31 + 22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840, 1.32 + }; 1.33 + static __attribute__((aligned(16))) uint16 acc_g[8] = { 1.34 + 17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312, 1.35 + }; 1.36 + static __attribute__((aligned(16))) uint16 acc_b[8] = { 1.37 + 28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832, 1.38 + }; 1.39 + /* 1.40 + * Registers: 1.41 + * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data 1.42 + * q2 : d4, d5 - are used for storing converted RGB data 1.43 + * q3 : d6, d7 - are used for temporary storage 1.44 + * 1.45 + * q4-q7 - reserved 1.46 + * 1.47 + * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data 1.48 + * q10 : d20, d21 1.49 + * q11 : d22, d23 1.50 + * q12 : d24, d25 1.51 + * q13 : d26, d27 1.52 + * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154) 1.53 + */ 1.54 + asm volatile ( 1.55 +".fpu neon\n" 1.56 +/* Allow to build on targets not supporting neon, and force the object file 1.57 + * target to avoid bumping the final binary target */ 1.58 +".arch armv7-a\n" 1.59 +".object_arch armv4t\n" 1.60 +".macro convert_macroblock size\n" 1.61 +/* load up to 16 source pixels */ 1.62 + ".if \\size == 16\n" 1.63 + "pld [%[y], #64]\n" 1.64 + "pld [%[u], #64]\n" 1.65 + "pld [%[v], #64]\n" 1.66 + "vld1.8 {d1}, [%[y]]!\n" 1.67 + "vld1.8 {d3}, [%[y]]!\n" 1.68 + "vld1.8 {d0}, [%[u]]!\n" 1.69 + "vld1.8 {d2}, [%[v]]!\n" 1.70 + ".elseif \\size == 8\n" 1.71 + "vld1.8 {d1}, [%[y]]!\n" 1.72 + "vld1.8 {d0[0]}, [%[u]]!\n" 1.73 + "vld1.8 {d0[1]}, [%[u]]!\n" 1.74 + "vld1.8 {d0[2]}, [%[u]]!\n" 1.75 + "vld1.8 {d0[3]}, [%[u]]!\n" 1.76 + "vld1.8 {d2[0]}, [%[v]]!\n" 1.77 + "vld1.8 {d2[1]}, [%[v]]!\n" 1.78 + "vld1.8 {d2[2]}, [%[v]]!\n" 1.79 + "vld1.8 {d2[3]}, [%[v]]!\n" 1.80 + ".elseif \\size == 4\n" 1.81 + "vld1.8 {d1[0]}, [%[y]]!\n" 1.82 + "vld1.8 {d1[1]}, [%[y]]!\n" 1.83 + "vld1.8 {d1[2]}, [%[y]]!\n" 1.84 + "vld1.8 {d1[3]}, [%[y]]!\n" 1.85 + "vld1.8 {d0[0]}, [%[u]]!\n" 1.86 + "vld1.8 {d0[1]}, [%[u]]!\n" 1.87 + "vld1.8 {d2[0]}, [%[v]]!\n" 1.88 + "vld1.8 {d2[1]}, [%[v]]!\n" 1.89 + ".elseif \\size == 2\n" 1.90 + "vld1.8 {d1[0]}, [%[y]]!\n" 1.91 + "vld1.8 {d1[1]}, [%[y]]!\n" 1.92 + "vld1.8 {d0[0]}, [%[u]]!\n" 1.93 + "vld1.8 {d2[0]}, [%[v]]!\n" 1.94 + ".elseif \\size == 1\n" 1.95 + "vld1.8 {d1[0]}, [%[y]]!\n" 1.96 + "vld1.8 {d0[0]}, [%[u]]!\n" 1.97 + "vld1.8 {d2[0]}, [%[v]]!\n" 1.98 + ".else\n" 1.99 + ".error \"unsupported macroblock size\"\n" 1.100 + ".endif\n" 1.101 + 1.102 + /* d1 - Y data (first 8 bytes) */ 1.103 + /* d3 - Y data (next 8 bytes) */ 1.104 + /* d0 - U data, d2 - V data */ 1.105 + 1.106 + /* split even and odd Y color components */ 1.107 + "vuzp.8 d1, d3\n" /* d1 - evenY, d3 - oddY */ 1.108 + /* clip upper and lower boundaries */ 1.109 + "vqadd.u8 q0, q0, q4\n" 1.110 + "vqadd.u8 q1, q1, q4\n" 1.111 + "vqsub.u8 q0, q0, q5\n" 1.112 + "vqsub.u8 q1, q1, q5\n" 1.113 + 1.114 + "vshr.u8 d4, d2, #1\n" /* d4 = V >> 1 */ 1.115 + 1.116 + "vmull.u8 q8, d1, d27\n" /* q8 = evenY * 149 */ 1.117 + "vmull.u8 q9, d3, d27\n" /* q9 = oddY * 149 */ 1.118 + 1.119 + "vld1.16 {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */ 1.120 + "vsubw.u8 q10, q10, d4\n" /* red acc -= (V >> 1) */ 1.121 + "vmlsl.u8 q10, d2, d28\n" /* red acc -= V * 204 */ 1.122 + "vld1.16 {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */ 1.123 + "vmlsl.u8 q11, d2, d30\n" /* green acc -= V * 104 */ 1.124 + "vmlsl.u8 q11, d0, d29\n" /* green acc -= U * 50 */ 1.125 + "vld1.16 {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */ 1.126 + "vmlsl.u8 q12, d0, d30\n" /* blue acc -= U * 104 */ 1.127 + "vmlsl.u8 q12, d0, d31\n" /* blue acc -= U * 154 */ 1.128 + 1.129 + "vhsub.s16 q3, q8, q10\n" /* calculate even red components */ 1.130 + "vhsub.s16 q10, q9, q10\n" /* calculate odd red components */ 1.131 + "vqshrun.s16 d0, q3, #6\n" /* right shift, narrow and saturate even red components */ 1.132 + "vqshrun.s16 d3, q10, #6\n" /* right shift, narrow and saturate odd red components */ 1.133 + 1.134 + "vhadd.s16 q3, q8, q11\n" /* calculate even green components */ 1.135 + "vhadd.s16 q11, q9, q11\n" /* calculate odd green components */ 1.136 + "vqshrun.s16 d1, q3, #6\n" /* right shift, narrow and saturate even green components */ 1.137 + "vqshrun.s16 d4, q11, #6\n" /* right shift, narrow and saturate odd green components */ 1.138 + 1.139 + "vhsub.s16 q3, q8, q12\n" /* calculate even blue components */ 1.140 + "vhsub.s16 q12, q9, q12\n" /* calculate odd blue components */ 1.141 + "vqshrun.s16 d2, q3, #6\n" /* right shift, narrow and saturate even blue components */ 1.142 + "vqshrun.s16 d5, q12, #6\n" /* right shift, narrow and saturate odd blue components */ 1.143 + 1.144 + "vzip.8 d0, d3\n" /* join even and odd red components */ 1.145 + "vzip.8 d1, d4\n" /* join even and odd green components */ 1.146 + "vzip.8 d2, d5\n" /* join even and odd blue components */ 1.147 + 1.148 + "vshll.u8 q3, d0, #8\n\t" 1.149 + "vshll.u8 q8, d1, #8\n\t" 1.150 + "vshll.u8 q9, d2, #8\n\t" 1.151 + "vsri.u16 q3, q8, #5\t\n" 1.152 + "vsri.u16 q3, q9, #11\t\n" 1.153 + /* store pixel data to memory */ 1.154 + ".if \\size == 16\n" 1.155 + " vst1.16 {d6, d7}, [%[dst]]!\n" 1.156 + " vshll.u8 q3, d3, #8\n\t" 1.157 + " vshll.u8 q8, d4, #8\n\t" 1.158 + " vshll.u8 q9, d5, #8\n\t" 1.159 + " vsri.u16 q3, q8, #5\t\n" 1.160 + " vsri.u16 q3, q9, #11\t\n" 1.161 + " vst1.16 {d6, d7}, [%[dst]]!\n" 1.162 + ".elseif \\size == 8\n" 1.163 + " vst1.16 {d6, d7}, [%[dst]]!\n" 1.164 + ".elseif \\size == 4\n" 1.165 + " vst1.16 {d6}, [%[dst]]!\n" 1.166 + ".elseif \\size == 2\n" 1.167 + " vst1.16 {d6[0]}, [%[dst]]!\n" 1.168 + " vst1.16 {d6[1]}, [%[dst]]!\n" 1.169 + ".elseif \\size == 1\n" 1.170 + " vst1.16 {d6[0]}, [%[dst]]!\n" 1.171 + ".endif\n" 1.172 + ".endm\n" 1.173 + 1.174 + "vmov.u8 d8, #15\n" /* add this to U/V to saturate upper boundary */ 1.175 + "vmov.u8 d9, #20\n" /* add this to Y to saturate upper boundary */ 1.176 + "vmov.u8 d10, #31\n" /* sub this from U/V to saturate lower boundary */ 1.177 + "vmov.u8 d11, #36\n" /* sub this from Y to saturate lower boundary */ 1.178 + 1.179 + "vmov.u8 d26, #16\n" 1.180 + "vmov.u8 d27, #149\n" 1.181 + "vmov.u8 d28, #204\n" 1.182 + "vmov.u8 d29, #50\n" 1.183 + "vmov.u8 d30, #104\n" 1.184 + "vmov.u8 d31, #154\n" 1.185 + 1.186 + "cmp %[oddflag], #0\n" 1.187 + "beq 1f\n" 1.188 + "convert_macroblock 1\n" 1.189 + "sub %[n], %[n], #1\n" 1.190 + "1:\n" 1.191 + "subs %[n], %[n], #16\n" 1.192 + "blt 2f\n" 1.193 + "1:\n" 1.194 + "convert_macroblock 16\n" 1.195 + "subs %[n], %[n], #16\n" 1.196 + "bge 1b\n" 1.197 + "2:\n" 1.198 + "tst %[n], #8\n" 1.199 + "beq 3f\n" 1.200 + "convert_macroblock 8\n" 1.201 + "3:\n" 1.202 + "tst %[n], #4\n" 1.203 + "beq 4f\n" 1.204 + "convert_macroblock 4\n" 1.205 + "4:\n" 1.206 + "tst %[n], #2\n" 1.207 + "beq 5f\n" 1.208 + "convert_macroblock 2\n" 1.209 + "5:\n" 1.210 + "tst %[n], #1\n" 1.211 + "beq 6f\n" 1.212 + "convert_macroblock 1\n" 1.213 + "6:\n" 1.214 + ".purgem convert_macroblock\n" 1.215 + : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n) 1.216 + : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]), 1.217 + [oddflag] "r" (oddflag) 1.218 + : "cc", "memory", 1.219 + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1.220 + "d8", "d9", "d10", "d11", /* "d12", "d13", "d14", "d15", */ 1.221 + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", 1.222 + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 1.223 + ); 1.224 +} 1.225 +# endif // MOZILLA_MAY_SUPPORT_NEON 1.226 + 1.227 +} // namespace gfx 1.228 + 1.229 +} // namespace mozilla 1.230 + 1.231 +#endif // HAVE_YCBCR_TO_RGB565