gfx/ycbcr/yuv_convert_arm.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/ycbcr/yuv_convert_arm.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,228 @@
     1.4 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
     1.5 +// Use of this source code is governed by a BSD-style license that can be
     1.6 +// found in the LICENSE file.
     1.7 +
     1.8 +// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
     1.9 +
    1.10 +#include "yuv_convert.h"
    1.11 +#include "ycbcr_to_rgb565.h"
    1.12 +
    1.13 +
    1.14 +
    1.15 +#ifdef HAVE_YCBCR_TO_RGB565
    1.16 +
    1.17 +namespace mozilla {
    1.18 +
    1.19 +namespace gfx {
    1.20 +
    1.21 +#  if defined(MOZILLA_MAY_SUPPORT_NEON)
    1.22 +void __attribute((noinline,optimize("-fomit-frame-pointer")))
    1.23 +    yuv42x_to_rgb565_row_neon(uint16 *dst,
    1.24 +                              const uint8 *y,
    1.25 +                              const uint8 *u,
    1.26 +                              const uint8 *v,
    1.27 +                              int n,
    1.28 +                              int oddflag)
    1.29 +{
    1.30 +    static __attribute__((aligned(16))) uint16 acc_r[8] = {
    1.31 +        22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
    1.32 +    };
    1.33 +    static __attribute__((aligned(16))) uint16 acc_g[8] = {
    1.34 +        17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
    1.35 +    };
    1.36 +    static __attribute__((aligned(16))) uint16 acc_b[8] = {
    1.37 +        28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
    1.38 +    };
    1.39 +    /*
    1.40 +     * Registers:
    1.41 +     * q0, q1 : d0, d1, d2, d3  - are used for initial loading of YUV data
    1.42 +     * q2     : d4, d5          - are used for storing converted RGB data
    1.43 +     * q3     : d6, d7          - are used for temporary storage
    1.44 +     *
    1.45 +     * q4-q7 - reserved
    1.46 +     *
    1.47 +     * q8, q9 : d16, d17, d18, d19  - are used for expanded Y data
    1.48 +     * q10    : d20, d21
    1.49 +     * q11    : d22, d23
    1.50 +     * q12    : d24, d25
    1.51 +     * q13    : d26, d27
    1.52 +     * q13, q14, q15            - various constants (#16, #149, #204, #50, #104, #154)
    1.53 +     */
    1.54 +    asm volatile (
    1.55 +".fpu neon\n"
    1.56 +/* Allow to build on targets not supporting neon, and force the object file
    1.57 + * target to avoid bumping the final binary target */
    1.58 +".arch armv7-a\n"
    1.59 +".object_arch armv4t\n"
    1.60 +".macro convert_macroblock size\n"
    1.61 +/* load up to 16 source pixels */
    1.62 +	".if \\size == 16\n"
    1.63 +	    "pld [%[y], #64]\n"
    1.64 +	    "pld [%[u], #64]\n"
    1.65 +	    "pld [%[v], #64]\n"
    1.66 +	    "vld1.8 {d1}, [%[y]]!\n"
    1.67 +	    "vld1.8 {d3}, [%[y]]!\n"
    1.68 +	    "vld1.8 {d0}, [%[u]]!\n"
    1.69 +	    "vld1.8 {d2}, [%[v]]!\n"
    1.70 +	".elseif \\size == 8\n"
    1.71 +	    "vld1.8 {d1}, [%[y]]!\n"
    1.72 +	    "vld1.8 {d0[0]}, [%[u]]!\n"
    1.73 +	    "vld1.8 {d0[1]}, [%[u]]!\n"
    1.74 +	    "vld1.8 {d0[2]}, [%[u]]!\n"
    1.75 +	    "vld1.8 {d0[3]}, [%[u]]!\n"
    1.76 +	    "vld1.8 {d2[0]}, [%[v]]!\n"
    1.77 +	    "vld1.8 {d2[1]}, [%[v]]!\n"
    1.78 +	    "vld1.8 {d2[2]}, [%[v]]!\n"
    1.79 +	    "vld1.8 {d2[3]}, [%[v]]!\n"
    1.80 +	".elseif \\size == 4\n"
    1.81 +	    "vld1.8 {d1[0]}, [%[y]]!\n"
    1.82 +	    "vld1.8 {d1[1]}, [%[y]]!\n"
    1.83 +	    "vld1.8 {d1[2]}, [%[y]]!\n"
    1.84 +	    "vld1.8 {d1[3]}, [%[y]]!\n"
    1.85 +	    "vld1.8 {d0[0]}, [%[u]]!\n"
    1.86 +	    "vld1.8 {d0[1]}, [%[u]]!\n"
    1.87 +	    "vld1.8 {d2[0]}, [%[v]]!\n"
    1.88 +	    "vld1.8 {d2[1]}, [%[v]]!\n"
    1.89 +	".elseif \\size == 2\n"
    1.90 +	    "vld1.8 {d1[0]}, [%[y]]!\n"
    1.91 +	    "vld1.8 {d1[1]}, [%[y]]!\n"
    1.92 +	    "vld1.8 {d0[0]}, [%[u]]!\n"
    1.93 +	    "vld1.8 {d2[0]}, [%[v]]!\n"
    1.94 +	".elseif \\size == 1\n"
    1.95 +	    "vld1.8 {d1[0]}, [%[y]]!\n"
    1.96 +	    "vld1.8 {d0[0]}, [%[u]]!\n"
    1.97 +	    "vld1.8 {d2[0]}, [%[v]]!\n"
    1.98 +	".else\n"
    1.99 +	    ".error \"unsupported macroblock size\"\n"
   1.100 +	".endif\n"
   1.101 +
   1.102 +        /* d1 - Y data (first 8 bytes) */
   1.103 +        /* d3 - Y data (next 8 bytes) */
   1.104 +        /* d0 - U data, d2 - V data */
   1.105 +
   1.106 +	/* split even and odd Y color components */
   1.107 +	"vuzp.8      d1, d3\n"                       /* d1 - evenY, d3 - oddY */
   1.108 +	/* clip upper and lower boundaries */
   1.109 +	"vqadd.u8    q0, q0, q4\n"
   1.110 +	"vqadd.u8    q1, q1, q4\n"
   1.111 +	"vqsub.u8    q0, q0, q5\n"
   1.112 +	"vqsub.u8    q1, q1, q5\n"
   1.113 +
   1.114 +	"vshr.u8     d4, d2, #1\n"                   /* d4 = V >> 1 */
   1.115 +
   1.116 +	"vmull.u8    q8, d1, d27\n"                  /* q8 = evenY * 149 */
   1.117 +	"vmull.u8    q9, d3, d27\n"                  /* q9 = oddY * 149 */
   1.118 +
   1.119 +	"vld1.16     {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
   1.120 +	"vsubw.u8    q10, q10, d4\n"                 /* red acc -= (V >> 1) */
   1.121 +	"vmlsl.u8    q10, d2, d28\n"                 /* red acc -= V * 204 */
   1.122 +	"vld1.16     {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
   1.123 +	"vmlsl.u8    q11, d2, d30\n"                 /* green acc -= V * 104 */
   1.124 +	"vmlsl.u8    q11, d0, d29\n"                 /* green acc -= U * 50 */
   1.125 +	"vld1.16     {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
   1.126 +	"vmlsl.u8    q12, d0, d30\n"                 /* blue acc -= U * 104 */
   1.127 +	"vmlsl.u8    q12, d0, d31\n"                 /* blue acc -= U * 154 */
   1.128 +
   1.129 +	"vhsub.s16   q3, q8, q10\n"                  /* calculate even red components */
   1.130 +	"vhsub.s16   q10, q9, q10\n"                 /* calculate odd red components */
   1.131 +	"vqshrun.s16 d0, q3, #6\n"                   /* right shift, narrow and saturate even red components */
   1.132 +	"vqshrun.s16 d3, q10, #6\n"                  /* right shift, narrow and saturate odd red components */
   1.133 +
   1.134 +	"vhadd.s16   q3, q8, q11\n"                  /* calculate even green components */
   1.135 +	"vhadd.s16   q11, q9, q11\n"                 /* calculate odd green components */
   1.136 +	"vqshrun.s16 d1, q3, #6\n"                   /* right shift, narrow and saturate even green components */
   1.137 +	"vqshrun.s16 d4, q11, #6\n"                  /* right shift, narrow and saturate odd green components */
   1.138 +
   1.139 +	"vhsub.s16   q3, q8, q12\n"                  /* calculate even blue components */
   1.140 +	"vhsub.s16   q12, q9, q12\n"                 /* calculate odd blue components */
   1.141 +	"vqshrun.s16 d2, q3, #6\n"                   /* right shift, narrow and saturate even blue components */
   1.142 +	"vqshrun.s16 d5, q12, #6\n"                  /* right shift, narrow and saturate odd blue components */
   1.143 +
   1.144 +	"vzip.8      d0, d3\n"                       /* join even and odd red components */
   1.145 +	"vzip.8      d1, d4\n"                       /* join even and odd green components */
   1.146 +	"vzip.8      d2, d5\n"                       /* join even and odd blue components */
   1.147 +
   1.148 +	"vshll.u8    q3, d0, #8\n\t"
   1.149 +	"vshll.u8    q8, d1, #8\n\t"
   1.150 +	"vshll.u8    q9, d2, #8\n\t"
   1.151 +	"vsri.u16    q3, q8, #5\t\n"
   1.152 +	"vsri.u16    q3, q9, #11\t\n"
   1.153 +	/* store pixel data to memory */
   1.154 +	".if \\size == 16\n"
   1.155 +	"    vst1.16 {d6, d7}, [%[dst]]!\n"
   1.156 +	"    vshll.u8    q3, d3, #8\n\t"
   1.157 +	"    vshll.u8    q8, d4, #8\n\t"
   1.158 +	"    vshll.u8    q9, d5, #8\n\t"
   1.159 +	"    vsri.u16    q3, q8, #5\t\n"
   1.160 +	"    vsri.u16    q3, q9, #11\t\n"
   1.161 +	"    vst1.16 {d6, d7}, [%[dst]]!\n"
   1.162 +	".elseif \\size == 8\n"
   1.163 +	"    vst1.16 {d6, d7}, [%[dst]]!\n"
   1.164 +	".elseif \\size == 4\n"
   1.165 +	"    vst1.16 {d6}, [%[dst]]!\n"
   1.166 +	".elseif \\size == 2\n"
   1.167 +	"    vst1.16 {d6[0]}, [%[dst]]!\n"
   1.168 +	"    vst1.16 {d6[1]}, [%[dst]]!\n"
   1.169 +	".elseif \\size == 1\n"
   1.170 +	"    vst1.16 {d6[0]}, [%[dst]]!\n"
   1.171 +	".endif\n"
   1.172 +	".endm\n"
   1.173 +
   1.174 +	"vmov.u8     d8, #15\n" /* add this to U/V to saturate upper boundary */
   1.175 +	"vmov.u8     d9, #20\n" /* add this to Y to saturate upper boundary */
   1.176 +	"vmov.u8     d10, #31\n" /* sub this from U/V to saturate lower boundary */
   1.177 +	"vmov.u8     d11, #36\n" /* sub this from Y to saturate lower boundary */
   1.178 +
   1.179 +	"vmov.u8     d26, #16\n"
   1.180 +	"vmov.u8     d27, #149\n"
   1.181 +	"vmov.u8     d28, #204\n"
   1.182 +	"vmov.u8     d29, #50\n"
   1.183 +	"vmov.u8     d30, #104\n"
   1.184 +	"vmov.u8     d31, #154\n"
   1.185 +
   1.186 +	"cmp         %[oddflag], #0\n"
   1.187 +	"beq         1f\n"
   1.188 +	"convert_macroblock 1\n"
   1.189 +	"sub         %[n], %[n], #1\n"
   1.190 +    "1:\n"
   1.191 +	"subs        %[n], %[n], #16\n"
   1.192 +	"blt         2f\n"
   1.193 +    "1:\n"
   1.194 +	"convert_macroblock 16\n"
   1.195 +	"subs        %[n], %[n], #16\n"
   1.196 +	"bge         1b\n"
   1.197 +    "2:\n"
   1.198 +	"tst         %[n], #8\n"
   1.199 +	"beq         3f\n"
   1.200 +	"convert_macroblock 8\n"
   1.201 +    "3:\n"
   1.202 +	"tst         %[n], #4\n"
   1.203 +	"beq         4f\n"
   1.204 +	"convert_macroblock 4\n"
   1.205 +    "4:\n"
   1.206 +	"tst         %[n], #2\n"
   1.207 +	"beq         5f\n"
   1.208 +	"convert_macroblock 2\n"
   1.209 +    "5:\n"
   1.210 +	"tst         %[n], #1\n"
   1.211 +	"beq         6f\n"
   1.212 +	"convert_macroblock 1\n"
   1.213 +    "6:\n"
   1.214 +	".purgem convert_macroblock\n"
   1.215 +	: [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
   1.216 +	: [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
   1.217 +	  [oddflag] "r" (oddflag)
   1.218 +	: "cc", "memory",
   1.219 +	  "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
   1.220 +	  "d8",  "d9",  "d10", "d11", /* "d12", "d13", "d14", "d15", */
   1.221 +	  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
   1.222 +	  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
   1.223 +    );
   1.224 +}
   1.225 +#  endif // MOZILLA_MAY_SUPPORT_NEON
   1.226 +
   1.227 +} // namespace gfx
   1.228 +
   1.229 +} // namespace mozilla
   1.230 +
   1.231 +#endif // HAVE_YCBCR_TO_RGB565

mercurial