media/libyuv/source/rotate_neon.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/source/rotate_neon.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,405 @@
     1.4 +/*
     1.5 + *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS. All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "libyuv/row.h"
    1.15 +
    1.16 +#include "libyuv/basic_types.h"
    1.17 +
    1.18 +#ifdef __cplusplus
    1.19 +namespace libyuv {
    1.20 +extern "C" {
    1.21 +#endif
    1.22 +
    1.23 +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
    1.24 +static uvec8 kVTbl4x4Transpose =
    1.25 +  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
    1.26 +
    1.27 +void TransposeWx8_NEON(const uint8* src, int src_stride,
    1.28 +                       uint8* dst, int dst_stride,
    1.29 +                       int width) {
    1.30 +  asm volatile (
    1.31 +    // loops are on blocks of 8. loop will stop when
    1.32 +    // counter gets to or below 0. starting the counter
    1.33 +    // at w-8 allow for this
    1.34 +    "sub         %4, #8                        \n"
    1.35 +
    1.36 +    // handle 8x8 blocks. this should be the majority of the plane
    1.37 +    ".p2align  2                               \n"
    1.38 +    "1:                                        \n"
    1.39 +      "mov         r9, %0                      \n"
    1.40 +
    1.41 +      "vld1.8      {d0}, [r9], %1              \n"
    1.42 +      "vld1.8      {d1}, [r9], %1              \n"
    1.43 +      "vld1.8      {d2}, [r9], %1              \n"
    1.44 +      "vld1.8      {d3}, [r9], %1              \n"
    1.45 +      "vld1.8      {d4}, [r9], %1              \n"
    1.46 +      "vld1.8      {d5}, [r9], %1              \n"
    1.47 +      "vld1.8      {d6}, [r9], %1              \n"
    1.48 +      "vld1.8      {d7}, [r9]                  \n"
    1.49 +
    1.50 +      "vtrn.8      d1, d0                      \n"
    1.51 +      "vtrn.8      d3, d2                      \n"
    1.52 +      "vtrn.8      d5, d4                      \n"
    1.53 +      "vtrn.8      d7, d6                      \n"
    1.54 +
    1.55 +      "vtrn.16     d1, d3                      \n"
    1.56 +      "vtrn.16     d0, d2                      \n"
    1.57 +      "vtrn.16     d5, d7                      \n"
    1.58 +      "vtrn.16     d4, d6                      \n"
    1.59 +
    1.60 +      "vtrn.32     d1, d5                      \n"
    1.61 +      "vtrn.32     d0, d4                      \n"
    1.62 +      "vtrn.32     d3, d7                      \n"
    1.63 +      "vtrn.32     d2, d6                      \n"
    1.64 +
    1.65 +      "vrev16.8    q0, q0                      \n"
    1.66 +      "vrev16.8    q1, q1                      \n"
    1.67 +      "vrev16.8    q2, q2                      \n"
    1.68 +      "vrev16.8    q3, q3                      \n"
    1.69 +
    1.70 +      "mov         r9, %2                      \n"
    1.71 +
    1.72 +      "vst1.8      {d1}, [r9], %3              \n"
    1.73 +      "vst1.8      {d0}, [r9], %3              \n"
    1.74 +      "vst1.8      {d3}, [r9], %3              \n"
    1.75 +      "vst1.8      {d2}, [r9], %3              \n"
    1.76 +      "vst1.8      {d5}, [r9], %3              \n"
    1.77 +      "vst1.8      {d4}, [r9], %3              \n"
    1.78 +      "vst1.8      {d7}, [r9], %3              \n"
    1.79 +      "vst1.8      {d6}, [r9]                  \n"
    1.80 +
    1.81 +      "add         %0, #8                      \n"  // src += 8
    1.82 +      "add         %2, %2, %3, lsl #3          \n"  // dst += 8 * dst_stride
    1.83 +      "subs        %4,  #8                     \n"  // w   -= 8
    1.84 +      "bge         1b                          \n"
    1.85 +
    1.86 +    // add 8 back to counter. if the result is 0 there are
    1.87 +    // no residuals.
    1.88 +    "adds        %4, #8                        \n"
    1.89 +    "beq         4f                            \n"
    1.90 +
    1.91 +    // some residual, so between 1 and 7 lines left to transpose
    1.92 +    "cmp         %4, #2                        \n"
    1.93 +    "blt         3f                            \n"
    1.94 +
    1.95 +    "cmp         %4, #4                        \n"
    1.96 +    "blt         2f                            \n"
    1.97 +
    1.98 +    // 4x8 block
    1.99 +    "mov         r9, %0                        \n"
   1.100 +    "vld1.32     {d0[0]}, [r9], %1             \n"
   1.101 +    "vld1.32     {d0[1]}, [r9], %1             \n"
   1.102 +    "vld1.32     {d1[0]}, [r9], %1             \n"
   1.103 +    "vld1.32     {d1[1]}, [r9], %1             \n"
   1.104 +    "vld1.32     {d2[0]}, [r9], %1             \n"
   1.105 +    "vld1.32     {d2[1]}, [r9], %1             \n"
   1.106 +    "vld1.32     {d3[0]}, [r9], %1             \n"
   1.107 +    "vld1.32     {d3[1]}, [r9]                 \n"
   1.108 +
   1.109 +    "mov         r9, %2                        \n"
   1.110 +
   1.111 +    "vld1.8      {q3}, [%5]                    \n"
   1.112 +
   1.113 +    "vtbl.8      d4, {d0, d1}, d6              \n"
   1.114 +    "vtbl.8      d5, {d0, d1}, d7              \n"
   1.115 +    "vtbl.8      d0, {d2, d3}, d6              \n"
   1.116 +    "vtbl.8      d1, {d2, d3}, d7              \n"
   1.117 +
   1.118 +    // TODO(frkoenig): Rework shuffle above to
   1.119 +    // write out with 4 instead of 8 writes.
   1.120 +    "vst1.32     {d4[0]}, [r9], %3             \n"
   1.121 +    "vst1.32     {d4[1]}, [r9], %3             \n"
   1.122 +    "vst1.32     {d5[0]}, [r9], %3             \n"
   1.123 +    "vst1.32     {d5[1]}, [r9]                 \n"
   1.124 +
   1.125 +    "add         r9, %2, #4                    \n"
   1.126 +    "vst1.32     {d0[0]}, [r9], %3             \n"
   1.127 +    "vst1.32     {d0[1]}, [r9], %3             \n"
   1.128 +    "vst1.32     {d1[0]}, [r9], %3             \n"
   1.129 +    "vst1.32     {d1[1]}, [r9]                 \n"
   1.130 +
   1.131 +    "add         %0, #4                        \n"  // src += 4
   1.132 +    "add         %2, %2, %3, lsl #2            \n"  // dst += 4 * dst_stride
   1.133 +    "subs        %4,  #4                       \n"  // w   -= 4
   1.134 +    "beq         4f                            \n"
   1.135 +
   1.136 +    // some residual, check to see if it includes a 2x8 block,
   1.137 +    // or less
   1.138 +    "cmp         %4, #2                        \n"
   1.139 +    "blt         3f                            \n"
   1.140 +
   1.141 +    // 2x8 block
   1.142 +    "2:                                        \n"
   1.143 +    "mov         r9, %0                        \n"
   1.144 +    "vld1.16     {d0[0]}, [r9], %1             \n"
   1.145 +    "vld1.16     {d1[0]}, [r9], %1             \n"
   1.146 +    "vld1.16     {d0[1]}, [r9], %1             \n"
   1.147 +    "vld1.16     {d1[1]}, [r9], %1             \n"
   1.148 +    "vld1.16     {d0[2]}, [r9], %1             \n"
   1.149 +    "vld1.16     {d1[2]}, [r9], %1             \n"
   1.150 +    "vld1.16     {d0[3]}, [r9], %1             \n"
   1.151 +    "vld1.16     {d1[3]}, [r9]                 \n"
   1.152 +
   1.153 +    "vtrn.8      d0, d1                        \n"
   1.154 +
   1.155 +    "mov         r9, %2                        \n"
   1.156 +
   1.157 +    "vst1.64     {d0}, [r9], %3                \n"
   1.158 +    "vst1.64     {d1}, [r9]                    \n"
   1.159 +
   1.160 +    "add         %0, #2                        \n"  // src += 2
   1.161 +    "add         %2, %2, %3, lsl #1            \n"  // dst += 2 * dst_stride
   1.162 +    "subs        %4,  #2                       \n"  // w   -= 2
   1.163 +    "beq         4f                            \n"
   1.164 +
   1.165 +    // 1x8 block
   1.166 +    "3:                                        \n"
   1.167 +    "vld1.8      {d0[0]}, [%0], %1             \n"
   1.168 +    "vld1.8      {d0[1]}, [%0], %1             \n"
   1.169 +    "vld1.8      {d0[2]}, [%0], %1             \n"
   1.170 +    "vld1.8      {d0[3]}, [%0], %1             \n"
   1.171 +    "vld1.8      {d0[4]}, [%0], %1             \n"
   1.172 +    "vld1.8      {d0[5]}, [%0], %1             \n"
   1.173 +    "vld1.8      {d0[6]}, [%0], %1             \n"
   1.174 +    "vld1.8      {d0[7]}, [%0]                 \n"
   1.175 +
   1.176 +    "vst1.64     {d0}, [%2]                    \n"
   1.177 +
   1.178 +    "4:                                        \n"
   1.179 +
   1.180 +    : "+r"(src),               // %0
   1.181 +      "+r"(src_stride),        // %1
   1.182 +      "+r"(dst),               // %2
   1.183 +      "+r"(dst_stride),        // %3
   1.184 +      "+r"(width)              // %4
   1.185 +    : "r"(&kVTbl4x4Transpose)  // %5
   1.186 +    : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
   1.187 +  );
   1.188 +}
   1.189 +
   1.190 +static uvec8 kVTbl4x4TransposeDi =
   1.191 +  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
   1.192 +
   1.193 +void TransposeUVWx8_NEON(const uint8* src, int src_stride,
   1.194 +                         uint8* dst_a, int dst_stride_a,
   1.195 +                         uint8* dst_b, int dst_stride_b,
   1.196 +                         int width) {
   1.197 +  asm volatile (
   1.198 +    // loops are on blocks of 8. loop will stop when
   1.199 +    // counter gets to or below 0. starting the counter
   1.200 +    // at w-8 allow for this
   1.201 +    "sub         %6, #8                        \n"
   1.202 +
   1.203 +    // handle 8x8 blocks. this should be the majority of the plane
   1.204 +    ".p2align  2                               \n"
   1.205 +    "1:                                        \n"
   1.206 +      "mov         r9, %0                      \n"
   1.207 +
   1.208 +      "vld2.8      {d0,  d1},  [r9], %1        \n"
   1.209 +      "vld2.8      {d2,  d3},  [r9], %1        \n"
   1.210 +      "vld2.8      {d4,  d5},  [r9], %1        \n"
   1.211 +      "vld2.8      {d6,  d7},  [r9], %1        \n"
   1.212 +      "vld2.8      {d16, d17}, [r9], %1        \n"
   1.213 +      "vld2.8      {d18, d19}, [r9], %1        \n"
   1.214 +      "vld2.8      {d20, d21}, [r9], %1        \n"
   1.215 +      "vld2.8      {d22, d23}, [r9]            \n"
   1.216 +
   1.217 +      "vtrn.8      q1, q0                      \n"
   1.218 +      "vtrn.8      q3, q2                      \n"
   1.219 +      "vtrn.8      q9, q8                      \n"
   1.220 +      "vtrn.8      q11, q10                    \n"
   1.221 +
   1.222 +      "vtrn.16     q1, q3                      \n"
   1.223 +      "vtrn.16     q0, q2                      \n"
   1.224 +      "vtrn.16     q9, q11                     \n"
   1.225 +      "vtrn.16     q8, q10                     \n"
   1.226 +
   1.227 +      "vtrn.32     q1, q9                      \n"
   1.228 +      "vtrn.32     q0, q8                      \n"
   1.229 +      "vtrn.32     q3, q11                     \n"
   1.230 +      "vtrn.32     q2, q10                     \n"
   1.231 +
   1.232 +      "vrev16.8    q0, q0                      \n"
   1.233 +      "vrev16.8    q1, q1                      \n"
   1.234 +      "vrev16.8    q2, q2                      \n"
   1.235 +      "vrev16.8    q3, q3                      \n"
   1.236 +      "vrev16.8    q8, q8                      \n"
   1.237 +      "vrev16.8    q9, q9                      \n"
   1.238 +      "vrev16.8    q10, q10                    \n"
   1.239 +      "vrev16.8    q11, q11                    \n"
   1.240 +
   1.241 +      "mov         r9, %2                      \n"
   1.242 +
   1.243 +      "vst1.8      {d2},  [r9], %3             \n"
   1.244 +      "vst1.8      {d0},  [r9], %3             \n"
   1.245 +      "vst1.8      {d6},  [r9], %3             \n"
   1.246 +      "vst1.8      {d4},  [r9], %3             \n"
   1.247 +      "vst1.8      {d18}, [r9], %3             \n"
   1.248 +      "vst1.8      {d16}, [r9], %3             \n"
   1.249 +      "vst1.8      {d22}, [r9], %3             \n"
   1.250 +      "vst1.8      {d20}, [r9]                 \n"
   1.251 +
   1.252 +      "mov         r9, %4                      \n"
   1.253 +
   1.254 +      "vst1.8      {d3},  [r9], %5             \n"
   1.255 +      "vst1.8      {d1},  [r9], %5             \n"
   1.256 +      "vst1.8      {d7},  [r9], %5             \n"
   1.257 +      "vst1.8      {d5},  [r9], %5             \n"
   1.258 +      "vst1.8      {d19}, [r9], %5             \n"
   1.259 +      "vst1.8      {d17}, [r9], %5             \n"
   1.260 +      "vst1.8      {d23}, [r9], %5             \n"
   1.261 +      "vst1.8      {d21}, [r9]                 \n"
   1.262 +
   1.263 +      "add         %0, #8*2                    \n"  // src   += 8*2
   1.264 +      "add         %2, %2, %3, lsl #3          \n"  // dst_a += 8 * dst_stride_a
   1.265 +      "add         %4, %4, %5, lsl #3          \n"  // dst_b += 8 * dst_stride_b
   1.266 +      "subs        %6,  #8                     \n"  // w     -= 8
   1.267 +      "bge         1b                          \n"
   1.268 +
   1.269 +    // add 8 back to counter. if the result is 0 there are
   1.270 +    // no residuals.
   1.271 +    "adds        %6, #8                        \n"
   1.272 +    "beq         4f                            \n"
   1.273 +
   1.274 +    // some residual, so between 1 and 7 lines left to transpose
   1.275 +    "cmp         %6, #2                        \n"
   1.276 +    "blt         3f                            \n"
   1.277 +
   1.278 +    "cmp         %6, #4                        \n"
   1.279 +    "blt         2f                            \n"
   1.280 +
   1.281 +    //TODO(frkoenig): Clean this up
   1.282 +    // 4x8 block
   1.283 +    "mov         r9, %0                        \n"
   1.284 +    "vld1.64     {d0}, [r9], %1                \n"
   1.285 +    "vld1.64     {d1}, [r9], %1                \n"
   1.286 +    "vld1.64     {d2}, [r9], %1                \n"
   1.287 +    "vld1.64     {d3}, [r9], %1                \n"
   1.288 +    "vld1.64     {d4}, [r9], %1                \n"
   1.289 +    "vld1.64     {d5}, [r9], %1                \n"
   1.290 +    "vld1.64     {d6}, [r9], %1                \n"
   1.291 +    "vld1.64     {d7}, [r9]                    \n"
   1.292 +
   1.293 +    "vld1.8      {q15}, [%7]                   \n"
   1.294 +
   1.295 +    "vtrn.8      q0, q1                        \n"
   1.296 +    "vtrn.8      q2, q3                        \n"
   1.297 +
   1.298 +    "vtbl.8      d16, {d0, d1}, d30            \n"
   1.299 +    "vtbl.8      d17, {d0, d1}, d31            \n"
   1.300 +    "vtbl.8      d18, {d2, d3}, d30            \n"
   1.301 +    "vtbl.8      d19, {d2, d3}, d31            \n"
   1.302 +    "vtbl.8      d20, {d4, d5}, d30            \n"
   1.303 +    "vtbl.8      d21, {d4, d5}, d31            \n"
   1.304 +    "vtbl.8      d22, {d6, d7}, d30            \n"
   1.305 +    "vtbl.8      d23, {d6, d7}, d31            \n"
   1.306 +
   1.307 +    "mov         r9, %2                        \n"
   1.308 +
   1.309 +    "vst1.32     {d16[0]},  [r9], %3           \n"
   1.310 +    "vst1.32     {d16[1]},  [r9], %3           \n"
   1.311 +    "vst1.32     {d17[0]},  [r9], %3           \n"
   1.312 +    "vst1.32     {d17[1]},  [r9], %3           \n"
   1.313 +
   1.314 +    "add         r9, %2, #4                    \n"
   1.315 +    "vst1.32     {d20[0]}, [r9], %3            \n"
   1.316 +    "vst1.32     {d20[1]}, [r9], %3            \n"
   1.317 +    "vst1.32     {d21[0]}, [r9], %3            \n"
   1.318 +    "vst1.32     {d21[1]}, [r9]                \n"
   1.319 +
   1.320 +    "mov         r9, %4                        \n"
   1.321 +
   1.322 +    "vst1.32     {d18[0]}, [r9], %5            \n"
   1.323 +    "vst1.32     {d18[1]}, [r9], %5            \n"
   1.324 +    "vst1.32     {d19[0]}, [r9], %5            \n"
   1.325 +    "vst1.32     {d19[1]}, [r9], %5            \n"
   1.326 +
   1.327 +    "add         r9, %4, #4                    \n"
   1.328 +    "vst1.32     {d22[0]},  [r9], %5           \n"
   1.329 +    "vst1.32     {d22[1]},  [r9], %5           \n"
   1.330 +    "vst1.32     {d23[0]},  [r9], %5           \n"
   1.331 +    "vst1.32     {d23[1]},  [r9]               \n"
   1.332 +
   1.333 +    "add         %0, #4*2                      \n"  // src   += 4 * 2
   1.334 +    "add         %2, %2, %3, lsl #2            \n"  // dst_a += 4 * dst_stride_a
   1.335 +    "add         %4, %4, %5, lsl #2            \n"  // dst_b += 4 * dst_stride_b
   1.336 +    "subs        %6,  #4                       \n"  // w     -= 4
   1.337 +    "beq         4f                            \n"
   1.338 +
   1.339 +    // some residual, check to see if it includes a 2x8 block,
   1.340 +    // or less
   1.341 +    "cmp         %6, #2                        \n"
   1.342 +    "blt         3f                            \n"
   1.343 +
   1.344 +    // 2x8 block
   1.345 +    "2:                                        \n"
   1.346 +    "mov         r9, %0                        \n"
   1.347 +    "vld2.16     {d0[0], d2[0]}, [r9], %1      \n"
   1.348 +    "vld2.16     {d1[0], d3[0]}, [r9], %1      \n"
   1.349 +    "vld2.16     {d0[1], d2[1]}, [r9], %1      \n"
   1.350 +    "vld2.16     {d1[1], d3[1]}, [r9], %1      \n"
   1.351 +    "vld2.16     {d0[2], d2[2]}, [r9], %1      \n"
   1.352 +    "vld2.16     {d1[2], d3[2]}, [r9], %1      \n"
   1.353 +    "vld2.16     {d0[3], d2[3]}, [r9], %1      \n"
   1.354 +    "vld2.16     {d1[3], d3[3]}, [r9]          \n"
   1.355 +
   1.356 +    "vtrn.8      d0, d1                        \n"
   1.357 +    "vtrn.8      d2, d3                        \n"
   1.358 +
   1.359 +    "mov         r9, %2                        \n"
   1.360 +
   1.361 +    "vst1.64     {d0}, [r9], %3                \n"
   1.362 +    "vst1.64     {d2}, [r9]                    \n"
   1.363 +
   1.364 +    "mov         r9, %4                        \n"
   1.365 +
   1.366 +    "vst1.64     {d1}, [r9], %5                \n"
   1.367 +    "vst1.64     {d3}, [r9]                    \n"
   1.368 +
   1.369 +    "add         %0, #2*2                      \n"  // src   += 2 * 2
   1.370 +    "add         %2, %2, %3, lsl #1            \n"  // dst_a += 2 * dst_stride_a
   1.371 +    "add         %4, %4, %5, lsl #1            \n"  // dst_b += 2 * dst_stride_b
   1.372 +    "subs        %6,  #2                       \n"  // w     -= 2
   1.373 +    "beq         4f                            \n"
   1.374 +
   1.375 +    // 1x8 block
   1.376 +    "3:                                        \n"
   1.377 +    "vld2.8      {d0[0], d1[0]}, [%0], %1      \n"
   1.378 +    "vld2.8      {d0[1], d1[1]}, [%0], %1      \n"
   1.379 +    "vld2.8      {d0[2], d1[2]}, [%0], %1      \n"
   1.380 +    "vld2.8      {d0[3], d1[3]}, [%0], %1      \n"
   1.381 +    "vld2.8      {d0[4], d1[4]}, [%0], %1      \n"
   1.382 +    "vld2.8      {d0[5], d1[5]}, [%0], %1      \n"
   1.383 +    "vld2.8      {d0[6], d1[6]}, [%0], %1      \n"
   1.384 +    "vld2.8      {d0[7], d1[7]}, [%0]          \n"
   1.385 +
   1.386 +    "vst1.64     {d0}, [%2]                    \n"
   1.387 +    "vst1.64     {d1}, [%4]                    \n"
   1.388 +
   1.389 +    "4:                                        \n"
   1.390 +
   1.391 +    : "+r"(src),                 // %0
   1.392 +      "+r"(src_stride),          // %1
   1.393 +      "+r"(dst_a),               // %2
   1.394 +      "+r"(dst_stride_a),        // %3
   1.395 +      "+r"(dst_b),               // %4
   1.396 +      "+r"(dst_stride_b),        // %5
   1.397 +      "+r"(width)                // %6
   1.398 +    : "r"(&kVTbl4x4TransposeDi)  // %7
   1.399 +    : "memory", "cc", "r9",
   1.400 +      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
   1.401 +  );
   1.402 +}
   1.403 +#endif
   1.404 +
   1.405 +#ifdef __cplusplus
   1.406 +}  // extern "C"
   1.407 +}  // namespace libyuv
   1.408 +#endif

mercurial