media/libyuv/source/rotate_neon.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS. All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include "libyuv/row.h"
    13 #include "libyuv/basic_types.h"
    15 #ifdef __cplusplus
    16 namespace libyuv {
    17 extern "C" {
    18 #endif
    20 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
    21 static uvec8 kVTbl4x4Transpose =
    22   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
    24 void TransposeWx8_NEON(const uint8* src, int src_stride,
    25                        uint8* dst, int dst_stride,
    26                        int width) {
    27   asm volatile (
    28     // loops are on blocks of 8. loop will stop when
    29     // counter gets to or below 0. starting the counter
    30     // at w-8 allow for this
    31     "sub         %4, #8                        \n"
    33     // handle 8x8 blocks. this should be the majority of the plane
    34     ".p2align  2                               \n"
    35     "1:                                        \n"
    36       "mov         r9, %0                      \n"
    38       "vld1.8      {d0}, [r9], %1              \n"
    39       "vld1.8      {d1}, [r9], %1              \n"
    40       "vld1.8      {d2}, [r9], %1              \n"
    41       "vld1.8      {d3}, [r9], %1              \n"
    42       "vld1.8      {d4}, [r9], %1              \n"
    43       "vld1.8      {d5}, [r9], %1              \n"
    44       "vld1.8      {d6}, [r9], %1              \n"
    45       "vld1.8      {d7}, [r9]                  \n"
    47       "vtrn.8      d1, d0                      \n"
    48       "vtrn.8      d3, d2                      \n"
    49       "vtrn.8      d5, d4                      \n"
    50       "vtrn.8      d7, d6                      \n"
    52       "vtrn.16     d1, d3                      \n"
    53       "vtrn.16     d0, d2                      \n"
    54       "vtrn.16     d5, d7                      \n"
    55       "vtrn.16     d4, d6                      \n"
    57       "vtrn.32     d1, d5                      \n"
    58       "vtrn.32     d0, d4                      \n"
    59       "vtrn.32     d3, d7                      \n"
    60       "vtrn.32     d2, d6                      \n"
    62       "vrev16.8    q0, q0                      \n"
    63       "vrev16.8    q1, q1                      \n"
    64       "vrev16.8    q2, q2                      \n"
    65       "vrev16.8    q3, q3                      \n"
    67       "mov         r9, %2                      \n"
    69       "vst1.8      {d1}, [r9], %3              \n"
    70       "vst1.8      {d0}, [r9], %3              \n"
    71       "vst1.8      {d3}, [r9], %3              \n"
    72       "vst1.8      {d2}, [r9], %3              \n"
    73       "vst1.8      {d5}, [r9], %3              \n"
    74       "vst1.8      {d4}, [r9], %3              \n"
    75       "vst1.8      {d7}, [r9], %3              \n"
    76       "vst1.8      {d6}, [r9]                  \n"
    78       "add         %0, #8                      \n"  // src += 8
    79       "add         %2, %2, %3, lsl #3          \n"  // dst += 8 * dst_stride
    80       "subs        %4,  #8                     \n"  // w   -= 8
    81       "bge         1b                          \n"
    83     // add 8 back to counter. if the result is 0 there are
    84     // no residuals.
    85     "adds        %4, #8                        \n"
    86     "beq         4f                            \n"
    88     // some residual, so between 1 and 7 lines left to transpose
    89     "cmp         %4, #2                        \n"
    90     "blt         3f                            \n"
    92     "cmp         %4, #4                        \n"
    93     "blt         2f                            \n"
    95     // 4x8 block
    96     "mov         r9, %0                        \n"
    97     "vld1.32     {d0[0]}, [r9], %1             \n"
    98     "vld1.32     {d0[1]}, [r9], %1             \n"
    99     "vld1.32     {d1[0]}, [r9], %1             \n"
   100     "vld1.32     {d1[1]}, [r9], %1             \n"
   101     "vld1.32     {d2[0]}, [r9], %1             \n"
   102     "vld1.32     {d2[1]}, [r9], %1             \n"
   103     "vld1.32     {d3[0]}, [r9], %1             \n"
   104     "vld1.32     {d3[1]}, [r9]                 \n"
   106     "mov         r9, %2                        \n"
   108     "vld1.8      {q3}, [%5]                    \n"
   110     "vtbl.8      d4, {d0, d1}, d6              \n"
   111     "vtbl.8      d5, {d0, d1}, d7              \n"
   112     "vtbl.8      d0, {d2, d3}, d6              \n"
   113     "vtbl.8      d1, {d2, d3}, d7              \n"
   115     // TODO(frkoenig): Rework shuffle above to
   116     // write out with 4 instead of 8 writes.
   117     "vst1.32     {d4[0]}, [r9], %3             \n"
   118     "vst1.32     {d4[1]}, [r9], %3             \n"
   119     "vst1.32     {d5[0]}, [r9], %3             \n"
   120     "vst1.32     {d5[1]}, [r9]                 \n"
   122     "add         r9, %2, #4                    \n"
   123     "vst1.32     {d0[0]}, [r9], %3             \n"
   124     "vst1.32     {d0[1]}, [r9], %3             \n"
   125     "vst1.32     {d1[0]}, [r9], %3             \n"
   126     "vst1.32     {d1[1]}, [r9]                 \n"
   128     "add         %0, #4                        \n"  // src += 4
   129     "add         %2, %2, %3, lsl #2            \n"  // dst += 4 * dst_stride
   130     "subs        %4,  #4                       \n"  // w   -= 4
   131     "beq         4f                            \n"
   133     // some residual, check to see if it includes a 2x8 block,
   134     // or less
   135     "cmp         %4, #2                        \n"
   136     "blt         3f                            \n"
   138     // 2x8 block
   139     "2:                                        \n"
   140     "mov         r9, %0                        \n"
   141     "vld1.16     {d0[0]}, [r9], %1             \n"
   142     "vld1.16     {d1[0]}, [r9], %1             \n"
   143     "vld1.16     {d0[1]}, [r9], %1             \n"
   144     "vld1.16     {d1[1]}, [r9], %1             \n"
   145     "vld1.16     {d0[2]}, [r9], %1             \n"
   146     "vld1.16     {d1[2]}, [r9], %1             \n"
   147     "vld1.16     {d0[3]}, [r9], %1             \n"
   148     "vld1.16     {d1[3]}, [r9]                 \n"
   150     "vtrn.8      d0, d1                        \n"
   152     "mov         r9, %2                        \n"
   154     "vst1.64     {d0}, [r9], %3                \n"
   155     "vst1.64     {d1}, [r9]                    \n"
   157     "add         %0, #2                        \n"  // src += 2
   158     "add         %2, %2, %3, lsl #1            \n"  // dst += 2 * dst_stride
   159     "subs        %4,  #2                       \n"  // w   -= 2
   160     "beq         4f                            \n"
   162     // 1x8 block
   163     "3:                                        \n"
   164     "vld1.8      {d0[0]}, [%0], %1             \n"
   165     "vld1.8      {d0[1]}, [%0], %1             \n"
   166     "vld1.8      {d0[2]}, [%0], %1             \n"
   167     "vld1.8      {d0[3]}, [%0], %1             \n"
   168     "vld1.8      {d0[4]}, [%0], %1             \n"
   169     "vld1.8      {d0[5]}, [%0], %1             \n"
   170     "vld1.8      {d0[6]}, [%0], %1             \n"
   171     "vld1.8      {d0[7]}, [%0]                 \n"
   173     "vst1.64     {d0}, [%2]                    \n"
   175     "4:                                        \n"
   177     : "+r"(src),               // %0
   178       "+r"(src_stride),        // %1
   179       "+r"(dst),               // %2
   180       "+r"(dst_stride),        // %3
   181       "+r"(width)              // %4
   182     : "r"(&kVTbl4x4Transpose)  // %5
   183     : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
   184   );
   185 }
   187 static uvec8 kVTbl4x4TransposeDi =
   188   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
   190 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
   191                          uint8* dst_a, int dst_stride_a,
   192                          uint8* dst_b, int dst_stride_b,
   193                          int width) {
   194   asm volatile (
   195     // loops are on blocks of 8. loop will stop when
   196     // counter gets to or below 0. starting the counter
   197     // at w-8 allow for this
   198     "sub         %6, #8                        \n"
   200     // handle 8x8 blocks. this should be the majority of the plane
   201     ".p2align  2                               \n"
   202     "1:                                        \n"
   203       "mov         r9, %0                      \n"
   205       "vld2.8      {d0,  d1},  [r9], %1        \n"
   206       "vld2.8      {d2,  d3},  [r9], %1        \n"
   207       "vld2.8      {d4,  d5},  [r9], %1        \n"
   208       "vld2.8      {d6,  d7},  [r9], %1        \n"
   209       "vld2.8      {d16, d17}, [r9], %1        \n"
   210       "vld2.8      {d18, d19}, [r9], %1        \n"
   211       "vld2.8      {d20, d21}, [r9], %1        \n"
   212       "vld2.8      {d22, d23}, [r9]            \n"
   214       "vtrn.8      q1, q0                      \n"
   215       "vtrn.8      q3, q2                      \n"
   216       "vtrn.8      q9, q8                      \n"
   217       "vtrn.8      q11, q10                    \n"
   219       "vtrn.16     q1, q3                      \n"
   220       "vtrn.16     q0, q2                      \n"
   221       "vtrn.16     q9, q11                     \n"
   222       "vtrn.16     q8, q10                     \n"
   224       "vtrn.32     q1, q9                      \n"
   225       "vtrn.32     q0, q8                      \n"
   226       "vtrn.32     q3, q11                     \n"
   227       "vtrn.32     q2, q10                     \n"
   229       "vrev16.8    q0, q0                      \n"
   230       "vrev16.8    q1, q1                      \n"
   231       "vrev16.8    q2, q2                      \n"
   232       "vrev16.8    q3, q3                      \n"
   233       "vrev16.8    q8, q8                      \n"
   234       "vrev16.8    q9, q9                      \n"
   235       "vrev16.8    q10, q10                    \n"
   236       "vrev16.8    q11, q11                    \n"
   238       "mov         r9, %2                      \n"
   240       "vst1.8      {d2},  [r9], %3             \n"
   241       "vst1.8      {d0},  [r9], %3             \n"
   242       "vst1.8      {d6},  [r9], %3             \n"
   243       "vst1.8      {d4},  [r9], %3             \n"
   244       "vst1.8      {d18}, [r9], %3             \n"
   245       "vst1.8      {d16}, [r9], %3             \n"
   246       "vst1.8      {d22}, [r9], %3             \n"
   247       "vst1.8      {d20}, [r9]                 \n"
   249       "mov         r9, %4                      \n"
   251       "vst1.8      {d3},  [r9], %5             \n"
   252       "vst1.8      {d1},  [r9], %5             \n"
   253       "vst1.8      {d7},  [r9], %5             \n"
   254       "vst1.8      {d5},  [r9], %5             \n"
   255       "vst1.8      {d19}, [r9], %5             \n"
   256       "vst1.8      {d17}, [r9], %5             \n"
   257       "vst1.8      {d23}, [r9], %5             \n"
   258       "vst1.8      {d21}, [r9]                 \n"
   260       "add         %0, #8*2                    \n"  // src   += 8*2
   261       "add         %2, %2, %3, lsl #3          \n"  // dst_a += 8 * dst_stride_a
   262       "add         %4, %4, %5, lsl #3          \n"  // dst_b += 8 * dst_stride_b
   263       "subs        %6,  #8                     \n"  // w     -= 8
   264       "bge         1b                          \n"
   266     // add 8 back to counter. if the result is 0 there are
   267     // no residuals.
   268     "adds        %6, #8                        \n"
   269     "beq         4f                            \n"
   271     // some residual, so between 1 and 7 lines left to transpose
   272     "cmp         %6, #2                        \n"
   273     "blt         3f                            \n"
   275     "cmp         %6, #4                        \n"
   276     "blt         2f                            \n"
   278     //TODO(frkoenig): Clean this up
   279     // 4x8 block
   280     "mov         r9, %0                        \n"
   281     "vld1.64     {d0}, [r9], %1                \n"
   282     "vld1.64     {d1}, [r9], %1                \n"
   283     "vld1.64     {d2}, [r9], %1                \n"
   284     "vld1.64     {d3}, [r9], %1                \n"
   285     "vld1.64     {d4}, [r9], %1                \n"
   286     "vld1.64     {d5}, [r9], %1                \n"
   287     "vld1.64     {d6}, [r9], %1                \n"
   288     "vld1.64     {d7}, [r9]                    \n"
   290     "vld1.8      {q15}, [%7]                   \n"
   292     "vtrn.8      q0, q1                        \n"
   293     "vtrn.8      q2, q3                        \n"
   295     "vtbl.8      d16, {d0, d1}, d30            \n"
   296     "vtbl.8      d17, {d0, d1}, d31            \n"
   297     "vtbl.8      d18, {d2, d3}, d30            \n"
   298     "vtbl.8      d19, {d2, d3}, d31            \n"
   299     "vtbl.8      d20, {d4, d5}, d30            \n"
   300     "vtbl.8      d21, {d4, d5}, d31            \n"
   301     "vtbl.8      d22, {d6, d7}, d30            \n"
   302     "vtbl.8      d23, {d6, d7}, d31            \n"
   304     "mov         r9, %2                        \n"
   306     "vst1.32     {d16[0]},  [r9], %3           \n"
   307     "vst1.32     {d16[1]},  [r9], %3           \n"
   308     "vst1.32     {d17[0]},  [r9], %3           \n"
   309     "vst1.32     {d17[1]},  [r9], %3           \n"
   311     "add         r9, %2, #4                    \n"
   312     "vst1.32     {d20[0]}, [r9], %3            \n"
   313     "vst1.32     {d20[1]}, [r9], %3            \n"
   314     "vst1.32     {d21[0]}, [r9], %3            \n"
   315     "vst1.32     {d21[1]}, [r9]                \n"
   317     "mov         r9, %4                        \n"
   319     "vst1.32     {d18[0]}, [r9], %5            \n"
   320     "vst1.32     {d18[1]}, [r9], %5            \n"
   321     "vst1.32     {d19[0]}, [r9], %5            \n"
   322     "vst1.32     {d19[1]}, [r9], %5            \n"
   324     "add         r9, %4, #4                    \n"
   325     "vst1.32     {d22[0]},  [r9], %5           \n"
   326     "vst1.32     {d22[1]},  [r9], %5           \n"
   327     "vst1.32     {d23[0]},  [r9], %5           \n"
   328     "vst1.32     {d23[1]},  [r9]               \n"
   330     "add         %0, #4*2                      \n"  // src   += 4 * 2
   331     "add         %2, %2, %3, lsl #2            \n"  // dst_a += 4 * dst_stride_a
   332     "add         %4, %4, %5, lsl #2            \n"  // dst_b += 4 * dst_stride_b
   333     "subs        %6,  #4                       \n"  // w     -= 4
   334     "beq         4f                            \n"
   336     // some residual, check to see if it includes a 2x8 block,
   337     // or less
   338     "cmp         %6, #2                        \n"
   339     "blt         3f                            \n"
   341     // 2x8 block
   342     "2:                                        \n"
   343     "mov         r9, %0                        \n"
   344     "vld2.16     {d0[0], d2[0]}, [r9], %1      \n"
   345     "vld2.16     {d1[0], d3[0]}, [r9], %1      \n"
   346     "vld2.16     {d0[1], d2[1]}, [r9], %1      \n"
   347     "vld2.16     {d1[1], d3[1]}, [r9], %1      \n"
   348     "vld2.16     {d0[2], d2[2]}, [r9], %1      \n"
   349     "vld2.16     {d1[2], d3[2]}, [r9], %1      \n"
   350     "vld2.16     {d0[3], d2[3]}, [r9], %1      \n"
   351     "vld2.16     {d1[3], d3[3]}, [r9]          \n"
   353     "vtrn.8      d0, d1                        \n"
   354     "vtrn.8      d2, d3                        \n"
   356     "mov         r9, %2                        \n"
   358     "vst1.64     {d0}, [r9], %3                \n"
   359     "vst1.64     {d2}, [r9]                    \n"
   361     "mov         r9, %4                        \n"
   363     "vst1.64     {d1}, [r9], %5                \n"
   364     "vst1.64     {d3}, [r9]                    \n"
   366     "add         %0, #2*2                      \n"  // src   += 2 * 2
   367     "add         %2, %2, %3, lsl #1            \n"  // dst_a += 2 * dst_stride_a
   368     "add         %4, %4, %5, lsl #1            \n"  // dst_b += 2 * dst_stride_b
   369     "subs        %6,  #2                       \n"  // w     -= 2
   370     "beq         4f                            \n"
   372     // 1x8 block
   373     "3:                                        \n"
   374     "vld2.8      {d0[0], d1[0]}, [%0], %1      \n"
   375     "vld2.8      {d0[1], d1[1]}, [%0], %1      \n"
   376     "vld2.8      {d0[2], d1[2]}, [%0], %1      \n"
   377     "vld2.8      {d0[3], d1[3]}, [%0], %1      \n"
   378     "vld2.8      {d0[4], d1[4]}, [%0], %1      \n"
   379     "vld2.8      {d0[5], d1[5]}, [%0], %1      \n"
   380     "vld2.8      {d0[6], d1[6]}, [%0], %1      \n"
   381     "vld2.8      {d0[7], d1[7]}, [%0]          \n"
   383     "vst1.64     {d0}, [%2]                    \n"
   384     "vst1.64     {d1}, [%4]                    \n"
   386     "4:                                        \n"
   388     : "+r"(src),                 // %0
   389       "+r"(src_stride),          // %1
   390       "+r"(dst_a),               // %2
   391       "+r"(dst_stride_a),        // %3
   392       "+r"(dst_b),               // %4
   393       "+r"(dst_stride_b),        // %5
   394       "+r"(width)                // %6
   395     : "r"(&kVTbl4x4TransposeDi)  // %7
   396     : "memory", "cc", "r9",
   397       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
   398   );
   399 }
   400 #endif
   402 #ifdef __cplusplus
   403 }  // extern "C"
   404 }  // namespace libyuv
   405 #endif

mercurial