media/libyuv/source/rotate.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS. All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include "libyuv/rotate.h"
    13 #include "libyuv/cpu_id.h"
    14 #include "libyuv/convert.h"
    15 #include "libyuv/planar_functions.h"
    16 #include "libyuv/row.h"
    18 #ifdef __cplusplus
    19 namespace libyuv {
    20 extern "C" {
    21 #endif
    23 #if !defined(LIBYUV_DISABLE_X86) && \
    24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
    25 #if defined(__APPLE__) && defined(__i386__)
    26 #define DECLARE_FUNCTION(name)                                                 \
    27     ".text                                     \n"                             \
    28     ".private_extern _" #name "                \n"                             \
    29     ".align 4,0x90                             \n"                             \
    30 "_" #name ":                                   \n"
    31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
    32 #define DECLARE_FUNCTION(name)                                                 \
    33     ".text                                     \n"                             \
    34     ".align 4,0x90                             \n"                             \
    35 "_" #name ":                                   \n"
    36 #else
    37 #define DECLARE_FUNCTION(name)                                                 \
    38     ".text                                     \n"                             \
    39     ".align 4,0x90                             \n"                             \
    40 #name ":                                       \n"
    41 #endif
    42 #endif
    44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
    45     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
    46 #define HAS_MIRRORROW_NEON
    47 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
    48 #define HAS_MIRRORROW_UV_NEON
    49 void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
    50 #define HAS_TRANSPOSE_WX8_NEON
    51 void TransposeWx8_NEON(const uint8* src, int src_stride,
    52                        uint8* dst, int dst_stride, int width);
    53 #define HAS_TRANSPOSE_UVWX8_NEON
    54 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    55                          uint8* dst_a, int dst_stride_a,
    56                          uint8* dst_b, int dst_stride_b,
    57                          int width);
    58 #endif  // defined(__ARM_NEON__)
    60 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
    61     defined(__mips__) && \
    62     defined(__mips_dsp) && (__mips_dsp_rev >= 2)
    63 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
    64 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
    65                              uint8* dst, int dst_stride, int width);
    67 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
    68                                   uint8* dst, int dst_stride, int width);
    69 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
    70 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
    71                                uint8* dst_a, int dst_stride_a,
    72                                uint8* dst_b, int dst_stride_b,
    73                                int width);
    74 #endif  // defined(__mips__)
    76 #if !defined(LIBYUV_DISABLE_X86) && \
    77     defined(_M_IX86) && defined(_MSC_VER)
    78 #define HAS_TRANSPOSE_WX8_SSSE3
    79 __declspec(naked) __declspec(align(16))
    80 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
    81                                uint8* dst, int dst_stride, int width) {
    82   __asm {
    83     push      edi
    84     push      esi
    85     push      ebp
    86     mov       eax, [esp + 12 + 4]   // src
    87     mov       edi, [esp + 12 + 8]   // src_stride
    88     mov       edx, [esp + 12 + 12]  // dst
    89     mov       esi, [esp + 12 + 16]  // dst_stride
    90     mov       ecx, [esp + 12 + 20]  // width
    92     // Read in the data from the source pointer.
    93     // First round of bit swap.
    94     align      4
    95  convertloop:
    96     movq      xmm0, qword ptr [eax]
    97     lea       ebp, [eax + 8]
    98     movq      xmm1, qword ptr [eax + edi]
    99     lea       eax, [eax + 2 * edi]
   100     punpcklbw xmm0, xmm1
   101     movq      xmm2, qword ptr [eax]
   102     movdqa    xmm1, xmm0
   103     palignr   xmm1, xmm1, 8
   104     movq      xmm3, qword ptr [eax + edi]
   105     lea       eax, [eax + 2 * edi]
   106     punpcklbw xmm2, xmm3
   107     movdqa    xmm3, xmm2
   108     movq      xmm4, qword ptr [eax]
   109     palignr   xmm3, xmm3, 8
   110     movq      xmm5, qword ptr [eax + edi]
   111     punpcklbw xmm4, xmm5
   112     lea       eax, [eax + 2 * edi]
   113     movdqa    xmm5, xmm4
   114     movq      xmm6, qword ptr [eax]
   115     palignr   xmm5, xmm5, 8
   116     movq      xmm7, qword ptr [eax + edi]
   117     punpcklbw xmm6, xmm7
   118     mov       eax, ebp
   119     movdqa    xmm7, xmm6
   120     palignr   xmm7, xmm7, 8
   121     // Second round of bit swap.
   122     punpcklwd xmm0, xmm2
   123     punpcklwd xmm1, xmm3
   124     movdqa    xmm2, xmm0
   125     movdqa    xmm3, xmm1
   126     palignr   xmm2, xmm2, 8
   127     palignr   xmm3, xmm3, 8
   128     punpcklwd xmm4, xmm6
   129     punpcklwd xmm5, xmm7
   130     movdqa    xmm6, xmm4
   131     movdqa    xmm7, xmm5
   132     palignr   xmm6, xmm6, 8
   133     palignr   xmm7, xmm7, 8
   134     // Third round of bit swap.
   135     // Write to the destination pointer.
   136     punpckldq xmm0, xmm4
   137     movq      qword ptr [edx], xmm0
   138     movdqa    xmm4, xmm0
   139     palignr   xmm4, xmm4, 8
   140     movq      qword ptr [edx + esi], xmm4
   141     lea       edx, [edx + 2 * esi]
   142     punpckldq xmm2, xmm6
   143     movdqa    xmm6, xmm2
   144     palignr   xmm6, xmm6, 8
   145     movq      qword ptr [edx], xmm2
   146     punpckldq xmm1, xmm5
   147     movq      qword ptr [edx + esi], xmm6
   148     lea       edx, [edx + 2 * esi]
   149     movdqa    xmm5, xmm1
   150     movq      qword ptr [edx], xmm1
   151     palignr   xmm5, xmm5, 8
   152     punpckldq xmm3, xmm7
   153     movq      qword ptr [edx + esi], xmm5
   154     lea       edx, [edx + 2 * esi]
   155     movq      qword ptr [edx], xmm3
   156     movdqa    xmm7, xmm3
   157     palignr   xmm7, xmm7, 8
   158     sub       ecx, 8
   159     movq      qword ptr [edx + esi], xmm7
   160     lea       edx, [edx + 2 * esi]
   161     jg        convertloop
   163     pop       ebp
   164     pop       esi
   165     pop       edi
   166     ret
   167   }
   168 }
   170 #define HAS_TRANSPOSE_UVWX8_SSE2
   171 __declspec(naked) __declspec(align(16))
   172 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   173                                 uint8* dst_a, int dst_stride_a,
   174                                 uint8* dst_b, int dst_stride_b,
   175                                 int w) {
   176   __asm {
   177     push      ebx
   178     push      esi
   179     push      edi
   180     push      ebp
   181     mov       eax, [esp + 16 + 4]   // src
   182     mov       edi, [esp + 16 + 8]   // src_stride
   183     mov       edx, [esp + 16 + 12]  // dst_a
   184     mov       esi, [esp + 16 + 16]  // dst_stride_a
   185     mov       ebx, [esp + 16 + 20]  // dst_b
   186     mov       ebp, [esp + 16 + 24]  // dst_stride_b
   187     mov       ecx, esp
   188     sub       esp, 4 + 16
   189     and       esp, ~15
   190     mov       [esp + 16], ecx
   191     mov       ecx, [ecx + 16 + 28]  // w
   193     align      4
   194  convertloop:
   195     // Read in the data from the source pointer.
   196     // First round of bit swap.
   197     movdqa    xmm0, [eax]
   198     movdqa    xmm1, [eax + edi]
   199     lea       eax, [eax + 2 * edi]
   200     movdqa    xmm7, xmm0  // use xmm7 as temp register.
   201     punpcklbw xmm0, xmm1
   202     punpckhbw xmm7, xmm1
   203     movdqa    xmm1, xmm7
   204     movdqa    xmm2, [eax]
   205     movdqa    xmm3, [eax + edi]
   206     lea       eax, [eax + 2 * edi]
   207     movdqa    xmm7, xmm2
   208     punpcklbw xmm2, xmm3
   209     punpckhbw xmm7, xmm3
   210     movdqa    xmm3, xmm7
   211     movdqa    xmm4, [eax]
   212     movdqa    xmm5, [eax + edi]
   213     lea       eax, [eax + 2 * edi]
   214     movdqa    xmm7, xmm4
   215     punpcklbw xmm4, xmm5
   216     punpckhbw xmm7, xmm5
   217     movdqa    xmm5, xmm7
   218     movdqa    xmm6, [eax]
   219     movdqa    xmm7, [eax + edi]
   220     lea       eax, [eax + 2 * edi]
   221     movdqa    [esp], xmm5  // backup xmm5
   222     neg       edi
   223     movdqa    xmm5, xmm6   // use xmm5 as temp register.
   224     punpcklbw xmm6, xmm7
   225     punpckhbw xmm5, xmm7
   226     movdqa    xmm7, xmm5
   227     lea       eax, [eax + 8 * edi + 16]
   228     neg       edi
   229     // Second round of bit swap.
   230     movdqa    xmm5, xmm0
   231     punpcklwd xmm0, xmm2
   232     punpckhwd xmm5, xmm2
   233     movdqa    xmm2, xmm5
   234     movdqa    xmm5, xmm1
   235     punpcklwd xmm1, xmm3
   236     punpckhwd xmm5, xmm3
   237     movdqa    xmm3, xmm5
   238     movdqa    xmm5, xmm4
   239     punpcklwd xmm4, xmm6
   240     punpckhwd xmm5, xmm6
   241     movdqa    xmm6, xmm5
   242     movdqa    xmm5, [esp]  // restore xmm5
   243     movdqa    [esp], xmm6  // backup xmm6
   244     movdqa    xmm6, xmm5    // use xmm6 as temp register.
   245     punpcklwd xmm5, xmm7
   246     punpckhwd xmm6, xmm7
   247     movdqa    xmm7, xmm6
   248     // Third round of bit swap.
   249     // Write to the destination pointer.
   250     movdqa    xmm6, xmm0
   251     punpckldq xmm0, xmm4
   252     punpckhdq xmm6, xmm4
   253     movdqa    xmm4, xmm6
   254     movdqa    xmm6, [esp]  // restore xmm6
   255     movlpd    qword ptr [edx], xmm0
   256     movhpd    qword ptr [ebx], xmm0
   257     movlpd    qword ptr [edx + esi], xmm4
   258     lea       edx, [edx + 2 * esi]
   259     movhpd    qword ptr [ebx + ebp], xmm4
   260     lea       ebx, [ebx + 2 * ebp]
   261     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
   262     punpckldq xmm2, xmm6
   263     movlpd    qword ptr [edx], xmm2
   264     movhpd    qword ptr [ebx], xmm2
   265     punpckhdq xmm0, xmm6
   266     movlpd    qword ptr [edx + esi], xmm0
   267     lea       edx, [edx + 2 * esi]
   268     movhpd    qword ptr [ebx + ebp], xmm0
   269     lea       ebx, [ebx + 2 * ebp]
   270     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
   271     punpckldq xmm1, xmm5
   272     movlpd    qword ptr [edx], xmm1
   273     movhpd    qword ptr [ebx], xmm1
   274     punpckhdq xmm0, xmm5
   275     movlpd    qword ptr [edx + esi], xmm0
   276     lea       edx, [edx + 2 * esi]
   277     movhpd    qword ptr [ebx + ebp], xmm0
   278     lea       ebx, [ebx + 2 * ebp]
   279     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
   280     punpckldq xmm3, xmm7
   281     movlpd    qword ptr [edx], xmm3
   282     movhpd    qword ptr [ebx], xmm3
   283     punpckhdq xmm0, xmm7
   284     sub       ecx, 8
   285     movlpd    qword ptr [edx + esi], xmm0
   286     lea       edx, [edx + 2 * esi]
   287     movhpd    qword ptr [ebx + ebp], xmm0
   288     lea       ebx, [ebx + 2 * ebp]
   289     jg        convertloop
   291     mov       esp, [esp + 16]
   292     pop       ebp
   293     pop       edi
   294     pop       esi
   295     pop       ebx
   296     ret
   297   }
   298 }
   299 #elif !defined(LIBYUV_DISABLE_X86) && \
   300     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
   301 #define HAS_TRANSPOSE_WX8_SSSE3
   302 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
   303                                uint8* dst, int dst_stride, int width) {
   304   asm volatile (
   305     // Read in the data from the source pointer.
   306     // First round of bit swap.
   307     ".p2align  2                                 \n"
   308   "1:                                            \n"
   309     "movq       (%0),%%xmm0                      \n"
   310     "movq       (%0,%3),%%xmm1                   \n"
   311     "lea        (%0,%3,2),%0                     \n"
   312     "punpcklbw  %%xmm1,%%xmm0                    \n"
   313     "movq       (%0),%%xmm2                      \n"
   314     "movdqa     %%xmm0,%%xmm1                    \n"
   315     "palignr    $0x8,%%xmm1,%%xmm1               \n"
   316     "movq       (%0,%3),%%xmm3                   \n"
   317     "lea        (%0,%3,2),%0                     \n"
   318     "punpcklbw  %%xmm3,%%xmm2                    \n"
   319     "movdqa     %%xmm2,%%xmm3                    \n"
   320     "movq       (%0),%%xmm4                      \n"
   321     "palignr    $0x8,%%xmm3,%%xmm3               \n"
   322     "movq       (%0,%3),%%xmm5                   \n"
   323     "lea        (%0,%3,2),%0                     \n"
   324     "punpcklbw  %%xmm5,%%xmm4                    \n"
   325     "movdqa     %%xmm4,%%xmm5                    \n"
   326     "movq       (%0),%%xmm6                      \n"
   327     "palignr    $0x8,%%xmm5,%%xmm5               \n"
   328     "movq       (%0,%3),%%xmm7                   \n"
   329     "lea        (%0,%3,2),%0                     \n"
   330     "punpcklbw  %%xmm7,%%xmm6                    \n"
   331     "neg        %3                               \n"
   332     "movdqa     %%xmm6,%%xmm7                    \n"
   333     "lea        0x8(%0,%3,8),%0                  \n"
   334     "palignr    $0x8,%%xmm7,%%xmm7               \n"
   335     "neg        %3                               \n"
   336      // Second round of bit swap.
   337     "punpcklwd  %%xmm2,%%xmm0                    \n"
   338     "punpcklwd  %%xmm3,%%xmm1                    \n"
   339     "movdqa     %%xmm0,%%xmm2                    \n"
   340     "movdqa     %%xmm1,%%xmm3                    \n"
   341     "palignr    $0x8,%%xmm2,%%xmm2               \n"
   342     "palignr    $0x8,%%xmm3,%%xmm3               \n"
   343     "punpcklwd  %%xmm6,%%xmm4                    \n"
   344     "punpcklwd  %%xmm7,%%xmm5                    \n"
   345     "movdqa     %%xmm4,%%xmm6                    \n"
   346     "movdqa     %%xmm5,%%xmm7                    \n"
   347     "palignr    $0x8,%%xmm6,%%xmm6               \n"
   348     "palignr    $0x8,%%xmm7,%%xmm7               \n"
   349     // Third round of bit swap.
   350     // Write to the destination pointer.
   351     "punpckldq  %%xmm4,%%xmm0                    \n"
   352     "movq       %%xmm0,(%1)                      \n"
   353     "movdqa     %%xmm0,%%xmm4                    \n"
   354     "palignr    $0x8,%%xmm4,%%xmm4               \n"
   355     "movq       %%xmm4,(%1,%4)                   \n"
   356     "lea        (%1,%4,2),%1                     \n"
   357     "punpckldq  %%xmm6,%%xmm2                    \n"
   358     "movdqa     %%xmm2,%%xmm6                    \n"
   359     "movq       %%xmm2,(%1)                      \n"
   360     "palignr    $0x8,%%xmm6,%%xmm6               \n"
   361     "punpckldq  %%xmm5,%%xmm1                    \n"
   362     "movq       %%xmm6,(%1,%4)                   \n"
   363     "lea        (%1,%4,2),%1                     \n"
   364     "movdqa     %%xmm1,%%xmm5                    \n"
   365     "movq       %%xmm1,(%1)                      \n"
   366     "palignr    $0x8,%%xmm5,%%xmm5               \n"
   367     "movq       %%xmm5,(%1,%4)                   \n"
   368     "lea        (%1,%4,2),%1                     \n"
   369     "punpckldq  %%xmm7,%%xmm3                    \n"
   370     "movq       %%xmm3,(%1)                      \n"
   371     "movdqa     %%xmm3,%%xmm7                    \n"
   372     "palignr    $0x8,%%xmm7,%%xmm7               \n"
   373     "sub        $0x8,%2                          \n"
   374     "movq       %%xmm7,(%1,%4)                   \n"
   375     "lea        (%1,%4,2),%1                     \n"
   376     "jg         1b                               \n"
   377     : "+r"(src),    // %0
   378       "+r"(dst),    // %1
   379       "+r"(width)   // %2
   380     : "r"((intptr_t)(src_stride)),  // %3
   381       "r"((intptr_t)(dst_stride))   // %4
   382     : "memory", "cc"
   383   #if defined(__SSE2__)
   384       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   385   #endif
   386   );
   387 }
   389 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
   390 #define HAS_TRANSPOSE_UVWX8_SSE2
   391 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   392                                     uint8* dst_a, int dst_stride_a,
   393                                     uint8* dst_b, int dst_stride_b,
   394                                     int w);
   395   asm (
   396     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
   397     "push   %ebx                               \n"
   398     "push   %esi                               \n"
   399     "push   %edi                               \n"
   400     "push   %ebp                               \n"
   401     "mov    0x14(%esp),%eax                    \n"
   402     "mov    0x18(%esp),%edi                    \n"
   403     "mov    0x1c(%esp),%edx                    \n"
   404     "mov    0x20(%esp),%esi                    \n"
   405     "mov    0x24(%esp),%ebx                    \n"
   406     "mov    0x28(%esp),%ebp                    \n"
   407     "mov    %esp,%ecx                          \n"
   408     "sub    $0x14,%esp                         \n"
   409     "and    $0xfffffff0,%esp                   \n"
   410     "mov    %ecx,0x10(%esp)                    \n"
   411     "mov    0x2c(%ecx),%ecx                    \n"
   413 "1:                                            \n"
   414     "movdqa (%eax),%xmm0                       \n"
   415     "movdqa (%eax,%edi,1),%xmm1                \n"
   416     "lea    (%eax,%edi,2),%eax                 \n"
   417     "movdqa %xmm0,%xmm7                        \n"
   418     "punpcklbw %xmm1,%xmm0                     \n"
   419     "punpckhbw %xmm1,%xmm7                     \n"
   420     "movdqa %xmm7,%xmm1                        \n"
   421     "movdqa (%eax),%xmm2                       \n"
   422     "movdqa (%eax,%edi,1),%xmm3                \n"
   423     "lea    (%eax,%edi,2),%eax                 \n"
   424     "movdqa %xmm2,%xmm7                        \n"
   425     "punpcklbw %xmm3,%xmm2                     \n"
   426     "punpckhbw %xmm3,%xmm7                     \n"
   427     "movdqa %xmm7,%xmm3                        \n"
   428     "movdqa (%eax),%xmm4                       \n"
   429     "movdqa (%eax,%edi,1),%xmm5                \n"
   430     "lea    (%eax,%edi,2),%eax                 \n"
   431     "movdqa %xmm4,%xmm7                        \n"
   432     "punpcklbw %xmm5,%xmm4                     \n"
   433     "punpckhbw %xmm5,%xmm7                     \n"
   434     "movdqa %xmm7,%xmm5                        \n"
   435     "movdqa (%eax),%xmm6                       \n"
   436     "movdqa (%eax,%edi,1),%xmm7                \n"
   437     "lea    (%eax,%edi,2),%eax                 \n"
   438     "movdqa %xmm5,(%esp)                       \n"
   439     "neg    %edi                               \n"
   440     "movdqa %xmm6,%xmm5                        \n"
   441     "punpcklbw %xmm7,%xmm6                     \n"
   442     "punpckhbw %xmm7,%xmm5                     \n"
   443     "movdqa %xmm5,%xmm7                        \n"
   444     "lea    0x10(%eax,%edi,8),%eax             \n"
   445     "neg    %edi                               \n"
   446     "movdqa %xmm0,%xmm5                        \n"
   447     "punpcklwd %xmm2,%xmm0                     \n"
   448     "punpckhwd %xmm2,%xmm5                     \n"
   449     "movdqa %xmm5,%xmm2                        \n"
   450     "movdqa %xmm1,%xmm5                        \n"
   451     "punpcklwd %xmm3,%xmm1                     \n"
   452     "punpckhwd %xmm3,%xmm5                     \n"
   453     "movdqa %xmm5,%xmm3                        \n"
   454     "movdqa %xmm4,%xmm5                        \n"
   455     "punpcklwd %xmm6,%xmm4                     \n"
   456     "punpckhwd %xmm6,%xmm5                     \n"
   457     "movdqa %xmm5,%xmm6                        \n"
   458     "movdqa (%esp),%xmm5                       \n"
   459     "movdqa %xmm6,(%esp)                       \n"
   460     "movdqa %xmm5,%xmm6                        \n"
   461     "punpcklwd %xmm7,%xmm5                     \n"
   462     "punpckhwd %xmm7,%xmm6                     \n"
   463     "movdqa %xmm6,%xmm7                        \n"
   464     "movdqa %xmm0,%xmm6                        \n"
   465     "punpckldq %xmm4,%xmm0                     \n"
   466     "punpckhdq %xmm4,%xmm6                     \n"
   467     "movdqa %xmm6,%xmm4                        \n"
   468     "movdqa (%esp),%xmm6                       \n"
   469     "movlpd %xmm0,(%edx)                       \n"
   470     "movhpd %xmm0,(%ebx)                       \n"
   471     "movlpd %xmm4,(%edx,%esi,1)                \n"
   472     "lea    (%edx,%esi,2),%edx                 \n"
   473     "movhpd %xmm4,(%ebx,%ebp,1)                \n"
   474     "lea    (%ebx,%ebp,2),%ebx                 \n"
   475     "movdqa %xmm2,%xmm0                        \n"
   476     "punpckldq %xmm6,%xmm2                     \n"
   477     "movlpd %xmm2,(%edx)                       \n"
   478     "movhpd %xmm2,(%ebx)                       \n"
   479     "punpckhdq %xmm6,%xmm0                     \n"
   480     "movlpd %xmm0,(%edx,%esi,1)                \n"
   481     "lea    (%edx,%esi,2),%edx                 \n"
   482     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
   483     "lea    (%ebx,%ebp,2),%ebx                 \n"
   484     "movdqa %xmm1,%xmm0                        \n"
   485     "punpckldq %xmm5,%xmm1                     \n"
   486     "movlpd %xmm1,(%edx)                       \n"
   487     "movhpd %xmm1,(%ebx)                       \n"
   488     "punpckhdq %xmm5,%xmm0                     \n"
   489     "movlpd %xmm0,(%edx,%esi,1)                \n"
   490     "lea    (%edx,%esi,2),%edx                 \n"
   491     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
   492     "lea    (%ebx,%ebp,2),%ebx                 \n"
   493     "movdqa %xmm3,%xmm0                        \n"
   494     "punpckldq %xmm7,%xmm3                     \n"
   495     "movlpd %xmm3,(%edx)                       \n"
   496     "movhpd %xmm3,(%ebx)                       \n"
   497     "punpckhdq %xmm7,%xmm0                     \n"
   498     "sub    $0x8,%ecx                          \n"
   499     "movlpd %xmm0,(%edx,%esi,1)                \n"
   500     "lea    (%edx,%esi,2),%edx                 \n"
   501     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
   502     "lea    (%ebx,%ebp,2),%ebx                 \n"
   503     "jg     1b                                 \n"
   504     "mov    0x10(%esp),%esp                    \n"
   505     "pop    %ebp                               \n"
   506     "pop    %edi                               \n"
   507     "pop    %esi                               \n"
   508     "pop    %ebx                               \n"
   509 #if defined(__native_client__)
   510     "pop    %ecx                               \n"
   511     "and    $0xffffffe0,%ecx                   \n"
   512     "jmp    *%ecx                              \n"
   513 #else
   514     "ret                                       \n"
   515 #endif
   516 );
   517 #elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
   518     defined(__x86_64__)
   519 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
   520 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
   521 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
   522                                     uint8* dst, int dst_stride, int width) {
   523   asm volatile (
   524   // Read in the data from the source pointer.
   525   // First round of bit swap.
   526   ".p2align  2                                 \n"
   527 "1:                                            \n"
   528   "movdqa     (%0),%%xmm0                      \n"
   529   "movdqa     (%0,%3),%%xmm1                   \n"
   530   "lea        (%0,%3,2),%0                     \n"
   531   "movdqa     %%xmm0,%%xmm8                    \n"
   532   "punpcklbw  %%xmm1,%%xmm0                    \n"
   533   "punpckhbw  %%xmm1,%%xmm8                    \n"
   534   "movdqa     (%0),%%xmm2                      \n"
   535   "movdqa     %%xmm0,%%xmm1                    \n"
   536   "movdqa     %%xmm8,%%xmm9                    \n"
   537   "palignr    $0x8,%%xmm1,%%xmm1               \n"
   538   "palignr    $0x8,%%xmm9,%%xmm9               \n"
   539   "movdqa     (%0,%3),%%xmm3                   \n"
   540   "lea        (%0,%3,2),%0                     \n"
   541   "movdqa     %%xmm2,%%xmm10                   \n"
   542   "punpcklbw  %%xmm3,%%xmm2                    \n"
   543   "punpckhbw  %%xmm3,%%xmm10                   \n"
   544   "movdqa     %%xmm2,%%xmm3                    \n"
   545   "movdqa     %%xmm10,%%xmm11                  \n"
   546   "movdqa     (%0),%%xmm4                      \n"
   547   "palignr    $0x8,%%xmm3,%%xmm3               \n"
   548   "palignr    $0x8,%%xmm11,%%xmm11             \n"
   549   "movdqa     (%0,%3),%%xmm5                   \n"
   550   "lea        (%0,%3,2),%0                     \n"
   551   "movdqa     %%xmm4,%%xmm12                   \n"
   552   "punpcklbw  %%xmm5,%%xmm4                    \n"
   553   "punpckhbw  %%xmm5,%%xmm12                   \n"
   554   "movdqa     %%xmm4,%%xmm5                    \n"
   555   "movdqa     %%xmm12,%%xmm13                  \n"
   556   "movdqa     (%0),%%xmm6                      \n"
   557   "palignr    $0x8,%%xmm5,%%xmm5               \n"
   558   "palignr    $0x8,%%xmm13,%%xmm13             \n"
   559   "movdqa     (%0,%3),%%xmm7                   \n"
   560   "lea        (%0,%3,2),%0                     \n"
   561   "movdqa     %%xmm6,%%xmm14                   \n"
   562   "punpcklbw  %%xmm7,%%xmm6                    \n"
   563   "punpckhbw  %%xmm7,%%xmm14                   \n"
   564   "neg        %3                               \n"
   565   "movdqa     %%xmm6,%%xmm7                    \n"
   566   "movdqa     %%xmm14,%%xmm15                  \n"
   567   "lea        0x10(%0,%3,8),%0                 \n"
   568   "palignr    $0x8,%%xmm7,%%xmm7               \n"
   569   "palignr    $0x8,%%xmm15,%%xmm15             \n"
   570   "neg        %3                               \n"
   571    // Second round of bit swap.
   572   "punpcklwd  %%xmm2,%%xmm0                    \n"
   573   "punpcklwd  %%xmm3,%%xmm1                    \n"
   574   "movdqa     %%xmm0,%%xmm2                    \n"
   575   "movdqa     %%xmm1,%%xmm3                    \n"
   576   "palignr    $0x8,%%xmm2,%%xmm2               \n"
   577   "palignr    $0x8,%%xmm3,%%xmm3               \n"
   578   "punpcklwd  %%xmm6,%%xmm4                    \n"
   579   "punpcklwd  %%xmm7,%%xmm5                    \n"
   580   "movdqa     %%xmm4,%%xmm6                    \n"
   581   "movdqa     %%xmm5,%%xmm7                    \n"
   582   "palignr    $0x8,%%xmm6,%%xmm6               \n"
   583   "palignr    $0x8,%%xmm7,%%xmm7               \n"
   584   "punpcklwd  %%xmm10,%%xmm8                   \n"
   585   "punpcklwd  %%xmm11,%%xmm9                   \n"
   586   "movdqa     %%xmm8,%%xmm10                   \n"
   587   "movdqa     %%xmm9,%%xmm11                   \n"
   588   "palignr    $0x8,%%xmm10,%%xmm10             \n"
   589   "palignr    $0x8,%%xmm11,%%xmm11             \n"
   590   "punpcklwd  %%xmm14,%%xmm12                  \n"
   591   "punpcklwd  %%xmm15,%%xmm13                  \n"
   592   "movdqa     %%xmm12,%%xmm14                  \n"
   593   "movdqa     %%xmm13,%%xmm15                  \n"
   594   "palignr    $0x8,%%xmm14,%%xmm14             \n"
   595   "palignr    $0x8,%%xmm15,%%xmm15             \n"
   596   // Third round of bit swap.
   597   // Write to the destination pointer.
   598   "punpckldq  %%xmm4,%%xmm0                    \n"
   599   "movq       %%xmm0,(%1)                      \n"
   600   "movdqa     %%xmm0,%%xmm4                    \n"
   601   "palignr    $0x8,%%xmm4,%%xmm4               \n"
   602   "movq       %%xmm4,(%1,%4)                   \n"
   603   "lea        (%1,%4,2),%1                     \n"
   604   "punpckldq  %%xmm6,%%xmm2                    \n"
   605   "movdqa     %%xmm2,%%xmm6                    \n"
   606   "movq       %%xmm2,(%1)                      \n"
   607   "palignr    $0x8,%%xmm6,%%xmm6               \n"
   608   "punpckldq  %%xmm5,%%xmm1                    \n"
   609   "movq       %%xmm6,(%1,%4)                   \n"
   610   "lea        (%1,%4,2),%1                     \n"
   611   "movdqa     %%xmm1,%%xmm5                    \n"
   612   "movq       %%xmm1,(%1)                      \n"
   613   "palignr    $0x8,%%xmm5,%%xmm5               \n"
   614   "movq       %%xmm5,(%1,%4)                   \n"
   615   "lea        (%1,%4,2),%1                     \n"
   616   "punpckldq  %%xmm7,%%xmm3                    \n"
   617   "movq       %%xmm3,(%1)                      \n"
   618   "movdqa     %%xmm3,%%xmm7                    \n"
   619   "palignr    $0x8,%%xmm7,%%xmm7               \n"
   620   "movq       %%xmm7,(%1,%4)                   \n"
   621   "lea        (%1,%4,2),%1                     \n"
   622   "punpckldq  %%xmm12,%%xmm8                   \n"
   623   "movq       %%xmm8,(%1)                      \n"
   624   "movdqa     %%xmm8,%%xmm12                   \n"
   625   "palignr    $0x8,%%xmm12,%%xmm12             \n"
   626   "movq       %%xmm12,(%1,%4)                  \n"
   627   "lea        (%1,%4,2),%1                     \n"
   628   "punpckldq  %%xmm14,%%xmm10                  \n"
   629   "movdqa     %%xmm10,%%xmm14                  \n"
   630   "movq       %%xmm10,(%1)                     \n"
   631   "palignr    $0x8,%%xmm14,%%xmm14             \n"
   632   "punpckldq  %%xmm13,%%xmm9                   \n"
   633   "movq       %%xmm14,(%1,%4)                  \n"
   634   "lea        (%1,%4,2),%1                     \n"
   635   "movdqa     %%xmm9,%%xmm13                   \n"
   636   "movq       %%xmm9,(%1)                      \n"
   637   "palignr    $0x8,%%xmm13,%%xmm13             \n"
   638   "movq       %%xmm13,(%1,%4)                  \n"
   639   "lea        (%1,%4,2),%1                     \n"
   640   "punpckldq  %%xmm15,%%xmm11                  \n"
   641   "movq       %%xmm11,(%1)                     \n"
   642   "movdqa     %%xmm11,%%xmm15                  \n"
   643   "palignr    $0x8,%%xmm15,%%xmm15             \n"
   644   "sub        $0x10,%2                         \n"
   645   "movq       %%xmm15,(%1,%4)                  \n"
   646   "lea        (%1,%4,2),%1                     \n"
   647   "jg         1b                               \n"
   648   : "+r"(src),    // %0
   649     "+r"(dst),    // %1
   650     "+r"(width)   // %2
   651   : "r"((intptr_t)(src_stride)),  // %3
   652     "r"((intptr_t)(dst_stride))   // %4
   653   : "memory", "cc",
   654     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
   655     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
   656 );
   657 }
   659 #define HAS_TRANSPOSE_UVWX8_SSE2
   660 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   661                                 uint8* dst_a, int dst_stride_a,
   662                                 uint8* dst_b, int dst_stride_b,
   663                                 int w) {
   664   asm volatile (
   665   // Read in the data from the source pointer.
   666   // First round of bit swap.
   667   ".p2align  2                                 \n"
   668 "1:                                            \n"
   669   "movdqa     (%0),%%xmm0                      \n"
   670   "movdqa     (%0,%4),%%xmm1                   \n"
   671   "lea        (%0,%4,2),%0                     \n"
   672   "movdqa     %%xmm0,%%xmm8                    \n"
   673   "punpcklbw  %%xmm1,%%xmm0                    \n"
   674   "punpckhbw  %%xmm1,%%xmm8                    \n"
   675   "movdqa     %%xmm8,%%xmm1                    \n"
   676   "movdqa     (%0),%%xmm2                      \n"
   677   "movdqa     (%0,%4),%%xmm3                   \n"
   678   "lea        (%0,%4,2),%0                     \n"
   679   "movdqa     %%xmm2,%%xmm8                    \n"
   680   "punpcklbw  %%xmm3,%%xmm2                    \n"
   681   "punpckhbw  %%xmm3,%%xmm8                    \n"
   682   "movdqa     %%xmm8,%%xmm3                    \n"
   683   "movdqa     (%0),%%xmm4                      \n"
   684   "movdqa     (%0,%4),%%xmm5                   \n"
   685   "lea        (%0,%4,2),%0                     \n"
   686   "movdqa     %%xmm4,%%xmm8                    \n"
   687   "punpcklbw  %%xmm5,%%xmm4                    \n"
   688   "punpckhbw  %%xmm5,%%xmm8                    \n"
   689   "movdqa     %%xmm8,%%xmm5                    \n"
   690   "movdqa     (%0),%%xmm6                      \n"
   691   "movdqa     (%0,%4),%%xmm7                   \n"
   692   "lea        (%0,%4,2),%0                     \n"
   693   "movdqa     %%xmm6,%%xmm8                    \n"
   694   "punpcklbw  %%xmm7,%%xmm6                    \n"
   695   "neg        %4                               \n"
   696   "lea        0x10(%0,%4,8),%0                 \n"
   697   "punpckhbw  %%xmm7,%%xmm8                    \n"
   698   "movdqa     %%xmm8,%%xmm7                    \n"
   699   "neg        %4                               \n"
   700    // Second round of bit swap.
   701   "movdqa     %%xmm0,%%xmm8                    \n"
   702   "movdqa     %%xmm1,%%xmm9                    \n"
   703   "punpckhwd  %%xmm2,%%xmm8                    \n"
   704   "punpckhwd  %%xmm3,%%xmm9                    \n"
   705   "punpcklwd  %%xmm2,%%xmm0                    \n"
   706   "punpcklwd  %%xmm3,%%xmm1                    \n"
   707   "movdqa     %%xmm8,%%xmm2                    \n"
   708   "movdqa     %%xmm9,%%xmm3                    \n"
   709   "movdqa     %%xmm4,%%xmm8                    \n"
   710   "movdqa     %%xmm5,%%xmm9                    \n"
   711   "punpckhwd  %%xmm6,%%xmm8                    \n"
   712   "punpckhwd  %%xmm7,%%xmm9                    \n"
   713   "punpcklwd  %%xmm6,%%xmm4                    \n"
   714   "punpcklwd  %%xmm7,%%xmm5                    \n"
   715   "movdqa     %%xmm8,%%xmm6                    \n"
   716   "movdqa     %%xmm9,%%xmm7                    \n"
   717   // Third round of bit swap.
   718   // Write to the destination pointer.
   719   "movdqa     %%xmm0,%%xmm8                    \n"
   720   "punpckldq  %%xmm4,%%xmm0                    \n"
   721   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
   722   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
   723   "punpckhdq  %%xmm4,%%xmm8                    \n"
   724   "movlpd     %%xmm8,(%1,%5)                   \n"
   725   "lea        (%1,%5,2),%1                     \n"
   726   "movhpd     %%xmm8,(%2,%6)                   \n"
   727   "lea        (%2,%6,2),%2                     \n"
   728   "movdqa     %%xmm2,%%xmm8                    \n"
   729   "punpckldq  %%xmm6,%%xmm2                    \n"
   730   "movlpd     %%xmm2,(%1)                      \n"
   731   "movhpd     %%xmm2,(%2)                      \n"
   732   "punpckhdq  %%xmm6,%%xmm8                    \n"
   733   "movlpd     %%xmm8,(%1,%5)                   \n"
   734   "lea        (%1,%5,2),%1                     \n"
   735   "movhpd     %%xmm8,(%2,%6)                   \n"
   736   "lea        (%2,%6,2),%2                     \n"
   737   "movdqa     %%xmm1,%%xmm8                    \n"
   738   "punpckldq  %%xmm5,%%xmm1                    \n"
   739   "movlpd     %%xmm1,(%1)                      \n"
   740   "movhpd     %%xmm1,(%2)                      \n"
   741   "punpckhdq  %%xmm5,%%xmm8                    \n"
   742   "movlpd     %%xmm8,(%1,%5)                   \n"
   743   "lea        (%1,%5,2),%1                     \n"
   744   "movhpd     %%xmm8,(%2,%6)                   \n"
   745   "lea        (%2,%6,2),%2                     \n"
   746   "movdqa     %%xmm3,%%xmm8                    \n"
   747   "punpckldq  %%xmm7,%%xmm3                    \n"
   748   "movlpd     %%xmm3,(%1)                      \n"
   749   "movhpd     %%xmm3,(%2)                      \n"
   750   "punpckhdq  %%xmm7,%%xmm8                    \n"
   751   "sub        $0x8,%3                          \n"
   752   "movlpd     %%xmm8,(%1,%5)                   \n"
   753   "lea        (%1,%5,2),%1                     \n"
   754   "movhpd     %%xmm8,(%2,%6)                   \n"
   755   "lea        (%2,%6,2),%2                     \n"
   756   "jg         1b                               \n"
   757   : "+r"(src),    // %0
   758     "+r"(dst_a),  // %1
   759     "+r"(dst_b),  // %2
   760     "+r"(w)   // %3
   761   : "r"((intptr_t)(src_stride)),    // %4
   762     "r"((intptr_t)(dst_stride_a)),  // %5
   763     "r"((intptr_t)(dst_stride_b))   // %6
   764   : "memory", "cc",
   765     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
   766     "xmm8", "xmm9"
   767 );
   768 }
   769 #endif
   770 #endif
   772 static void TransposeWx8_C(const uint8* src, int src_stride,
   773                            uint8* dst, int dst_stride,
   774                            int width) {
   775   int i;
   776   for (i = 0; i < width; ++i) {
   777     dst[0] = src[0 * src_stride];
   778     dst[1] = src[1 * src_stride];
   779     dst[2] = src[2 * src_stride];
   780     dst[3] = src[3 * src_stride];
   781     dst[4] = src[4 * src_stride];
   782     dst[5] = src[5 * src_stride];
   783     dst[6] = src[6 * src_stride];
   784     dst[7] = src[7 * src_stride];
   785     ++src;
   786     dst += dst_stride;
   787   }
   788 }
   790 static void TransposeWxH_C(const uint8* src, int src_stride,
   791                            uint8* dst, int dst_stride,
   792                            int width, int height) {
   793   int i;
   794   for (i = 0; i < width; ++i) {
   795     int j;
   796     for (j = 0; j < height; ++j) {
   797       dst[i * dst_stride + j] = src[j * src_stride + i];
   798     }
   799   }
   800 }
   802 LIBYUV_API
   803 void TransposePlane(const uint8* src, int src_stride,
   804                     uint8* dst, int dst_stride,
   805                     int width, int height) {
   806   int i = height;
   807   void (*TransposeWx8)(const uint8* src, int src_stride,
   808                        uint8* dst, int dst_stride,
   809                        int width) = TransposeWx8_C;
   810 #if defined(HAS_TRANSPOSE_WX8_NEON)
   811   if (TestCpuFlag(kCpuHasNEON)) {
   812     TransposeWx8 = TransposeWx8_NEON;
   813   }
   814 #endif
   815 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
   816   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
   817     TransposeWx8 = TransposeWx8_SSSE3;
   818   }
   819 #endif
   820 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
   821   if (TestCpuFlag(kCpuHasSSSE3) &&
   822       IS_ALIGNED(width, 16) &&
   823       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
   824     TransposeWx8 = TransposeWx8_FAST_SSSE3;
   825   }
   826 #endif
   827 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
   828   if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
   829     if (IS_ALIGNED(width, 4) &&
   830         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
   831       TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
   832     } else {
   833       TransposeWx8 = TransposeWx8_MIPS_DSPR2;
   834     }
   835   }
   836 #endif
   838   // Work across the source in 8x8 tiles
   839   while (i >= 8) {
   840     TransposeWx8(src, src_stride, dst, dst_stride, width);
   841     src += 8 * src_stride;    // Go down 8 rows.
   842     dst += 8;                 // Move over 8 columns.
   843     i -= 8;
   844   }
   846   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
   847 }
   849 LIBYUV_API
   850 void RotatePlane90(const uint8* src, int src_stride,
   851                    uint8* dst, int dst_stride,
   852                    int width, int height) {
   853   // Rotate by 90 is a transpose with the source read
   854   // from bottom to top. So set the source pointer to the end
   855   // of the buffer and flip the sign of the source stride.
   856   src += src_stride * (height - 1);
   857   src_stride = -src_stride;
   858   TransposePlane(src, src_stride, dst, dst_stride, width, height);
   859 }
   861 LIBYUV_API
   862 void RotatePlane270(const uint8* src, int src_stride,
   863                     uint8* dst, int dst_stride,
   864                     int width, int height) {
   865   // Rotate by 270 is a transpose with the destination written
   866   // from bottom to top. So set the destination pointer to the end
   867   // of the buffer and flip the sign of the destination stride.
   868   dst += dst_stride * (width - 1);
   869   dst_stride = -dst_stride;
   870   TransposePlane(src, src_stride, dst, dst_stride, width, height);
   871 }
   873 LIBYUV_API
   874 void RotatePlane180(const uint8* src, int src_stride,
   875                     uint8* dst, int dst_stride,
   876                     int width, int height) {
   877   // Swap first and last row and mirror the content. Uses a temporary row.
   878   align_buffer_64(row, width);
   879   const uint8* src_bot = src + src_stride * (height - 1);
   880   uint8* dst_bot = dst + dst_stride * (height - 1);
   881   int half_height = (height + 1) >> 1;
   882   int y;
   883   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
   884   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
   885 #if defined(HAS_MIRRORROW_NEON)
   886   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
   887     MirrorRow = MirrorRow_NEON;
   888   }
   889 #endif
   890 #if defined(HAS_MIRRORROW_SSE2)
   891   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
   892       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
   893       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
   894     MirrorRow = MirrorRow_SSE2;
   895   }
   896 #endif
   897 #if defined(HAS_MIRRORROW_SSSE3)
   898   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
   899       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
   900       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
   901     MirrorRow = MirrorRow_SSSE3;
   902   }
   903 #endif
   904 #if defined(HAS_MIRRORROW_AVX2)
   905   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
   906     MirrorRow = MirrorRow_AVX2;
   907   }
   908 #endif
   909 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
   910   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
   911       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
   912       IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
   913     MirrorRow = MirrorRow_MIPS_DSPR2;
   914   }
   915 #endif
   916 #if defined(HAS_COPYROW_NEON)
   917   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
   918     CopyRow = CopyRow_NEON;
   919   }
   920 #endif
   921 #if defined(HAS_COPYROW_X86)
   922   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
   923     CopyRow = CopyRow_X86;
   924   }
   925 #endif
   926 #if defined(HAS_COPYROW_SSE2)
   927   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
   928       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
   929       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
   930     CopyRow = CopyRow_SSE2;
   931   }
   932 #endif
   933 #if defined(HAS_COPYROW_ERMS)
   934   if (TestCpuFlag(kCpuHasERMS)) {
   935     CopyRow = CopyRow_ERMS;
   936   }
   937 #endif
   938 #if defined(HAS_COPYROW_MIPS)
   939   if (TestCpuFlag(kCpuHasMIPS)) {
   940     CopyRow = CopyRow_MIPS;
   941   }
   942 #endif
   944   // Odd height will harmlessly mirror the middle row twice.
   945   for (y = 0; y < half_height; ++y) {
   946     MirrorRow(src, row, width);  // Mirror first row into a buffer
   947     src += src_stride;
   948     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
   949     dst += dst_stride;
   950     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
   951     src_bot -= src_stride;
   952     dst_bot -= dst_stride;
   953   }
   954   free_aligned_buffer_64(row);
   955 }
   957 static void TransposeUVWx8_C(const uint8* src, int src_stride,
   958                              uint8* dst_a, int dst_stride_a,
   959                              uint8* dst_b, int dst_stride_b,
   960                              int width) {
   961   int i;
   962   for (i = 0; i < width; ++i) {
   963     dst_a[0] = src[0 * src_stride + 0];
   964     dst_b[0] = src[0 * src_stride + 1];
   965     dst_a[1] = src[1 * src_stride + 0];
   966     dst_b[1] = src[1 * src_stride + 1];
   967     dst_a[2] = src[2 * src_stride + 0];
   968     dst_b[2] = src[2 * src_stride + 1];
   969     dst_a[3] = src[3 * src_stride + 0];
   970     dst_b[3] = src[3 * src_stride + 1];
   971     dst_a[4] = src[4 * src_stride + 0];
   972     dst_b[4] = src[4 * src_stride + 1];
   973     dst_a[5] = src[5 * src_stride + 0];
   974     dst_b[5] = src[5 * src_stride + 1];
   975     dst_a[6] = src[6 * src_stride + 0];
   976     dst_b[6] = src[6 * src_stride + 1];
   977     dst_a[7] = src[7 * src_stride + 0];
   978     dst_b[7] = src[7 * src_stride + 1];
   979     src += 2;
   980     dst_a += dst_stride_a;
   981     dst_b += dst_stride_b;
   982   }
   983 }
   985 static void TransposeUVWxH_C(const uint8* src, int src_stride,
   986                              uint8* dst_a, int dst_stride_a,
   987                              uint8* dst_b, int dst_stride_b,
   988                              int width, int height) {
   989   int i;
   990   for (i = 0; i < width * 2; i += 2) {
   991     int j;
   992     for (j = 0; j < height; ++j) {
   993       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
   994       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
   995     }
   996   }
   997 }
   999 LIBYUV_API
  1000 void TransposeUV(const uint8* src, int src_stride,
  1001                  uint8* dst_a, int dst_stride_a,
  1002                  uint8* dst_b, int dst_stride_b,
  1003                  int width, int height) {
  1004   int i = height;
  1005   void (*TransposeUVWx8)(const uint8* src, int src_stride,
  1006                          uint8* dst_a, int dst_stride_a,
  1007                          uint8* dst_b, int dst_stride_b,
  1008                          int width) = TransposeUVWx8_C;
  1009 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
  1010   if (TestCpuFlag(kCpuHasNEON)) {
  1011     TransposeUVWx8 = TransposeUVWx8_NEON;
  1013 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
  1014   if (TestCpuFlag(kCpuHasSSE2) &&
  1015       IS_ALIGNED(width, 8) &&
  1016       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
  1017     TransposeUVWx8 = TransposeUVWx8_SSE2;
  1019 #elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
  1020   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
  1021       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
  1022     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
  1024 #endif
  1026   // Work through the source in 8x8 tiles.
  1027   while (i >= 8) {
  1028     TransposeUVWx8(src, src_stride,
  1029                    dst_a, dst_stride_a,
  1030                    dst_b, dst_stride_b,
  1031                    width);
  1032     src += 8 * src_stride;    // Go down 8 rows.
  1033     dst_a += 8;               // Move over 8 columns.
  1034     dst_b += 8;               // Move over 8 columns.
  1035     i -= 8;
  1038   TransposeUVWxH_C(src, src_stride,
  1039                    dst_a, dst_stride_a,
  1040                    dst_b, dst_stride_b,
  1041                    width, i);
  1044 LIBYUV_API
  1045 void RotateUV90(const uint8* src, int src_stride,
  1046                 uint8* dst_a, int dst_stride_a,
  1047                 uint8* dst_b, int dst_stride_b,
  1048                 int width, int height) {
  1049   src += src_stride * (height - 1);
  1050   src_stride = -src_stride;
  1052   TransposeUV(src, src_stride,
  1053               dst_a, dst_stride_a,
  1054               dst_b, dst_stride_b,
  1055               width, height);
  1058 LIBYUV_API
  1059 void RotateUV270(const uint8* src, int src_stride,
  1060                  uint8* dst_a, int dst_stride_a,
  1061                  uint8* dst_b, int dst_stride_b,
  1062                  int width, int height) {
  1063   dst_a += dst_stride_a * (width - 1);
  1064   dst_b += dst_stride_b * (width - 1);
  1065   dst_stride_a = -dst_stride_a;
  1066   dst_stride_b = -dst_stride_b;
  1068   TransposeUV(src, src_stride,
  1069               dst_a, dst_stride_a,
  1070               dst_b, dst_stride_b,
  1071               width, height);
  1074 // Rotate 180 is a horizontal and vertical flip.
  1075 LIBYUV_API
  1076 void RotateUV180(const uint8* src, int src_stride,
  1077                  uint8* dst_a, int dst_stride_a,
  1078                  uint8* dst_b, int dst_stride_b,
  1079                  int width, int height) {
  1080   int i;
  1081   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
  1082       MirrorUVRow_C;
  1083 #if defined(HAS_MIRRORUVROW_NEON)
  1084   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
  1085     MirrorRowUV = MirrorUVRow_NEON;
  1087 #elif defined(HAS_MIRRORROW_UV_SSSE3)
  1088   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
  1089       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
  1090     MirrorRowUV = MirrorUVRow_SSSE3;
  1092 #elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
  1093   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
  1094       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
  1095     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
  1097 #endif
  1099   dst_a += dst_stride_a * (height - 1);
  1100   dst_b += dst_stride_b * (height - 1);
  1102   for (i = 0; i < height; ++i) {
  1103     MirrorRowUV(src, dst_a, dst_b, width);
  1104     src += src_stride;
  1105     dst_a -= dst_stride_a;
  1106     dst_b -= dst_stride_b;
  1110 LIBYUV_API
  1111 int RotatePlane(const uint8* src, int src_stride,
  1112                 uint8* dst, int dst_stride,
  1113                 int width, int height,
  1114                 enum RotationMode mode) {
  1115   if (!src || width <= 0 || height == 0 || !dst) {
  1116     return -1;
  1119   // Negative height means invert the image.
  1120   if (height < 0) {
  1121     height = -height;
  1122     src = src + (height - 1) * src_stride;
  1123     src_stride = -src_stride;
  1126   switch (mode) {
  1127     case kRotate0:
  1128       // copy frame
  1129       CopyPlane(src, src_stride,
  1130                 dst, dst_stride,
  1131                 width, height);
  1132       return 0;
  1133     case kRotate90:
  1134       RotatePlane90(src, src_stride,
  1135                     dst, dst_stride,
  1136                     width, height);
  1137       return 0;
  1138     case kRotate270:
  1139       RotatePlane270(src, src_stride,
  1140                      dst, dst_stride,
  1141                      width, height);
  1142       return 0;
  1143     case kRotate180:
  1144       RotatePlane180(src, src_stride,
  1145                      dst, dst_stride,
  1146                      width, height);
  1147       return 0;
  1148     default:
  1149       break;
  1151   return -1;
  1154 LIBYUV_API
  1155 int I420Rotate(const uint8* src_y, int src_stride_y,
  1156                const uint8* src_u, int src_stride_u,
  1157                const uint8* src_v, int src_stride_v,
  1158                uint8* dst_y, int dst_stride_y,
  1159                uint8* dst_u, int dst_stride_u,
  1160                uint8* dst_v, int dst_stride_v,
  1161                int width, int height,
  1162                enum RotationMode mode) {
  1163   int halfwidth = (width + 1) >> 1;
  1164   int halfheight = (height + 1) >> 1;
  1165   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
  1166       !dst_y || !dst_u || !dst_v) {
  1167     return -1;
  1170   // Negative height means invert the image.
  1171   if (height < 0) {
  1172     height = -height;
  1173     halfheight = (height + 1) >> 1;
  1174     src_y = src_y + (height - 1) * src_stride_y;
  1175     src_u = src_u + (halfheight - 1) * src_stride_u;
  1176     src_v = src_v + (halfheight - 1) * src_stride_v;
  1177     src_stride_y = -src_stride_y;
  1178     src_stride_u = -src_stride_u;
  1179     src_stride_v = -src_stride_v;
  1182   switch (mode) {
  1183     case kRotate0:
  1184       // copy frame
  1185       return I420Copy(src_y, src_stride_y,
  1186                       src_u, src_stride_u,
  1187                       src_v, src_stride_v,
  1188                       dst_y, dst_stride_y,
  1189                       dst_u, dst_stride_u,
  1190                       dst_v, dst_stride_v,
  1191                       width, height);
  1192     case kRotate90:
  1193       RotatePlane90(src_y, src_stride_y,
  1194                     dst_y, dst_stride_y,
  1195                     width, height);
  1196       RotatePlane90(src_u, src_stride_u,
  1197                     dst_u, dst_stride_u,
  1198                     halfwidth, halfheight);
  1199       RotatePlane90(src_v, src_stride_v,
  1200                     dst_v, dst_stride_v,
  1201                     halfwidth, halfheight);
  1202       return 0;
  1203     case kRotate270:
  1204       RotatePlane270(src_y, src_stride_y,
  1205                      dst_y, dst_stride_y,
  1206                      width, height);
  1207       RotatePlane270(src_u, src_stride_u,
  1208                      dst_u, dst_stride_u,
  1209                      halfwidth, halfheight);
  1210       RotatePlane270(src_v, src_stride_v,
  1211                      dst_v, dst_stride_v,
  1212                      halfwidth, halfheight);
  1213       return 0;
  1214     case kRotate180:
  1215       RotatePlane180(src_y, src_stride_y,
  1216                      dst_y, dst_stride_y,
  1217                      width, height);
  1218       RotatePlane180(src_u, src_stride_u,
  1219                      dst_u, dst_stride_u,
  1220                      halfwidth, halfheight);
  1221       RotatePlane180(src_v, src_stride_v,
  1222                      dst_v, dst_stride_v,
  1223                      halfwidth, halfheight);
  1224       return 0;
  1225     default:
  1226       break;
  1228   return -1;
  1231 LIBYUV_API
  1232 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
  1233                      const uint8* src_uv, int src_stride_uv,
  1234                      uint8* dst_y, int dst_stride_y,
  1235                      uint8* dst_u, int dst_stride_u,
  1236                      uint8* dst_v, int dst_stride_v,
  1237                      int width, int height,
  1238                      enum RotationMode mode) {
  1239   int halfwidth = (width + 1) >> 1;
  1240   int halfheight = (height + 1) >> 1;
  1241   if (!src_y || !src_uv || width <= 0 || height == 0 ||
  1242       !dst_y || !dst_u || !dst_v) {
  1243     return -1;
  1246   // Negative height means invert the image.
  1247   if (height < 0) {
  1248     height = -height;
  1249     halfheight = (height + 1) >> 1;
  1250     src_y = src_y + (height - 1) * src_stride_y;
  1251     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
  1252     src_stride_y = -src_stride_y;
  1253     src_stride_uv = -src_stride_uv;
  1256   switch (mode) {
  1257     case kRotate0:
  1258       // copy frame
  1259       return NV12ToI420(src_y, src_stride_y,
  1260                         src_uv, src_stride_uv,
  1261                         dst_y, dst_stride_y,
  1262                         dst_u, dst_stride_u,
  1263                         dst_v, dst_stride_v,
  1264                         width, height);
  1265     case kRotate90:
  1266       RotatePlane90(src_y, src_stride_y,
  1267                     dst_y, dst_stride_y,
  1268                     width, height);
  1269       RotateUV90(src_uv, src_stride_uv,
  1270                  dst_u, dst_stride_u,
  1271                  dst_v, dst_stride_v,
  1272                  halfwidth, halfheight);
  1273       return 0;
  1274     case kRotate270:
  1275       RotatePlane270(src_y, src_stride_y,
  1276                      dst_y, dst_stride_y,
  1277                      width, height);
  1278       RotateUV270(src_uv, src_stride_uv,
  1279                   dst_u, dst_stride_u,
  1280                   dst_v, dst_stride_v,
  1281                   halfwidth, halfheight);
  1282       return 0;
  1283     case kRotate180:
  1284       RotatePlane180(src_y, src_stride_y,
  1285                      dst_y, dst_stride_y,
  1286                      width, height);
  1287       RotateUV180(src_uv, src_stride_uv,
  1288                   dst_u, dst_stride_u,
  1289                   dst_v, dst_stride_v,
  1290                   halfwidth, halfheight);
  1291       return 0;
  1292     default:
  1293       break;
  1295   return -1;
  1298 #ifdef __cplusplus
  1299 }  // extern "C"
  1300 }  // namespace libyuv
  1301 #endif

mercurial