The Tor Browser: media/libyuv/source/rotate.cc@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*

     2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

     3  *

     4  *  Use of this source code is governed by a BSD-style license

     5  *  that can be found in the LICENSE file in the root of the source

     6  *  tree. An additional intellectual property rights grant can be found

     7  *  in the file PATENTS. All contributing project authors may

     8  *  be found in the AUTHORS file in the root of the source tree.

     9  */

    11 #include "libyuv/rotate.h"

    13 #include "libyuv/cpu_id.h"

    14 #include "libyuv/convert.h"

    15 #include "libyuv/planar_functions.h"

    16 #include "libyuv/row.h"

    18 #ifdef __cplusplus

    19 namespace libyuv {

    20 extern "C" {

    21 #endif

    23 #if !defined(LIBYUV_DISABLE_X86) && \

    24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

    25 #if defined(__APPLE__) && defined(__i386__)

    26 #define DECLARE_FUNCTION(name)                                                 \

    27     ".text                                     \n"                             \

    28     ".private_extern _" #name "                \n"                             \

    29     ".align 4,0x90                             \n"                             \

    30 "_" #name ":                                   \n"

    31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)

    32 #define DECLARE_FUNCTION(name)                                                 \

    33     ".text                                     \n"                             \

    34     ".align 4,0x90                             \n"                             \

    35 "_" #name ":                                   \n"

    36 #else

    37 #define DECLARE_FUNCTION(name)                                                 \

    38     ".text                                     \n"                             \

    39     ".align 4,0x90                             \n"                             \

    40 #name ":                                       \n"

    41 #endif

    42 #endif

    44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \

    45     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))

    46 #define HAS_MIRRORROW_NEON

    47 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);

    48 #define HAS_MIRRORROW_UV_NEON

    49 void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);

    50 #define HAS_TRANSPOSE_WX8_NEON

    51 void TransposeWx8_NEON(const uint8* src, int src_stride,

    52                        uint8* dst, int dst_stride, int width);

    53 #define HAS_TRANSPOSE_UVWX8_NEON

    54 void TransposeUVWx8_NEON(const uint8* src, int src_stride,

    55                          uint8* dst_a, int dst_stride_a,

    56                          uint8* dst_b, int dst_stride_b,

    57                          int width);

    58 #endif  // defined(__ARM_NEON__)

    60 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \

    61     defined(__mips__) && \

    62     defined(__mips_dsp) && (__mips_dsp_rev >= 2)

    63 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2

    64 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,

    65                              uint8* dst, int dst_stride, int width);

    67 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,

    68                                   uint8* dst, int dst_stride, int width);

    69 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2

    70 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,

    71                                uint8* dst_a, int dst_stride_a,

    72                                uint8* dst_b, int dst_stride_b,

    73                                int width);

    74 #endif  // defined(__mips__)

    76 #if !defined(LIBYUV_DISABLE_X86) && \

    77     defined(_M_IX86) && defined(_MSC_VER)

    78 #define HAS_TRANSPOSE_WX8_SSSE3

    79 __declspec(naked) __declspec(align(16))

    80 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,

    81                                uint8* dst, int dst_stride, int width) {

    82   __asm {

    83     push      edi

    84     push      esi

    85     push      ebp

    86     mov       eax, [esp + 12 + 4]   // src

    87     mov       edi, [esp + 12 + 8]   // src_stride

    88     mov       edx, [esp + 12 + 12]  // dst

    89     mov       esi, [esp + 12 + 16]  // dst_stride

    90     mov       ecx, [esp + 12 + 20]  // width

    92     // Read in the data from the source pointer.

    93     // First round of bit swap.

    94     align      4

    95  convertloop:

    96     movq      xmm0, qword ptr [eax]

    97     lea       ebp, [eax + 8]

    98     movq      xmm1, qword ptr [eax + edi]

    99     lea       eax, [eax + 2 * edi]

   100     punpcklbw xmm0, xmm1

   101     movq      xmm2, qword ptr [eax]

   102     movdqa    xmm1, xmm0

   103     palignr   xmm1, xmm1, 8

   104     movq      xmm3, qword ptr [eax + edi]

   105     lea       eax, [eax + 2 * edi]

   106     punpcklbw xmm2, xmm3

   107     movdqa    xmm3, xmm2

   108     movq      xmm4, qword ptr [eax]

   109     palignr   xmm3, xmm3, 8

   110     movq      xmm5, qword ptr [eax + edi]

   111     punpcklbw xmm4, xmm5

   112     lea       eax, [eax + 2 * edi]

   113     movdqa    xmm5, xmm4

   114     movq      xmm6, qword ptr [eax]

   115     palignr   xmm5, xmm5, 8

   116     movq      xmm7, qword ptr [eax + edi]

   117     punpcklbw xmm6, xmm7

   118     mov       eax, ebp

   119     movdqa    xmm7, xmm6

   120     palignr   xmm7, xmm7, 8

   121     // Second round of bit swap.

   122     punpcklwd xmm0, xmm2

   123     punpcklwd xmm1, xmm3

   124     movdqa    xmm2, xmm0

   125     movdqa    xmm3, xmm1

   126     palignr   xmm2, xmm2, 8

   127     palignr   xmm3, xmm3, 8

   128     punpcklwd xmm4, xmm6

   129     punpcklwd xmm5, xmm7

   130     movdqa    xmm6, xmm4

   131     movdqa    xmm7, xmm5

   132     palignr   xmm6, xmm6, 8

   133     palignr   xmm7, xmm7, 8

   134     // Third round of bit swap.

   135     // Write to the destination pointer.

   136     punpckldq xmm0, xmm4

   137     movq      qword ptr [edx], xmm0

   138     movdqa    xmm4, xmm0

   139     palignr   xmm4, xmm4, 8

   140     movq      qword ptr [edx + esi], xmm4

   141     lea       edx, [edx + 2 * esi]

   142     punpckldq xmm2, xmm6

   143     movdqa    xmm6, xmm2

   144     palignr   xmm6, xmm6, 8

   145     movq      qword ptr [edx], xmm2

   146     punpckldq xmm1, xmm5

   147     movq      qword ptr [edx + esi], xmm6

   148     lea       edx, [edx + 2 * esi]

   149     movdqa    xmm5, xmm1

   150     movq      qword ptr [edx], xmm1

   151     palignr   xmm5, xmm5, 8

   152     punpckldq xmm3, xmm7

   153     movq      qword ptr [edx + esi], xmm5

   154     lea       edx, [edx + 2 * esi]

   155     movq      qword ptr [edx], xmm3

   156     movdqa    xmm7, xmm3

   157     palignr   xmm7, xmm7, 8

   158     sub       ecx, 8

   159     movq      qword ptr [edx + esi], xmm7

   160     lea       edx, [edx + 2 * esi]

   161     jg        convertloop

   163     pop       ebp

   164     pop       esi

   165     pop       edi

   166     ret

   167   }

   168 }

   170 #define HAS_TRANSPOSE_UVWX8_SSE2

   171 __declspec(naked) __declspec(align(16))

   172 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

   173                                 uint8* dst_a, int dst_stride_a,

   174                                 uint8* dst_b, int dst_stride_b,

   175                                 int w) {

   176   __asm {

   177     push      ebx

   178     push      esi

   179     push      edi

   180     push      ebp

   181     mov       eax, [esp + 16 + 4]   // src

   182     mov       edi, [esp + 16 + 8]   // src_stride

   183     mov       edx, [esp + 16 + 12]  // dst_a

   184     mov       esi, [esp + 16 + 16]  // dst_stride_a

   185     mov       ebx, [esp + 16 + 20]  // dst_b

   186     mov       ebp, [esp + 16 + 24]  // dst_stride_b

   187     mov       ecx, esp

   188     sub       esp, 4 + 16

   189     and       esp, ~15

   190     mov       [esp + 16], ecx

   191     mov       ecx, [ecx + 16 + 28]  // w

   193     align      4

   194  convertloop:

   195     // Read in the data from the source pointer.

   196     // First round of bit swap.

   197     movdqa    xmm0, [eax]

   198     movdqa    xmm1, [eax + edi]

   199     lea       eax, [eax + 2 * edi]

   200     movdqa    xmm7, xmm0  // use xmm7 as temp register.

   201     punpcklbw xmm0, xmm1

   202     punpckhbw xmm7, xmm1

   203     movdqa    xmm1, xmm7

   204     movdqa    xmm2, [eax]

   205     movdqa    xmm3, [eax + edi]

   206     lea       eax, [eax + 2 * edi]

   207     movdqa    xmm7, xmm2

   208     punpcklbw xmm2, xmm3

   209     punpckhbw xmm7, xmm3

   210     movdqa    xmm3, xmm7

   211     movdqa    xmm4, [eax]

   212     movdqa    xmm5, [eax + edi]

   213     lea       eax, [eax + 2 * edi]

   214     movdqa    xmm7, xmm4

   215     punpcklbw xmm4, xmm5

   216     punpckhbw xmm7, xmm5

   217     movdqa    xmm5, xmm7

   218     movdqa    xmm6, [eax]

   219     movdqa    xmm7, [eax + edi]

   220     lea       eax, [eax + 2 * edi]

   221     movdqa    [esp], xmm5  // backup xmm5

   222     neg       edi

   223     movdqa    xmm5, xmm6   // use xmm5 as temp register.

   224     punpcklbw xmm6, xmm7

   225     punpckhbw xmm5, xmm7

   226     movdqa    xmm7, xmm5

   227     lea       eax, [eax + 8 * edi + 16]

   228     neg       edi

   229     // Second round of bit swap.

   230     movdqa    xmm5, xmm0

   231     punpcklwd xmm0, xmm2

   232     punpckhwd xmm5, xmm2

   233     movdqa    xmm2, xmm5

   234     movdqa    xmm5, xmm1

   235     punpcklwd xmm1, xmm3

   236     punpckhwd xmm5, xmm3

   237     movdqa    xmm3, xmm5

   238     movdqa    xmm5, xmm4

   239     punpcklwd xmm4, xmm6

   240     punpckhwd xmm5, xmm6

   241     movdqa    xmm6, xmm5

   242     movdqa    xmm5, [esp]  // restore xmm5

   243     movdqa    [esp], xmm6  // backup xmm6

   244     movdqa    xmm6, xmm5    // use xmm6 as temp register.

   245     punpcklwd xmm5, xmm7

   246     punpckhwd xmm6, xmm7

   247     movdqa    xmm7, xmm6

   248     // Third round of bit swap.

   249     // Write to the destination pointer.

   250     movdqa    xmm6, xmm0

   251     punpckldq xmm0, xmm4

   252     punpckhdq xmm6, xmm4

   253     movdqa    xmm4, xmm6

   254     movdqa    xmm6, [esp]  // restore xmm6

   255     movlpd    qword ptr [edx], xmm0

   256     movhpd    qword ptr [ebx], xmm0

   257     movlpd    qword ptr [edx + esi], xmm4

   258     lea       edx, [edx + 2 * esi]

   259     movhpd    qword ptr [ebx + ebp], xmm4

   260     lea       ebx, [ebx + 2 * ebp]

   261     movdqa    xmm0, xmm2   // use xmm0 as the temp register.

   262     punpckldq xmm2, xmm6

   263     movlpd    qword ptr [edx], xmm2

   264     movhpd    qword ptr [ebx], xmm2

   265     punpckhdq xmm0, xmm6

   266     movlpd    qword ptr [edx + esi], xmm0

   267     lea       edx, [edx + 2 * esi]

   268     movhpd    qword ptr [ebx + ebp], xmm0

   269     lea       ebx, [ebx + 2 * ebp]

   270     movdqa    xmm0, xmm1   // use xmm0 as the temp register.

   271     punpckldq xmm1, xmm5

   272     movlpd    qword ptr [edx], xmm1

   273     movhpd    qword ptr [ebx], xmm1

   274     punpckhdq xmm0, xmm5

   275     movlpd    qword ptr [edx + esi], xmm0

   276     lea       edx, [edx + 2 * esi]

   277     movhpd    qword ptr [ebx + ebp], xmm0

   278     lea       ebx, [ebx + 2 * ebp]

   279     movdqa    xmm0, xmm3   // use xmm0 as the temp register.

   280     punpckldq xmm3, xmm7

   281     movlpd    qword ptr [edx], xmm3

   282     movhpd    qword ptr [ebx], xmm3

   283     punpckhdq xmm0, xmm7

   284     sub       ecx, 8

   285     movlpd    qword ptr [edx + esi], xmm0

   286     lea       edx, [edx + 2 * esi]

   287     movhpd    qword ptr [ebx + ebp], xmm0

   288     lea       ebx, [ebx + 2 * ebp]

   289     jg        convertloop

   291     mov       esp, [esp + 16]

   292     pop       ebp

   293     pop       edi

   294     pop       esi

   295     pop       ebx

   296     ret

   297   }

   298 }

   299 #elif !defined(LIBYUV_DISABLE_X86) && \

   300     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))

   301 #define HAS_TRANSPOSE_WX8_SSSE3

   302 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,

   303                                uint8* dst, int dst_stride, int width) {

   304   asm volatile (

   305     // Read in the data from the source pointer.

   306     // First round of bit swap.

   307     ".p2align  2                                 \n"

   308   "1:                                            \n"

   309     "movq       (%0),%%xmm0                      \n"

   310     "movq       (%0,%3),%%xmm1                   \n"

   311     "lea        (%0,%3,2),%0                     \n"

   312     "punpcklbw  %%xmm1,%%xmm0                    \n"

   313     "movq       (%0),%%xmm2                      \n"

   314     "movdqa     %%xmm0,%%xmm1                    \n"

   315     "palignr    $0x8,%%xmm1,%%xmm1               \n"

   316     "movq       (%0,%3),%%xmm3                   \n"

   317     "lea        (%0,%3,2),%0                     \n"

   318     "punpcklbw  %%xmm3,%%xmm2                    \n"

   319     "movdqa     %%xmm2,%%xmm3                    \n"

   320     "movq       (%0),%%xmm4                      \n"

   321     "palignr    $0x8,%%xmm3,%%xmm3               \n"

   322     "movq       (%0,%3),%%xmm5                   \n"

   323     "lea        (%0,%3,2),%0                     \n"

   324     "punpcklbw  %%xmm5,%%xmm4                    \n"

   325     "movdqa     %%xmm4,%%xmm5                    \n"

   326     "movq       (%0),%%xmm6                      \n"

   327     "palignr    $0x8,%%xmm5,%%xmm5               \n"

   328     "movq       (%0,%3),%%xmm7                   \n"

   329     "lea        (%0,%3,2),%0                     \n"

   330     "punpcklbw  %%xmm7,%%xmm6                    \n"

   331     "neg        %3                               \n"

   332     "movdqa     %%xmm6,%%xmm7                    \n"

   333     "lea        0x8(%0,%3,8),%0                  \n"

   334     "palignr    $0x8,%%xmm7,%%xmm7               \n"

   335     "neg        %3                               \n"

   336      // Second round of bit swap.

   337     "punpcklwd  %%xmm2,%%xmm0                    \n"

   338     "punpcklwd  %%xmm3,%%xmm1                    \n"

   339     "movdqa     %%xmm0,%%xmm2                    \n"

   340     "movdqa     %%xmm1,%%xmm3                    \n"

   341     "palignr    $0x8,%%xmm2,%%xmm2               \n"

   342     "palignr    $0x8,%%xmm3,%%xmm3               \n"

   343     "punpcklwd  %%xmm6,%%xmm4                    \n"

   344     "punpcklwd  %%xmm7,%%xmm5                    \n"

   345     "movdqa     %%xmm4,%%xmm6                    \n"

   346     "movdqa     %%xmm5,%%xmm7                    \n"

   347     "palignr    $0x8,%%xmm6,%%xmm6               \n"

   348     "palignr    $0x8,%%xmm7,%%xmm7               \n"

   349     // Third round of bit swap.

   350     // Write to the destination pointer.

   351     "punpckldq  %%xmm4,%%xmm0                    \n"

   352     "movq       %%xmm0,(%1)                      \n"

   353     "movdqa     %%xmm0,%%xmm4                    \n"

   354     "palignr    $0x8,%%xmm4,%%xmm4               \n"

   355     "movq       %%xmm4,(%1,%4)                   \n"

   356     "lea        (%1,%4,2),%1                     \n"

   357     "punpckldq  %%xmm6,%%xmm2                    \n"

   358     "movdqa     %%xmm2,%%xmm6                    \n"

   359     "movq       %%xmm2,(%1)                      \n"

   360     "palignr    $0x8,%%xmm6,%%xmm6               \n"

   361     "punpckldq  %%xmm5,%%xmm1                    \n"

   362     "movq       %%xmm6,(%1,%4)                   \n"

   363     "lea        (%1,%4,2),%1                     \n"

   364     "movdqa     %%xmm1,%%xmm5                    \n"

   365     "movq       %%xmm1,(%1)                      \n"

   366     "palignr    $0x8,%%xmm5,%%xmm5               \n"

   367     "movq       %%xmm5,(%1,%4)                   \n"

   368     "lea        (%1,%4,2),%1                     \n"

   369     "punpckldq  %%xmm7,%%xmm3                    \n"

   370     "movq       %%xmm3,(%1)                      \n"

   371     "movdqa     %%xmm3,%%xmm7                    \n"

   372     "palignr    $0x8,%%xmm7,%%xmm7               \n"

   373     "sub        $0x8,%2                          \n"

   374     "movq       %%xmm7,(%1,%4)                   \n"

   375     "lea        (%1,%4,2),%1                     \n"

   376     "jg         1b                               \n"

   377     : "+r"(src),    // %0

   378       "+r"(dst),    // %1

   379       "+r"(width)   // %2

   380     : "r"((intptr_t)(src_stride)),  // %3

   381       "r"((intptr_t)(dst_stride))   // %4

   382     : "memory", "cc"

   383   #if defined(__SSE2__)

   384       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

   385   #endif

   386   );

   387 }

   389 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)

   390 #define HAS_TRANSPOSE_UVWX8_SSE2

   391 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

   392                                     uint8* dst_a, int dst_stride_a,

   393                                     uint8* dst_b, int dst_stride_b,

   394                                     int w);

   395   asm (

   396     DECLARE_FUNCTION(TransposeUVWx8_SSE2)

   397     "push   %ebx                               \n"

   398     "push   %esi                               \n"

   399     "push   %edi                               \n"

   400     "push   %ebp                               \n"

   401     "mov    0x14(%esp),%eax                    \n"

   402     "mov    0x18(%esp),%edi                    \n"

   403     "mov    0x1c(%esp),%edx                    \n"

   404     "mov    0x20(%esp),%esi                    \n"

   405     "mov    0x24(%esp),%ebx                    \n"

   406     "mov    0x28(%esp),%ebp                    \n"

   407     "mov    %esp,%ecx                          \n"

   408     "sub    $0x14,%esp                         \n"

   409     "and    $0xfffffff0,%esp                   \n"

   410     "mov    %ecx,0x10(%esp)                    \n"

   411     "mov    0x2c(%ecx),%ecx                    \n"

   413 "1:                                            \n"

   414     "movdqa (%eax),%xmm0                       \n"

   415     "movdqa (%eax,%edi,1),%xmm1                \n"

   416     "lea    (%eax,%edi,2),%eax                 \n"

   417     "movdqa %xmm0,%xmm7                        \n"

   418     "punpcklbw %xmm1,%xmm0                     \n"

   419     "punpckhbw %xmm1,%xmm7                     \n"

   420     "movdqa %xmm7,%xmm1                        \n"

   421     "movdqa (%eax),%xmm2                       \n"

   422     "movdqa (%eax,%edi,1),%xmm3                \n"

   423     "lea    (%eax,%edi,2),%eax                 \n"

   424     "movdqa %xmm2,%xmm7                        \n"

   425     "punpcklbw %xmm3,%xmm2                     \n"

   426     "punpckhbw %xmm3,%xmm7                     \n"

   427     "movdqa %xmm7,%xmm3                        \n"

   428     "movdqa (%eax),%xmm4                       \n"

   429     "movdqa (%eax,%edi,1),%xmm5                \n"

   430     "lea    (%eax,%edi,2),%eax                 \n"

   431     "movdqa %xmm4,%xmm7                        \n"

   432     "punpcklbw %xmm5,%xmm4                     \n"

   433     "punpckhbw %xmm5,%xmm7                     \n"

   434     "movdqa %xmm7,%xmm5                        \n"

   435     "movdqa (%eax),%xmm6                       \n"

   436     "movdqa (%eax,%edi,1),%xmm7                \n"

   437     "lea    (%eax,%edi,2),%eax                 \n"

   438     "movdqa %xmm5,(%esp)                       \n"

   439     "neg    %edi                               \n"

   440     "movdqa %xmm6,%xmm5                        \n"

   441     "punpcklbw %xmm7,%xmm6                     \n"

   442     "punpckhbw %xmm7,%xmm5                     \n"

   443     "movdqa %xmm5,%xmm7                        \n"

   444     "lea    0x10(%eax,%edi,8),%eax             \n"

   445     "neg    %edi                               \n"

   446     "movdqa %xmm0,%xmm5                        \n"

   447     "punpcklwd %xmm2,%xmm0                     \n"

   448     "punpckhwd %xmm2,%xmm5                     \n"

   449     "movdqa %xmm5,%xmm2                        \n"

   450     "movdqa %xmm1,%xmm5                        \n"

   451     "punpcklwd %xmm3,%xmm1                     \n"

   452     "punpckhwd %xmm3,%xmm5                     \n"

   453     "movdqa %xmm5,%xmm3                        \n"

   454     "movdqa %xmm4,%xmm5                        \n"

   455     "punpcklwd %xmm6,%xmm4                     \n"

   456     "punpckhwd %xmm6,%xmm5                     \n"

   457     "movdqa %xmm5,%xmm6                        \n"

   458     "movdqa (%esp),%xmm5                       \n"

   459     "movdqa %xmm6,(%esp)                       \n"

   460     "movdqa %xmm5,%xmm6                        \n"

   461     "punpcklwd %xmm7,%xmm5                     \n"

   462     "punpckhwd %xmm7,%xmm6                     \n"

   463     "movdqa %xmm6,%xmm7                        \n"

   464     "movdqa %xmm0,%xmm6                        \n"

   465     "punpckldq %xmm4,%xmm0                     \n"

   466     "punpckhdq %xmm4,%xmm6                     \n"

   467     "movdqa %xmm6,%xmm4                        \n"

   468     "movdqa (%esp),%xmm6                       \n"

   469     "movlpd %xmm0,(%edx)                       \n"

   470     "movhpd %xmm0,(%ebx)                       \n"

   471     "movlpd %xmm4,(%edx,%esi,1)                \n"

   472     "lea    (%edx,%esi,2),%edx                 \n"

   473     "movhpd %xmm4,(%ebx,%ebp,1)                \n"

   474     "lea    (%ebx,%ebp,2),%ebx                 \n"

   475     "movdqa %xmm2,%xmm0                        \n"

   476     "punpckldq %xmm6,%xmm2                     \n"

   477     "movlpd %xmm2,(%edx)                       \n"

   478     "movhpd %xmm2,(%ebx)                       \n"

   479     "punpckhdq %xmm6,%xmm0                     \n"

   480     "movlpd %xmm0,(%edx,%esi,1)                \n"

   481     "lea    (%edx,%esi,2),%edx                 \n"

   482     "movhpd %xmm0,(%ebx,%ebp,1)                \n"

   483     "lea    (%ebx,%ebp,2),%ebx                 \n"

   484     "movdqa %xmm1,%xmm0                        \n"

   485     "punpckldq %xmm5,%xmm1                     \n"

   486     "movlpd %xmm1,(%edx)                       \n"

   487     "movhpd %xmm1,(%ebx)                       \n"

   488     "punpckhdq %xmm5,%xmm0                     \n"

   489     "movlpd %xmm0,(%edx,%esi,1)                \n"

   490     "lea    (%edx,%esi,2),%edx                 \n"

   491     "movhpd %xmm0,(%ebx,%ebp,1)                \n"

   492     "lea    (%ebx,%ebp,2),%ebx                 \n"

   493     "movdqa %xmm3,%xmm0                        \n"

   494     "punpckldq %xmm7,%xmm3                     \n"

   495     "movlpd %xmm3,(%edx)                       \n"

   496     "movhpd %xmm3,(%ebx)                       \n"

   497     "punpckhdq %xmm7,%xmm0                     \n"

   498     "sub    $0x8,%ecx                          \n"

   499     "movlpd %xmm0,(%edx,%esi,1)                \n"

   500     "lea    (%edx,%esi,2),%edx                 \n"

   501     "movhpd %xmm0,(%ebx,%ebp,1)                \n"

   502     "lea    (%ebx,%ebp,2),%ebx                 \n"

   503     "jg     1b                                 \n"

   504     "mov    0x10(%esp),%esp                    \n"

   505     "pop    %ebp                               \n"

   506     "pop    %edi                               \n"

   507     "pop    %esi                               \n"

   508     "pop    %ebx                               \n"

   509 #if defined(__native_client__)

   510     "pop    %ecx                               \n"

   511     "and    $0xffffffe0,%ecx                   \n"

   512     "jmp    *%ecx                              \n"

   513 #else

   514     "ret                                       \n"

   515 #endif

   516 );

   517 #elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \

   518     defined(__x86_64__)

   519 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.

   520 #define HAS_TRANSPOSE_WX8_FAST_SSSE3

   521 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,

   522                                     uint8* dst, int dst_stride, int width) {

   523   asm volatile (

   524   // Read in the data from the source pointer.

   525   // First round of bit swap.

   526   ".p2align  2                                 \n"

   527 "1:                                            \n"

   528   "movdqa     (%0),%%xmm0                      \n"

   529   "movdqa     (%0,%3),%%xmm1                   \n"

   530   "lea        (%0,%3,2),%0                     \n"

   531   "movdqa     %%xmm0,%%xmm8                    \n"

   532   "punpcklbw  %%xmm1,%%xmm0                    \n"

   533   "punpckhbw  %%xmm1,%%xmm8                    \n"

   534   "movdqa     (%0),%%xmm2                      \n"

   535   "movdqa     %%xmm0,%%xmm1                    \n"

   536   "movdqa     %%xmm8,%%xmm9                    \n"

   537   "palignr    $0x8,%%xmm1,%%xmm1               \n"

   538   "palignr    $0x8,%%xmm9,%%xmm9               \n"

   539   "movdqa     (%0,%3),%%xmm3                   \n"

   540   "lea        (%0,%3,2),%0                     \n"

   541   "movdqa     %%xmm2,%%xmm10                   \n"

   542   "punpcklbw  %%xmm3,%%xmm2                    \n"

   543   "punpckhbw  %%xmm3,%%xmm10                   \n"

   544   "movdqa     %%xmm2,%%xmm3                    \n"

   545   "movdqa     %%xmm10,%%xmm11                  \n"

   546   "movdqa     (%0),%%xmm4                      \n"

   547   "palignr    $0x8,%%xmm3,%%xmm3               \n"

   548   "palignr    $0x8,%%xmm11,%%xmm11             \n"

   549   "movdqa     (%0,%3),%%xmm5                   \n"

   550   "lea        (%0,%3,2),%0                     \n"

   551   "movdqa     %%xmm4,%%xmm12                   \n"

   552   "punpcklbw  %%xmm5,%%xmm4                    \n"

   553   "punpckhbw  %%xmm5,%%xmm12                   \n"

   554   "movdqa     %%xmm4,%%xmm5                    \n"

   555   "movdqa     %%xmm12,%%xmm13                  \n"

   556   "movdqa     (%0),%%xmm6                      \n"

   557   "palignr    $0x8,%%xmm5,%%xmm5               \n"

   558   "palignr    $0x8,%%xmm13,%%xmm13             \n"

   559   "movdqa     (%0,%3),%%xmm7                   \n"

   560   "lea        (%0,%3,2),%0                     \n"

   561   "movdqa     %%xmm6,%%xmm14                   \n"

   562   "punpcklbw  %%xmm7,%%xmm6                    \n"

   563   "punpckhbw  %%xmm7,%%xmm14                   \n"

   564   "neg        %3                               \n"

   565   "movdqa     %%xmm6,%%xmm7                    \n"

   566   "movdqa     %%xmm14,%%xmm15                  \n"

   567   "lea        0x10(%0,%3,8),%0                 \n"

   568   "palignr    $0x8,%%xmm7,%%xmm7               \n"

   569   "palignr    $0x8,%%xmm15,%%xmm15             \n"

   570   "neg        %3                               \n"

   571    // Second round of bit swap.

   572   "punpcklwd  %%xmm2,%%xmm0                    \n"

   573   "punpcklwd  %%xmm3,%%xmm1                    \n"

   574   "movdqa     %%xmm0,%%xmm2                    \n"

   575   "movdqa     %%xmm1,%%xmm3                    \n"

   576   "palignr    $0x8,%%xmm2,%%xmm2               \n"

   577   "palignr    $0x8,%%xmm3,%%xmm3               \n"

   578   "punpcklwd  %%xmm6,%%xmm4                    \n"

   579   "punpcklwd  %%xmm7,%%xmm5                    \n"

   580   "movdqa     %%xmm4,%%xmm6                    \n"

   581   "movdqa     %%xmm5,%%xmm7                    \n"

   582   "palignr    $0x8,%%xmm6,%%xmm6               \n"

   583   "palignr    $0x8,%%xmm7,%%xmm7               \n"

   584   "punpcklwd  %%xmm10,%%xmm8                   \n"

   585   "punpcklwd  %%xmm11,%%xmm9                   \n"

   586   "movdqa     %%xmm8,%%xmm10                   \n"

   587   "movdqa     %%xmm9,%%xmm11                   \n"

   588   "palignr    $0x8,%%xmm10,%%xmm10             \n"

   589   "palignr    $0x8,%%xmm11,%%xmm11             \n"

   590   "punpcklwd  %%xmm14,%%xmm12                  \n"

   591   "punpcklwd  %%xmm15,%%xmm13                  \n"

   592   "movdqa     %%xmm12,%%xmm14                  \n"

   593   "movdqa     %%xmm13,%%xmm15                  \n"

   594   "palignr    $0x8,%%xmm14,%%xmm14             \n"

   595   "palignr    $0x8,%%xmm15,%%xmm15             \n"

   596   // Third round of bit swap.

   597   // Write to the destination pointer.

   598   "punpckldq  %%xmm4,%%xmm0                    \n"

   599   "movq       %%xmm0,(%1)                      \n"

   600   "movdqa     %%xmm0,%%xmm4                    \n"

   601   "palignr    $0x8,%%xmm4,%%xmm4               \n"

   602   "movq       %%xmm4,(%1,%4)                   \n"

   603   "lea        (%1,%4,2),%1                     \n"

   604   "punpckldq  %%xmm6,%%xmm2                    \n"

   605   "movdqa     %%xmm2,%%xmm6                    \n"

   606   "movq       %%xmm2,(%1)                      \n"

   607   "palignr    $0x8,%%xmm6,%%xmm6               \n"

   608   "punpckldq  %%xmm5,%%xmm1                    \n"

   609   "movq       %%xmm6,(%1,%4)                   \n"

   610   "lea        (%1,%4,2),%1                     \n"

   611   "movdqa     %%xmm1,%%xmm5                    \n"

   612   "movq       %%xmm1,(%1)                      \n"

   613   "palignr    $0x8,%%xmm5,%%xmm5               \n"

   614   "movq       %%xmm5,(%1,%4)                   \n"

   615   "lea        (%1,%4,2),%1                     \n"

   616   "punpckldq  %%xmm7,%%xmm3                    \n"

   617   "movq       %%xmm3,(%1)                      \n"

   618   "movdqa     %%xmm3,%%xmm7                    \n"

   619   "palignr    $0x8,%%xmm7,%%xmm7               \n"

   620   "movq       %%xmm7,(%1,%4)                   \n"

   621   "lea        (%1,%4,2),%1                     \n"

   622   "punpckldq  %%xmm12,%%xmm8                   \n"

   623   "movq       %%xmm8,(%1)                      \n"

   624   "movdqa     %%xmm8,%%xmm12                   \n"

   625   "palignr    $0x8,%%xmm12,%%xmm12             \n"

   626   "movq       %%xmm12,(%1,%4)                  \n"

   627   "lea        (%1,%4,2),%1                     \n"

   628   "punpckldq  %%xmm14,%%xmm10                  \n"

   629   "movdqa     %%xmm10,%%xmm14                  \n"

   630   "movq       %%xmm10,(%1)                     \n"

   631   "palignr    $0x8,%%xmm14,%%xmm14             \n"

   632   "punpckldq  %%xmm13,%%xmm9                   \n"

   633   "movq       %%xmm14,(%1,%4)                  \n"

   634   "lea        (%1,%4,2),%1                     \n"

   635   "movdqa     %%xmm9,%%xmm13                   \n"

   636   "movq       %%xmm9,(%1)                      \n"

   637   "palignr    $0x8,%%xmm13,%%xmm13             \n"

   638   "movq       %%xmm13,(%1,%4)                  \n"

   639   "lea        (%1,%4,2),%1                     \n"

   640   "punpckldq  %%xmm15,%%xmm11                  \n"

   641   "movq       %%xmm11,(%1)                     \n"

   642   "movdqa     %%xmm11,%%xmm15                  \n"

   643   "palignr    $0x8,%%xmm15,%%xmm15             \n"

   644   "sub        $0x10,%2                         \n"

   645   "movq       %%xmm15,(%1,%4)                  \n"

   646   "lea        (%1,%4,2),%1                     \n"

   647   "jg         1b                               \n"

   648   : "+r"(src),    // %0

   649     "+r"(dst),    // %1

   650     "+r"(width)   // %2

   651   : "r"((intptr_t)(src_stride)),  // %3

   652     "r"((intptr_t)(dst_stride))   // %4

   653   : "memory", "cc",

   654     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

   655     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"

   656 );

   657 }

   659 #define HAS_TRANSPOSE_UVWX8_SSE2

   660 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

   661                                 uint8* dst_a, int dst_stride_a,

   662                                 uint8* dst_b, int dst_stride_b,

   663                                 int w) {

   664   asm volatile (

   665   // Read in the data from the source pointer.

   666   // First round of bit swap.

   667   ".p2align  2                                 \n"

   668 "1:                                            \n"

   669   "movdqa     (%0),%%xmm0                      \n"

   670   "movdqa     (%0,%4),%%xmm1                   \n"

   671   "lea        (%0,%4,2),%0                     \n"

   672   "movdqa     %%xmm0,%%xmm8                    \n"

   673   "punpcklbw  %%xmm1,%%xmm0                    \n"

   674   "punpckhbw  %%xmm1,%%xmm8                    \n"

   675   "movdqa     %%xmm8,%%xmm1                    \n"

   676   "movdqa     (%0),%%xmm2                      \n"

   677   "movdqa     (%0,%4),%%xmm3                   \n"

   678   "lea        (%0,%4,2),%0                     \n"

   679   "movdqa     %%xmm2,%%xmm8                    \n"

   680   "punpcklbw  %%xmm3,%%xmm2                    \n"

   681   "punpckhbw  %%xmm3,%%xmm8                    \n"

   682   "movdqa     %%xmm8,%%xmm3                    \n"

   683   "movdqa     (%0),%%xmm4                      \n"

   684   "movdqa     (%0,%4),%%xmm5                   \n"

   685   "lea        (%0,%4,2),%0                     \n"

   686   "movdqa     %%xmm4,%%xmm8                    \n"

   687   "punpcklbw  %%xmm5,%%xmm4                    \n"

   688   "punpckhbw  %%xmm5,%%xmm8                    \n"

   689   "movdqa     %%xmm8,%%xmm5                    \n"

   690   "movdqa     (%0),%%xmm6                      \n"

   691   "movdqa     (%0,%4),%%xmm7                   \n"

   692   "lea        (%0,%4,2),%0                     \n"

   693   "movdqa     %%xmm6,%%xmm8                    \n"

   694   "punpcklbw  %%xmm7,%%xmm6                    \n"

   695   "neg        %4                               \n"

   696   "lea        0x10(%0,%4,8),%0                 \n"

   697   "punpckhbw  %%xmm7,%%xmm8                    \n"

   698   "movdqa     %%xmm8,%%xmm7                    \n"

   699   "neg        %4                               \n"

   700    // Second round of bit swap.

   701   "movdqa     %%xmm0,%%xmm8                    \n"

   702   "movdqa     %%xmm1,%%xmm9                    \n"

   703   "punpckhwd  %%xmm2,%%xmm8                    \n"

   704   "punpckhwd  %%xmm3,%%xmm9                    \n"

   705   "punpcklwd  %%xmm2,%%xmm0                    \n"

   706   "punpcklwd  %%xmm3,%%xmm1                    \n"

   707   "movdqa     %%xmm8,%%xmm2                    \n"

   708   "movdqa     %%xmm9,%%xmm3                    \n"

   709   "movdqa     %%xmm4,%%xmm8                    \n"

   710   "movdqa     %%xmm5,%%xmm9                    \n"

   711   "punpckhwd  %%xmm6,%%xmm8                    \n"

   712   "punpckhwd  %%xmm7,%%xmm9                    \n"

   713   "punpcklwd  %%xmm6,%%xmm4                    \n"

   714   "punpcklwd  %%xmm7,%%xmm5                    \n"

   715   "movdqa     %%xmm8,%%xmm6                    \n"

   716   "movdqa     %%xmm9,%%xmm7                    \n"

   717   // Third round of bit swap.

   718   // Write to the destination pointer.

   719   "movdqa     %%xmm0,%%xmm8                    \n"

   720   "punpckldq  %%xmm4,%%xmm0                    \n"

   721   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel

   722   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel

   723   "punpckhdq  %%xmm4,%%xmm8                    \n"

   724   "movlpd     %%xmm8,(%1,%5)                   \n"

   725   "lea        (%1,%5,2),%1                     \n"

   726   "movhpd     %%xmm8,(%2,%6)                   \n"

   727   "lea        (%2,%6,2),%2                     \n"

   728   "movdqa     %%xmm2,%%xmm8                    \n"

   729   "punpckldq  %%xmm6,%%xmm2                    \n"

   730   "movlpd     %%xmm2,(%1)                      \n"

   731   "movhpd     %%xmm2,(%2)                      \n"

   732   "punpckhdq  %%xmm6,%%xmm8                    \n"

   733   "movlpd     %%xmm8,(%1,%5)                   \n"

   734   "lea        (%1,%5,2),%1                     \n"

   735   "movhpd     %%xmm8,(%2,%6)                   \n"

   736   "lea        (%2,%6,2),%2                     \n"

   737   "movdqa     %%xmm1,%%xmm8                    \n"

   738   "punpckldq  %%xmm5,%%xmm1                    \n"

   739   "movlpd     %%xmm1,(%1)                      \n"

   740   "movhpd     %%xmm1,(%2)                      \n"

   741   "punpckhdq  %%xmm5,%%xmm8                    \n"

   742   "movlpd     %%xmm8,(%1,%5)                   \n"

   743   "lea        (%1,%5,2),%1                     \n"

   744   "movhpd     %%xmm8,(%2,%6)                   \n"

   745   "lea        (%2,%6,2),%2                     \n"

   746   "movdqa     %%xmm3,%%xmm8                    \n"

   747   "punpckldq  %%xmm7,%%xmm3                    \n"

   748   "movlpd     %%xmm3,(%1)                      \n"

   749   "movhpd     %%xmm3,(%2)                      \n"

   750   "punpckhdq  %%xmm7,%%xmm8                    \n"

   751   "sub        $0x8,%3                          \n"

   752   "movlpd     %%xmm8,(%1,%5)                   \n"

   753   "lea        (%1,%5,2),%1                     \n"

   754   "movhpd     %%xmm8,(%2,%6)                   \n"

   755   "lea        (%2,%6,2),%2                     \n"

   756   "jg         1b                               \n"

   757   : "+r"(src),    // %0

   758     "+r"(dst_a),  // %1

   759     "+r"(dst_b),  // %2

   760     "+r"(w)   // %3

   761   : "r"((intptr_t)(src_stride)),    // %4

   762     "r"((intptr_t)(dst_stride_a)),  // %5

   763     "r"((intptr_t)(dst_stride_b))   // %6

   764   : "memory", "cc",

   765     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

   766     "xmm8", "xmm9"

   767 );

   768 }

   769 #endif

   770 #endif

   772 static void TransposeWx8_C(const uint8* src, int src_stride,

   773                            uint8* dst, int dst_stride,

   774                            int width) {

   775   int i;

   776   for (i = 0; i < width; ++i) {

   777     dst[0] = src[0 * src_stride];

   778     dst[1] = src[1 * src_stride];

   779     dst[2] = src[2 * src_stride];

   780     dst[3] = src[3 * src_stride];

   781     dst[4] = src[4 * src_stride];

   782     dst[5] = src[5 * src_stride];

   783     dst[6] = src[6 * src_stride];

   784     dst[7] = src[7 * src_stride];

   785     ++src;

   786     dst += dst_stride;

   787   }

   788 }

   790 static void TransposeWxH_C(const uint8* src, int src_stride,

   791                            uint8* dst, int dst_stride,

   792                            int width, int height) {

   793   int i;

   794   for (i = 0; i < width; ++i) {

   795     int j;

   796     for (j = 0; j < height; ++j) {

   797       dst[i * dst_stride + j] = src[j * src_stride + i];

   798     }

   799   }

   800 }

   802 LIBYUV_API

   803 void TransposePlane(const uint8* src, int src_stride,

   804                     uint8* dst, int dst_stride,

   805                     int width, int height) {

   806   int i = height;

   807   void (*TransposeWx8)(const uint8* src, int src_stride,

   808                        uint8* dst, int dst_stride,

   809                        int width) = TransposeWx8_C;

   810 #if defined(HAS_TRANSPOSE_WX8_NEON)

   811   if (TestCpuFlag(kCpuHasNEON)) {

   812     TransposeWx8 = TransposeWx8_NEON;

   813   }

   814 #endif

   815 #if defined(HAS_TRANSPOSE_WX8_SSSE3)

   816   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {

   817     TransposeWx8 = TransposeWx8_SSSE3;

   818   }

   819 #endif

   820 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)

   821   if (TestCpuFlag(kCpuHasSSSE3) &&

   822       IS_ALIGNED(width, 16) &&

   823       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {

   824     TransposeWx8 = TransposeWx8_FAST_SSSE3;

   825   }

   826 #endif

   827 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)

   828   if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {

   829     if (IS_ALIGNED(width, 4) &&

   830         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

   831       TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;

   832     } else {

   833       TransposeWx8 = TransposeWx8_MIPS_DSPR2;

   834     }

   835   }

   836 #endif

   838   // Work across the source in 8x8 tiles

   839   while (i >= 8) {

   840     TransposeWx8(src, src_stride, dst, dst_stride, width);

   841     src += 8 * src_stride;    // Go down 8 rows.

   842     dst += 8;                 // Move over 8 columns.

   843     i -= 8;

   844   }

   846   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);

   847 }

   849 LIBYUV_API

   850 void RotatePlane90(const uint8* src, int src_stride,

   851                    uint8* dst, int dst_stride,

   852                    int width, int height) {

   853   // Rotate by 90 is a transpose with the source read

   854   // from bottom to top. So set the source pointer to the end

   855   // of the buffer and flip the sign of the source stride.

   856   src += src_stride * (height - 1);

   857   src_stride = -src_stride;

   858   TransposePlane(src, src_stride, dst, dst_stride, width, height);

   859 }

   861 LIBYUV_API

   862 void RotatePlane270(const uint8* src, int src_stride,

   863                     uint8* dst, int dst_stride,

   864                     int width, int height) {

   865   // Rotate by 270 is a transpose with the destination written

   866   // from bottom to top. So set the destination pointer to the end

   867   // of the buffer and flip the sign of the destination stride.

   868   dst += dst_stride * (width - 1);

   869   dst_stride = -dst_stride;

   870   TransposePlane(src, src_stride, dst, dst_stride, width, height);

   871 }

   873 LIBYUV_API

   874 void RotatePlane180(const uint8* src, int src_stride,

   875                     uint8* dst, int dst_stride,

   876                     int width, int height) {

   877   // Swap first and last row and mirror the content. Uses a temporary row.

   878   align_buffer_64(row, width);

   879   const uint8* src_bot = src + src_stride * (height - 1);

   880   uint8* dst_bot = dst + dst_stride * (height - 1);

   881   int half_height = (height + 1) >> 1;

   882   int y;

   883   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;

   884   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

   885 #if defined(HAS_MIRRORROW_NEON)

   886   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {

   887     MirrorRow = MirrorRow_NEON;

   888   }

   889 #endif

   890 #if defined(HAS_MIRRORROW_SSE2)

   891   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&

   892       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&

   893       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

   894     MirrorRow = MirrorRow_SSE2;

   895   }

   896 #endif

   897 #if defined(HAS_MIRRORROW_SSSE3)

   898   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&

   899       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&

   900       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

   901     MirrorRow = MirrorRow_SSSE3;

   902   }

   903 #endif

   904 #if defined(HAS_MIRRORROW_AVX2)

   905   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {

   906     MirrorRow = MirrorRow_AVX2;

   907   }

   908 #endif

   909 #if defined(HAS_MIRRORROW_MIPS_DSPR2)

   910   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

   911       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&

   912       IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {

   913     MirrorRow = MirrorRow_MIPS_DSPR2;

   914   }

   915 #endif

   916 #if defined(HAS_COPYROW_NEON)

   917   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {

   918     CopyRow = CopyRow_NEON;

   919   }

   920 #endif

   921 #if defined(HAS_COPYROW_X86)

   922   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {

   923     CopyRow = CopyRow_X86;

   924   }

   925 #endif

   926 #if defined(HAS_COPYROW_SSE2)

   927   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&

   928       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&

   929       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {

   930     CopyRow = CopyRow_SSE2;

   931   }

   932 #endif

   933 #if defined(HAS_COPYROW_ERMS)

   934   if (TestCpuFlag(kCpuHasERMS)) {

   935     CopyRow = CopyRow_ERMS;

   936   }

   937 #endif

   938 #if defined(HAS_COPYROW_MIPS)

   939   if (TestCpuFlag(kCpuHasMIPS)) {

   940     CopyRow = CopyRow_MIPS;

   941   }

   942 #endif

   944   // Odd height will harmlessly mirror the middle row twice.

   945   for (y = 0; y < half_height; ++y) {

   946     MirrorRow(src, row, width);  // Mirror first row into a buffer

   947     src += src_stride;

   948     MirrorRow(src_bot, dst, width);  // Mirror last row into first row

   949     dst += dst_stride;

   950     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last

   951     src_bot -= src_stride;

   952     dst_bot -= dst_stride;

   953   }

   954   free_aligned_buffer_64(row);

   955 }

   957 static void TransposeUVWx8_C(const uint8* src, int src_stride,

   958                              uint8* dst_a, int dst_stride_a,

   959                              uint8* dst_b, int dst_stride_b,

   960                              int width) {

   961   int i;

   962   for (i = 0; i < width; ++i) {

   963     dst_a[0] = src[0 * src_stride + 0];

   964     dst_b[0] = src[0 * src_stride + 1];

   965     dst_a[1] = src[1 * src_stride + 0];

   966     dst_b[1] = src[1 * src_stride + 1];

   967     dst_a[2] = src[2 * src_stride + 0];

   968     dst_b[2] = src[2 * src_stride + 1];

   969     dst_a[3] = src[3 * src_stride + 0];

   970     dst_b[3] = src[3 * src_stride + 1];

   971     dst_a[4] = src[4 * src_stride + 0];

   972     dst_b[4] = src[4 * src_stride + 1];

   973     dst_a[5] = src[5 * src_stride + 0];

   974     dst_b[5] = src[5 * src_stride + 1];

   975     dst_a[6] = src[6 * src_stride + 0];

   976     dst_b[6] = src[6 * src_stride + 1];

   977     dst_a[7] = src[7 * src_stride + 0];

   978     dst_b[7] = src[7 * src_stride + 1];

   979     src += 2;

   980     dst_a += dst_stride_a;

   981     dst_b += dst_stride_b;

   982   }

   983 }

   985 static void TransposeUVWxH_C(const uint8* src, int src_stride,

   986                              uint8* dst_a, int dst_stride_a,

   987                              uint8* dst_b, int dst_stride_b,

   988                              int width, int height) {

   989   int i;

   990   for (i = 0; i < width * 2; i += 2) {

   991     int j;

   992     for (j = 0; j < height; ++j) {

   993       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];

   994       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];

   995     }

   996   }

   997 }

   999 LIBYUV_API

  1000 void TransposeUV(const uint8* src, int src_stride,

  1001                  uint8* dst_a, int dst_stride_a,

  1002                  uint8* dst_b, int dst_stride_b,

  1003                  int width, int height) {

  1004   int i = height;

  1005   void (*TransposeUVWx8)(const uint8* src, int src_stride,

  1006                          uint8* dst_a, int dst_stride_a,

  1007                          uint8* dst_b, int dst_stride_b,

  1008                          int width) = TransposeUVWx8_C;

  1009 #if defined(HAS_TRANSPOSE_UVWX8_NEON)

  1010   if (TestCpuFlag(kCpuHasNEON)) {

  1011     TransposeUVWx8 = TransposeUVWx8_NEON;

  1012   }

  1013 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)

  1014   if (TestCpuFlag(kCpuHasSSE2) &&

  1015       IS_ALIGNED(width, 8) &&

  1016       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {

  1017     TransposeUVWx8 = TransposeUVWx8_SSE2;

  1018   }

  1019 #elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)

  1020   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&

  1021       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

  1022     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;

  1023   }

  1024 #endif

  1026   // Work through the source in 8x8 tiles.

  1027   while (i >= 8) {

  1028     TransposeUVWx8(src, src_stride,

  1029                    dst_a, dst_stride_a,

  1030                    dst_b, dst_stride_b,

  1031                    width);

  1032     src += 8 * src_stride;    // Go down 8 rows.

  1033     dst_a += 8;               // Move over 8 columns.

  1034     dst_b += 8;               // Move over 8 columns.

  1035     i -= 8;

  1036   }

  1038   TransposeUVWxH_C(src, src_stride,

  1039                    dst_a, dst_stride_a,

  1040                    dst_b, dst_stride_b,

  1041                    width, i);

  1042 }

  1044 LIBYUV_API

  1045 void RotateUV90(const uint8* src, int src_stride,

  1046                 uint8* dst_a, int dst_stride_a,

  1047                 uint8* dst_b, int dst_stride_b,

  1048                 int width, int height) {

  1049   src += src_stride * (height - 1);

  1050   src_stride = -src_stride;

  1052   TransposeUV(src, src_stride,

  1053               dst_a, dst_stride_a,

  1054               dst_b, dst_stride_b,

  1055               width, height);

  1056 }

  1058 LIBYUV_API

  1059 void RotateUV270(const uint8* src, int src_stride,

  1060                  uint8* dst_a, int dst_stride_a,

  1061                  uint8* dst_b, int dst_stride_b,

  1062                  int width, int height) {

  1063   dst_a += dst_stride_a * (width - 1);

  1064   dst_b += dst_stride_b * (width - 1);

  1065   dst_stride_a = -dst_stride_a;

  1066   dst_stride_b = -dst_stride_b;

  1068   TransposeUV(src, src_stride,

  1069               dst_a, dst_stride_a,

  1070               dst_b, dst_stride_b,

  1071               width, height);

  1072 }

  1074 // Rotate 180 is a horizontal and vertical flip.

  1075 LIBYUV_API

  1076 void RotateUV180(const uint8* src, int src_stride,

  1077                  uint8* dst_a, int dst_stride_a,

  1078                  uint8* dst_b, int dst_stride_b,

  1079                  int width, int height) {

  1080   int i;

  1081   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =

  1082       MirrorUVRow_C;

  1083 #if defined(HAS_MIRRORUVROW_NEON)

  1084   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

  1085     MirrorRowUV = MirrorUVRow_NEON;

  1086   }

  1087 #elif defined(HAS_MIRRORROW_UV_SSSE3)

  1088   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&

  1089       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {

  1090     MirrorRowUV = MirrorUVRow_SSSE3;

  1091   }

  1092 #elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)

  1093   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&

  1094       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

  1095     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;

  1096   }

  1097 #endif

  1099   dst_a += dst_stride_a * (height - 1);

  1100   dst_b += dst_stride_b * (height - 1);

  1102   for (i = 0; i < height; ++i) {

  1103     MirrorRowUV(src, dst_a, dst_b, width);

  1104     src += src_stride;

  1105     dst_a -= dst_stride_a;

  1106     dst_b -= dst_stride_b;

  1107   }

  1108 }

  1110 LIBYUV_API

  1111 int RotatePlane(const uint8* src, int src_stride,

  1112                 uint8* dst, int dst_stride,

  1113                 int width, int height,

  1114                 enum RotationMode mode) {

  1115   if (!src || width <= 0 || height == 0 || !dst) {

  1116     return -1;

  1117   }

  1119   // Negative height means invert the image.

  1120   if (height < 0) {

  1121     height = -height;

  1122     src = src + (height - 1) * src_stride;

  1123     src_stride = -src_stride;

  1124   }

  1126   switch (mode) {

  1127     case kRotate0:

  1128       // copy frame

  1129       CopyPlane(src, src_stride,

  1130                 dst, dst_stride,

  1131                 width, height);

  1132       return 0;

  1133     case kRotate90:

  1134       RotatePlane90(src, src_stride,

  1135                     dst, dst_stride,

  1136                     width, height);

  1137       return 0;

  1138     case kRotate270:

  1139       RotatePlane270(src, src_stride,

  1140                      dst, dst_stride,

  1141                      width, height);

  1142       return 0;

  1143     case kRotate180:

  1144       RotatePlane180(src, src_stride,

  1145                      dst, dst_stride,

  1146                      width, height);

  1147       return 0;

  1148     default:

  1149       break;

  1150   }

  1151   return -1;

  1152 }

  1154 LIBYUV_API

  1155 int I420Rotate(const uint8* src_y, int src_stride_y,

  1156                const uint8* src_u, int src_stride_u,

  1157                const uint8* src_v, int src_stride_v,

  1158                uint8* dst_y, int dst_stride_y,

  1159                uint8* dst_u, int dst_stride_u,

  1160                uint8* dst_v, int dst_stride_v,

  1161                int width, int height,

  1162                enum RotationMode mode) {

  1163   int halfwidth = (width + 1) >> 1;

  1164   int halfheight = (height + 1) >> 1;

  1165   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||

  1166       !dst_y || !dst_u || !dst_v) {

  1167     return -1;

  1168   }

  1170   // Negative height means invert the image.

  1171   if (height < 0) {

  1172     height = -height;

  1173     halfheight = (height + 1) >> 1;

  1174     src_y = src_y + (height - 1) * src_stride_y;

  1175     src_u = src_u + (halfheight - 1) * src_stride_u;

  1176     src_v = src_v + (halfheight - 1) * src_stride_v;

  1177     src_stride_y = -src_stride_y;

  1178     src_stride_u = -src_stride_u;

  1179     src_stride_v = -src_stride_v;

  1180   }

  1182   switch (mode) {

  1183     case kRotate0:

  1184       // copy frame

  1185       return I420Copy(src_y, src_stride_y,

  1186                       src_u, src_stride_u,

  1187                       src_v, src_stride_v,

  1188                       dst_y, dst_stride_y,

  1189                       dst_u, dst_stride_u,

  1190                       dst_v, dst_stride_v,

  1191                       width, height);

  1192     case kRotate90:

  1193       RotatePlane90(src_y, src_stride_y,

  1194                     dst_y, dst_stride_y,

  1195                     width, height);

  1196       RotatePlane90(src_u, src_stride_u,

  1197                     dst_u, dst_stride_u,

  1198                     halfwidth, halfheight);

  1199       RotatePlane90(src_v, src_stride_v,

  1200                     dst_v, dst_stride_v,

  1201                     halfwidth, halfheight);

  1202       return 0;

  1203     case kRotate270:

  1204       RotatePlane270(src_y, src_stride_y,

  1205                      dst_y, dst_stride_y,

  1206                      width, height);

  1207       RotatePlane270(src_u, src_stride_u,

  1208                      dst_u, dst_stride_u,

  1209                      halfwidth, halfheight);

  1210       RotatePlane270(src_v, src_stride_v,

  1211                      dst_v, dst_stride_v,

  1212                      halfwidth, halfheight);

  1213       return 0;

  1214     case kRotate180:

  1215       RotatePlane180(src_y, src_stride_y,

  1216                      dst_y, dst_stride_y,

  1217                      width, height);

  1218       RotatePlane180(src_u, src_stride_u,

  1219                      dst_u, dst_stride_u,

  1220                      halfwidth, halfheight);

  1221       RotatePlane180(src_v, src_stride_v,

  1222                      dst_v, dst_stride_v,

  1223                      halfwidth, halfheight);

  1224       return 0;

  1225     default:

  1226       break;

  1227   }

  1228   return -1;

  1229 }

  1231 LIBYUV_API

  1232 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,

  1233                      const uint8* src_uv, int src_stride_uv,

  1234                      uint8* dst_y, int dst_stride_y,

  1235                      uint8* dst_u, int dst_stride_u,

  1236                      uint8* dst_v, int dst_stride_v,

  1237                      int width, int height,

  1238                      enum RotationMode mode) {

  1239   int halfwidth = (width + 1) >> 1;

  1240   int halfheight = (height + 1) >> 1;

  1241   if (!src_y || !src_uv || width <= 0 || height == 0 ||

  1242       !dst_y || !dst_u || !dst_v) {

  1243     return -1;

  1244   }

  1246   // Negative height means invert the image.

  1247   if (height < 0) {

  1248     height = -height;

  1249     halfheight = (height + 1) >> 1;

  1250     src_y = src_y + (height - 1) * src_stride_y;

  1251     src_uv = src_uv + (halfheight - 1) * src_stride_uv;

  1252     src_stride_y = -src_stride_y;

  1253     src_stride_uv = -src_stride_uv;

  1254   }

  1256   switch (mode) {

  1257     case kRotate0:

  1258       // copy frame

  1259       return NV12ToI420(src_y, src_stride_y,

  1260                         src_uv, src_stride_uv,

  1261                         dst_y, dst_stride_y,

  1262                         dst_u, dst_stride_u,

  1263                         dst_v, dst_stride_v,

  1264                         width, height);

  1265     case kRotate90:

  1266       RotatePlane90(src_y, src_stride_y,

  1267                     dst_y, dst_stride_y,

  1268                     width, height);

  1269       RotateUV90(src_uv, src_stride_uv,

  1270                  dst_u, dst_stride_u,

  1271                  dst_v, dst_stride_v,

  1272                  halfwidth, halfheight);

  1273       return 0;

  1274     case kRotate270:

  1275       RotatePlane270(src_y, src_stride_y,

  1276                      dst_y, dst_stride_y,

  1277                      width, height);

  1278       RotateUV270(src_uv, src_stride_uv,

  1279                   dst_u, dst_stride_u,

  1280                   dst_v, dst_stride_v,

  1281                   halfwidth, halfheight);

  1282       return 0;

  1283     case kRotate180:

  1284       RotatePlane180(src_y, src_stride_y,

  1285                      dst_y, dst_stride_y,

  1286                      width, height);

  1287       RotateUV180(src_uv, src_stride_uv,

  1288                   dst_u, dst_stride_u,

  1289                   dst_v, dst_stride_v,

  1290                   halfwidth, halfheight);

  1291       return 0;

  1292     default:

  1293       break;

  1294   }

  1295   return -1;

  1296 }

  1298 #ifdef __cplusplus

  1299 }  // extern "C"

  1300 }  // namespace libyuv

  1301 #endif

The Tor Browser / file revision

media/libyuv/source/rotate.cc@b8a032363ba2

media/libyuv/source/rotate.cc