media/libyuv/source/rotate.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/source/rotate.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1301 @@
     1.4 +/*
     1.5 + *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS. All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "libyuv/rotate.h"
    1.15 +
    1.16 +#include "libyuv/cpu_id.h"
    1.17 +#include "libyuv/convert.h"
    1.18 +#include "libyuv/planar_functions.h"
    1.19 +#include "libyuv/row.h"
    1.20 +
    1.21 +#ifdef __cplusplus
    1.22 +namespace libyuv {
    1.23 +extern "C" {
    1.24 +#endif
    1.25 +
    1.26 +#if !defined(LIBYUV_DISABLE_X86) && \
    1.27 +    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
    1.28 +#if defined(__APPLE__) && defined(__i386__)
    1.29 +#define DECLARE_FUNCTION(name)                                                 \
    1.30 +    ".text                                     \n"                             \
    1.31 +    ".private_extern _" #name "                \n"                             \
    1.32 +    ".align 4,0x90                             \n"                             \
    1.33 +"_" #name ":                                   \n"
    1.34 +#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
    1.35 +#define DECLARE_FUNCTION(name)                                                 \
    1.36 +    ".text                                     \n"                             \
    1.37 +    ".align 4,0x90                             \n"                             \
    1.38 +"_" #name ":                                   \n"
    1.39 +#else
    1.40 +#define DECLARE_FUNCTION(name)                                                 \
    1.41 +    ".text                                     \n"                             \
    1.42 +    ".align 4,0x90                             \n"                             \
    1.43 +#name ":                                       \n"
    1.44 +#endif
    1.45 +#endif
    1.46 +
    1.47 +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
    1.48 +    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
    1.49 +#define HAS_MIRRORROW_NEON
    1.50 +void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
    1.51 +#define HAS_MIRRORROW_UV_NEON
    1.52 +void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
    1.53 +#define HAS_TRANSPOSE_WX8_NEON
    1.54 +void TransposeWx8_NEON(const uint8* src, int src_stride,
    1.55 +                       uint8* dst, int dst_stride, int width);
    1.56 +#define HAS_TRANSPOSE_UVWX8_NEON
    1.57 +void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    1.58 +                         uint8* dst_a, int dst_stride_a,
    1.59 +                         uint8* dst_b, int dst_stride_b,
    1.60 +                         int width);
    1.61 +#endif  // defined(__ARM_NEON__)
    1.62 +
    1.63 +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
    1.64 +    defined(__mips__) && \
    1.65 +    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
    1.66 +#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
    1.67 +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
    1.68 +                             uint8* dst, int dst_stride, int width);
    1.69 +
    1.70 +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
    1.71 +                                  uint8* dst, int dst_stride, int width);
    1.72 +#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
    1.73 +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
    1.74 +                               uint8* dst_a, int dst_stride_a,
    1.75 +                               uint8* dst_b, int dst_stride_b,
    1.76 +                               int width);
    1.77 +#endif  // defined(__mips__)
    1.78 +
    1.79 +#if !defined(LIBYUV_DISABLE_X86) && \
    1.80 +    defined(_M_IX86) && defined(_MSC_VER)
    1.81 +#define HAS_TRANSPOSE_WX8_SSSE3
    1.82 +__declspec(naked) __declspec(align(16))
    1.83 +static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
    1.84 +                               uint8* dst, int dst_stride, int width) {
    1.85 +  __asm {
    1.86 +    push      edi
    1.87 +    push      esi
    1.88 +    push      ebp
    1.89 +    mov       eax, [esp + 12 + 4]   // src
    1.90 +    mov       edi, [esp + 12 + 8]   // src_stride
    1.91 +    mov       edx, [esp + 12 + 12]  // dst
    1.92 +    mov       esi, [esp + 12 + 16]  // dst_stride
    1.93 +    mov       ecx, [esp + 12 + 20]  // width
    1.94 +
    1.95 +    // Read in the data from the source pointer.
    1.96 +    // First round of bit swap.
    1.97 +    align      4
    1.98 + convertloop:
    1.99 +    movq      xmm0, qword ptr [eax]
   1.100 +    lea       ebp, [eax + 8]
   1.101 +    movq      xmm1, qword ptr [eax + edi]
   1.102 +    lea       eax, [eax + 2 * edi]
   1.103 +    punpcklbw xmm0, xmm1
   1.104 +    movq      xmm2, qword ptr [eax]
   1.105 +    movdqa    xmm1, xmm0
   1.106 +    palignr   xmm1, xmm1, 8
   1.107 +    movq      xmm3, qword ptr [eax + edi]
   1.108 +    lea       eax, [eax + 2 * edi]
   1.109 +    punpcklbw xmm2, xmm3
   1.110 +    movdqa    xmm3, xmm2
   1.111 +    movq      xmm4, qword ptr [eax]
   1.112 +    palignr   xmm3, xmm3, 8
   1.113 +    movq      xmm5, qword ptr [eax + edi]
   1.114 +    punpcklbw xmm4, xmm5
   1.115 +    lea       eax, [eax + 2 * edi]
   1.116 +    movdqa    xmm5, xmm4
   1.117 +    movq      xmm6, qword ptr [eax]
   1.118 +    palignr   xmm5, xmm5, 8
   1.119 +    movq      xmm7, qword ptr [eax + edi]
   1.120 +    punpcklbw xmm6, xmm7
   1.121 +    mov       eax, ebp
   1.122 +    movdqa    xmm7, xmm6
   1.123 +    palignr   xmm7, xmm7, 8
   1.124 +    // Second round of bit swap.
   1.125 +    punpcklwd xmm0, xmm2
   1.126 +    punpcklwd xmm1, xmm3
   1.127 +    movdqa    xmm2, xmm0
   1.128 +    movdqa    xmm3, xmm1
   1.129 +    palignr   xmm2, xmm2, 8
   1.130 +    palignr   xmm3, xmm3, 8
   1.131 +    punpcklwd xmm4, xmm6
   1.132 +    punpcklwd xmm5, xmm7
   1.133 +    movdqa    xmm6, xmm4
   1.134 +    movdqa    xmm7, xmm5
   1.135 +    palignr   xmm6, xmm6, 8
   1.136 +    palignr   xmm7, xmm7, 8
   1.137 +    // Third round of bit swap.
   1.138 +    // Write to the destination pointer.
   1.139 +    punpckldq xmm0, xmm4
   1.140 +    movq      qword ptr [edx], xmm0
   1.141 +    movdqa    xmm4, xmm0
   1.142 +    palignr   xmm4, xmm4, 8
   1.143 +    movq      qword ptr [edx + esi], xmm4
   1.144 +    lea       edx, [edx + 2 * esi]
   1.145 +    punpckldq xmm2, xmm6
   1.146 +    movdqa    xmm6, xmm2
   1.147 +    palignr   xmm6, xmm6, 8
   1.148 +    movq      qword ptr [edx], xmm2
   1.149 +    punpckldq xmm1, xmm5
   1.150 +    movq      qword ptr [edx + esi], xmm6
   1.151 +    lea       edx, [edx + 2 * esi]
   1.152 +    movdqa    xmm5, xmm1
   1.153 +    movq      qword ptr [edx], xmm1
   1.154 +    palignr   xmm5, xmm5, 8
   1.155 +    punpckldq xmm3, xmm7
   1.156 +    movq      qword ptr [edx + esi], xmm5
   1.157 +    lea       edx, [edx + 2 * esi]
   1.158 +    movq      qword ptr [edx], xmm3
   1.159 +    movdqa    xmm7, xmm3
   1.160 +    palignr   xmm7, xmm7, 8
   1.161 +    sub       ecx, 8
   1.162 +    movq      qword ptr [edx + esi], xmm7
   1.163 +    lea       edx, [edx + 2 * esi]
   1.164 +    jg        convertloop
   1.165 +
   1.166 +    pop       ebp
   1.167 +    pop       esi
   1.168 +    pop       edi
   1.169 +    ret
   1.170 +  }
   1.171 +}
   1.172 +
   1.173 +#define HAS_TRANSPOSE_UVWX8_SSE2
   1.174 +__declspec(naked) __declspec(align(16))
   1.175 +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   1.176 +                                uint8* dst_a, int dst_stride_a,
   1.177 +                                uint8* dst_b, int dst_stride_b,
   1.178 +                                int w) {
   1.179 +  __asm {
   1.180 +    push      ebx
   1.181 +    push      esi
   1.182 +    push      edi
   1.183 +    push      ebp
   1.184 +    mov       eax, [esp + 16 + 4]   // src
   1.185 +    mov       edi, [esp + 16 + 8]   // src_stride
   1.186 +    mov       edx, [esp + 16 + 12]  // dst_a
   1.187 +    mov       esi, [esp + 16 + 16]  // dst_stride_a
   1.188 +    mov       ebx, [esp + 16 + 20]  // dst_b
   1.189 +    mov       ebp, [esp + 16 + 24]  // dst_stride_b
   1.190 +    mov       ecx, esp
   1.191 +    sub       esp, 4 + 16
   1.192 +    and       esp, ~15
   1.193 +    mov       [esp + 16], ecx
   1.194 +    mov       ecx, [ecx + 16 + 28]  // w
   1.195 +
   1.196 +    align      4
   1.197 + convertloop:
   1.198 +    // Read in the data from the source pointer.
   1.199 +    // First round of bit swap.
   1.200 +    movdqa    xmm0, [eax]
   1.201 +    movdqa    xmm1, [eax + edi]
   1.202 +    lea       eax, [eax + 2 * edi]
   1.203 +    movdqa    xmm7, xmm0  // use xmm7 as temp register.
   1.204 +    punpcklbw xmm0, xmm1
   1.205 +    punpckhbw xmm7, xmm1
   1.206 +    movdqa    xmm1, xmm7
   1.207 +    movdqa    xmm2, [eax]
   1.208 +    movdqa    xmm3, [eax + edi]
   1.209 +    lea       eax, [eax + 2 * edi]
   1.210 +    movdqa    xmm7, xmm2
   1.211 +    punpcklbw xmm2, xmm3
   1.212 +    punpckhbw xmm7, xmm3
   1.213 +    movdqa    xmm3, xmm7
   1.214 +    movdqa    xmm4, [eax]
   1.215 +    movdqa    xmm5, [eax + edi]
   1.216 +    lea       eax, [eax + 2 * edi]
   1.217 +    movdqa    xmm7, xmm4
   1.218 +    punpcklbw xmm4, xmm5
   1.219 +    punpckhbw xmm7, xmm5
   1.220 +    movdqa    xmm5, xmm7
   1.221 +    movdqa    xmm6, [eax]
   1.222 +    movdqa    xmm7, [eax + edi]
   1.223 +    lea       eax, [eax + 2 * edi]
   1.224 +    movdqa    [esp], xmm5  // backup xmm5
   1.225 +    neg       edi
   1.226 +    movdqa    xmm5, xmm6   // use xmm5 as temp register.
   1.227 +    punpcklbw xmm6, xmm7
   1.228 +    punpckhbw xmm5, xmm7
   1.229 +    movdqa    xmm7, xmm5
   1.230 +    lea       eax, [eax + 8 * edi + 16]
   1.231 +    neg       edi
   1.232 +    // Second round of bit swap.
   1.233 +    movdqa    xmm5, xmm0
   1.234 +    punpcklwd xmm0, xmm2
   1.235 +    punpckhwd xmm5, xmm2
   1.236 +    movdqa    xmm2, xmm5
   1.237 +    movdqa    xmm5, xmm1
   1.238 +    punpcklwd xmm1, xmm3
   1.239 +    punpckhwd xmm5, xmm3
   1.240 +    movdqa    xmm3, xmm5
   1.241 +    movdqa    xmm5, xmm4
   1.242 +    punpcklwd xmm4, xmm6
   1.243 +    punpckhwd xmm5, xmm6
   1.244 +    movdqa    xmm6, xmm5
   1.245 +    movdqa    xmm5, [esp]  // restore xmm5
   1.246 +    movdqa    [esp], xmm6  // backup xmm6
   1.247 +    movdqa    xmm6, xmm5    // use xmm6 as temp register.
   1.248 +    punpcklwd xmm5, xmm7
   1.249 +    punpckhwd xmm6, xmm7
   1.250 +    movdqa    xmm7, xmm6
   1.251 +    // Third round of bit swap.
   1.252 +    // Write to the destination pointer.
   1.253 +    movdqa    xmm6, xmm0
   1.254 +    punpckldq xmm0, xmm4
   1.255 +    punpckhdq xmm6, xmm4
   1.256 +    movdqa    xmm4, xmm6
   1.257 +    movdqa    xmm6, [esp]  // restore xmm6
   1.258 +    movlpd    qword ptr [edx], xmm0
   1.259 +    movhpd    qword ptr [ebx], xmm0
   1.260 +    movlpd    qword ptr [edx + esi], xmm4
   1.261 +    lea       edx, [edx + 2 * esi]
   1.262 +    movhpd    qword ptr [ebx + ebp], xmm4
   1.263 +    lea       ebx, [ebx + 2 * ebp]
   1.264 +    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
   1.265 +    punpckldq xmm2, xmm6
   1.266 +    movlpd    qword ptr [edx], xmm2
   1.267 +    movhpd    qword ptr [ebx], xmm2
   1.268 +    punpckhdq xmm0, xmm6
   1.269 +    movlpd    qword ptr [edx + esi], xmm0
   1.270 +    lea       edx, [edx + 2 * esi]
   1.271 +    movhpd    qword ptr [ebx + ebp], xmm0
   1.272 +    lea       ebx, [ebx + 2 * ebp]
   1.273 +    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
   1.274 +    punpckldq xmm1, xmm5
   1.275 +    movlpd    qword ptr [edx], xmm1
   1.276 +    movhpd    qword ptr [ebx], xmm1
   1.277 +    punpckhdq xmm0, xmm5
   1.278 +    movlpd    qword ptr [edx + esi], xmm0
   1.279 +    lea       edx, [edx + 2 * esi]
   1.280 +    movhpd    qword ptr [ebx + ebp], xmm0
   1.281 +    lea       ebx, [ebx + 2 * ebp]
   1.282 +    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
   1.283 +    punpckldq xmm3, xmm7
   1.284 +    movlpd    qword ptr [edx], xmm3
   1.285 +    movhpd    qword ptr [ebx], xmm3
   1.286 +    punpckhdq xmm0, xmm7
   1.287 +    sub       ecx, 8
   1.288 +    movlpd    qword ptr [edx + esi], xmm0
   1.289 +    lea       edx, [edx + 2 * esi]
   1.290 +    movhpd    qword ptr [ebx + ebp], xmm0
   1.291 +    lea       ebx, [ebx + 2 * ebp]
   1.292 +    jg        convertloop
   1.293 +
   1.294 +    mov       esp, [esp + 16]
   1.295 +    pop       ebp
   1.296 +    pop       edi
   1.297 +    pop       esi
   1.298 +    pop       ebx
   1.299 +    ret
   1.300 +  }
   1.301 +}
   1.302 +#elif !defined(LIBYUV_DISABLE_X86) && \
   1.303 +    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
   1.304 +#define HAS_TRANSPOSE_WX8_SSSE3
   1.305 +static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
   1.306 +                               uint8* dst, int dst_stride, int width) {
   1.307 +  asm volatile (
   1.308 +    // Read in the data from the source pointer.
   1.309 +    // First round of bit swap.
   1.310 +    ".p2align  2                                 \n"
   1.311 +  "1:                                            \n"
   1.312 +    "movq       (%0),%%xmm0                      \n"
   1.313 +    "movq       (%0,%3),%%xmm1                   \n"
   1.314 +    "lea        (%0,%3,2),%0                     \n"
   1.315 +    "punpcklbw  %%xmm1,%%xmm0                    \n"
   1.316 +    "movq       (%0),%%xmm2                      \n"
   1.317 +    "movdqa     %%xmm0,%%xmm1                    \n"
   1.318 +    "palignr    $0x8,%%xmm1,%%xmm1               \n"
   1.319 +    "movq       (%0,%3),%%xmm3                   \n"
   1.320 +    "lea        (%0,%3,2),%0                     \n"
   1.321 +    "punpcklbw  %%xmm3,%%xmm2                    \n"
   1.322 +    "movdqa     %%xmm2,%%xmm3                    \n"
   1.323 +    "movq       (%0),%%xmm4                      \n"
   1.324 +    "palignr    $0x8,%%xmm3,%%xmm3               \n"
   1.325 +    "movq       (%0,%3),%%xmm5                   \n"
   1.326 +    "lea        (%0,%3,2),%0                     \n"
   1.327 +    "punpcklbw  %%xmm5,%%xmm4                    \n"
   1.328 +    "movdqa     %%xmm4,%%xmm5                    \n"
   1.329 +    "movq       (%0),%%xmm6                      \n"
   1.330 +    "palignr    $0x8,%%xmm5,%%xmm5               \n"
   1.331 +    "movq       (%0,%3),%%xmm7                   \n"
   1.332 +    "lea        (%0,%3,2),%0                     \n"
   1.333 +    "punpcklbw  %%xmm7,%%xmm6                    \n"
   1.334 +    "neg        %3                               \n"
   1.335 +    "movdqa     %%xmm6,%%xmm7                    \n"
   1.336 +    "lea        0x8(%0,%3,8),%0                  \n"
   1.337 +    "palignr    $0x8,%%xmm7,%%xmm7               \n"
   1.338 +    "neg        %3                               \n"
   1.339 +     // Second round of bit swap.
   1.340 +    "punpcklwd  %%xmm2,%%xmm0                    \n"
   1.341 +    "punpcklwd  %%xmm3,%%xmm1                    \n"
   1.342 +    "movdqa     %%xmm0,%%xmm2                    \n"
   1.343 +    "movdqa     %%xmm1,%%xmm3                    \n"
   1.344 +    "palignr    $0x8,%%xmm2,%%xmm2               \n"
   1.345 +    "palignr    $0x8,%%xmm3,%%xmm3               \n"
   1.346 +    "punpcklwd  %%xmm6,%%xmm4                    \n"
   1.347 +    "punpcklwd  %%xmm7,%%xmm5                    \n"
   1.348 +    "movdqa     %%xmm4,%%xmm6                    \n"
   1.349 +    "movdqa     %%xmm5,%%xmm7                    \n"
   1.350 +    "palignr    $0x8,%%xmm6,%%xmm6               \n"
   1.351 +    "palignr    $0x8,%%xmm7,%%xmm7               \n"
   1.352 +    // Third round of bit swap.
   1.353 +    // Write to the destination pointer.
   1.354 +    "punpckldq  %%xmm4,%%xmm0                    \n"
   1.355 +    "movq       %%xmm0,(%1)                      \n"
   1.356 +    "movdqa     %%xmm0,%%xmm4                    \n"
   1.357 +    "palignr    $0x8,%%xmm4,%%xmm4               \n"
   1.358 +    "movq       %%xmm4,(%1,%4)                   \n"
   1.359 +    "lea        (%1,%4,2),%1                     \n"
   1.360 +    "punpckldq  %%xmm6,%%xmm2                    \n"
   1.361 +    "movdqa     %%xmm2,%%xmm6                    \n"
   1.362 +    "movq       %%xmm2,(%1)                      \n"
   1.363 +    "palignr    $0x8,%%xmm6,%%xmm6               \n"
   1.364 +    "punpckldq  %%xmm5,%%xmm1                    \n"
   1.365 +    "movq       %%xmm6,(%1,%4)                   \n"
   1.366 +    "lea        (%1,%4,2),%1                     \n"
   1.367 +    "movdqa     %%xmm1,%%xmm5                    \n"
   1.368 +    "movq       %%xmm1,(%1)                      \n"
   1.369 +    "palignr    $0x8,%%xmm5,%%xmm5               \n"
   1.370 +    "movq       %%xmm5,(%1,%4)                   \n"
   1.371 +    "lea        (%1,%4,2),%1                     \n"
   1.372 +    "punpckldq  %%xmm7,%%xmm3                    \n"
   1.373 +    "movq       %%xmm3,(%1)                      \n"
   1.374 +    "movdqa     %%xmm3,%%xmm7                    \n"
   1.375 +    "palignr    $0x8,%%xmm7,%%xmm7               \n"
   1.376 +    "sub        $0x8,%2                          \n"
   1.377 +    "movq       %%xmm7,(%1,%4)                   \n"
   1.378 +    "lea        (%1,%4,2),%1                     \n"
   1.379 +    "jg         1b                               \n"
   1.380 +    : "+r"(src),    // %0
   1.381 +      "+r"(dst),    // %1
   1.382 +      "+r"(width)   // %2
   1.383 +    : "r"((intptr_t)(src_stride)),  // %3
   1.384 +      "r"((intptr_t)(dst_stride))   // %4
   1.385 +    : "memory", "cc"
   1.386 +  #if defined(__SSE2__)
   1.387 +      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1.388 +  #endif
   1.389 +  );
   1.390 +}
   1.391 +
   1.392 +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
   1.393 +#define HAS_TRANSPOSE_UVWX8_SSE2
   1.394 +extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   1.395 +                                    uint8* dst_a, int dst_stride_a,
   1.396 +                                    uint8* dst_b, int dst_stride_b,
   1.397 +                                    int w);
   1.398 +  asm (
   1.399 +    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
   1.400 +    "push   %ebx                               \n"
   1.401 +    "push   %esi                               \n"
   1.402 +    "push   %edi                               \n"
   1.403 +    "push   %ebp                               \n"
   1.404 +    "mov    0x14(%esp),%eax                    \n"
   1.405 +    "mov    0x18(%esp),%edi                    \n"
   1.406 +    "mov    0x1c(%esp),%edx                    \n"
   1.407 +    "mov    0x20(%esp),%esi                    \n"
   1.408 +    "mov    0x24(%esp),%ebx                    \n"
   1.409 +    "mov    0x28(%esp),%ebp                    \n"
   1.410 +    "mov    %esp,%ecx                          \n"
   1.411 +    "sub    $0x14,%esp                         \n"
   1.412 +    "and    $0xfffffff0,%esp                   \n"
   1.413 +    "mov    %ecx,0x10(%esp)                    \n"
   1.414 +    "mov    0x2c(%ecx),%ecx                    \n"
   1.415 +
   1.416 +"1:                                            \n"
   1.417 +    "movdqa (%eax),%xmm0                       \n"
   1.418 +    "movdqa (%eax,%edi,1),%xmm1                \n"
   1.419 +    "lea    (%eax,%edi,2),%eax                 \n"
   1.420 +    "movdqa %xmm0,%xmm7                        \n"
   1.421 +    "punpcklbw %xmm1,%xmm0                     \n"
   1.422 +    "punpckhbw %xmm1,%xmm7                     \n"
   1.423 +    "movdqa %xmm7,%xmm1                        \n"
   1.424 +    "movdqa (%eax),%xmm2                       \n"
   1.425 +    "movdqa (%eax,%edi,1),%xmm3                \n"
   1.426 +    "lea    (%eax,%edi,2),%eax                 \n"
   1.427 +    "movdqa %xmm2,%xmm7                        \n"
   1.428 +    "punpcklbw %xmm3,%xmm2                     \n"
   1.429 +    "punpckhbw %xmm3,%xmm7                     \n"
   1.430 +    "movdqa %xmm7,%xmm3                        \n"
   1.431 +    "movdqa (%eax),%xmm4                       \n"
   1.432 +    "movdqa (%eax,%edi,1),%xmm5                \n"
   1.433 +    "lea    (%eax,%edi,2),%eax                 \n"
   1.434 +    "movdqa %xmm4,%xmm7                        \n"
   1.435 +    "punpcklbw %xmm5,%xmm4                     \n"
   1.436 +    "punpckhbw %xmm5,%xmm7                     \n"
   1.437 +    "movdqa %xmm7,%xmm5                        \n"
   1.438 +    "movdqa (%eax),%xmm6                       \n"
   1.439 +    "movdqa (%eax,%edi,1),%xmm7                \n"
   1.440 +    "lea    (%eax,%edi,2),%eax                 \n"
   1.441 +    "movdqa %xmm5,(%esp)                       \n"
   1.442 +    "neg    %edi                               \n"
   1.443 +    "movdqa %xmm6,%xmm5                        \n"
   1.444 +    "punpcklbw %xmm7,%xmm6                     \n"
   1.445 +    "punpckhbw %xmm7,%xmm5                     \n"
   1.446 +    "movdqa %xmm5,%xmm7                        \n"
   1.447 +    "lea    0x10(%eax,%edi,8),%eax             \n"
   1.448 +    "neg    %edi                               \n"
   1.449 +    "movdqa %xmm0,%xmm5                        \n"
   1.450 +    "punpcklwd %xmm2,%xmm0                     \n"
   1.451 +    "punpckhwd %xmm2,%xmm5                     \n"
   1.452 +    "movdqa %xmm5,%xmm2                        \n"
   1.453 +    "movdqa %xmm1,%xmm5                        \n"
   1.454 +    "punpcklwd %xmm3,%xmm1                     \n"
   1.455 +    "punpckhwd %xmm3,%xmm5                     \n"
   1.456 +    "movdqa %xmm5,%xmm3                        \n"
   1.457 +    "movdqa %xmm4,%xmm5                        \n"
   1.458 +    "punpcklwd %xmm6,%xmm4                     \n"
   1.459 +    "punpckhwd %xmm6,%xmm5                     \n"
   1.460 +    "movdqa %xmm5,%xmm6                        \n"
   1.461 +    "movdqa (%esp),%xmm5                       \n"
   1.462 +    "movdqa %xmm6,(%esp)                       \n"
   1.463 +    "movdqa %xmm5,%xmm6                        \n"
   1.464 +    "punpcklwd %xmm7,%xmm5                     \n"
   1.465 +    "punpckhwd %xmm7,%xmm6                     \n"
   1.466 +    "movdqa %xmm6,%xmm7                        \n"
   1.467 +    "movdqa %xmm0,%xmm6                        \n"
   1.468 +    "punpckldq %xmm4,%xmm0                     \n"
   1.469 +    "punpckhdq %xmm4,%xmm6                     \n"
   1.470 +    "movdqa %xmm6,%xmm4                        \n"
   1.471 +    "movdqa (%esp),%xmm6                       \n"
   1.472 +    "movlpd %xmm0,(%edx)                       \n"
   1.473 +    "movhpd %xmm0,(%ebx)                       \n"
   1.474 +    "movlpd %xmm4,(%edx,%esi,1)                \n"
   1.475 +    "lea    (%edx,%esi,2),%edx                 \n"
   1.476 +    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
   1.477 +    "lea    (%ebx,%ebp,2),%ebx                 \n"
   1.478 +    "movdqa %xmm2,%xmm0                        \n"
   1.479 +    "punpckldq %xmm6,%xmm2                     \n"
   1.480 +    "movlpd %xmm2,(%edx)                       \n"
   1.481 +    "movhpd %xmm2,(%ebx)                       \n"
   1.482 +    "punpckhdq %xmm6,%xmm0                     \n"
   1.483 +    "movlpd %xmm0,(%edx,%esi,1)                \n"
   1.484 +    "lea    (%edx,%esi,2),%edx                 \n"
   1.485 +    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
   1.486 +    "lea    (%ebx,%ebp,2),%ebx                 \n"
   1.487 +    "movdqa %xmm1,%xmm0                        \n"
   1.488 +    "punpckldq %xmm5,%xmm1                     \n"
   1.489 +    "movlpd %xmm1,(%edx)                       \n"
   1.490 +    "movhpd %xmm1,(%ebx)                       \n"
   1.491 +    "punpckhdq %xmm5,%xmm0                     \n"
   1.492 +    "movlpd %xmm0,(%edx,%esi,1)                \n"
   1.493 +    "lea    (%edx,%esi,2),%edx                 \n"
   1.494 +    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
   1.495 +    "lea    (%ebx,%ebp,2),%ebx                 \n"
   1.496 +    "movdqa %xmm3,%xmm0                        \n"
   1.497 +    "punpckldq %xmm7,%xmm3                     \n"
   1.498 +    "movlpd %xmm3,(%edx)                       \n"
   1.499 +    "movhpd %xmm3,(%ebx)                       \n"
   1.500 +    "punpckhdq %xmm7,%xmm0                     \n"
   1.501 +    "sub    $0x8,%ecx                          \n"
   1.502 +    "movlpd %xmm0,(%edx,%esi,1)                \n"
   1.503 +    "lea    (%edx,%esi,2),%edx                 \n"
   1.504 +    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
   1.505 +    "lea    (%ebx,%ebp,2),%ebx                 \n"
   1.506 +    "jg     1b                                 \n"
   1.507 +    "mov    0x10(%esp),%esp                    \n"
   1.508 +    "pop    %ebp                               \n"
   1.509 +    "pop    %edi                               \n"
   1.510 +    "pop    %esi                               \n"
   1.511 +    "pop    %ebx                               \n"
   1.512 +#if defined(__native_client__)
   1.513 +    "pop    %ecx                               \n"
   1.514 +    "and    $0xffffffe0,%ecx                   \n"
   1.515 +    "jmp    *%ecx                              \n"
   1.516 +#else
   1.517 +    "ret                                       \n"
   1.518 +#endif
   1.519 +);
   1.520 +#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
   1.521 +    defined(__x86_64__)
   1.522 +// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
   1.523 +#define HAS_TRANSPOSE_WX8_FAST_SSSE3
   1.524 +static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
   1.525 +                                    uint8* dst, int dst_stride, int width) {
   1.526 +  asm volatile (
   1.527 +  // Read in the data from the source pointer.
   1.528 +  // First round of bit swap.
   1.529 +  ".p2align  2                                 \n"
   1.530 +"1:                                            \n"
   1.531 +  "movdqa     (%0),%%xmm0                      \n"
   1.532 +  "movdqa     (%0,%3),%%xmm1                   \n"
   1.533 +  "lea        (%0,%3,2),%0                     \n"
   1.534 +  "movdqa     %%xmm0,%%xmm8                    \n"
   1.535 +  "punpcklbw  %%xmm1,%%xmm0                    \n"
   1.536 +  "punpckhbw  %%xmm1,%%xmm8                    \n"
   1.537 +  "movdqa     (%0),%%xmm2                      \n"
   1.538 +  "movdqa     %%xmm0,%%xmm1                    \n"
   1.539 +  "movdqa     %%xmm8,%%xmm9                    \n"
   1.540 +  "palignr    $0x8,%%xmm1,%%xmm1               \n"
   1.541 +  "palignr    $0x8,%%xmm9,%%xmm9               \n"
   1.542 +  "movdqa     (%0,%3),%%xmm3                   \n"
   1.543 +  "lea        (%0,%3,2),%0                     \n"
   1.544 +  "movdqa     %%xmm2,%%xmm10                   \n"
   1.545 +  "punpcklbw  %%xmm3,%%xmm2                    \n"
   1.546 +  "punpckhbw  %%xmm3,%%xmm10                   \n"
   1.547 +  "movdqa     %%xmm2,%%xmm3                    \n"
   1.548 +  "movdqa     %%xmm10,%%xmm11                  \n"
   1.549 +  "movdqa     (%0),%%xmm4                      \n"
   1.550 +  "palignr    $0x8,%%xmm3,%%xmm3               \n"
   1.551 +  "palignr    $0x8,%%xmm11,%%xmm11             \n"
   1.552 +  "movdqa     (%0,%3),%%xmm5                   \n"
   1.553 +  "lea        (%0,%3,2),%0                     \n"
   1.554 +  "movdqa     %%xmm4,%%xmm12                   \n"
   1.555 +  "punpcklbw  %%xmm5,%%xmm4                    \n"
   1.556 +  "punpckhbw  %%xmm5,%%xmm12                   \n"
   1.557 +  "movdqa     %%xmm4,%%xmm5                    \n"
   1.558 +  "movdqa     %%xmm12,%%xmm13                  \n"
   1.559 +  "movdqa     (%0),%%xmm6                      \n"
   1.560 +  "palignr    $0x8,%%xmm5,%%xmm5               \n"
   1.561 +  "palignr    $0x8,%%xmm13,%%xmm13             \n"
   1.562 +  "movdqa     (%0,%3),%%xmm7                   \n"
   1.563 +  "lea        (%0,%3,2),%0                     \n"
   1.564 +  "movdqa     %%xmm6,%%xmm14                   \n"
   1.565 +  "punpcklbw  %%xmm7,%%xmm6                    \n"
   1.566 +  "punpckhbw  %%xmm7,%%xmm14                   \n"
   1.567 +  "neg        %3                               \n"
   1.568 +  "movdqa     %%xmm6,%%xmm7                    \n"
   1.569 +  "movdqa     %%xmm14,%%xmm15                  \n"
   1.570 +  "lea        0x10(%0,%3,8),%0                 \n"
   1.571 +  "palignr    $0x8,%%xmm7,%%xmm7               \n"
   1.572 +  "palignr    $0x8,%%xmm15,%%xmm15             \n"
   1.573 +  "neg        %3                               \n"
   1.574 +   // Second round of bit swap.
   1.575 +  "punpcklwd  %%xmm2,%%xmm0                    \n"
   1.576 +  "punpcklwd  %%xmm3,%%xmm1                    \n"
   1.577 +  "movdqa     %%xmm0,%%xmm2                    \n"
   1.578 +  "movdqa     %%xmm1,%%xmm3                    \n"
   1.579 +  "palignr    $0x8,%%xmm2,%%xmm2               \n"
   1.580 +  "palignr    $0x8,%%xmm3,%%xmm3               \n"
   1.581 +  "punpcklwd  %%xmm6,%%xmm4                    \n"
   1.582 +  "punpcklwd  %%xmm7,%%xmm5                    \n"
   1.583 +  "movdqa     %%xmm4,%%xmm6                    \n"
   1.584 +  "movdqa     %%xmm5,%%xmm7                    \n"
   1.585 +  "palignr    $0x8,%%xmm6,%%xmm6               \n"
   1.586 +  "palignr    $0x8,%%xmm7,%%xmm7               \n"
   1.587 +  "punpcklwd  %%xmm10,%%xmm8                   \n"
   1.588 +  "punpcklwd  %%xmm11,%%xmm9                   \n"
   1.589 +  "movdqa     %%xmm8,%%xmm10                   \n"
   1.590 +  "movdqa     %%xmm9,%%xmm11                   \n"
   1.591 +  "palignr    $0x8,%%xmm10,%%xmm10             \n"
   1.592 +  "palignr    $0x8,%%xmm11,%%xmm11             \n"
   1.593 +  "punpcklwd  %%xmm14,%%xmm12                  \n"
   1.594 +  "punpcklwd  %%xmm15,%%xmm13                  \n"
   1.595 +  "movdqa     %%xmm12,%%xmm14                  \n"
   1.596 +  "movdqa     %%xmm13,%%xmm15                  \n"
   1.597 +  "palignr    $0x8,%%xmm14,%%xmm14             \n"
   1.598 +  "palignr    $0x8,%%xmm15,%%xmm15             \n"
   1.599 +  // Third round of bit swap.
   1.600 +  // Write to the destination pointer.
   1.601 +  "punpckldq  %%xmm4,%%xmm0                    \n"
   1.602 +  "movq       %%xmm0,(%1)                      \n"
   1.603 +  "movdqa     %%xmm0,%%xmm4                    \n"
   1.604 +  "palignr    $0x8,%%xmm4,%%xmm4               \n"
   1.605 +  "movq       %%xmm4,(%1,%4)                   \n"
   1.606 +  "lea        (%1,%4,2),%1                     \n"
   1.607 +  "punpckldq  %%xmm6,%%xmm2                    \n"
   1.608 +  "movdqa     %%xmm2,%%xmm6                    \n"
   1.609 +  "movq       %%xmm2,(%1)                      \n"
   1.610 +  "palignr    $0x8,%%xmm6,%%xmm6               \n"
   1.611 +  "punpckldq  %%xmm5,%%xmm1                    \n"
   1.612 +  "movq       %%xmm6,(%1,%4)                   \n"
   1.613 +  "lea        (%1,%4,2),%1                     \n"
   1.614 +  "movdqa     %%xmm1,%%xmm5                    \n"
   1.615 +  "movq       %%xmm1,(%1)                      \n"
   1.616 +  "palignr    $0x8,%%xmm5,%%xmm5               \n"
   1.617 +  "movq       %%xmm5,(%1,%4)                   \n"
   1.618 +  "lea        (%1,%4,2),%1                     \n"
   1.619 +  "punpckldq  %%xmm7,%%xmm3                    \n"
   1.620 +  "movq       %%xmm3,(%1)                      \n"
   1.621 +  "movdqa     %%xmm3,%%xmm7                    \n"
   1.622 +  "palignr    $0x8,%%xmm7,%%xmm7               \n"
   1.623 +  "movq       %%xmm7,(%1,%4)                   \n"
   1.624 +  "lea        (%1,%4,2),%1                     \n"
   1.625 +  "punpckldq  %%xmm12,%%xmm8                   \n"
   1.626 +  "movq       %%xmm8,(%1)                      \n"
   1.627 +  "movdqa     %%xmm8,%%xmm12                   \n"
   1.628 +  "palignr    $0x8,%%xmm12,%%xmm12             \n"
   1.629 +  "movq       %%xmm12,(%1,%4)                  \n"
   1.630 +  "lea        (%1,%4,2),%1                     \n"
   1.631 +  "punpckldq  %%xmm14,%%xmm10                  \n"
   1.632 +  "movdqa     %%xmm10,%%xmm14                  \n"
   1.633 +  "movq       %%xmm10,(%1)                     \n"
   1.634 +  "palignr    $0x8,%%xmm14,%%xmm14             \n"
   1.635 +  "punpckldq  %%xmm13,%%xmm9                   \n"
   1.636 +  "movq       %%xmm14,(%1,%4)                  \n"
   1.637 +  "lea        (%1,%4,2),%1                     \n"
   1.638 +  "movdqa     %%xmm9,%%xmm13                   \n"
   1.639 +  "movq       %%xmm9,(%1)                      \n"
   1.640 +  "palignr    $0x8,%%xmm13,%%xmm13             \n"
   1.641 +  "movq       %%xmm13,(%1,%4)                  \n"
   1.642 +  "lea        (%1,%4,2),%1                     \n"
   1.643 +  "punpckldq  %%xmm15,%%xmm11                  \n"
   1.644 +  "movq       %%xmm11,(%1)                     \n"
   1.645 +  "movdqa     %%xmm11,%%xmm15                  \n"
   1.646 +  "palignr    $0x8,%%xmm15,%%xmm15             \n"
   1.647 +  "sub        $0x10,%2                         \n"
   1.648 +  "movq       %%xmm15,(%1,%4)                  \n"
   1.649 +  "lea        (%1,%4,2),%1                     \n"
   1.650 +  "jg         1b                               \n"
   1.651 +  : "+r"(src),    // %0
   1.652 +    "+r"(dst),    // %1
   1.653 +    "+r"(width)   // %2
   1.654 +  : "r"((intptr_t)(src_stride)),  // %3
   1.655 +    "r"((intptr_t)(dst_stride))   // %4
   1.656 +  : "memory", "cc",
   1.657 +    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
   1.658 +    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
   1.659 +);
   1.660 +}
   1.661 +
   1.662 +#define HAS_TRANSPOSE_UVWX8_SSE2
   1.663 +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   1.664 +                                uint8* dst_a, int dst_stride_a,
   1.665 +                                uint8* dst_b, int dst_stride_b,
   1.666 +                                int w) {
   1.667 +  asm volatile (
   1.668 +  // Read in the data from the source pointer.
   1.669 +  // First round of bit swap.
   1.670 +  ".p2align  2                                 \n"
   1.671 +"1:                                            \n"
   1.672 +  "movdqa     (%0),%%xmm0                      \n"
   1.673 +  "movdqa     (%0,%4),%%xmm1                   \n"
   1.674 +  "lea        (%0,%4,2),%0                     \n"
   1.675 +  "movdqa     %%xmm0,%%xmm8                    \n"
   1.676 +  "punpcklbw  %%xmm1,%%xmm0                    \n"
   1.677 +  "punpckhbw  %%xmm1,%%xmm8                    \n"
   1.678 +  "movdqa     %%xmm8,%%xmm1                    \n"
   1.679 +  "movdqa     (%0),%%xmm2                      \n"
   1.680 +  "movdqa     (%0,%4),%%xmm3                   \n"
   1.681 +  "lea        (%0,%4,2),%0                     \n"
   1.682 +  "movdqa     %%xmm2,%%xmm8                    \n"
   1.683 +  "punpcklbw  %%xmm3,%%xmm2                    \n"
   1.684 +  "punpckhbw  %%xmm3,%%xmm8                    \n"
   1.685 +  "movdqa     %%xmm8,%%xmm3                    \n"
   1.686 +  "movdqa     (%0),%%xmm4                      \n"
   1.687 +  "movdqa     (%0,%4),%%xmm5                   \n"
   1.688 +  "lea        (%0,%4,2),%0                     \n"
   1.689 +  "movdqa     %%xmm4,%%xmm8                    \n"
   1.690 +  "punpcklbw  %%xmm5,%%xmm4                    \n"
   1.691 +  "punpckhbw  %%xmm5,%%xmm8                    \n"
   1.692 +  "movdqa     %%xmm8,%%xmm5                    \n"
   1.693 +  "movdqa     (%0),%%xmm6                      \n"
   1.694 +  "movdqa     (%0,%4),%%xmm7                   \n"
   1.695 +  "lea        (%0,%4,2),%0                     \n"
   1.696 +  "movdqa     %%xmm6,%%xmm8                    \n"
   1.697 +  "punpcklbw  %%xmm7,%%xmm6                    \n"
   1.698 +  "neg        %4                               \n"
   1.699 +  "lea        0x10(%0,%4,8),%0                 \n"
   1.700 +  "punpckhbw  %%xmm7,%%xmm8                    \n"
   1.701 +  "movdqa     %%xmm8,%%xmm7                    \n"
   1.702 +  "neg        %4                               \n"
   1.703 +   // Second round of bit swap.
   1.704 +  "movdqa     %%xmm0,%%xmm8                    \n"
   1.705 +  "movdqa     %%xmm1,%%xmm9                    \n"
   1.706 +  "punpckhwd  %%xmm2,%%xmm8                    \n"
   1.707 +  "punpckhwd  %%xmm3,%%xmm9                    \n"
   1.708 +  "punpcklwd  %%xmm2,%%xmm0                    \n"
   1.709 +  "punpcklwd  %%xmm3,%%xmm1                    \n"
   1.710 +  "movdqa     %%xmm8,%%xmm2                    \n"
   1.711 +  "movdqa     %%xmm9,%%xmm3                    \n"
   1.712 +  "movdqa     %%xmm4,%%xmm8                    \n"
   1.713 +  "movdqa     %%xmm5,%%xmm9                    \n"
   1.714 +  "punpckhwd  %%xmm6,%%xmm8                    \n"
   1.715 +  "punpckhwd  %%xmm7,%%xmm9                    \n"
   1.716 +  "punpcklwd  %%xmm6,%%xmm4                    \n"
   1.717 +  "punpcklwd  %%xmm7,%%xmm5                    \n"
   1.718 +  "movdqa     %%xmm8,%%xmm6                    \n"
   1.719 +  "movdqa     %%xmm9,%%xmm7                    \n"
   1.720 +  // Third round of bit swap.
   1.721 +  // Write to the destination pointer.
   1.722 +  "movdqa     %%xmm0,%%xmm8                    \n"
   1.723 +  "punpckldq  %%xmm4,%%xmm0                    \n"
   1.724 +  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
   1.725 +  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
   1.726 +  "punpckhdq  %%xmm4,%%xmm8                    \n"
   1.727 +  "movlpd     %%xmm8,(%1,%5)                   \n"
   1.728 +  "lea        (%1,%5,2),%1                     \n"
   1.729 +  "movhpd     %%xmm8,(%2,%6)                   \n"
   1.730 +  "lea        (%2,%6,2),%2                     \n"
   1.731 +  "movdqa     %%xmm2,%%xmm8                    \n"
   1.732 +  "punpckldq  %%xmm6,%%xmm2                    \n"
   1.733 +  "movlpd     %%xmm2,(%1)                      \n"
   1.734 +  "movhpd     %%xmm2,(%2)                      \n"
   1.735 +  "punpckhdq  %%xmm6,%%xmm8                    \n"
   1.736 +  "movlpd     %%xmm8,(%1,%5)                   \n"
   1.737 +  "lea        (%1,%5,2),%1                     \n"
   1.738 +  "movhpd     %%xmm8,(%2,%6)                   \n"
   1.739 +  "lea        (%2,%6,2),%2                     \n"
   1.740 +  "movdqa     %%xmm1,%%xmm8                    \n"
   1.741 +  "punpckldq  %%xmm5,%%xmm1                    \n"
   1.742 +  "movlpd     %%xmm1,(%1)                      \n"
   1.743 +  "movhpd     %%xmm1,(%2)                      \n"
   1.744 +  "punpckhdq  %%xmm5,%%xmm8                    \n"
   1.745 +  "movlpd     %%xmm8,(%1,%5)                   \n"
   1.746 +  "lea        (%1,%5,2),%1                     \n"
   1.747 +  "movhpd     %%xmm8,(%2,%6)                   \n"
   1.748 +  "lea        (%2,%6,2),%2                     \n"
   1.749 +  "movdqa     %%xmm3,%%xmm8                    \n"
   1.750 +  "punpckldq  %%xmm7,%%xmm3                    \n"
   1.751 +  "movlpd     %%xmm3,(%1)                      \n"
   1.752 +  "movhpd     %%xmm3,(%2)                      \n"
   1.753 +  "punpckhdq  %%xmm7,%%xmm8                    \n"
   1.754 +  "sub        $0x8,%3                          \n"
   1.755 +  "movlpd     %%xmm8,(%1,%5)                   \n"
   1.756 +  "lea        (%1,%5,2),%1                     \n"
   1.757 +  "movhpd     %%xmm8,(%2,%6)                   \n"
   1.758 +  "lea        (%2,%6,2),%2                     \n"
   1.759 +  "jg         1b                               \n"
   1.760 +  : "+r"(src),    // %0
   1.761 +    "+r"(dst_a),  // %1
   1.762 +    "+r"(dst_b),  // %2
   1.763 +    "+r"(w)   // %3
   1.764 +  : "r"((intptr_t)(src_stride)),    // %4
   1.765 +    "r"((intptr_t)(dst_stride_a)),  // %5
   1.766 +    "r"((intptr_t)(dst_stride_b))   // %6
   1.767 +  : "memory", "cc",
   1.768 +    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
   1.769 +    "xmm8", "xmm9"
   1.770 +);
   1.771 +}
   1.772 +#endif
   1.773 +#endif
   1.774 +
   1.775 +static void TransposeWx8_C(const uint8* src, int src_stride,
   1.776 +                           uint8* dst, int dst_stride,
   1.777 +                           int width) {
   1.778 +  int i;
   1.779 +  for (i = 0; i < width; ++i) {
   1.780 +    dst[0] = src[0 * src_stride];
   1.781 +    dst[1] = src[1 * src_stride];
   1.782 +    dst[2] = src[2 * src_stride];
   1.783 +    dst[3] = src[3 * src_stride];
   1.784 +    dst[4] = src[4 * src_stride];
   1.785 +    dst[5] = src[5 * src_stride];
   1.786 +    dst[6] = src[6 * src_stride];
   1.787 +    dst[7] = src[7 * src_stride];
   1.788 +    ++src;
   1.789 +    dst += dst_stride;
   1.790 +  }
   1.791 +}
   1.792 +
   1.793 +static void TransposeWxH_C(const uint8* src, int src_stride,
   1.794 +                           uint8* dst, int dst_stride,
   1.795 +                           int width, int height) {
   1.796 +  int i;
   1.797 +  for (i = 0; i < width; ++i) {
   1.798 +    int j;
   1.799 +    for (j = 0; j < height; ++j) {
   1.800 +      dst[i * dst_stride + j] = src[j * src_stride + i];
   1.801 +    }
   1.802 +  }
   1.803 +}
   1.804 +
   1.805 +LIBYUV_API
   1.806 +void TransposePlane(const uint8* src, int src_stride,
   1.807 +                    uint8* dst, int dst_stride,
   1.808 +                    int width, int height) {
   1.809 +  int i = height;
   1.810 +  void (*TransposeWx8)(const uint8* src, int src_stride,
   1.811 +                       uint8* dst, int dst_stride,
   1.812 +                       int width) = TransposeWx8_C;
   1.813 +#if defined(HAS_TRANSPOSE_WX8_NEON)
   1.814 +  if (TestCpuFlag(kCpuHasNEON)) {
   1.815 +    TransposeWx8 = TransposeWx8_NEON;
   1.816 +  }
   1.817 +#endif
   1.818 +#if defined(HAS_TRANSPOSE_WX8_SSSE3)
   1.819 +  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
   1.820 +    TransposeWx8 = TransposeWx8_SSSE3;
   1.821 +  }
   1.822 +#endif
   1.823 +#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
   1.824 +  if (TestCpuFlag(kCpuHasSSSE3) &&
   1.825 +      IS_ALIGNED(width, 16) &&
   1.826 +      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
   1.827 +    TransposeWx8 = TransposeWx8_FAST_SSSE3;
   1.828 +  }
   1.829 +#endif
   1.830 +#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
   1.831 +  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
   1.832 +    if (IS_ALIGNED(width, 4) &&
   1.833 +        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
   1.834 +      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
   1.835 +    } else {
   1.836 +      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
   1.837 +    }
   1.838 +  }
   1.839 +#endif
   1.840 +
   1.841 +  // Work across the source in 8x8 tiles
   1.842 +  while (i >= 8) {
   1.843 +    TransposeWx8(src, src_stride, dst, dst_stride, width);
   1.844 +    src += 8 * src_stride;    // Go down 8 rows.
   1.845 +    dst += 8;                 // Move over 8 columns.
   1.846 +    i -= 8;
   1.847 +  }
   1.848 +
   1.849 +  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
   1.850 +}
   1.851 +
   1.852 +LIBYUV_API
   1.853 +void RotatePlane90(const uint8* src, int src_stride,
   1.854 +                   uint8* dst, int dst_stride,
   1.855 +                   int width, int height) {
   1.856 +  // Rotate by 90 is a transpose with the source read
   1.857 +  // from bottom to top. So set the source pointer to the end
   1.858 +  // of the buffer and flip the sign of the source stride.
   1.859 +  src += src_stride * (height - 1);
   1.860 +  src_stride = -src_stride;
   1.861 +  TransposePlane(src, src_stride, dst, dst_stride, width, height);
   1.862 +}
   1.863 +
   1.864 +LIBYUV_API
   1.865 +void RotatePlane270(const uint8* src, int src_stride,
   1.866 +                    uint8* dst, int dst_stride,
   1.867 +                    int width, int height) {
   1.868 +  // Rotate by 270 is a transpose with the destination written
   1.869 +  // from bottom to top. So set the destination pointer to the end
   1.870 +  // of the buffer and flip the sign of the destination stride.
   1.871 +  dst += dst_stride * (width - 1);
   1.872 +  dst_stride = -dst_stride;
   1.873 +  TransposePlane(src, src_stride, dst, dst_stride, width, height);
   1.874 +}
   1.875 +
   1.876 +LIBYUV_API
   1.877 +void RotatePlane180(const uint8* src, int src_stride,
   1.878 +                    uint8* dst, int dst_stride,
   1.879 +                    int width, int height) {
   1.880 +  // Swap first and last row and mirror the content. Uses a temporary row.
   1.881 +  align_buffer_64(row, width);
   1.882 +  const uint8* src_bot = src + src_stride * (height - 1);
   1.883 +  uint8* dst_bot = dst + dst_stride * (height - 1);
   1.884 +  int half_height = (height + 1) >> 1;
   1.885 +  int y;
   1.886 +  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
   1.887 +  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
   1.888 +#if defined(HAS_MIRRORROW_NEON)
   1.889 +  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
   1.890 +    MirrorRow = MirrorRow_NEON;
   1.891 +  }
   1.892 +#endif
   1.893 +#if defined(HAS_MIRRORROW_SSE2)
   1.894 +  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
   1.895 +      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
   1.896 +      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
   1.897 +    MirrorRow = MirrorRow_SSE2;
   1.898 +  }
   1.899 +#endif
   1.900 +#if defined(HAS_MIRRORROW_SSSE3)
   1.901 +  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
   1.902 +      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
   1.903 +      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
   1.904 +    MirrorRow = MirrorRow_SSSE3;
   1.905 +  }
   1.906 +#endif
   1.907 +#if defined(HAS_MIRRORROW_AVX2)
   1.908 +  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
   1.909 +    MirrorRow = MirrorRow_AVX2;
   1.910 +  }
   1.911 +#endif
   1.912 +#if defined(HAS_MIRRORROW_MIPS_DSPR2)
   1.913 +  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
   1.914 +      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
   1.915 +      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
   1.916 +    MirrorRow = MirrorRow_MIPS_DSPR2;
   1.917 +  }
   1.918 +#endif
   1.919 +#if defined(HAS_COPYROW_NEON)
   1.920 +  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
   1.921 +    CopyRow = CopyRow_NEON;
   1.922 +  }
   1.923 +#endif
   1.924 +#if defined(HAS_COPYROW_X86)
   1.925 +  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
   1.926 +    CopyRow = CopyRow_X86;
   1.927 +  }
   1.928 +#endif
   1.929 +#if defined(HAS_COPYROW_SSE2)
   1.930 +  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
   1.931 +      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
   1.932 +      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
   1.933 +    CopyRow = CopyRow_SSE2;
   1.934 +  }
   1.935 +#endif
   1.936 +#if defined(HAS_COPYROW_ERMS)
   1.937 +  if (TestCpuFlag(kCpuHasERMS)) {
   1.938 +    CopyRow = CopyRow_ERMS;
   1.939 +  }
   1.940 +#endif
   1.941 +#if defined(HAS_COPYROW_MIPS)
   1.942 +  if (TestCpuFlag(kCpuHasMIPS)) {
   1.943 +    CopyRow = CopyRow_MIPS;
   1.944 +  }
   1.945 +#endif
   1.946 +
   1.947 +  // Odd height will harmlessly mirror the middle row twice.
   1.948 +  for (y = 0; y < half_height; ++y) {
   1.949 +    MirrorRow(src, row, width);  // Mirror first row into a buffer
   1.950 +    src += src_stride;
   1.951 +    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
   1.952 +    dst += dst_stride;
   1.953 +    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
   1.954 +    src_bot -= src_stride;
   1.955 +    dst_bot -= dst_stride;
   1.956 +  }
   1.957 +  free_aligned_buffer_64(row);
   1.958 +}
   1.959 +
   1.960 +static void TransposeUVWx8_C(const uint8* src, int src_stride,
   1.961 +                             uint8* dst_a, int dst_stride_a,
   1.962 +                             uint8* dst_b, int dst_stride_b,
   1.963 +                             int width) {
   1.964 +  int i;
   1.965 +  for (i = 0; i < width; ++i) {
   1.966 +    dst_a[0] = src[0 * src_stride + 0];
   1.967 +    dst_b[0] = src[0 * src_stride + 1];
   1.968 +    dst_a[1] = src[1 * src_stride + 0];
   1.969 +    dst_b[1] = src[1 * src_stride + 1];
   1.970 +    dst_a[2] = src[2 * src_stride + 0];
   1.971 +    dst_b[2] = src[2 * src_stride + 1];
   1.972 +    dst_a[3] = src[3 * src_stride + 0];
   1.973 +    dst_b[3] = src[3 * src_stride + 1];
   1.974 +    dst_a[4] = src[4 * src_stride + 0];
   1.975 +    dst_b[4] = src[4 * src_stride + 1];
   1.976 +    dst_a[5] = src[5 * src_stride + 0];
   1.977 +    dst_b[5] = src[5 * src_stride + 1];
   1.978 +    dst_a[6] = src[6 * src_stride + 0];
   1.979 +    dst_b[6] = src[6 * src_stride + 1];
   1.980 +    dst_a[7] = src[7 * src_stride + 0];
   1.981 +    dst_b[7] = src[7 * src_stride + 1];
   1.982 +    src += 2;
   1.983 +    dst_a += dst_stride_a;
   1.984 +    dst_b += dst_stride_b;
   1.985 +  }
   1.986 +}
   1.987 +
   1.988 +static void TransposeUVWxH_C(const uint8* src, int src_stride,
   1.989 +                             uint8* dst_a, int dst_stride_a,
   1.990 +                             uint8* dst_b, int dst_stride_b,
   1.991 +                             int width, int height) {
   1.992 +  int i;
   1.993 +  for (i = 0; i < width * 2; i += 2) {
   1.994 +    int j;
   1.995 +    for (j = 0; j < height; ++j) {
   1.996 +      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
   1.997 +      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
   1.998 +    }
   1.999 +  }
  1.1000 +}
  1.1001 +
  1.1002 +LIBYUV_API
  1.1003 +void TransposeUV(const uint8* src, int src_stride,
  1.1004 +                 uint8* dst_a, int dst_stride_a,
  1.1005 +                 uint8* dst_b, int dst_stride_b,
  1.1006 +                 int width, int height) {
  1.1007 +  int i = height;
  1.1008 +  void (*TransposeUVWx8)(const uint8* src, int src_stride,
  1.1009 +                         uint8* dst_a, int dst_stride_a,
  1.1010 +                         uint8* dst_b, int dst_stride_b,
  1.1011 +                         int width) = TransposeUVWx8_C;
  1.1012 +#if defined(HAS_TRANSPOSE_UVWX8_NEON)
  1.1013 +  if (TestCpuFlag(kCpuHasNEON)) {
  1.1014 +    TransposeUVWx8 = TransposeUVWx8_NEON;
  1.1015 +  }
  1.1016 +#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
  1.1017 +  if (TestCpuFlag(kCpuHasSSE2) &&
  1.1018 +      IS_ALIGNED(width, 8) &&
  1.1019 +      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
  1.1020 +    TransposeUVWx8 = TransposeUVWx8_SSE2;
  1.1021 +  }
  1.1022 +#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
  1.1023 +  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
  1.1024 +      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
  1.1025 +    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
  1.1026 +  }
  1.1027 +#endif
  1.1028 +
  1.1029 +  // Work through the source in 8x8 tiles.
  1.1030 +  while (i >= 8) {
  1.1031 +    TransposeUVWx8(src, src_stride,
  1.1032 +                   dst_a, dst_stride_a,
  1.1033 +                   dst_b, dst_stride_b,
  1.1034 +                   width);
  1.1035 +    src += 8 * src_stride;    // Go down 8 rows.
  1.1036 +    dst_a += 8;               // Move over 8 columns.
  1.1037 +    dst_b += 8;               // Move over 8 columns.
  1.1038 +    i -= 8;
  1.1039 +  }
  1.1040 +
  1.1041 +  TransposeUVWxH_C(src, src_stride,
  1.1042 +                   dst_a, dst_stride_a,
  1.1043 +                   dst_b, dst_stride_b,
  1.1044 +                   width, i);
  1.1045 +}
  1.1046 +
  1.1047 +LIBYUV_API
  1.1048 +void RotateUV90(const uint8* src, int src_stride,
  1.1049 +                uint8* dst_a, int dst_stride_a,
  1.1050 +                uint8* dst_b, int dst_stride_b,
  1.1051 +                int width, int height) {
  1.1052 +  src += src_stride * (height - 1);
  1.1053 +  src_stride = -src_stride;
  1.1054 +
  1.1055 +  TransposeUV(src, src_stride,
  1.1056 +              dst_a, dst_stride_a,
  1.1057 +              dst_b, dst_stride_b,
  1.1058 +              width, height);
  1.1059 +}
  1.1060 +
  1.1061 +LIBYUV_API
  1.1062 +void RotateUV270(const uint8* src, int src_stride,
  1.1063 +                 uint8* dst_a, int dst_stride_a,
  1.1064 +                 uint8* dst_b, int dst_stride_b,
  1.1065 +                 int width, int height) {
  1.1066 +  dst_a += dst_stride_a * (width - 1);
  1.1067 +  dst_b += dst_stride_b * (width - 1);
  1.1068 +  dst_stride_a = -dst_stride_a;
  1.1069 +  dst_stride_b = -dst_stride_b;
  1.1070 +
  1.1071 +  TransposeUV(src, src_stride,
  1.1072 +              dst_a, dst_stride_a,
  1.1073 +              dst_b, dst_stride_b,
  1.1074 +              width, height);
  1.1075 +}
  1.1076 +
  1.1077 +// Rotate 180 is a horizontal and vertical flip.
  1.1078 +LIBYUV_API
  1.1079 +void RotateUV180(const uint8* src, int src_stride,
  1.1080 +                 uint8* dst_a, int dst_stride_a,
  1.1081 +                 uint8* dst_b, int dst_stride_b,
  1.1082 +                 int width, int height) {
  1.1083 +  int i;
  1.1084 +  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
  1.1085 +      MirrorUVRow_C;
  1.1086 +#if defined(HAS_MIRRORUVROW_NEON)
  1.1087 +  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
  1.1088 +    MirrorRowUV = MirrorUVRow_NEON;
  1.1089 +  }
  1.1090 +#elif defined(HAS_MIRRORROW_UV_SSSE3)
  1.1091 +  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
  1.1092 +      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
  1.1093 +    MirrorRowUV = MirrorUVRow_SSSE3;
  1.1094 +  }
  1.1095 +#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
  1.1096 +  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
  1.1097 +      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
  1.1098 +    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
  1.1099 +  }
  1.1100 +#endif
  1.1101 +
  1.1102 +  dst_a += dst_stride_a * (height - 1);
  1.1103 +  dst_b += dst_stride_b * (height - 1);
  1.1104 +
  1.1105 +  for (i = 0; i < height; ++i) {
  1.1106 +    MirrorRowUV(src, dst_a, dst_b, width);
  1.1107 +    src += src_stride;
  1.1108 +    dst_a -= dst_stride_a;
  1.1109 +    dst_b -= dst_stride_b;
  1.1110 +  }
  1.1111 +}
  1.1112 +
  1.1113 +LIBYUV_API
  1.1114 +int RotatePlane(const uint8* src, int src_stride,
  1.1115 +                uint8* dst, int dst_stride,
  1.1116 +                int width, int height,
  1.1117 +                enum RotationMode mode) {
  1.1118 +  if (!src || width <= 0 || height == 0 || !dst) {
  1.1119 +    return -1;
  1.1120 +  }
  1.1121 +
  1.1122 +  // Negative height means invert the image.
  1.1123 +  if (height < 0) {
  1.1124 +    height = -height;
  1.1125 +    src = src + (height - 1) * src_stride;
  1.1126 +    src_stride = -src_stride;
  1.1127 +  }
  1.1128 +
  1.1129 +  switch (mode) {
  1.1130 +    case kRotate0:
  1.1131 +      // copy frame
  1.1132 +      CopyPlane(src, src_stride,
  1.1133 +                dst, dst_stride,
  1.1134 +                width, height);
  1.1135 +      return 0;
  1.1136 +    case kRotate90:
  1.1137 +      RotatePlane90(src, src_stride,
  1.1138 +                    dst, dst_stride,
  1.1139 +                    width, height);
  1.1140 +      return 0;
  1.1141 +    case kRotate270:
  1.1142 +      RotatePlane270(src, src_stride,
  1.1143 +                     dst, dst_stride,
  1.1144 +                     width, height);
  1.1145 +      return 0;
  1.1146 +    case kRotate180:
  1.1147 +      RotatePlane180(src, src_stride,
  1.1148 +                     dst, dst_stride,
  1.1149 +                     width, height);
  1.1150 +      return 0;
  1.1151 +    default:
  1.1152 +      break;
  1.1153 +  }
  1.1154 +  return -1;
  1.1155 +}
  1.1156 +
  1.1157 +LIBYUV_API
  1.1158 +int I420Rotate(const uint8* src_y, int src_stride_y,
  1.1159 +               const uint8* src_u, int src_stride_u,
  1.1160 +               const uint8* src_v, int src_stride_v,
  1.1161 +               uint8* dst_y, int dst_stride_y,
  1.1162 +               uint8* dst_u, int dst_stride_u,
  1.1163 +               uint8* dst_v, int dst_stride_v,
  1.1164 +               int width, int height,
  1.1165 +               enum RotationMode mode) {
  1.1166 +  int halfwidth = (width + 1) >> 1;
  1.1167 +  int halfheight = (height + 1) >> 1;
  1.1168 +  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
  1.1169 +      !dst_y || !dst_u || !dst_v) {
  1.1170 +    return -1;
  1.1171 +  }
  1.1172 +
  1.1173 +  // Negative height means invert the image.
  1.1174 +  if (height < 0) {
  1.1175 +    height = -height;
  1.1176 +    halfheight = (height + 1) >> 1;
  1.1177 +    src_y = src_y + (height - 1) * src_stride_y;
  1.1178 +    src_u = src_u + (halfheight - 1) * src_stride_u;
  1.1179 +    src_v = src_v + (halfheight - 1) * src_stride_v;
  1.1180 +    src_stride_y = -src_stride_y;
  1.1181 +    src_stride_u = -src_stride_u;
  1.1182 +    src_stride_v = -src_stride_v;
  1.1183 +  }
  1.1184 +
  1.1185 +  switch (mode) {
  1.1186 +    case kRotate0:
  1.1187 +      // copy frame
  1.1188 +      return I420Copy(src_y, src_stride_y,
  1.1189 +                      src_u, src_stride_u,
  1.1190 +                      src_v, src_stride_v,
  1.1191 +                      dst_y, dst_stride_y,
  1.1192 +                      dst_u, dst_stride_u,
  1.1193 +                      dst_v, dst_stride_v,
  1.1194 +                      width, height);
  1.1195 +    case kRotate90:
  1.1196 +      RotatePlane90(src_y, src_stride_y,
  1.1197 +                    dst_y, dst_stride_y,
  1.1198 +                    width, height);
  1.1199 +      RotatePlane90(src_u, src_stride_u,
  1.1200 +                    dst_u, dst_stride_u,
  1.1201 +                    halfwidth, halfheight);
  1.1202 +      RotatePlane90(src_v, src_stride_v,
  1.1203 +                    dst_v, dst_stride_v,
  1.1204 +                    halfwidth, halfheight);
  1.1205 +      return 0;
  1.1206 +    case kRotate270:
  1.1207 +      RotatePlane270(src_y, src_stride_y,
  1.1208 +                     dst_y, dst_stride_y,
  1.1209 +                     width, height);
  1.1210 +      RotatePlane270(src_u, src_stride_u,
  1.1211 +                     dst_u, dst_stride_u,
  1.1212 +                     halfwidth, halfheight);
  1.1213 +      RotatePlane270(src_v, src_stride_v,
  1.1214 +                     dst_v, dst_stride_v,
  1.1215 +                     halfwidth, halfheight);
  1.1216 +      return 0;
  1.1217 +    case kRotate180:
  1.1218 +      RotatePlane180(src_y, src_stride_y,
  1.1219 +                     dst_y, dst_stride_y,
  1.1220 +                     width, height);
  1.1221 +      RotatePlane180(src_u, src_stride_u,
  1.1222 +                     dst_u, dst_stride_u,
  1.1223 +                     halfwidth, halfheight);
  1.1224 +      RotatePlane180(src_v, src_stride_v,
  1.1225 +                     dst_v, dst_stride_v,
  1.1226 +                     halfwidth, halfheight);
  1.1227 +      return 0;
  1.1228 +    default:
  1.1229 +      break;
  1.1230 +  }
  1.1231 +  return -1;
  1.1232 +}
  1.1233 +
  1.1234 +LIBYUV_API
  1.1235 +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
  1.1236 +                     const uint8* src_uv, int src_stride_uv,
  1.1237 +                     uint8* dst_y, int dst_stride_y,
  1.1238 +                     uint8* dst_u, int dst_stride_u,
  1.1239 +                     uint8* dst_v, int dst_stride_v,
  1.1240 +                     int width, int height,
  1.1241 +                     enum RotationMode mode) {
  1.1242 +  int halfwidth = (width + 1) >> 1;
  1.1243 +  int halfheight = (height + 1) >> 1;
  1.1244 +  if (!src_y || !src_uv || width <= 0 || height == 0 ||
  1.1245 +      !dst_y || !dst_u || !dst_v) {
  1.1246 +    return -1;
  1.1247 +  }
  1.1248 +
  1.1249 +  // Negative height means invert the image.
  1.1250 +  if (height < 0) {
  1.1251 +    height = -height;
  1.1252 +    halfheight = (height + 1) >> 1;
  1.1253 +    src_y = src_y + (height - 1) * src_stride_y;
  1.1254 +    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
  1.1255 +    src_stride_y = -src_stride_y;
  1.1256 +    src_stride_uv = -src_stride_uv;
  1.1257 +  }
  1.1258 +
  1.1259 +  switch (mode) {
  1.1260 +    case kRotate0:
  1.1261 +      // copy frame
  1.1262 +      return NV12ToI420(src_y, src_stride_y,
  1.1263 +                        src_uv, src_stride_uv,
  1.1264 +                        dst_y, dst_stride_y,
  1.1265 +                        dst_u, dst_stride_u,
  1.1266 +                        dst_v, dst_stride_v,
  1.1267 +                        width, height);
  1.1268 +    case kRotate90:
  1.1269 +      RotatePlane90(src_y, src_stride_y,
  1.1270 +                    dst_y, dst_stride_y,
  1.1271 +                    width, height);
  1.1272 +      RotateUV90(src_uv, src_stride_uv,
  1.1273 +                 dst_u, dst_stride_u,
  1.1274 +                 dst_v, dst_stride_v,
  1.1275 +                 halfwidth, halfheight);
  1.1276 +      return 0;
  1.1277 +    case kRotate270:
  1.1278 +      RotatePlane270(src_y, src_stride_y,
  1.1279 +                     dst_y, dst_stride_y,
  1.1280 +                     width, height);
  1.1281 +      RotateUV270(src_uv, src_stride_uv,
  1.1282 +                  dst_u, dst_stride_u,
  1.1283 +                  dst_v, dst_stride_v,
  1.1284 +                  halfwidth, halfheight);
  1.1285 +      return 0;
  1.1286 +    case kRotate180:
  1.1287 +      RotatePlane180(src_y, src_stride_y,
  1.1288 +                     dst_y, dst_stride_y,
  1.1289 +                     width, height);
  1.1290 +      RotateUV180(src_uv, src_stride_uv,
  1.1291 +                  dst_u, dst_stride_u,
  1.1292 +                  dst_v, dst_stride_v,
  1.1293 +                  halfwidth, halfheight);
  1.1294 +      return 0;
  1.1295 +    default:
  1.1296 +      break;
  1.1297 +  }
  1.1298 +  return -1;
  1.1299 +}
  1.1300 +
  1.1301 +#ifdef __cplusplus
  1.1302 +}  // extern "C"
  1.1303 +}  // namespace libyuv
  1.1304 +#endif

mercurial