1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/rotate.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1301 @@ 1.4 +/* 1.5 + * Copyright 2011 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/rotate.h" 1.15 + 1.16 +#include "libyuv/cpu_id.h" 1.17 +#include "libyuv/convert.h" 1.18 +#include "libyuv/planar_functions.h" 1.19 +#include "libyuv/row.h" 1.20 + 1.21 +#ifdef __cplusplus 1.22 +namespace libyuv { 1.23 +extern "C" { 1.24 +#endif 1.25 + 1.26 +#if !defined(LIBYUV_DISABLE_X86) && \ 1.27 + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) 1.28 +#if defined(__APPLE__) && defined(__i386__) 1.29 +#define DECLARE_FUNCTION(name) \ 1.30 + ".text \n" \ 1.31 + ".private_extern _" #name " \n" \ 1.32 + ".align 4,0x90 \n" \ 1.33 +"_" #name ": \n" 1.34 +#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) 1.35 +#define DECLARE_FUNCTION(name) \ 1.36 + ".text \n" \ 1.37 + ".align 4,0x90 \n" \ 1.38 +"_" #name ": \n" 1.39 +#else 1.40 +#define DECLARE_FUNCTION(name) \ 1.41 + ".text \n" \ 1.42 + ".align 4,0x90 \n" \ 1.43 +#name ": \n" 1.44 +#endif 1.45 +#endif 1.46 + 1.47 +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ 1.48 + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) 1.49 +#define HAS_MIRRORROW_NEON 1.50 +void MirrorRow_NEON(const uint8* src, uint8* dst, int width); 1.51 +#define HAS_MIRRORROW_UV_NEON 1.52 +void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); 1.53 +#define HAS_TRANSPOSE_WX8_NEON 1.54 +void TransposeWx8_NEON(const uint8* src, int src_stride, 1.55 + uint8* dst, int dst_stride, int width); 1.56 +#define HAS_TRANSPOSE_UVWX8_NEON 1.57 +void TransposeUVWx8_NEON(const uint8* src, int src_stride, 1.58 + uint8* dst_a, int dst_stride_a, 1.59 + uint8* dst_b, int dst_stride_b, 1.60 + int width); 1.61 +#endif // defined(__ARM_NEON__) 1.62 + 1.63 +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ 1.64 + defined(__mips__) && \ 1.65 + defined(__mips_dsp) && (__mips_dsp_rev >= 2) 1.66 +#define HAS_TRANSPOSE_WX8_MIPS_DSPR2 1.67 +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, 1.68 + uint8* dst, int dst_stride, int width); 1.69 + 1.70 +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, 1.71 + uint8* dst, int dst_stride, int width); 1.72 +#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2 1.73 +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, 1.74 + uint8* dst_a, int dst_stride_a, 1.75 + uint8* dst_b, int dst_stride_b, 1.76 + int width); 1.77 +#endif // defined(__mips__) 1.78 + 1.79 +#if !defined(LIBYUV_DISABLE_X86) && \ 1.80 + defined(_M_IX86) && defined(_MSC_VER) 1.81 +#define HAS_TRANSPOSE_WX8_SSSE3 1.82 +__declspec(naked) __declspec(align(16)) 1.83 +static void TransposeWx8_SSSE3(const uint8* src, int src_stride, 1.84 + uint8* dst, int dst_stride, int width) { 1.85 + __asm { 1.86 + push edi 1.87 + push esi 1.88 + push ebp 1.89 + mov eax, [esp + 12 + 4] // src 1.90 + mov edi, [esp + 12 + 8] // src_stride 1.91 + mov edx, [esp + 12 + 12] // dst 1.92 + mov esi, [esp + 12 + 16] // dst_stride 1.93 + mov ecx, [esp + 12 + 20] // width 1.94 + 1.95 + // Read in the data from the source pointer. 1.96 + // First round of bit swap. 1.97 + align 4 1.98 + convertloop: 1.99 + movq xmm0, qword ptr [eax] 1.100 + lea ebp, [eax + 8] 1.101 + movq xmm1, qword ptr [eax + edi] 1.102 + lea eax, [eax + 2 * edi] 1.103 + punpcklbw xmm0, xmm1 1.104 + movq xmm2, qword ptr [eax] 1.105 + movdqa xmm1, xmm0 1.106 + palignr xmm1, xmm1, 8 1.107 + movq xmm3, qword ptr [eax + edi] 1.108 + lea eax, [eax + 2 * edi] 1.109 + punpcklbw xmm2, xmm3 1.110 + movdqa xmm3, xmm2 1.111 + movq xmm4, qword ptr [eax] 1.112 + palignr xmm3, xmm3, 8 1.113 + movq xmm5, qword ptr [eax + edi] 1.114 + punpcklbw xmm4, xmm5 1.115 + lea eax, [eax + 2 * edi] 1.116 + movdqa xmm5, xmm4 1.117 + movq xmm6, qword ptr [eax] 1.118 + palignr xmm5, xmm5, 8 1.119 + movq xmm7, qword ptr [eax + edi] 1.120 + punpcklbw xmm6, xmm7 1.121 + mov eax, ebp 1.122 + movdqa xmm7, xmm6 1.123 + palignr xmm7, xmm7, 8 1.124 + // Second round of bit swap. 1.125 + punpcklwd xmm0, xmm2 1.126 + punpcklwd xmm1, xmm3 1.127 + movdqa xmm2, xmm0 1.128 + movdqa xmm3, xmm1 1.129 + palignr xmm2, xmm2, 8 1.130 + palignr xmm3, xmm3, 8 1.131 + punpcklwd xmm4, xmm6 1.132 + punpcklwd xmm5, xmm7 1.133 + movdqa xmm6, xmm4 1.134 + movdqa xmm7, xmm5 1.135 + palignr xmm6, xmm6, 8 1.136 + palignr xmm7, xmm7, 8 1.137 + // Third round of bit swap. 1.138 + // Write to the destination pointer. 1.139 + punpckldq xmm0, xmm4 1.140 + movq qword ptr [edx], xmm0 1.141 + movdqa xmm4, xmm0 1.142 + palignr xmm4, xmm4, 8 1.143 + movq qword ptr [edx + esi], xmm4 1.144 + lea edx, [edx + 2 * esi] 1.145 + punpckldq xmm2, xmm6 1.146 + movdqa xmm6, xmm2 1.147 + palignr xmm6, xmm6, 8 1.148 + movq qword ptr [edx], xmm2 1.149 + punpckldq xmm1, xmm5 1.150 + movq qword ptr [edx + esi], xmm6 1.151 + lea edx, [edx + 2 * esi] 1.152 + movdqa xmm5, xmm1 1.153 + movq qword ptr [edx], xmm1 1.154 + palignr xmm5, xmm5, 8 1.155 + punpckldq xmm3, xmm7 1.156 + movq qword ptr [edx + esi], xmm5 1.157 + lea edx, [edx + 2 * esi] 1.158 + movq qword ptr [edx], xmm3 1.159 + movdqa xmm7, xmm3 1.160 + palignr xmm7, xmm7, 8 1.161 + sub ecx, 8 1.162 + movq qword ptr [edx + esi], xmm7 1.163 + lea edx, [edx + 2 * esi] 1.164 + jg convertloop 1.165 + 1.166 + pop ebp 1.167 + pop esi 1.168 + pop edi 1.169 + ret 1.170 + } 1.171 +} 1.172 + 1.173 +#define HAS_TRANSPOSE_UVWX8_SSE2 1.174 +__declspec(naked) __declspec(align(16)) 1.175 +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 1.176 + uint8* dst_a, int dst_stride_a, 1.177 + uint8* dst_b, int dst_stride_b, 1.178 + int w) { 1.179 + __asm { 1.180 + push ebx 1.181 + push esi 1.182 + push edi 1.183 + push ebp 1.184 + mov eax, [esp + 16 + 4] // src 1.185 + mov edi, [esp + 16 + 8] // src_stride 1.186 + mov edx, [esp + 16 + 12] // dst_a 1.187 + mov esi, [esp + 16 + 16] // dst_stride_a 1.188 + mov ebx, [esp + 16 + 20] // dst_b 1.189 + mov ebp, [esp + 16 + 24] // dst_stride_b 1.190 + mov ecx, esp 1.191 + sub esp, 4 + 16 1.192 + and esp, ~15 1.193 + mov [esp + 16], ecx 1.194 + mov ecx, [ecx + 16 + 28] // w 1.195 + 1.196 + align 4 1.197 + convertloop: 1.198 + // Read in the data from the source pointer. 1.199 + // First round of bit swap. 1.200 + movdqa xmm0, [eax] 1.201 + movdqa xmm1, [eax + edi] 1.202 + lea eax, [eax + 2 * edi] 1.203 + movdqa xmm7, xmm0 // use xmm7 as temp register. 1.204 + punpcklbw xmm0, xmm1 1.205 + punpckhbw xmm7, xmm1 1.206 + movdqa xmm1, xmm7 1.207 + movdqa xmm2, [eax] 1.208 + movdqa xmm3, [eax + edi] 1.209 + lea eax, [eax + 2 * edi] 1.210 + movdqa xmm7, xmm2 1.211 + punpcklbw xmm2, xmm3 1.212 + punpckhbw xmm7, xmm3 1.213 + movdqa xmm3, xmm7 1.214 + movdqa xmm4, [eax] 1.215 + movdqa xmm5, [eax + edi] 1.216 + lea eax, [eax + 2 * edi] 1.217 + movdqa xmm7, xmm4 1.218 + punpcklbw xmm4, xmm5 1.219 + punpckhbw xmm7, xmm5 1.220 + movdqa xmm5, xmm7 1.221 + movdqa xmm6, [eax] 1.222 + movdqa xmm7, [eax + edi] 1.223 + lea eax, [eax + 2 * edi] 1.224 + movdqa [esp], xmm5 // backup xmm5 1.225 + neg edi 1.226 + movdqa xmm5, xmm6 // use xmm5 as temp register. 1.227 + punpcklbw xmm6, xmm7 1.228 + punpckhbw xmm5, xmm7 1.229 + movdqa xmm7, xmm5 1.230 + lea eax, [eax + 8 * edi + 16] 1.231 + neg edi 1.232 + // Second round of bit swap. 1.233 + movdqa xmm5, xmm0 1.234 + punpcklwd xmm0, xmm2 1.235 + punpckhwd xmm5, xmm2 1.236 + movdqa xmm2, xmm5 1.237 + movdqa xmm5, xmm1 1.238 + punpcklwd xmm1, xmm3 1.239 + punpckhwd xmm5, xmm3 1.240 + movdqa xmm3, xmm5 1.241 + movdqa xmm5, xmm4 1.242 + punpcklwd xmm4, xmm6 1.243 + punpckhwd xmm5, xmm6 1.244 + movdqa xmm6, xmm5 1.245 + movdqa xmm5, [esp] // restore xmm5 1.246 + movdqa [esp], xmm6 // backup xmm6 1.247 + movdqa xmm6, xmm5 // use xmm6 as temp register. 1.248 + punpcklwd xmm5, xmm7 1.249 + punpckhwd xmm6, xmm7 1.250 + movdqa xmm7, xmm6 1.251 + // Third round of bit swap. 1.252 + // Write to the destination pointer. 1.253 + movdqa xmm6, xmm0 1.254 + punpckldq xmm0, xmm4 1.255 + punpckhdq xmm6, xmm4 1.256 + movdqa xmm4, xmm6 1.257 + movdqa xmm6, [esp] // restore xmm6 1.258 + movlpd qword ptr [edx], xmm0 1.259 + movhpd qword ptr [ebx], xmm0 1.260 + movlpd qword ptr [edx + esi], xmm4 1.261 + lea edx, [edx + 2 * esi] 1.262 + movhpd qword ptr [ebx + ebp], xmm4 1.263 + lea ebx, [ebx + 2 * ebp] 1.264 + movdqa xmm0, xmm2 // use xmm0 as the temp register. 1.265 + punpckldq xmm2, xmm6 1.266 + movlpd qword ptr [edx], xmm2 1.267 + movhpd qword ptr [ebx], xmm2 1.268 + punpckhdq xmm0, xmm6 1.269 + movlpd qword ptr [edx + esi], xmm0 1.270 + lea edx, [edx + 2 * esi] 1.271 + movhpd qword ptr [ebx + ebp], xmm0 1.272 + lea ebx, [ebx + 2 * ebp] 1.273 + movdqa xmm0, xmm1 // use xmm0 as the temp register. 1.274 + punpckldq xmm1, xmm5 1.275 + movlpd qword ptr [edx], xmm1 1.276 + movhpd qword ptr [ebx], xmm1 1.277 + punpckhdq xmm0, xmm5 1.278 + movlpd qword ptr [edx + esi], xmm0 1.279 + lea edx, [edx + 2 * esi] 1.280 + movhpd qword ptr [ebx + ebp], xmm0 1.281 + lea ebx, [ebx + 2 * ebp] 1.282 + movdqa xmm0, xmm3 // use xmm0 as the temp register. 1.283 + punpckldq xmm3, xmm7 1.284 + movlpd qword ptr [edx], xmm3 1.285 + movhpd qword ptr [ebx], xmm3 1.286 + punpckhdq xmm0, xmm7 1.287 + sub ecx, 8 1.288 + movlpd qword ptr [edx + esi], xmm0 1.289 + lea edx, [edx + 2 * esi] 1.290 + movhpd qword ptr [ebx + ebp], xmm0 1.291 + lea ebx, [ebx + 2 * ebp] 1.292 + jg convertloop 1.293 + 1.294 + mov esp, [esp + 16] 1.295 + pop ebp 1.296 + pop edi 1.297 + pop esi 1.298 + pop ebx 1.299 + ret 1.300 + } 1.301 +} 1.302 +#elif !defined(LIBYUV_DISABLE_X86) && \ 1.303 + (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) 1.304 +#define HAS_TRANSPOSE_WX8_SSSE3 1.305 +static void TransposeWx8_SSSE3(const uint8* src, int src_stride, 1.306 + uint8* dst, int dst_stride, int width) { 1.307 + asm volatile ( 1.308 + // Read in the data from the source pointer. 1.309 + // First round of bit swap. 1.310 + ".p2align 2 \n" 1.311 + "1: \n" 1.312 + "movq (%0),%%xmm0 \n" 1.313 + "movq (%0,%3),%%xmm1 \n" 1.314 + "lea (%0,%3,2),%0 \n" 1.315 + "punpcklbw %%xmm1,%%xmm0 \n" 1.316 + "movq (%0),%%xmm2 \n" 1.317 + "movdqa %%xmm0,%%xmm1 \n" 1.318 + "palignr $0x8,%%xmm1,%%xmm1 \n" 1.319 + "movq (%0,%3),%%xmm3 \n" 1.320 + "lea (%0,%3,2),%0 \n" 1.321 + "punpcklbw %%xmm3,%%xmm2 \n" 1.322 + "movdqa %%xmm2,%%xmm3 \n" 1.323 + "movq (%0),%%xmm4 \n" 1.324 + "palignr $0x8,%%xmm3,%%xmm3 \n" 1.325 + "movq (%0,%3),%%xmm5 \n" 1.326 + "lea (%0,%3,2),%0 \n" 1.327 + "punpcklbw %%xmm5,%%xmm4 \n" 1.328 + "movdqa %%xmm4,%%xmm5 \n" 1.329 + "movq (%0),%%xmm6 \n" 1.330 + "palignr $0x8,%%xmm5,%%xmm5 \n" 1.331 + "movq (%0,%3),%%xmm7 \n" 1.332 + "lea (%0,%3,2),%0 \n" 1.333 + "punpcklbw %%xmm7,%%xmm6 \n" 1.334 + "neg %3 \n" 1.335 + "movdqa %%xmm6,%%xmm7 \n" 1.336 + "lea 0x8(%0,%3,8),%0 \n" 1.337 + "palignr $0x8,%%xmm7,%%xmm7 \n" 1.338 + "neg %3 \n" 1.339 + // Second round of bit swap. 1.340 + "punpcklwd %%xmm2,%%xmm0 \n" 1.341 + "punpcklwd %%xmm3,%%xmm1 \n" 1.342 + "movdqa %%xmm0,%%xmm2 \n" 1.343 + "movdqa %%xmm1,%%xmm3 \n" 1.344 + "palignr $0x8,%%xmm2,%%xmm2 \n" 1.345 + "palignr $0x8,%%xmm3,%%xmm3 \n" 1.346 + "punpcklwd %%xmm6,%%xmm4 \n" 1.347 + "punpcklwd %%xmm7,%%xmm5 \n" 1.348 + "movdqa %%xmm4,%%xmm6 \n" 1.349 + "movdqa %%xmm5,%%xmm7 \n" 1.350 + "palignr $0x8,%%xmm6,%%xmm6 \n" 1.351 + "palignr $0x8,%%xmm7,%%xmm7 \n" 1.352 + // Third round of bit swap. 1.353 + // Write to the destination pointer. 1.354 + "punpckldq %%xmm4,%%xmm0 \n" 1.355 + "movq %%xmm0,(%1) \n" 1.356 + "movdqa %%xmm0,%%xmm4 \n" 1.357 + "palignr $0x8,%%xmm4,%%xmm4 \n" 1.358 + "movq %%xmm4,(%1,%4) \n" 1.359 + "lea (%1,%4,2),%1 \n" 1.360 + "punpckldq %%xmm6,%%xmm2 \n" 1.361 + "movdqa %%xmm2,%%xmm6 \n" 1.362 + "movq %%xmm2,(%1) \n" 1.363 + "palignr $0x8,%%xmm6,%%xmm6 \n" 1.364 + "punpckldq %%xmm5,%%xmm1 \n" 1.365 + "movq %%xmm6,(%1,%4) \n" 1.366 + "lea (%1,%4,2),%1 \n" 1.367 + "movdqa %%xmm1,%%xmm5 \n" 1.368 + "movq %%xmm1,(%1) \n" 1.369 + "palignr $0x8,%%xmm5,%%xmm5 \n" 1.370 + "movq %%xmm5,(%1,%4) \n" 1.371 + "lea (%1,%4,2),%1 \n" 1.372 + "punpckldq %%xmm7,%%xmm3 \n" 1.373 + "movq %%xmm3,(%1) \n" 1.374 + "movdqa %%xmm3,%%xmm7 \n" 1.375 + "palignr $0x8,%%xmm7,%%xmm7 \n" 1.376 + "sub $0x8,%2 \n" 1.377 + "movq %%xmm7,(%1,%4) \n" 1.378 + "lea (%1,%4,2),%1 \n" 1.379 + "jg 1b \n" 1.380 + : "+r"(src), // %0 1.381 + "+r"(dst), // %1 1.382 + "+r"(width) // %2 1.383 + : "r"((intptr_t)(src_stride)), // %3 1.384 + "r"((intptr_t)(dst_stride)) // %4 1.385 + : "memory", "cc" 1.386 + #if defined(__SSE2__) 1.387 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.388 + #endif 1.389 + ); 1.390 +} 1.391 + 1.392 +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) 1.393 +#define HAS_TRANSPOSE_UVWX8_SSE2 1.394 +extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 1.395 + uint8* dst_a, int dst_stride_a, 1.396 + uint8* dst_b, int dst_stride_b, 1.397 + int w); 1.398 + asm ( 1.399 + DECLARE_FUNCTION(TransposeUVWx8_SSE2) 1.400 + "push %ebx \n" 1.401 + "push %esi \n" 1.402 + "push %edi \n" 1.403 + "push %ebp \n" 1.404 + "mov 0x14(%esp),%eax \n" 1.405 + "mov 0x18(%esp),%edi \n" 1.406 + "mov 0x1c(%esp),%edx \n" 1.407 + "mov 0x20(%esp),%esi \n" 1.408 + "mov 0x24(%esp),%ebx \n" 1.409 + "mov 0x28(%esp),%ebp \n" 1.410 + "mov %esp,%ecx \n" 1.411 + "sub $0x14,%esp \n" 1.412 + "and $0xfffffff0,%esp \n" 1.413 + "mov %ecx,0x10(%esp) \n" 1.414 + "mov 0x2c(%ecx),%ecx \n" 1.415 + 1.416 +"1: \n" 1.417 + "movdqa (%eax),%xmm0 \n" 1.418 + "movdqa (%eax,%edi,1),%xmm1 \n" 1.419 + "lea (%eax,%edi,2),%eax \n" 1.420 + "movdqa %xmm0,%xmm7 \n" 1.421 + "punpcklbw %xmm1,%xmm0 \n" 1.422 + "punpckhbw %xmm1,%xmm7 \n" 1.423 + "movdqa %xmm7,%xmm1 \n" 1.424 + "movdqa (%eax),%xmm2 \n" 1.425 + "movdqa (%eax,%edi,1),%xmm3 \n" 1.426 + "lea (%eax,%edi,2),%eax \n" 1.427 + "movdqa %xmm2,%xmm7 \n" 1.428 + "punpcklbw %xmm3,%xmm2 \n" 1.429 + "punpckhbw %xmm3,%xmm7 \n" 1.430 + "movdqa %xmm7,%xmm3 \n" 1.431 + "movdqa (%eax),%xmm4 \n" 1.432 + "movdqa (%eax,%edi,1),%xmm5 \n" 1.433 + "lea (%eax,%edi,2),%eax \n" 1.434 + "movdqa %xmm4,%xmm7 \n" 1.435 + "punpcklbw %xmm5,%xmm4 \n" 1.436 + "punpckhbw %xmm5,%xmm7 \n" 1.437 + "movdqa %xmm7,%xmm5 \n" 1.438 + "movdqa (%eax),%xmm6 \n" 1.439 + "movdqa (%eax,%edi,1),%xmm7 \n" 1.440 + "lea (%eax,%edi,2),%eax \n" 1.441 + "movdqa %xmm5,(%esp) \n" 1.442 + "neg %edi \n" 1.443 + "movdqa %xmm6,%xmm5 \n" 1.444 + "punpcklbw %xmm7,%xmm6 \n" 1.445 + "punpckhbw %xmm7,%xmm5 \n" 1.446 + "movdqa %xmm5,%xmm7 \n" 1.447 + "lea 0x10(%eax,%edi,8),%eax \n" 1.448 + "neg %edi \n" 1.449 + "movdqa %xmm0,%xmm5 \n" 1.450 + "punpcklwd %xmm2,%xmm0 \n" 1.451 + "punpckhwd %xmm2,%xmm5 \n" 1.452 + "movdqa %xmm5,%xmm2 \n" 1.453 + "movdqa %xmm1,%xmm5 \n" 1.454 + "punpcklwd %xmm3,%xmm1 \n" 1.455 + "punpckhwd %xmm3,%xmm5 \n" 1.456 + "movdqa %xmm5,%xmm3 \n" 1.457 + "movdqa %xmm4,%xmm5 \n" 1.458 + "punpcklwd %xmm6,%xmm4 \n" 1.459 + "punpckhwd %xmm6,%xmm5 \n" 1.460 + "movdqa %xmm5,%xmm6 \n" 1.461 + "movdqa (%esp),%xmm5 \n" 1.462 + "movdqa %xmm6,(%esp) \n" 1.463 + "movdqa %xmm5,%xmm6 \n" 1.464 + "punpcklwd %xmm7,%xmm5 \n" 1.465 + "punpckhwd %xmm7,%xmm6 \n" 1.466 + "movdqa %xmm6,%xmm7 \n" 1.467 + "movdqa %xmm0,%xmm6 \n" 1.468 + "punpckldq %xmm4,%xmm0 \n" 1.469 + "punpckhdq %xmm4,%xmm6 \n" 1.470 + "movdqa %xmm6,%xmm4 \n" 1.471 + "movdqa (%esp),%xmm6 \n" 1.472 + "movlpd %xmm0,(%edx) \n" 1.473 + "movhpd %xmm0,(%ebx) \n" 1.474 + "movlpd %xmm4,(%edx,%esi,1) \n" 1.475 + "lea (%edx,%esi,2),%edx \n" 1.476 + "movhpd %xmm4,(%ebx,%ebp,1) \n" 1.477 + "lea (%ebx,%ebp,2),%ebx \n" 1.478 + "movdqa %xmm2,%xmm0 \n" 1.479 + "punpckldq %xmm6,%xmm2 \n" 1.480 + "movlpd %xmm2,(%edx) \n" 1.481 + "movhpd %xmm2,(%ebx) \n" 1.482 + "punpckhdq %xmm6,%xmm0 \n" 1.483 + "movlpd %xmm0,(%edx,%esi,1) \n" 1.484 + "lea (%edx,%esi,2),%edx \n" 1.485 + "movhpd %xmm0,(%ebx,%ebp,1) \n" 1.486 + "lea (%ebx,%ebp,2),%ebx \n" 1.487 + "movdqa %xmm1,%xmm0 \n" 1.488 + "punpckldq %xmm5,%xmm1 \n" 1.489 + "movlpd %xmm1,(%edx) \n" 1.490 + "movhpd %xmm1,(%ebx) \n" 1.491 + "punpckhdq %xmm5,%xmm0 \n" 1.492 + "movlpd %xmm0,(%edx,%esi,1) \n" 1.493 + "lea (%edx,%esi,2),%edx \n" 1.494 + "movhpd %xmm0,(%ebx,%ebp,1) \n" 1.495 + "lea (%ebx,%ebp,2),%ebx \n" 1.496 + "movdqa %xmm3,%xmm0 \n" 1.497 + "punpckldq %xmm7,%xmm3 \n" 1.498 + "movlpd %xmm3,(%edx) \n" 1.499 + "movhpd %xmm3,(%ebx) \n" 1.500 + "punpckhdq %xmm7,%xmm0 \n" 1.501 + "sub $0x8,%ecx \n" 1.502 + "movlpd %xmm0,(%edx,%esi,1) \n" 1.503 + "lea (%edx,%esi,2),%edx \n" 1.504 + "movhpd %xmm0,(%ebx,%ebp,1) \n" 1.505 + "lea (%ebx,%ebp,2),%ebx \n" 1.506 + "jg 1b \n" 1.507 + "mov 0x10(%esp),%esp \n" 1.508 + "pop %ebp \n" 1.509 + "pop %edi \n" 1.510 + "pop %esi \n" 1.511 + "pop %ebx \n" 1.512 +#if defined(__native_client__) 1.513 + "pop %ecx \n" 1.514 + "and $0xffffffe0,%ecx \n" 1.515 + "jmp *%ecx \n" 1.516 +#else 1.517 + "ret \n" 1.518 +#endif 1.519 +); 1.520 +#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ 1.521 + defined(__x86_64__) 1.522 +// 64 bit version has enough registers to do 16x8 to 8x16 at a time. 1.523 +#define HAS_TRANSPOSE_WX8_FAST_SSSE3 1.524 +static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, 1.525 + uint8* dst, int dst_stride, int width) { 1.526 + asm volatile ( 1.527 + // Read in the data from the source pointer. 1.528 + // First round of bit swap. 1.529 + ".p2align 2 \n" 1.530 +"1: \n" 1.531 + "movdqa (%0),%%xmm0 \n" 1.532 + "movdqa (%0,%3),%%xmm1 \n" 1.533 + "lea (%0,%3,2),%0 \n" 1.534 + "movdqa %%xmm0,%%xmm8 \n" 1.535 + "punpcklbw %%xmm1,%%xmm0 \n" 1.536 + "punpckhbw %%xmm1,%%xmm8 \n" 1.537 + "movdqa (%0),%%xmm2 \n" 1.538 + "movdqa %%xmm0,%%xmm1 \n" 1.539 + "movdqa %%xmm8,%%xmm9 \n" 1.540 + "palignr $0x8,%%xmm1,%%xmm1 \n" 1.541 + "palignr $0x8,%%xmm9,%%xmm9 \n" 1.542 + "movdqa (%0,%3),%%xmm3 \n" 1.543 + "lea (%0,%3,2),%0 \n" 1.544 + "movdqa %%xmm2,%%xmm10 \n" 1.545 + "punpcklbw %%xmm3,%%xmm2 \n" 1.546 + "punpckhbw %%xmm3,%%xmm10 \n" 1.547 + "movdqa %%xmm2,%%xmm3 \n" 1.548 + "movdqa %%xmm10,%%xmm11 \n" 1.549 + "movdqa (%0),%%xmm4 \n" 1.550 + "palignr $0x8,%%xmm3,%%xmm3 \n" 1.551 + "palignr $0x8,%%xmm11,%%xmm11 \n" 1.552 + "movdqa (%0,%3),%%xmm5 \n" 1.553 + "lea (%0,%3,2),%0 \n" 1.554 + "movdqa %%xmm4,%%xmm12 \n" 1.555 + "punpcklbw %%xmm5,%%xmm4 \n" 1.556 + "punpckhbw %%xmm5,%%xmm12 \n" 1.557 + "movdqa %%xmm4,%%xmm5 \n" 1.558 + "movdqa %%xmm12,%%xmm13 \n" 1.559 + "movdqa (%0),%%xmm6 \n" 1.560 + "palignr $0x8,%%xmm5,%%xmm5 \n" 1.561 + "palignr $0x8,%%xmm13,%%xmm13 \n" 1.562 + "movdqa (%0,%3),%%xmm7 \n" 1.563 + "lea (%0,%3,2),%0 \n" 1.564 + "movdqa %%xmm6,%%xmm14 \n" 1.565 + "punpcklbw %%xmm7,%%xmm6 \n" 1.566 + "punpckhbw %%xmm7,%%xmm14 \n" 1.567 + "neg %3 \n" 1.568 + "movdqa %%xmm6,%%xmm7 \n" 1.569 + "movdqa %%xmm14,%%xmm15 \n" 1.570 + "lea 0x10(%0,%3,8),%0 \n" 1.571 + "palignr $0x8,%%xmm7,%%xmm7 \n" 1.572 + "palignr $0x8,%%xmm15,%%xmm15 \n" 1.573 + "neg %3 \n" 1.574 + // Second round of bit swap. 1.575 + "punpcklwd %%xmm2,%%xmm0 \n" 1.576 + "punpcklwd %%xmm3,%%xmm1 \n" 1.577 + "movdqa %%xmm0,%%xmm2 \n" 1.578 + "movdqa %%xmm1,%%xmm3 \n" 1.579 + "palignr $0x8,%%xmm2,%%xmm2 \n" 1.580 + "palignr $0x8,%%xmm3,%%xmm3 \n" 1.581 + "punpcklwd %%xmm6,%%xmm4 \n" 1.582 + "punpcklwd %%xmm7,%%xmm5 \n" 1.583 + "movdqa %%xmm4,%%xmm6 \n" 1.584 + "movdqa %%xmm5,%%xmm7 \n" 1.585 + "palignr $0x8,%%xmm6,%%xmm6 \n" 1.586 + "palignr $0x8,%%xmm7,%%xmm7 \n" 1.587 + "punpcklwd %%xmm10,%%xmm8 \n" 1.588 + "punpcklwd %%xmm11,%%xmm9 \n" 1.589 + "movdqa %%xmm8,%%xmm10 \n" 1.590 + "movdqa %%xmm9,%%xmm11 \n" 1.591 + "palignr $0x8,%%xmm10,%%xmm10 \n" 1.592 + "palignr $0x8,%%xmm11,%%xmm11 \n" 1.593 + "punpcklwd %%xmm14,%%xmm12 \n" 1.594 + "punpcklwd %%xmm15,%%xmm13 \n" 1.595 + "movdqa %%xmm12,%%xmm14 \n" 1.596 + "movdqa %%xmm13,%%xmm15 \n" 1.597 + "palignr $0x8,%%xmm14,%%xmm14 \n" 1.598 + "palignr $0x8,%%xmm15,%%xmm15 \n" 1.599 + // Third round of bit swap. 1.600 + // Write to the destination pointer. 1.601 + "punpckldq %%xmm4,%%xmm0 \n" 1.602 + "movq %%xmm0,(%1) \n" 1.603 + "movdqa %%xmm0,%%xmm4 \n" 1.604 + "palignr $0x8,%%xmm4,%%xmm4 \n" 1.605 + "movq %%xmm4,(%1,%4) \n" 1.606 + "lea (%1,%4,2),%1 \n" 1.607 + "punpckldq %%xmm6,%%xmm2 \n" 1.608 + "movdqa %%xmm2,%%xmm6 \n" 1.609 + "movq %%xmm2,(%1) \n" 1.610 + "palignr $0x8,%%xmm6,%%xmm6 \n" 1.611 + "punpckldq %%xmm5,%%xmm1 \n" 1.612 + "movq %%xmm6,(%1,%4) \n" 1.613 + "lea (%1,%4,2),%1 \n" 1.614 + "movdqa %%xmm1,%%xmm5 \n" 1.615 + "movq %%xmm1,(%1) \n" 1.616 + "palignr $0x8,%%xmm5,%%xmm5 \n" 1.617 + "movq %%xmm5,(%1,%4) \n" 1.618 + "lea (%1,%4,2),%1 \n" 1.619 + "punpckldq %%xmm7,%%xmm3 \n" 1.620 + "movq %%xmm3,(%1) \n" 1.621 + "movdqa %%xmm3,%%xmm7 \n" 1.622 + "palignr $0x8,%%xmm7,%%xmm7 \n" 1.623 + "movq %%xmm7,(%1,%4) \n" 1.624 + "lea (%1,%4,2),%1 \n" 1.625 + "punpckldq %%xmm12,%%xmm8 \n" 1.626 + "movq %%xmm8,(%1) \n" 1.627 + "movdqa %%xmm8,%%xmm12 \n" 1.628 + "palignr $0x8,%%xmm12,%%xmm12 \n" 1.629 + "movq %%xmm12,(%1,%4) \n" 1.630 + "lea (%1,%4,2),%1 \n" 1.631 + "punpckldq %%xmm14,%%xmm10 \n" 1.632 + "movdqa %%xmm10,%%xmm14 \n" 1.633 + "movq %%xmm10,(%1) \n" 1.634 + "palignr $0x8,%%xmm14,%%xmm14 \n" 1.635 + "punpckldq %%xmm13,%%xmm9 \n" 1.636 + "movq %%xmm14,(%1,%4) \n" 1.637 + "lea (%1,%4,2),%1 \n" 1.638 + "movdqa %%xmm9,%%xmm13 \n" 1.639 + "movq %%xmm9,(%1) \n" 1.640 + "palignr $0x8,%%xmm13,%%xmm13 \n" 1.641 + "movq %%xmm13,(%1,%4) \n" 1.642 + "lea (%1,%4,2),%1 \n" 1.643 + "punpckldq %%xmm15,%%xmm11 \n" 1.644 + "movq %%xmm11,(%1) \n" 1.645 + "movdqa %%xmm11,%%xmm15 \n" 1.646 + "palignr $0x8,%%xmm15,%%xmm15 \n" 1.647 + "sub $0x10,%2 \n" 1.648 + "movq %%xmm15,(%1,%4) \n" 1.649 + "lea (%1,%4,2),%1 \n" 1.650 + "jg 1b \n" 1.651 + : "+r"(src), // %0 1.652 + "+r"(dst), // %1 1.653 + "+r"(width) // %2 1.654 + : "r"((intptr_t)(src_stride)), // %3 1.655 + "r"((intptr_t)(dst_stride)) // %4 1.656 + : "memory", "cc", 1.657 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 1.658 + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" 1.659 +); 1.660 +} 1.661 + 1.662 +#define HAS_TRANSPOSE_UVWX8_SSE2 1.663 +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 1.664 + uint8* dst_a, int dst_stride_a, 1.665 + uint8* dst_b, int dst_stride_b, 1.666 + int w) { 1.667 + asm volatile ( 1.668 + // Read in the data from the source pointer. 1.669 + // First round of bit swap. 1.670 + ".p2align 2 \n" 1.671 +"1: \n" 1.672 + "movdqa (%0),%%xmm0 \n" 1.673 + "movdqa (%0,%4),%%xmm1 \n" 1.674 + "lea (%0,%4,2),%0 \n" 1.675 + "movdqa %%xmm0,%%xmm8 \n" 1.676 + "punpcklbw %%xmm1,%%xmm0 \n" 1.677 + "punpckhbw %%xmm1,%%xmm8 \n" 1.678 + "movdqa %%xmm8,%%xmm1 \n" 1.679 + "movdqa (%0),%%xmm2 \n" 1.680 + "movdqa (%0,%4),%%xmm3 \n" 1.681 + "lea (%0,%4,2),%0 \n" 1.682 + "movdqa %%xmm2,%%xmm8 \n" 1.683 + "punpcklbw %%xmm3,%%xmm2 \n" 1.684 + "punpckhbw %%xmm3,%%xmm8 \n" 1.685 + "movdqa %%xmm8,%%xmm3 \n" 1.686 + "movdqa (%0),%%xmm4 \n" 1.687 + "movdqa (%0,%4),%%xmm5 \n" 1.688 + "lea (%0,%4,2),%0 \n" 1.689 + "movdqa %%xmm4,%%xmm8 \n" 1.690 + "punpcklbw %%xmm5,%%xmm4 \n" 1.691 + "punpckhbw %%xmm5,%%xmm8 \n" 1.692 + "movdqa %%xmm8,%%xmm5 \n" 1.693 + "movdqa (%0),%%xmm6 \n" 1.694 + "movdqa (%0,%4),%%xmm7 \n" 1.695 + "lea (%0,%4,2),%0 \n" 1.696 + "movdqa %%xmm6,%%xmm8 \n" 1.697 + "punpcklbw %%xmm7,%%xmm6 \n" 1.698 + "neg %4 \n" 1.699 + "lea 0x10(%0,%4,8),%0 \n" 1.700 + "punpckhbw %%xmm7,%%xmm8 \n" 1.701 + "movdqa %%xmm8,%%xmm7 \n" 1.702 + "neg %4 \n" 1.703 + // Second round of bit swap. 1.704 + "movdqa %%xmm0,%%xmm8 \n" 1.705 + "movdqa %%xmm1,%%xmm9 \n" 1.706 + "punpckhwd %%xmm2,%%xmm8 \n" 1.707 + "punpckhwd %%xmm3,%%xmm9 \n" 1.708 + "punpcklwd %%xmm2,%%xmm0 \n" 1.709 + "punpcklwd %%xmm3,%%xmm1 \n" 1.710 + "movdqa %%xmm8,%%xmm2 \n" 1.711 + "movdqa %%xmm9,%%xmm3 \n" 1.712 + "movdqa %%xmm4,%%xmm8 \n" 1.713 + "movdqa %%xmm5,%%xmm9 \n" 1.714 + "punpckhwd %%xmm6,%%xmm8 \n" 1.715 + "punpckhwd %%xmm7,%%xmm9 \n" 1.716 + "punpcklwd %%xmm6,%%xmm4 \n" 1.717 + "punpcklwd %%xmm7,%%xmm5 \n" 1.718 + "movdqa %%xmm8,%%xmm6 \n" 1.719 + "movdqa %%xmm9,%%xmm7 \n" 1.720 + // Third round of bit swap. 1.721 + // Write to the destination pointer. 1.722 + "movdqa %%xmm0,%%xmm8 \n" 1.723 + "punpckldq %%xmm4,%%xmm0 \n" 1.724 + "movlpd %%xmm0,(%1) \n" // Write back U channel 1.725 + "movhpd %%xmm0,(%2) \n" // Write back V channel 1.726 + "punpckhdq %%xmm4,%%xmm8 \n" 1.727 + "movlpd %%xmm8,(%1,%5) \n" 1.728 + "lea (%1,%5,2),%1 \n" 1.729 + "movhpd %%xmm8,(%2,%6) \n" 1.730 + "lea (%2,%6,2),%2 \n" 1.731 + "movdqa %%xmm2,%%xmm8 \n" 1.732 + "punpckldq %%xmm6,%%xmm2 \n" 1.733 + "movlpd %%xmm2,(%1) \n" 1.734 + "movhpd %%xmm2,(%2) \n" 1.735 + "punpckhdq %%xmm6,%%xmm8 \n" 1.736 + "movlpd %%xmm8,(%1,%5) \n" 1.737 + "lea (%1,%5,2),%1 \n" 1.738 + "movhpd %%xmm8,(%2,%6) \n" 1.739 + "lea (%2,%6,2),%2 \n" 1.740 + "movdqa %%xmm1,%%xmm8 \n" 1.741 + "punpckldq %%xmm5,%%xmm1 \n" 1.742 + "movlpd %%xmm1,(%1) \n" 1.743 + "movhpd %%xmm1,(%2) \n" 1.744 + "punpckhdq %%xmm5,%%xmm8 \n" 1.745 + "movlpd %%xmm8,(%1,%5) \n" 1.746 + "lea (%1,%5,2),%1 \n" 1.747 + "movhpd %%xmm8,(%2,%6) \n" 1.748 + "lea (%2,%6,2),%2 \n" 1.749 + "movdqa %%xmm3,%%xmm8 \n" 1.750 + "punpckldq %%xmm7,%%xmm3 \n" 1.751 + "movlpd %%xmm3,(%1) \n" 1.752 + "movhpd %%xmm3,(%2) \n" 1.753 + "punpckhdq %%xmm7,%%xmm8 \n" 1.754 + "sub $0x8,%3 \n" 1.755 + "movlpd %%xmm8,(%1,%5) \n" 1.756 + "lea (%1,%5,2),%1 \n" 1.757 + "movhpd %%xmm8,(%2,%6) \n" 1.758 + "lea (%2,%6,2),%2 \n" 1.759 + "jg 1b \n" 1.760 + : "+r"(src), // %0 1.761 + "+r"(dst_a), // %1 1.762 + "+r"(dst_b), // %2 1.763 + "+r"(w) // %3 1.764 + : "r"((intptr_t)(src_stride)), // %4 1.765 + "r"((intptr_t)(dst_stride_a)), // %5 1.766 + "r"((intptr_t)(dst_stride_b)) // %6 1.767 + : "memory", "cc", 1.768 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 1.769 + "xmm8", "xmm9" 1.770 +); 1.771 +} 1.772 +#endif 1.773 +#endif 1.774 + 1.775 +static void TransposeWx8_C(const uint8* src, int src_stride, 1.776 + uint8* dst, int dst_stride, 1.777 + int width) { 1.778 + int i; 1.779 + for (i = 0; i < width; ++i) { 1.780 + dst[0] = src[0 * src_stride]; 1.781 + dst[1] = src[1 * src_stride]; 1.782 + dst[2] = src[2 * src_stride]; 1.783 + dst[3] = src[3 * src_stride]; 1.784 + dst[4] = src[4 * src_stride]; 1.785 + dst[5] = src[5 * src_stride]; 1.786 + dst[6] = src[6 * src_stride]; 1.787 + dst[7] = src[7 * src_stride]; 1.788 + ++src; 1.789 + dst += dst_stride; 1.790 + } 1.791 +} 1.792 + 1.793 +static void TransposeWxH_C(const uint8* src, int src_stride, 1.794 + uint8* dst, int dst_stride, 1.795 + int width, int height) { 1.796 + int i; 1.797 + for (i = 0; i < width; ++i) { 1.798 + int j; 1.799 + for (j = 0; j < height; ++j) { 1.800 + dst[i * dst_stride + j] = src[j * src_stride + i]; 1.801 + } 1.802 + } 1.803 +} 1.804 + 1.805 +LIBYUV_API 1.806 +void TransposePlane(const uint8* src, int src_stride, 1.807 + uint8* dst, int dst_stride, 1.808 + int width, int height) { 1.809 + int i = height; 1.810 + void (*TransposeWx8)(const uint8* src, int src_stride, 1.811 + uint8* dst, int dst_stride, 1.812 + int width) = TransposeWx8_C; 1.813 +#if defined(HAS_TRANSPOSE_WX8_NEON) 1.814 + if (TestCpuFlag(kCpuHasNEON)) { 1.815 + TransposeWx8 = TransposeWx8_NEON; 1.816 + } 1.817 +#endif 1.818 +#if defined(HAS_TRANSPOSE_WX8_SSSE3) 1.819 + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { 1.820 + TransposeWx8 = TransposeWx8_SSSE3; 1.821 + } 1.822 +#endif 1.823 +#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) 1.824 + if (TestCpuFlag(kCpuHasSSSE3) && 1.825 + IS_ALIGNED(width, 16) && 1.826 + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 1.827 + TransposeWx8 = TransposeWx8_FAST_SSSE3; 1.828 + } 1.829 +#endif 1.830 +#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) 1.831 + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { 1.832 + if (IS_ALIGNED(width, 4) && 1.833 + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { 1.834 + TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; 1.835 + } else { 1.836 + TransposeWx8 = TransposeWx8_MIPS_DSPR2; 1.837 + } 1.838 + } 1.839 +#endif 1.840 + 1.841 + // Work across the source in 8x8 tiles 1.842 + while (i >= 8) { 1.843 + TransposeWx8(src, src_stride, dst, dst_stride, width); 1.844 + src += 8 * src_stride; // Go down 8 rows. 1.845 + dst += 8; // Move over 8 columns. 1.846 + i -= 8; 1.847 + } 1.848 + 1.849 + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); 1.850 +} 1.851 + 1.852 +LIBYUV_API 1.853 +void RotatePlane90(const uint8* src, int src_stride, 1.854 + uint8* dst, int dst_stride, 1.855 + int width, int height) { 1.856 + // Rotate by 90 is a transpose with the source read 1.857 + // from bottom to top. So set the source pointer to the end 1.858 + // of the buffer and flip the sign of the source stride. 1.859 + src += src_stride * (height - 1); 1.860 + src_stride = -src_stride; 1.861 + TransposePlane(src, src_stride, dst, dst_stride, width, height); 1.862 +} 1.863 + 1.864 +LIBYUV_API 1.865 +void RotatePlane270(const uint8* src, int src_stride, 1.866 + uint8* dst, int dst_stride, 1.867 + int width, int height) { 1.868 + // Rotate by 270 is a transpose with the destination written 1.869 + // from bottom to top. So set the destination pointer to the end 1.870 + // of the buffer and flip the sign of the destination stride. 1.871 + dst += dst_stride * (width - 1); 1.872 + dst_stride = -dst_stride; 1.873 + TransposePlane(src, src_stride, dst, dst_stride, width, height); 1.874 +} 1.875 + 1.876 +LIBYUV_API 1.877 +void RotatePlane180(const uint8* src, int src_stride, 1.878 + uint8* dst, int dst_stride, 1.879 + int width, int height) { 1.880 + // Swap first and last row and mirror the content. Uses a temporary row. 1.881 + align_buffer_64(row, width); 1.882 + const uint8* src_bot = src + src_stride * (height - 1); 1.883 + uint8* dst_bot = dst + dst_stride * (height - 1); 1.884 + int half_height = (height + 1) >> 1; 1.885 + int y; 1.886 + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; 1.887 + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; 1.888 +#if defined(HAS_MIRRORROW_NEON) 1.889 + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { 1.890 + MirrorRow = MirrorRow_NEON; 1.891 + } 1.892 +#endif 1.893 +#if defined(HAS_MIRRORROW_SSE2) 1.894 + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && 1.895 + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 1.896 + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 1.897 + MirrorRow = MirrorRow_SSE2; 1.898 + } 1.899 +#endif 1.900 +#if defined(HAS_MIRRORROW_SSSE3) 1.901 + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && 1.902 + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 1.903 + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 1.904 + MirrorRow = MirrorRow_SSSE3; 1.905 + } 1.906 +#endif 1.907 +#if defined(HAS_MIRRORROW_AVX2) 1.908 + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { 1.909 + MirrorRow = MirrorRow_AVX2; 1.910 + } 1.911 +#endif 1.912 +#if defined(HAS_MIRRORROW_MIPS_DSPR2) 1.913 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && 1.914 + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && 1.915 + IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { 1.916 + MirrorRow = MirrorRow_MIPS_DSPR2; 1.917 + } 1.918 +#endif 1.919 +#if defined(HAS_COPYROW_NEON) 1.920 + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { 1.921 + CopyRow = CopyRow_NEON; 1.922 + } 1.923 +#endif 1.924 +#if defined(HAS_COPYROW_X86) 1.925 + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { 1.926 + CopyRow = CopyRow_X86; 1.927 + } 1.928 +#endif 1.929 +#if defined(HAS_COPYROW_SSE2) 1.930 + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && 1.931 + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 1.932 + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 1.933 + CopyRow = CopyRow_SSE2; 1.934 + } 1.935 +#endif 1.936 +#if defined(HAS_COPYROW_ERMS) 1.937 + if (TestCpuFlag(kCpuHasERMS)) { 1.938 + CopyRow = CopyRow_ERMS; 1.939 + } 1.940 +#endif 1.941 +#if defined(HAS_COPYROW_MIPS) 1.942 + if (TestCpuFlag(kCpuHasMIPS)) { 1.943 + CopyRow = CopyRow_MIPS; 1.944 + } 1.945 +#endif 1.946 + 1.947 + // Odd height will harmlessly mirror the middle row twice. 1.948 + for (y = 0; y < half_height; ++y) { 1.949 + MirrorRow(src, row, width); // Mirror first row into a buffer 1.950 + src += src_stride; 1.951 + MirrorRow(src_bot, dst, width); // Mirror last row into first row 1.952 + dst += dst_stride; 1.953 + CopyRow(row, dst_bot, width); // Copy first mirrored row into last 1.954 + src_bot -= src_stride; 1.955 + dst_bot -= dst_stride; 1.956 + } 1.957 + free_aligned_buffer_64(row); 1.958 +} 1.959 + 1.960 +static void TransposeUVWx8_C(const uint8* src, int src_stride, 1.961 + uint8* dst_a, int dst_stride_a, 1.962 + uint8* dst_b, int dst_stride_b, 1.963 + int width) { 1.964 + int i; 1.965 + for (i = 0; i < width; ++i) { 1.966 + dst_a[0] = src[0 * src_stride + 0]; 1.967 + dst_b[0] = src[0 * src_stride + 1]; 1.968 + dst_a[1] = src[1 * src_stride + 0]; 1.969 + dst_b[1] = src[1 * src_stride + 1]; 1.970 + dst_a[2] = src[2 * src_stride + 0]; 1.971 + dst_b[2] = src[2 * src_stride + 1]; 1.972 + dst_a[3] = src[3 * src_stride + 0]; 1.973 + dst_b[3] = src[3 * src_stride + 1]; 1.974 + dst_a[4] = src[4 * src_stride + 0]; 1.975 + dst_b[4] = src[4 * src_stride + 1]; 1.976 + dst_a[5] = src[5 * src_stride + 0]; 1.977 + dst_b[5] = src[5 * src_stride + 1]; 1.978 + dst_a[6] = src[6 * src_stride + 0]; 1.979 + dst_b[6] = src[6 * src_stride + 1]; 1.980 + dst_a[7] = src[7 * src_stride + 0]; 1.981 + dst_b[7] = src[7 * src_stride + 1]; 1.982 + src += 2; 1.983 + dst_a += dst_stride_a; 1.984 + dst_b += dst_stride_b; 1.985 + } 1.986 +} 1.987 + 1.988 +static void TransposeUVWxH_C(const uint8* src, int src_stride, 1.989 + uint8* dst_a, int dst_stride_a, 1.990 + uint8* dst_b, int dst_stride_b, 1.991 + int width, int height) { 1.992 + int i; 1.993 + for (i = 0; i < width * 2; i += 2) { 1.994 + int j; 1.995 + for (j = 0; j < height; ++j) { 1.996 + dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; 1.997 + dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; 1.998 + } 1.999 + } 1.1000 +} 1.1001 + 1.1002 +LIBYUV_API 1.1003 +void TransposeUV(const uint8* src, int src_stride, 1.1004 + uint8* dst_a, int dst_stride_a, 1.1005 + uint8* dst_b, int dst_stride_b, 1.1006 + int width, int height) { 1.1007 + int i = height; 1.1008 + void (*TransposeUVWx8)(const uint8* src, int src_stride, 1.1009 + uint8* dst_a, int dst_stride_a, 1.1010 + uint8* dst_b, int dst_stride_b, 1.1011 + int width) = TransposeUVWx8_C; 1.1012 +#if defined(HAS_TRANSPOSE_UVWX8_NEON) 1.1013 + if (TestCpuFlag(kCpuHasNEON)) { 1.1014 + TransposeUVWx8 = TransposeUVWx8_NEON; 1.1015 + } 1.1016 +#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) 1.1017 + if (TestCpuFlag(kCpuHasSSE2) && 1.1018 + IS_ALIGNED(width, 8) && 1.1019 + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 1.1020 + TransposeUVWx8 = TransposeUVWx8_SSE2; 1.1021 + } 1.1022 +#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) 1.1023 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && 1.1024 + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { 1.1025 + TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; 1.1026 + } 1.1027 +#endif 1.1028 + 1.1029 + // Work through the source in 8x8 tiles. 1.1030 + while (i >= 8) { 1.1031 + TransposeUVWx8(src, src_stride, 1.1032 + dst_a, dst_stride_a, 1.1033 + dst_b, dst_stride_b, 1.1034 + width); 1.1035 + src += 8 * src_stride; // Go down 8 rows. 1.1036 + dst_a += 8; // Move over 8 columns. 1.1037 + dst_b += 8; // Move over 8 columns. 1.1038 + i -= 8; 1.1039 + } 1.1040 + 1.1041 + TransposeUVWxH_C(src, src_stride, 1.1042 + dst_a, dst_stride_a, 1.1043 + dst_b, dst_stride_b, 1.1044 + width, i); 1.1045 +} 1.1046 + 1.1047 +LIBYUV_API 1.1048 +void RotateUV90(const uint8* src, int src_stride, 1.1049 + uint8* dst_a, int dst_stride_a, 1.1050 + uint8* dst_b, int dst_stride_b, 1.1051 + int width, int height) { 1.1052 + src += src_stride * (height - 1); 1.1053 + src_stride = -src_stride; 1.1054 + 1.1055 + TransposeUV(src, src_stride, 1.1056 + dst_a, dst_stride_a, 1.1057 + dst_b, dst_stride_b, 1.1058 + width, height); 1.1059 +} 1.1060 + 1.1061 +LIBYUV_API 1.1062 +void RotateUV270(const uint8* src, int src_stride, 1.1063 + uint8* dst_a, int dst_stride_a, 1.1064 + uint8* dst_b, int dst_stride_b, 1.1065 + int width, int height) { 1.1066 + dst_a += dst_stride_a * (width - 1); 1.1067 + dst_b += dst_stride_b * (width - 1); 1.1068 + dst_stride_a = -dst_stride_a; 1.1069 + dst_stride_b = -dst_stride_b; 1.1070 + 1.1071 + TransposeUV(src, src_stride, 1.1072 + dst_a, dst_stride_a, 1.1073 + dst_b, dst_stride_b, 1.1074 + width, height); 1.1075 +} 1.1076 + 1.1077 +// Rotate 180 is a horizontal and vertical flip. 1.1078 +LIBYUV_API 1.1079 +void RotateUV180(const uint8* src, int src_stride, 1.1080 + uint8* dst_a, int dst_stride_a, 1.1081 + uint8* dst_b, int dst_stride_b, 1.1082 + int width, int height) { 1.1083 + int i; 1.1084 + void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = 1.1085 + MirrorUVRow_C; 1.1086 +#if defined(HAS_MIRRORUVROW_NEON) 1.1087 + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { 1.1088 + MirrorRowUV = MirrorUVRow_NEON; 1.1089 + } 1.1090 +#elif defined(HAS_MIRRORROW_UV_SSSE3) 1.1091 + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && 1.1092 + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 1.1093 + MirrorRowUV = MirrorUVRow_SSSE3; 1.1094 + } 1.1095 +#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2) 1.1096 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && 1.1097 + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { 1.1098 + MirrorRowUV = MirrorUVRow_MIPS_DSPR2; 1.1099 + } 1.1100 +#endif 1.1101 + 1.1102 + dst_a += dst_stride_a * (height - 1); 1.1103 + dst_b += dst_stride_b * (height - 1); 1.1104 + 1.1105 + for (i = 0; i < height; ++i) { 1.1106 + MirrorRowUV(src, dst_a, dst_b, width); 1.1107 + src += src_stride; 1.1108 + dst_a -= dst_stride_a; 1.1109 + dst_b -= dst_stride_b; 1.1110 + } 1.1111 +} 1.1112 + 1.1113 +LIBYUV_API 1.1114 +int RotatePlane(const uint8* src, int src_stride, 1.1115 + uint8* dst, int dst_stride, 1.1116 + int width, int height, 1.1117 + enum RotationMode mode) { 1.1118 + if (!src || width <= 0 || height == 0 || !dst) { 1.1119 + return -1; 1.1120 + } 1.1121 + 1.1122 + // Negative height means invert the image. 1.1123 + if (height < 0) { 1.1124 + height = -height; 1.1125 + src = src + (height - 1) * src_stride; 1.1126 + src_stride = -src_stride; 1.1127 + } 1.1128 + 1.1129 + switch (mode) { 1.1130 + case kRotate0: 1.1131 + // copy frame 1.1132 + CopyPlane(src, src_stride, 1.1133 + dst, dst_stride, 1.1134 + width, height); 1.1135 + return 0; 1.1136 + case kRotate90: 1.1137 + RotatePlane90(src, src_stride, 1.1138 + dst, dst_stride, 1.1139 + width, height); 1.1140 + return 0; 1.1141 + case kRotate270: 1.1142 + RotatePlane270(src, src_stride, 1.1143 + dst, dst_stride, 1.1144 + width, height); 1.1145 + return 0; 1.1146 + case kRotate180: 1.1147 + RotatePlane180(src, src_stride, 1.1148 + dst, dst_stride, 1.1149 + width, height); 1.1150 + return 0; 1.1151 + default: 1.1152 + break; 1.1153 + } 1.1154 + return -1; 1.1155 +} 1.1156 + 1.1157 +LIBYUV_API 1.1158 +int I420Rotate(const uint8* src_y, int src_stride_y, 1.1159 + const uint8* src_u, int src_stride_u, 1.1160 + const uint8* src_v, int src_stride_v, 1.1161 + uint8* dst_y, int dst_stride_y, 1.1162 + uint8* dst_u, int dst_stride_u, 1.1163 + uint8* dst_v, int dst_stride_v, 1.1164 + int width, int height, 1.1165 + enum RotationMode mode) { 1.1166 + int halfwidth = (width + 1) >> 1; 1.1167 + int halfheight = (height + 1) >> 1; 1.1168 + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || 1.1169 + !dst_y || !dst_u || !dst_v) { 1.1170 + return -1; 1.1171 + } 1.1172 + 1.1173 + // Negative height means invert the image. 1.1174 + if (height < 0) { 1.1175 + height = -height; 1.1176 + halfheight = (height + 1) >> 1; 1.1177 + src_y = src_y + (height - 1) * src_stride_y; 1.1178 + src_u = src_u + (halfheight - 1) * src_stride_u; 1.1179 + src_v = src_v + (halfheight - 1) * src_stride_v; 1.1180 + src_stride_y = -src_stride_y; 1.1181 + src_stride_u = -src_stride_u; 1.1182 + src_stride_v = -src_stride_v; 1.1183 + } 1.1184 + 1.1185 + switch (mode) { 1.1186 + case kRotate0: 1.1187 + // copy frame 1.1188 + return I420Copy(src_y, src_stride_y, 1.1189 + src_u, src_stride_u, 1.1190 + src_v, src_stride_v, 1.1191 + dst_y, dst_stride_y, 1.1192 + dst_u, dst_stride_u, 1.1193 + dst_v, dst_stride_v, 1.1194 + width, height); 1.1195 + case kRotate90: 1.1196 + RotatePlane90(src_y, src_stride_y, 1.1197 + dst_y, dst_stride_y, 1.1198 + width, height); 1.1199 + RotatePlane90(src_u, src_stride_u, 1.1200 + dst_u, dst_stride_u, 1.1201 + halfwidth, halfheight); 1.1202 + RotatePlane90(src_v, src_stride_v, 1.1203 + dst_v, dst_stride_v, 1.1204 + halfwidth, halfheight); 1.1205 + return 0; 1.1206 + case kRotate270: 1.1207 + RotatePlane270(src_y, src_stride_y, 1.1208 + dst_y, dst_stride_y, 1.1209 + width, height); 1.1210 + RotatePlane270(src_u, src_stride_u, 1.1211 + dst_u, dst_stride_u, 1.1212 + halfwidth, halfheight); 1.1213 + RotatePlane270(src_v, src_stride_v, 1.1214 + dst_v, dst_stride_v, 1.1215 + halfwidth, halfheight); 1.1216 + return 0; 1.1217 + case kRotate180: 1.1218 + RotatePlane180(src_y, src_stride_y, 1.1219 + dst_y, dst_stride_y, 1.1220 + width, height); 1.1221 + RotatePlane180(src_u, src_stride_u, 1.1222 + dst_u, dst_stride_u, 1.1223 + halfwidth, halfheight); 1.1224 + RotatePlane180(src_v, src_stride_v, 1.1225 + dst_v, dst_stride_v, 1.1226 + halfwidth, halfheight); 1.1227 + return 0; 1.1228 + default: 1.1229 + break; 1.1230 + } 1.1231 + return -1; 1.1232 +} 1.1233 + 1.1234 +LIBYUV_API 1.1235 +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, 1.1236 + const uint8* src_uv, int src_stride_uv, 1.1237 + uint8* dst_y, int dst_stride_y, 1.1238 + uint8* dst_u, int dst_stride_u, 1.1239 + uint8* dst_v, int dst_stride_v, 1.1240 + int width, int height, 1.1241 + enum RotationMode mode) { 1.1242 + int halfwidth = (width + 1) >> 1; 1.1243 + int halfheight = (height + 1) >> 1; 1.1244 + if (!src_y || !src_uv || width <= 0 || height == 0 || 1.1245 + !dst_y || !dst_u || !dst_v) { 1.1246 + return -1; 1.1247 + } 1.1248 + 1.1249 + // Negative height means invert the image. 1.1250 + if (height < 0) { 1.1251 + height = -height; 1.1252 + halfheight = (height + 1) >> 1; 1.1253 + src_y = src_y + (height - 1) * src_stride_y; 1.1254 + src_uv = src_uv + (halfheight - 1) * src_stride_uv; 1.1255 + src_stride_y = -src_stride_y; 1.1256 + src_stride_uv = -src_stride_uv; 1.1257 + } 1.1258 + 1.1259 + switch (mode) { 1.1260 + case kRotate0: 1.1261 + // copy frame 1.1262 + return NV12ToI420(src_y, src_stride_y, 1.1263 + src_uv, src_stride_uv, 1.1264 + dst_y, dst_stride_y, 1.1265 + dst_u, dst_stride_u, 1.1266 + dst_v, dst_stride_v, 1.1267 + width, height); 1.1268 + case kRotate90: 1.1269 + RotatePlane90(src_y, src_stride_y, 1.1270 + dst_y, dst_stride_y, 1.1271 + width, height); 1.1272 + RotateUV90(src_uv, src_stride_uv, 1.1273 + dst_u, dst_stride_u, 1.1274 + dst_v, dst_stride_v, 1.1275 + halfwidth, halfheight); 1.1276 + return 0; 1.1277 + case kRotate270: 1.1278 + RotatePlane270(src_y, src_stride_y, 1.1279 + dst_y, dst_stride_y, 1.1280 + width, height); 1.1281 + RotateUV270(src_uv, src_stride_uv, 1.1282 + dst_u, dst_stride_u, 1.1283 + dst_v, dst_stride_v, 1.1284 + halfwidth, halfheight); 1.1285 + return 0; 1.1286 + case kRotate180: 1.1287 + RotatePlane180(src_y, src_stride_y, 1.1288 + dst_y, dst_stride_y, 1.1289 + width, height); 1.1290 + RotateUV180(src_uv, src_stride_uv, 1.1291 + dst_u, dst_stride_u, 1.1292 + dst_v, dst_stride_v, 1.1293 + halfwidth, halfheight); 1.1294 + return 0; 1.1295 + default: 1.1296 + break; 1.1297 + } 1.1298 + return -1; 1.1299 +} 1.1300 + 1.1301 +#ifdef __cplusplus 1.1302 +} // extern "C" 1.1303 +} // namespace libyuv 1.1304 +#endif